From 654d0310007146fae87b0c1a68f81e53ad519b14 Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Thu, 20 Apr 2023 09:49:23 +0200 Subject: [PATCH 001/271] optee: fix uninited async notif value Fixes an uninitialized variable in irq_handler() that could lead to unpredictable behavior in case OP-TEE fails to handle SMC function ID OPTEE_SMC_GET_ASYNC_NOTIF_VALUE. This change ensures that in that case get_async_notif_value() properly reports there are no notification event. Reported-by: kernel test robot Link: https://lore.kernel.org/r/202304200755.OoiuclDZ-lkp@intel.com/ Reported-by: Dan Carpenter Link: https://lore.kernel.org/all/d9b7f69b-c737-4cb3-8e74-79fe00c934f9@kili.mountain/ Fixes: 6749e69c4dad ("optee: add asynchronous notifications") Signed-off-by: Etienne Carriere Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/smc_abi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c index a1c1fa1a9c28..e6e0428f8e7b 100644 --- a/drivers/tee/optee/smc_abi.c +++ b/drivers/tee/optee/smc_abi.c @@ -984,8 +984,10 @@ static u32 get_async_notif_value(optee_invoke_fn *invoke_fn, bool *value_valid, invoke_fn(OPTEE_SMC_GET_ASYNC_NOTIF_VALUE, 0, 0, 0, 0, 0, 0, 0, &res); - if (res.a0) + if (res.a0) { + *value_valid = false; return 0; + } *value_valid = (res.a2 & OPTEE_SMC_ASYNC_NOTIF_VALUE_VALID); *value_pending = (res.a2 & OPTEE_SMC_ASYNC_NOTIF_VALUE_PENDING); return res.a1; From 0cce06ba859a515bd06224085d3addb870608b6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2023 17:03:13 +0200 Subject: [PATCH 002/271] debugobjects,locking: Annotate debug_object_fill_pool() wait type violation There is an explicit wait-type violation in debug_object_fill_pool() for PREEMPT_RT=n kernels which allows them to more easily fill the object pool and reduce the chance of allocation failures. Lockdep's wait-type checks are designed to check the PREEMPT_RT locking rules even for PREEMPT_RT=n kernels and object to this, so create a lockdep annotation to allow this to stand. Specifically, create a 'lock' type that overrides the inner wait-type while it is held -- allowing one to temporarily raise it, such that the violation is hidden. Reported-by: Vlastimil Babka Reported-by: Qi Zheng Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qi Zheng Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net --- include/linux/lockdep.h | 14 ++++++++++++++ include/linux/lockdep_types.h | 1 + kernel/locking/lockdep.c | 28 +++++++++++++++++++++------- lib/debugobjects.c | 15 +++++++++++++-- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1023f349af71..a3329fb49b33 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -339,6 +339,16 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) +/* + * Must use lock_map_aquire_try() with override maps to avoid + * lockdep thinking they participate in the block chain. + */ +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ + struct lockdep_map _name = { \ + .name = #_name "-wait-type-override", \ + .wait_type_inner = _wait_type, \ + .lock_type = LD_LOCK_WAIT_OVERRIDE, } + #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) @@ -427,6 +437,9 @@ extern int lockdep_is_held(const void *); #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ + struct lockdep_map __maybe_unused _name = {} + #endif /* !LOCKDEP */ enum xhlock_context_t { @@ -551,6 +564,7 @@ do { \ #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index d22430840b53..59f4fb1626ea 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -33,6 +33,7 @@ enum lockdep_wait_type { enum lockdep_lock_type { LD_LOCK_NORMAL = 0, /* normal, catch all */ LD_LOCK_PERCPU, /* percpu */ + LD_LOCK_WAIT_OVERRIDE, /* annotation */ LD_LOCK_MAX, }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 50d4863974e7..62ef295e07e6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2253,6 +2253,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask) static inline bool usage_skip(struct lock_list *entry, void *mask) { + if (entry->class->lock_type == LD_LOCK_NORMAL) + return false; + /* * Skip local_lock() for irq inversion detection. * @@ -2279,14 +2282,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) * As a result, we will skip local_lock(), when we search for irq * inversion bugs. */ - if (entry->class->lock_type == LD_LOCK_PERCPU) { - if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) - return false; + if (entry->class->lock_type == LD_LOCK_PERCPU && + DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; - return true; - } + /* + * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually + * a lock and only used to override the wait_type. + */ - return false; + return true; } /* @@ -4752,7 +4757,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; - u8 prev_inner = hlock_class(prev)->wait_type_inner; + struct lock_class *class = hlock_class(prev); + u8 prev_inner = class->wait_type_inner; if (prev_inner) { /* @@ -4762,6 +4768,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) * Also due to trylocks. */ curr_inner = min(curr_inner, prev_inner); + + /* + * Allow override for annotations -- this is typically + * only valid/needed for code that only exists when + * CONFIG_PREEMPT_RT=n. + */ + if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE)) + curr_inner = prev_inner; } } diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 003edc5ebd67..826c617b10a7 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -591,10 +591,21 @@ static void debug_objects_fill_pool(void) { /* * On RT enabled kernels the pool refill must happen in preemptible - * context: + * context -- for !RT kernels we rely on the fact that spinlock_t and + * raw_spinlock_t are basically the same type and this lock-type + * inversion works just fine. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to WAIT_SLEEP, matching + * the preemptible() condition above. + */ + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); + lock_map_acquire_try(&fill_pool_map); fill_pool(); + lock_map_release(&fill_pool_map); + } } static void From 70a640c0efa7667453c3911b13335304ce46ad8b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 26 Apr 2023 15:35:45 +0200 Subject: [PATCH 003/271] regmap: REGMAP_KUNIT should not select REGMAP Enabling a (modular) test should not silently enable additional kernel functionality, as that may increase the attack vector of a product. Fix this by: 1. making REGMAP visible if CONFIG_KUNIT_ALL_TESTS is enabled, 2. making REGMAP_KUNIT depend on REGMAP instead of selecting it. After this, one can safely enable CONFIG_KUNIT_ALL_TESTS=m to build modules for all appropriate tests for ones system, without pulling in extra unwanted functionality, while still allowing a tester to manually enable REGMAP and its test suite on a system where REGMAP is not enabled by default. Fixes: 2238959b6ad27040 ("regmap: Add some basic kunit tests") Signed-off-by: Geert Uytterhoeven Date: Tue, 2 May 2023 12:38:09 +0200 Subject: [PATCH 004/271] phy: qcom-qmp-combo: fix init-count imbalance The init counter is not decremented on initialisation errors, which prevents retrying initialisation and can lead to the runtime suspend callback attempting to disable resources that have never been enabled. Add the missing decrement on initialisation errors so that the counter reflects the state of the device. Fixes: e78f3d15e115 ("phy: qcom-qmp: new qmp phy driver for qcom-chipsets") Cc: stable@vger.kernel.org # 4.12 Signed-off-by: Johan Hovold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20230502103810.12061-2-johan+linaro@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c index 6850e04c329b..87b17e5877ab 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c @@ -2472,7 +2472,7 @@ static int qmp_combo_com_init(struct qmp_combo *qmp) ret = regulator_bulk_enable(cfg->num_vregs, qmp->vregs); if (ret) { dev_err(qmp->dev, "failed to enable regulators, err=%d\n", ret); - goto err_unlock; + goto err_decrement_count; } ret = reset_control_bulk_assert(cfg->num_resets, qmp->resets); @@ -2522,7 +2522,8 @@ err_assert_reset: reset_control_bulk_assert(cfg->num_resets, qmp->resets); err_disable_regulators: regulator_bulk_disable(cfg->num_vregs, qmp->vregs); -err_unlock: +err_decrement_count: + qmp->init_count--; mutex_unlock(&qmp->phy_mutex); return ret; From e42f110700ed7293700c26145e1ed07ea05ac3f6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 2 May 2023 12:38:10 +0200 Subject: [PATCH 005/271] phy: qcom-qmp-pcie-msm8996: fix init-count imbalance The init counter is not decremented on initialisation errors, which prevents retrying initialisation. Add the missing decrement on initialisation errors so that the counter reflects the state of the device. Fixes: e78f3d15e115 ("phy: qcom-qmp: new qmp phy driver for qcom-chipsets") Cc: stable@vger.kernel.org # 4.12 Signed-off-by: Johan Hovold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20230502103810.12061-3-johan+linaro@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c index 09824be088c9..0c603bc06e09 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie-msm8996.c @@ -379,7 +379,7 @@ static int qmp_pcie_msm8996_com_init(struct qmp_phy *qphy) ret = regulator_bulk_enable(cfg->num_vregs, qmp->vregs); if (ret) { dev_err(qmp->dev, "failed to enable regulators, err=%d\n", ret); - goto err_unlock; + goto err_decrement_count; } ret = reset_control_bulk_assert(cfg->num_resets, qmp->resets); @@ -409,7 +409,8 @@ err_assert_reset: reset_control_bulk_assert(cfg->num_resets, qmp->resets); err_disable_regulators: regulator_bulk_disable(cfg->num_vregs, qmp->vregs); -err_unlock: +err_decrement_count: + qmp->init_count--; mutex_unlock(&qmp->phy_mutex); return ret; From 44e8d5ad2dc01529eb1316b1521f24ac4aac8eaf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Apr 2023 09:33:49 -1000 Subject: [PATCH 006/271] firmware: arm_scmi: Fix incorrect alloc_workqueue() invocation scmi_xfer_raw_worker_init() is specifying a flag, WQ_SYSFS, as @max_active. Fix it by or'ing WQ_SYSFS into @flags so that it actually enables sysfs interface and using 0 for @max_active for the default setting. Signed-off-by: Tejun Heo Fixes: 3c3d818a9317 ("firmware: arm_scmi: Add core raw transmission support") Link: https://lore.kernel.org/r/ZEGTnajiQm7mkkZS@slm.duckdns.org Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/raw_mode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_scmi/raw_mode.c b/drivers/firmware/arm_scmi/raw_mode.c index d40df099fd51..6971dcf72fb9 100644 --- a/drivers/firmware/arm_scmi/raw_mode.c +++ b/drivers/firmware/arm_scmi/raw_mode.c @@ -1066,7 +1066,7 @@ static int scmi_xfer_raw_worker_init(struct scmi_raw_mode_info *raw) raw->wait_wq = alloc_workqueue("scmi-raw-wait-wq-%d", WQ_UNBOUND | WQ_FREEZABLE | - WQ_HIGHPRI, WQ_SYSFS, raw->id); + WQ_HIGHPRI | WQ_SYSFS, 0, raw->id); if (!raw->wait_wq) return -ENOMEM; From 328acc5657c6197753238d7ce0a6924ead829347 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 23 Apr 2023 17:08:37 +0200 Subject: [PATCH 007/271] ARM: dts: vexpress: add missing cache properties As all level 2 and level 3 caches are unified, add required cache-unified property to fix warnings like: vexpress-v2p-ca5s.dtb: cache-controller@2c0f0000: 'cache-unified' is a required property Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230423150837.118466-1-krzysztof.kozlowski@linaro.org Signed-off-by: Sudeep Holla --- arch/arm/boot/dts/vexpress-v2p-ca5s.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/vexpress-v2p-ca5s.dts b/arch/arm/boot/dts/vexpress-v2p-ca5s.dts index 3b88209bacea..ff1f9a1bcfcf 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca5s.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca5s.dts @@ -132,6 +132,7 @@ reg = <0x2c0f0000 0x1000>; interrupts = <0 84 4>; cache-level = <2>; + cache-unified; }; pmu { From 55b37d9c8ba23d28c584aef0801fb1e60e4a817c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 22 Apr 2023 00:32:12 +0200 Subject: [PATCH 008/271] arm64: dts: arm: add missing cache properties As all level 2 and level 3 caches are unified, add required cache-unified properties to fix warnings like: foundation-v8.dtb: l2-cache0: 'cache-unified' is a required property Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20230421223213.115639-1-krzysztof.kozlowski@linaro.org Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/foundation-v8.dtsi | 1 + arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts | 1 + arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/arm/foundation-v8.dtsi b/arch/arm64/boot/dts/arm/foundation-v8.dtsi index 029578072d8f..7b41537731a6 100644 --- a/arch/arm64/boot/dts/arm/foundation-v8.dtsi +++ b/arch/arm64/boot/dts/arm/foundation-v8.dtsi @@ -59,6 +59,7 @@ L2_0: l2-cache0 { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts b/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts index ef68f5aae7dd..afdf954206f1 100644 --- a/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts +++ b/arch/arm64/boot/dts/arm/rtsm_ve-aemv8a.dts @@ -72,6 +72,7 @@ L2_0: l2-cache0 { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts b/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts index 796cd7d02eb5..7bdeb965f0a9 100644 --- a/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts +++ b/arch/arm64/boot/dts/arm/vexpress-v2f-1xv7-ca53x2.dts @@ -58,6 +58,7 @@ L2_0: l2-cache0 { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; From b71b55248a580e9c9befc4ae060539f1f8e477da Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 20 Apr 2023 16:06:01 +0100 Subject: [PATCH 009/271] firmware: arm_ffa: Check if ffa_driver remove is present before executing Currently ffa_drv->remove() is called unconditionally from ffa_device_remove(). Since the driver registration doesn't check for it and allows it to be registered without .remove callback, we need to check for the presence of it before executing it from ffa_device_remove() to above a NULL pointer dereference like the one below: | Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 | Mem abort info: | ESR = 0x0000000086000004 | EC = 0x21: IABT (current EL), IL = 32 bits | SET = 0, FnV = 0 | EA = 0, S1PTW = 0 | FSC = 0x04: level 0 translation fault | user pgtable: 4k pages, 48-bit VAs, pgdp=0000000881cc8000 | [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 | Internal error: Oops: 0000000086000004 [#1] PREEMPT SMP | CPU: 3 PID: 130 Comm: rmmod Not tainted 6.3.0-rc7 #6 | Hardware name: FVP Base RevC (DT) | pstate: 63402809 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=-c) | pc : 0x0 | lr : ffa_device_remove+0x20/0x2c | Call trace: | 0x0 | device_release_driver_internal+0x16c/0x260 | driver_detach+0x90/0xd0 | bus_remove_driver+0xdc/0x11c | driver_unregister+0x30/0x54 | ffa_driver_unregister+0x14/0x20 | cleanup_module+0x18/0xeec | __arm64_sys_delete_module+0x234/0x378 | invoke_syscall+0x40/0x108 | el0_svc_common+0xb4/0xf0 | do_el0_svc+0x30/0xa4 | el0_svc+0x2c/0x7c | el0t_64_sync_handler+0x84/0xf0 | el0t_64_sync+0x190/0x194 Fixes: 244f5d597e1e ("firmware: arm_ffa: Add missing remove callback to ffa_bus_type") Link: https://lore.kernel.org/r/20230419-ffa_fixes_6-4-v2-1-d9108e43a176@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/bus.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index f29d77ecf72d..36bd5423c2f0 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -53,7 +53,8 @@ static void ffa_device_remove(struct device *dev) { struct ffa_driver *ffa_drv = to_ffa_driver(dev->driver); - ffa_drv->remove(to_ffa_dev(dev)); + if (ffa_drv->remove) + ffa_drv->remove(to_ffa_dev(dev)); } static int ffa_device_uevent(const struct device *dev, struct kobj_uevent_env *env) From c6e045361a27ecd4fac6413164e0d091d80eee99 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 20 Apr 2023 16:06:02 +0100 Subject: [PATCH 010/271] firmware: arm_ffa: Fix usage of partition info get count flag Commit bb1be7498500 ("firmware: arm_ffa: Add v1.1 get_partition_info support") adds support to discovery the UUIDs of the partitions or just fetch the partition count using the PARTITION_INFO_GET_RETURN_COUNT_ONLY flag. However the commit doesn't handle the fact that the older version doesn't understand the flag and must be MBZ which results in firmware returning invalid parameter error. That results in the failure of the driver probe which is in correct. Limit the usage of the PARTITION_INFO_GET_RETURN_COUNT_ONLY flag for the versions above v1.0(i.e v1.1 and onwards) which fixes the issue. Fixes: bb1be7498500 ("firmware: arm_ffa: Add v1.1 get_partition_info support") Reported-by: Jens Wiklander Reported-by: Marc Bonnici Tested-by: Jens Wiklander Reviewed-by: Jens Wiklander Link: https://lore.kernel.org/r/20230419-ffa_fixes_6-4-v2-2-d9108e43a176@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/driver.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index fa85c64d3ded..4aced2e5b772 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -193,7 +193,8 @@ __ffa_partition_info_get(u32 uuid0, u32 uuid1, u32 uuid2, u32 uuid3, int idx, count, flags = 0, sz, buf_sz; ffa_value_t partition_info; - if (!buffer || !num_partitions) /* Just get the count for now */ + if (drv_info->version > FFA_VERSION_1_0 && + (!buffer || !num_partitions)) /* Just get the count for now */ flags = PARTITION_INFO_GET_RETURN_COUNT_ONLY; mutex_lock(&drv_info->rx_lock); From 19b8766459c41c6f318f8a548cc1c66dffd18363 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 20 Apr 2023 16:06:03 +0100 Subject: [PATCH 011/271] firmware: arm_ffa: Fix FFA device names for logical partitions Each physical partition can provide multiple services each with UUID. Each such service can be presented as logical partition with a unique combination of VM ID and UUID. The number of distinct UUID in a system will be less than or equal to the number of logical partitions. However, currently it fails to register more than one logical partition or service within a physical partition as the device name contains only VM ID while both VM ID and UUID are maintained in the partition information. The kernel complains with the below message: | sysfs: cannot create duplicate filename '/devices/arm-ffa-8001' | CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc7 #8 | Hardware name: FVP Base RevC (DT) | Call trace: | dump_backtrace+0xf8/0x118 | show_stack+0x18/0x24 | dump_stack_lvl+0x50/0x68 | dump_stack+0x18/0x24 | sysfs_create_dir_ns+0xe0/0x13c | kobject_add_internal+0x220/0x3d4 | kobject_add+0x94/0x100 | device_add+0x144/0x5d8 | device_register+0x20/0x30 | ffa_device_register+0x88/0xd8 | ffa_setup_partitions+0x108/0x1b8 | ffa_init+0x2ec/0x3a4 | do_one_initcall+0xcc/0x240 | do_initcall_level+0x8c/0xac | do_initcalls+0x54/0x94 | do_basic_setup+0x1c/0x28 | kernel_init_freeable+0x100/0x16c | kernel_init+0x20/0x1a0 | ret_from_fork+0x10/0x20 | kobject_add_internal failed for arm-ffa-8001 with -EEXIST, don't try to | register things with the same name in the same directory. | arm_ffa arm-ffa: unable to register device arm-ffa-8001 err=-17 | ARM FF-A: ffa_setup_partitions: failed to register partition ID 0x8001 By virtue of being random enough to avoid collisions when generated in a distributed system, there is no way to compress UUID keys to the number of bits required to identify each. We can eliminate '-' in the name but it is not worth eliminating 4 bytes and add unnecessary logic for doing that. Also v1.0 doesn't provide the UUID of the partitions which makes it hard to use the same for the device name. So to keep it simple, let us alloc an ID using ida_alloc() and append the same to "arm-ffa" to make up a unique device name. Also stash the id value in ffa_dev to help freeing the ID later when the device is destroyed. Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Reported-by: Lucian Paul-Trifu Link: https://lore.kernel.org/r/20230419-ffa_fixes_6-4-v2-3-d9108e43a176@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/bus.c | 18 ++++++++++++++---- include/linux/arm_ffa.h | 1 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index 36bd5423c2f0..2b8bfcd010f5 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -15,6 +15,8 @@ #include "common.h" +static DEFINE_IDA(ffa_bus_id); + static int ffa_device_match(struct device *dev, struct device_driver *drv) { const struct ffa_device_id *id_table; @@ -131,6 +133,7 @@ static void ffa_release_device(struct device *dev) { struct ffa_device *ffa_dev = to_ffa_dev(dev); + ida_free(&ffa_bus_id, ffa_dev->id); kfree(ffa_dev); } @@ -171,18 +174,24 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, const struct ffa_ops *ops) { - int ret; + int id, ret; struct device *dev; struct ffa_device *ffa_dev; - ffa_dev = kzalloc(sizeof(*ffa_dev), GFP_KERNEL); - if (!ffa_dev) + id = ida_alloc_min(&ffa_bus_id, 1, GFP_KERNEL); + if (id < 0) return NULL; + ffa_dev = kzalloc(sizeof(*ffa_dev), GFP_KERNEL); + if (!ffa_dev) { + ida_free(&ffa_bus_id, id); + return NULL; + } + dev = &ffa_dev->dev; dev->bus = &ffa_bus_type; dev->release = ffa_release_device; - dev_set_name(&ffa_dev->dev, "arm-ffa-%04x", vm_id); + dev_set_name(&ffa_dev->dev, "arm-ffa-%d", id); ffa_dev->vm_id = vm_id; ffa_dev->ops = ops; @@ -218,4 +227,5 @@ void arm_ffa_bus_exit(void) { ffa_devices_unregister(); bus_unregister(&ffa_bus_type); + ida_destroy(&ffa_bus_id); } diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c87aeecaa9b2..583fe3b49a49 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -96,6 +96,7 @@ /* FFA Bus/Device/Driver related */ struct ffa_device { + u32 id; int vm_id; bool mode_32bit; uuid_t uuid; From 111a833dc5cbef3d05b2a796a7e23cb7f6ff2192 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 3 May 2023 14:12:52 +0100 Subject: [PATCH 012/271] firmware: arm_ffa: Set reserved/MBZ fields to zero in the memory descriptors The transmit buffers allocated by the driver can be used to transmit data by any messages/commands needing the buffer. However, it is not guaranteed to have been zero-ed before every new transmission and hence it will just contain residual value from the previous transmission. There are several reserved fields in the memory descriptors that must be zero(MBZ). The receiver can reject the transmission if any such MBZ fields are non-zero. While we can set the whole page to zero, it is not optimal as most of the fields get initialised to the value required for the current transmission. So, just set the reserved/MBZ fields to zero in the memory descriptors explicitly to honour the requirement and keep the receiver happy. Fixes: cc2195fe536c ("firmware: arm_ffa: Add support for MEM_* interfaces") Reported-by: Marc Bonnici Link: https://lore.kernel.org/r/20230503131252.12585-1-sudeep.holla@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/driver.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c index 4aced2e5b772..e23409138667 100644 --- a/drivers/firmware/arm_ffa/driver.c +++ b/drivers/firmware/arm_ffa/driver.c @@ -421,12 +421,17 @@ ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize, ep_mem_access->receiver = args->attrs[idx].receiver; ep_mem_access->attrs = args->attrs[idx].attrs; ep_mem_access->composite_off = COMPOSITE_OFFSET(args->nattrs); + ep_mem_access->flag = 0; + ep_mem_access->reserved = 0; } + mem_region->reserved_0 = 0; + mem_region->reserved_1 = 0; mem_region->ep_count = args->nattrs; composite = buffer + COMPOSITE_OFFSET(args->nattrs); composite->total_pg_cnt = ffa_get_num_pages_sg(args->sg); composite->addr_range_cnt = num_entries; + composite->reserved = 0; length = COMPOSITE_CONSTITUENTS_OFFSET(args->nattrs, num_entries); frag_len = COMPOSITE_CONSTITUENTS_OFFSET(args->nattrs, 0); @@ -461,6 +466,7 @@ ffa_setup_and_transmit(u32 func_id, void *buffer, u32 max_fragsize, constituents->address = sg_phys(args->sg); constituents->pg_cnt = args->sg->length / FFA_PAGE_SIZE; + constituents->reserved = 0; constituents++; frag_len += sizeof(struct ffa_mem_region_addr_range); } while ((args->sg = sg_next(args->sg))); From eb4b8eca1bad98f4b8574558a74f041f9acb5a54 Mon Sep 17 00:00:00 2001 From: Milo Spadacini Date: Mon, 8 May 2023 15:18:48 +0200 Subject: [PATCH 013/271] tools: gpio: fix debounce_period_us output of lsgpio Fix incorrect output that could occur when more attributes are used and GPIO_V2_LINE_ATTR_ID_DEBOUNCE is not the first one. Signed-off-by: Milo Spadacini Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- tools/gpio/lsgpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/gpio/lsgpio.c b/tools/gpio/lsgpio.c index c61d061247e1..52a0be45410c 100644 --- a/tools/gpio/lsgpio.c +++ b/tools/gpio/lsgpio.c @@ -94,7 +94,7 @@ static void print_attributes(struct gpio_v2_line_info *info) for (i = 0; i < info->num_attrs; i++) { if (info->attrs[i].id == GPIO_V2_LINE_ATTR_ID_DEBOUNCE) fprintf(stdout, ", debounce_period=%dusec", - info->attrs[0].debounce_period_us); + info->attrs[i].debounce_period_us); } } From f67bc15e526bb9920683ad6c1891ff9e08981335 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Apr 2023 13:42:41 +0300 Subject: [PATCH 014/271] coresight: Fix signedness bug in tmc_etr_buf_insert_barrier_packet() This code generates a Smatch warning: drivers/hwtracing/coresight/coresight-tmc-etr.c:947 tmc_etr_buf_insert_barrier_packet() error: uninitialized symbol 'bufp'. The problem is that if tmc_sg_table_get_data() returns -EINVAL, then when we test if "len < CORESIGHT_BARRIER_PKT_SIZE", the negative "len" value is type promoted to a high unsigned long value which is greater than CORESIGHT_BARRIER_PKT_SIZE. Fix this bug by adding an explicit check for error codes. Fixes: 75f4e3619fe2 ("coresight: tmc-etr: Add transparent buffer management") Signed-off-by: Dan Carpenter Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/7d33e244-d8b9-4c27-9653-883a13534b01@kili.mountain --- drivers/hwtracing/coresight/coresight-tmc-etr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 918d461fcf4a..eaa296ced167 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -942,7 +942,7 @@ tmc_etr_buf_insert_barrier_packet(struct etr_buf *etr_buf, u64 offset) len = tmc_etr_buf_get_data(etr_buf, offset, CORESIGHT_BARRIER_PKT_SIZE, &bufp); - if (WARN_ON(len < CORESIGHT_BARRIER_PKT_SIZE)) + if (WARN_ON(len < 0 || len < CORESIGHT_BARRIER_PKT_SIZE)) return -EINVAL; coresight_insert_barrier_packet(bufp); return offset + CORESIGHT_BARRIER_PKT_SIZE; From 04ac7f98b92181179ea84439642493f3826d04a2 Mon Sep 17 00:00:00 2001 From: Ruidong Tian Date: Tue, 25 Apr 2023 11:24:16 +0800 Subject: [PATCH 015/271] coresight: perf: Release Coresight path when alloc trace id failed Error handler for etm_setup_aux can not release coresight path because cpu mask was cleared when coresight_trace_id_get_cpu_id failed. Call coresight_release_path function explicitly when alloc trace id filed. Fixes: 4ff1fdb4125c4 ("coresight: perf: traceid: Add perf ID allocation and notifiers") Signed-off-by: Ruidong Tian Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20230425032416.125542-1-tianruidong@linux.alibaba.com --- drivers/hwtracing/coresight/coresight-etm-perf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index 711f451b6946..89e8ed214ea4 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -402,6 +402,7 @@ static void *etm_setup_aux(struct perf_event *event, void **pages, trace_id = coresight_trace_id_get_cpu_id(cpu); if (!IS_VALID_CS_TRACE_ID(trace_id)) { cpumask_clear_cpu(cpu, mask); + coresight_release_path(path); continue; } From 976d3c6778e99390c6d854d140b746d12ea18a51 Mon Sep 17 00:00:00 2001 From: Mirsad Todorovac Date: Tue, 11 Apr 2023 20:15:20 +0300 Subject: [PATCH 016/271] selftests: gpio: gpio-sim: Fix BUG: test FAILED due to recent change According to Mirsad the gpio-sim.sh test appears to FAIL in a wrong way due to missing initialisation of shell variables: 4.2. Bias settings work correctly cat: /sys/devices/platform/gpio-sim.0/gpiochip18/sim_gpio0/value: No such file or directory ./gpio-sim.sh: line 393: test: =: unary operator expected bias setting does not work GPIO gpio-sim test FAIL After this change the test passed: 4.2. Bias settings work correctly GPIO gpio-sim test PASS His testing environment is AlmaLinux 8.7 on Lenovo desktop box with the latest Linux kernel based on v6.2: Linux 6.2.0-mglru-kmlk-andy-09238-gd2980d8d8265 x86_64 Suggested-by: Mirsad Todorovac Signed-off-by: Andy Shevchenko Tested-by: Mirsad Goran Todorovac Signed-off-by: Mirsad Goran Todorovac Signed-off-by: Bartosz Golaszewski --- tools/testing/selftests/gpio/gpio-sim.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/gpio/gpio-sim.sh b/tools/testing/selftests/gpio/gpio-sim.sh index 9f539d454ee4..fa2ce2b9dd5f 100755 --- a/tools/testing/selftests/gpio/gpio-sim.sh +++ b/tools/testing/selftests/gpio/gpio-sim.sh @@ -389,6 +389,9 @@ create_chip chip create_bank chip bank set_num_lines chip bank 8 enable_chip chip +DEVNAME=`configfs_dev_name chip` +CHIPNAME=`configfs_chip_name chip bank` +SYSFS_PATH="/sys/devices/platform/$DEVNAME/$CHIPNAME/sim_gpio0/value" $BASE_DIR/gpio-mockup-cdev -b pull-up /dev/`configfs_chip_name chip bank` 0 test `cat $SYSFS_PATH` = "1" || fail "bias setting does not work" remove_chip chip From ad81e23426a651eb89a4b306e1c4169e6308c124 Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Wed, 10 May 2023 15:10:34 +0200 Subject: [PATCH 017/271] drm/mgag200: Fix gamma lut not initialized. When mgag200 switched from simple KMS to regular atomic helpers, the initialization of the gamma settings was lost. This leads to a black screen, if the bios/uefi doesn't use the same pixel color depth. v2: rebase on top of drm-misc-fixes, and add Cc stable tag. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2171155 Fixes: 1baf9127c482 ("drm/mgag200: Replace simple-KMS with regular atomic helpers") Cc: Tested-by: Phil Oester Reviewed-by: Thomas Zimmermann Signed-off-by: Jocelyn Falempe Link: https://patchwork.freedesktop.org/patch/msgid/20230510131034.284078-1-jfalempe@redhat.com --- drivers/gpu/drm/mgag200/mgag200_mode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c index 0a5aaf78172a..576c4c838a33 100644 --- a/drivers/gpu/drm/mgag200/mgag200_mode.c +++ b/drivers/gpu/drm/mgag200/mgag200_mode.c @@ -640,6 +640,11 @@ void mgag200_crtc_helper_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_ if (funcs->pixpllc_atomic_update) funcs->pixpllc_atomic_update(crtc, old_state); + if (crtc_state->gamma_lut) + mgag200_crtc_set_gamma(mdev, format, crtc_state->gamma_lut->data); + else + mgag200_crtc_set_gamma_linear(mdev, format); + mgag200_enable_display(mdev); if (funcs->enable_vidrst) From 3b8803494a0612acdeee714cb72aa142b1e05ce5 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 11 May 2023 15:19:05 +0300 Subject: [PATCH 018/271] PCI/DPC: Quirk PIO log size for Intel Ice Lake Root Ports Commit 5459c0b70467 ("PCI/DPC: Quirk PIO log size for certain Intel Root Ports") added quirks for Tiger and Alder Lake Root Ports but missed that the same issue exists also in the previous generation, Ice Lake. Apply the quirk for Ice Lake Root Ports as well. This prevents kernel complaints like: DPC: RP PIO log size 0 is invalid and also enables the DPC driver to dump the RP PIO Log registers when DPC is triggered. [bhelgaas: add dmesg warning and RP PIO Log dump info] Closes: https://bugzilla.kernel.org/show_bug.cgi?id=209943 Link: https://lore.kernel.org/r/20230511121905.73949-1-mika.westerberg@linux.intel.com Reported-by: Mark Blakeney Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas --- drivers/pci/quirks.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f4e2a88729fd..c525867760bf 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -6003,8 +6003,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x56c1, aspm_l1_acceptable_latency #ifdef CONFIG_PCIE_DPC /* - * Intel Tiger Lake and Alder Lake BIOS has a bug that clears the DPC - * RP PIO Log Size of the integrated Thunderbolt PCIe Root Ports. + * Intel Ice Lake, Tiger Lake and Alder Lake BIOS has a bug that clears + * the DPC RP PIO Log Size of the integrated Thunderbolt PCIe Root + * Ports. */ static void dpc_log_size(struct pci_dev *dev) { @@ -6027,6 +6028,10 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x461f, dpc_log_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x462f, dpc_log_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x463f, dpc_log_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x466e, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8a1d, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8a1f, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8a21, dpc_log_size); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8a23, dpc_log_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a23, dpc_log_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a25, dpc_log_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x9a27, dpc_log_size); From 08c7f09356e45d093d1867c7a3c6ac6526e2f98b Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 7 May 2023 11:29:29 -0700 Subject: [PATCH 019/271] RDMA/bnxt_re: Fix the page_size used during the MR creation Driver populates the list of pages used for Memory region wrongly when page size is more than system page size. This is causing a failure when some of the applications that creates MR with page size as 2M. Since HW can support multiple page sizes, pass the correct page size while creating the MR. Also, driver need not adjust the number of pages when HW Queues are created with user memory. It should work with the number of dma blocks returned by ib_umem_num_dma_blocks. Fix this calculation also. Fixes: 0c4dcd602817 ("RDMA/bnxt_re: Refactor hardware queue memory allocation") Fixes: f6919d56388c ("RDMA/bnxt_re: Code refactor while populating user MRs") Link: https://lore.kernel.org/r/1683484169-9539-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_res.c | 12 ++---------- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 7 +++---- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 126d4f26f75a..81b0c5e879f9 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -215,17 +215,9 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, return -EINVAL; hwq_attr->sginfo->npages = npages; } else { - unsigned long sginfo_num_pages = ib_umem_num_dma_blocks( - hwq_attr->sginfo->umem, hwq_attr->sginfo->pgsize); - + npages = ib_umem_num_dma_blocks(hwq_attr->sginfo->umem, + hwq_attr->sginfo->pgsize); hwq->is_user = true; - npages = sginfo_num_pages; - npages = (npages * PAGE_SIZE) / - BIT_ULL(hwq_attr->sginfo->pgshft); - if ((sginfo_num_pages * PAGE_SIZE) % - BIT_ULL(hwq_attr->sginfo->pgshft)) - if (!npages) - npages++; } if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 1714a1e23113..b967a17a44be 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -617,16 +617,15 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, /* Free the hwq if it already exist, must be a rereg */ if (mr->hwq.max_elements) bnxt_qplib_free_hwq(res, &mr->hwq); - /* Use system PAGE_SIZE */ hwq_attr.res = res; hwq_attr.depth = pages; - hwq_attr.stride = buf_pg_size; + hwq_attr.stride = sizeof(dma_addr_t); hwq_attr.type = HWQ_TYPE_MR; hwq_attr.sginfo = &sginfo; hwq_attr.sginfo->umem = umem; hwq_attr.sginfo->npages = pages; - hwq_attr.sginfo->pgsize = PAGE_SIZE; - hwq_attr.sginfo->pgshft = PAGE_SHIFT; + hwq_attr.sginfo->pgsize = buf_pg_size; + hwq_attr.sginfo->pgshft = ilog2(buf_pg_size); rc = bnxt_qplib_alloc_init_hwq(&mr->hwq, &hwq_attr); if (rc) { dev_err(&res->pdev->dev, From 78b6a9a3f445e121ca08195084206cb8faa6999f Mon Sep 17 00:00:00 2001 From: Haoyue Xu Date: Sat, 6 May 2023 15:06:04 +0800 Subject: [PATCH 020/271] MAINTAINERS: Update maintainers of HiSilicon RoCE Wenpeng has moved to other technical areas, and Junxian will take over his responsibilities in maintaining this module. Link: https://lore.kernel.org/r/20230506070604.2982542-1-xuhaoyue1@hisilicon.com Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7e0b87d5aa2e..c56043737848 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9360,7 +9360,7 @@ F: drivers/crypto/hisilicon/zip/ HISILICON ROCE DRIVER M: Haoyue Xu -M: Wenpeng Liang +M: Junxian Huang L: linux-rdma@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt From bdc1c5fac982845a58d28690cdb56db8c88a530d Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Fri, 5 May 2023 20:30:20 +0000 Subject: [PATCH 021/271] binder: fix UAF caused by faulty buffer cleanup In binder_transaction_buffer_release() the 'failed_at' offset indicates the number of objects to clean up. However, this function was changed by commit 44d8047f1d87 ("binder: use standard functions to allocate fds"), to release all the objects in the buffer when 'failed_at' is zero. This introduced an issue when a transaction buffer is released without any objects having been processed so far. In this case, 'failed_at' is indeed zero yet it is misinterpreted as releasing the entire buffer. This leads to use-after-free errors where nodes are incorrectly freed and subsequently accessed. Such is the case in the following KASAN report: ================================================================== BUG: KASAN: slab-use-after-free in binder_thread_read+0xc40/0x1f30 Read of size 8 at addr ffff4faf037cfc58 by task poc/474 CPU: 6 PID: 474 Comm: poc Not tainted 6.3.0-12570-g7df047b3f0aa #5 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x94/0xec show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 print_report+0xf8/0x5b8 kasan_report+0xb8/0xfc __asan_load8+0x9c/0xb8 binder_thread_read+0xc40/0x1f30 binder_ioctl+0xd9c/0x1768 __arm64_sys_ioctl+0xd4/0x118 invoke_syscall+0x60/0x188 [...] Allocated by task 474: kasan_save_stack+0x3c/0x64 kasan_set_track+0x2c/0x40 kasan_save_alloc_info+0x24/0x34 __kasan_kmalloc+0xb8/0xbc kmalloc_trace+0x48/0x5c binder_new_node+0x3c/0x3a4 binder_transaction+0x2b58/0x36f0 binder_thread_write+0x8e0/0x1b78 binder_ioctl+0x14a0/0x1768 __arm64_sys_ioctl+0xd4/0x118 invoke_syscall+0x60/0x188 [...] Freed by task 475: kasan_save_stack+0x3c/0x64 kasan_set_track+0x2c/0x40 kasan_save_free_info+0x38/0x5c __kasan_slab_free+0xe8/0x154 __kmem_cache_free+0x128/0x2bc kfree+0x58/0x70 binder_dec_node_tmpref+0x178/0x1fc binder_transaction_buffer_release+0x430/0x628 binder_transaction+0x1954/0x36f0 binder_thread_write+0x8e0/0x1b78 binder_ioctl+0x14a0/0x1768 __arm64_sys_ioctl+0xd4/0x118 invoke_syscall+0x60/0x188 [...] ================================================================== In order to avoid these issues, let's always calculate the intended 'failed_at' offset beforehand. This is renamed and wrapped in a helper function to make it clear and convenient. Fixes: 32e9f56a96d8 ("binder: don't detect sender/target during buffer cleanup") Reported-by: Zi Fan Tan Cc: stable@vger.kernel.org Signed-off-by: Carlos Llamas Acked-by: Todd Kjos Link: https://lore.kernel.org/r/20230505203020.4101154-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index fb56bfc45096..8fb7672021ee 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1934,24 +1934,23 @@ static void binder_deferred_fd_close(int fd) static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, - binder_size_t failed_at, + binder_size_t off_end_offset, bool is_failure) { int debug_id = buffer->debug_id; - binder_size_t off_start_offset, buffer_offset, off_end_offset; + binder_size_t off_start_offset, buffer_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, - (unsigned long long)failed_at); + (unsigned long long)off_end_offset); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); - off_end_offset = is_failure && failed_at ? failed_at : - off_start_offset + buffer->offsets_size; + for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; @@ -2111,6 +2110,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } } +/* Clean up all the objects in the buffer */ +static inline void binder_release_entire_buffer(struct binder_proc *proc, + struct binder_thread *thread, + struct binder_buffer *buffer, + bool is_failure) +{ + binder_size_t off_end_offset; + + off_end_offset = ALIGN(buffer->data_size, sizeof(void *)); + off_end_offset += buffer->offsets_size; + + binder_transaction_buffer_release(proc, thread, buffer, + off_end_offset, is_failure); +} + static int binder_translate_binder(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) @@ -2806,7 +2820,7 @@ static int binder_proc_transaction(struct binder_transaction *t, t_outdated->buffer = NULL; buffer->transaction = NULL; trace_binder_transaction_update_buffer_release(buffer); - binder_transaction_buffer_release(proc, NULL, buffer, 0, 0); + binder_release_entire_buffer(proc, NULL, buffer, false); binder_alloc_free_buf(&proc->alloc, buffer); kfree(t_outdated); binder_stats_deleted(BINDER_STAT_TRANSACTION); @@ -3775,7 +3789,7 @@ binder_free_buf(struct binder_proc *proc, binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure); + binder_release_entire_buffer(proc, thread, buffer, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } From b15655b12ddca7ade09807f790bafb6fab61b50a Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 2 May 2023 20:12:17 +0000 Subject: [PATCH 022/271] Revert "binder_alloc: add missing mmap_lock calls when using the VMA" This reverts commit 44e602b4e52f70f04620bbbf4fe46ecb40170bde. This caused a performance regression particularly when pages are getting reclaimed. We don't need to acquire the mmap_lock to determine when the binder buffer has been fully initialized. A subsequent patch will bring back the lockless approach for this. [cmllamas: resolved trivial conflicts with renaming of alloc->mm] Fixes: 44e602b4e52f ("binder_alloc: add missing mmap_lock calls when using the VMA") Cc: Liam Howlett Cc: Suren Baghdasaryan Cc: stable@vger.kernel.org Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20230502201220.1756319-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 55a3c3c2409f..92c814ec44fe 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -380,15 +380,12 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; - mmap_read_lock(alloc->mm); if (!binder_alloc_get_vma(alloc)) { - mmap_read_unlock(alloc->mm); binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: binder_alloc_buf, no vma\n", alloc->pid); return ERR_PTR(-ESRCH); } - mmap_read_unlock(alloc->mm); data_offsets_size = ALIGN(data_size, sizeof(void *)) + ALIGN(offsets_size, sizeof(void *)); @@ -916,25 +913,17 @@ void binder_alloc_print_pages(struct seq_file *m, * Make sure the binder_alloc is fully initialized, otherwise we might * read inconsistent state. */ - - mmap_read_lock(alloc->mm); - if (binder_alloc_get_vma(alloc) == NULL) { - mmap_read_unlock(alloc->mm); - goto uninitialized; + if (binder_alloc_get_vma(alloc) != NULL) { + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + page = &alloc->pages[i]; + if (!page->page_ptr) + free++; + else if (list_empty(&page->lru)) + active++; + else + lru++; + } } - - mmap_read_unlock(alloc->mm); - for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { - page = &alloc->pages[i]; - if (!page->page_ptr) - free++; - else if (list_empty(&page->lru)) - active++; - else - lru++; - } - -uninitialized: mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); From c0fd2101781ef761b636769b2f445351f71c3626 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 2 May 2023 20:12:18 +0000 Subject: [PATCH 023/271] Revert "android: binder: stop saving a pointer to the VMA" This reverts commit a43cfc87caaf46710c8027a8c23b8a55f1078f19. This patch fixed an issue reported by syzkaller in [1]. However, this turned out to be only a band-aid in binder. The root cause, as bisected by syzkaller, was fixed by commit 5789151e48ac ("mm/mmap: undo ->mmap() when mas_preallocate() fails"). We no longer need the patch for binder. Reverting such patch allows us to have a lockless access to alloc->vma in specific cases where the mmap_lock is not required. This approach avoids the contention that caused a performance regression. [1] https://lore.kernel.org/all/0000000000004a0dbe05e1d749e0@google.com [cmllamas: resolved conflicts with rework of alloc->mm and removal of binder_alloc_set_vma() also fixed comment section] Fixes: a43cfc87caaf ("android: binder: stop saving a pointer to the VMA") Cc: Liam Howlett Cc: Suren Baghdasaryan Cc: stable@vger.kernel.org Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20230502201220.1756319-2-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 17 +++++++++-------- drivers/android/binder_alloc.h | 4 ++-- drivers/android/binder_alloc_selftest.c | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 92c814ec44fe..eb082b33115b 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -213,7 +213,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (mm) { mmap_read_lock(mm); - vma = vma_lookup(mm, alloc->vma_addr); + vma = alloc->vma; } if (!vma && need_mm) { @@ -314,9 +314,11 @@ static inline struct vm_area_struct *binder_alloc_get_vma( { struct vm_area_struct *vma = NULL; - if (alloc->vma_addr) - vma = vma_lookup(alloc->mm, alloc->vma_addr); - + if (alloc->vma) { + /* Look at description in binder_alloc_set_vma */ + smp_rmb(); + vma = alloc->vma; + } return vma; } @@ -775,7 +777,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, buffer->free = 1; binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; - alloc->vma_addr = vma->vm_start; + alloc->vma = vma; return 0; @@ -805,8 +807,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers = 0; mutex_lock(&alloc->mutex); - BUG_ON(alloc->vma_addr && - vma_lookup(alloc->mm, alloc->vma_addr)); + BUG_ON(alloc->vma); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); @@ -958,7 +959,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) */ void binder_alloc_vma_close(struct binder_alloc *alloc) { - alloc->vma_addr = 0; + alloc->vma = 0; } /** diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 0f811ac4bcff..138d1d5af9ce 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -75,7 +75,7 @@ struct binder_lru_page { /** * struct binder_alloc - per-binder proc state for binder allocator * @mutex: protects binder_alloc fields - * @vma_addr: vm_area_struct->vm_start passed to mmap_handler + * @vma: vm_area_struct passed to mmap_handler * (invariant after mmap) * @mm: copy of task->mm (invariant after open) * @buffer: base of per-proc address space mapped via mmap @@ -99,7 +99,7 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - unsigned long vma_addr; + struct vm_area_struct *vma; struct mm_struct *mm; void __user *buffer; struct list_head buffers; diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index 43a881073a42..c2b323bc3b3a 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -287,7 +287,7 @@ void binder_selftest_alloc(struct binder_alloc *alloc) if (!binder_selftest_run) return; mutex_lock(&binder_selftest_lock); - if (!binder_selftest_run || !alloc->vma_addr) + if (!binder_selftest_run || !alloc->vma) goto done; pr_info("STARTED\n"); binder_selftest_alloc_offset(alloc, end_offset, 0); From 0fa53349c3acba0239369ba4cd133740a408d246 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 2 May 2023 20:12:19 +0000 Subject: [PATCH 024/271] binder: add lockless binder_alloc_(set|get)_vma() Bring back the original lockless design in binder_alloc to determine whether the buffer setup has been completed by the ->mmap() handler. However, this time use smp_load_acquire() and smp_store_release() to wrap all the ordering in a single macro call. Also, add comments to make it evident that binder uses alloc->vma to determine when the binder_alloc has been fully initialized. In these scenarios acquiring the mmap_lock is not required. Fixes: a43cfc87caaf ("android: binder: stop saving a pointer to the VMA") Cc: Liam Howlett Cc: Suren Baghdasaryan Cc: stable@vger.kernel.org Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20230502201220.1756319-3-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index eb082b33115b..e7c9d466f8e8 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -309,17 +309,18 @@ err_no_vma: return vma ? -ENOMEM : -ESRCH; } +static inline void binder_alloc_set_vma(struct binder_alloc *alloc, + struct vm_area_struct *vma) +{ + /* pairs with smp_load_acquire in binder_alloc_get_vma() */ + smp_store_release(&alloc->vma, vma); +} + static inline struct vm_area_struct *binder_alloc_get_vma( struct binder_alloc *alloc) { - struct vm_area_struct *vma = NULL; - - if (alloc->vma) { - /* Look at description in binder_alloc_set_vma */ - smp_rmb(); - vma = alloc->vma; - } - return vma; + /* pairs with smp_store_release in binder_alloc_set_vma() */ + return smp_load_acquire(&alloc->vma); } static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid) @@ -382,6 +383,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked( size_t size, data_offsets_size; int ret; + /* Check binder_alloc is fully initialized */ if (!binder_alloc_get_vma(alloc)) { binder_alloc_debug(BINDER_DEBUG_USER_ERROR, "%d: binder_alloc_buf, no vma\n", @@ -777,7 +779,9 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, buffer->free = 1; binder_insert_free_buffer(alloc, buffer); alloc->free_async_space = alloc->buffer_size / 2; - alloc->vma = vma; + + /* Signal binder_alloc is fully initialized */ + binder_alloc_set_vma(alloc, vma); return 0; @@ -959,7 +963,7 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) */ void binder_alloc_vma_close(struct binder_alloc *alloc) { - alloc->vma = 0; + binder_alloc_set_vma(alloc, NULL); } /** From 2ac6c4a637132b79863426223242e680cc4f8e27 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Sun, 23 Apr 2023 07:35:13 -0500 Subject: [PATCH 025/271] arm64: dts: imx8mn: Fix video clock parents There are a few clocks whose parents are set in mipi_dsi and mxsfb nodes, but these clocks are used by the disp_blk_ctrl power domain which may cause an issue when re-parenting, resuling in a disp_pixel clock having the wrong parent and wrong rate. Fix this by moving the assigned-clock-parents as associate clock assignments to the power-domain node to setup these clocks before they are enabled. Fixes: d825fb6455d5 ("arm64: dts: imx8mn: Add display pipeline components") Signed-off-by: Adam Ford Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mn.dtsi | 28 ++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi index bd84db550053..8be8f090e8b8 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi @@ -1069,13 +1069,6 @@ <&clk IMX8MN_CLK_DISP_APB_ROOT>, <&clk IMX8MN_CLK_DISP_AXI_ROOT>; clock-names = "pix", "axi", "disp_axi"; - assigned-clocks = <&clk IMX8MN_CLK_DISP_PIXEL_ROOT>, - <&clk IMX8MN_CLK_DISP_AXI>, - <&clk IMX8MN_CLK_DISP_APB>; - assigned-clock-parents = <&clk IMX8MN_CLK_DISP_PIXEL>, - <&clk IMX8MN_SYS_PLL2_1000M>, - <&clk IMX8MN_SYS_PLL1_800M>; - assigned-clock-rates = <594000000>, <500000000>, <200000000>; interrupts = ; power-domains = <&disp_blk_ctrl IMX8MN_DISPBLK_PD_LCDIF>; status = "disabled"; @@ -1093,12 +1086,6 @@ clocks = <&clk IMX8MN_CLK_DSI_CORE>, <&clk IMX8MN_CLK_DSI_PHY_REF>; clock-names = "bus_clk", "sclk_mipi"; - assigned-clocks = <&clk IMX8MN_CLK_DSI_CORE>, - <&clk IMX8MN_CLK_DSI_PHY_REF>; - assigned-clock-parents = <&clk IMX8MN_SYS_PLL1_266M>, - <&clk IMX8MN_CLK_24M>; - assigned-clock-rates = <266000000>, <24000000>; - samsung,pll-clock-frequency = <24000000>; interrupts = ; power-domains = <&disp_blk_ctrl IMX8MN_DISPBLK_PD_MIPI_DSI>; status = "disabled"; @@ -1142,6 +1129,21 @@ "lcdif-axi", "lcdif-apb", "lcdif-pix", "dsi-pclk", "dsi-ref", "csi-aclk", "csi-pclk"; + assigned-clocks = <&clk IMX8MN_CLK_DSI_CORE>, + <&clk IMX8MN_CLK_DSI_PHY_REF>, + <&clk IMX8MN_CLK_DISP_PIXEL>, + <&clk IMX8MN_CLK_DISP_AXI>, + <&clk IMX8MN_CLK_DISP_APB>; + assigned-clock-parents = <&clk IMX8MN_SYS_PLL1_266M>, + <&clk IMX8MN_CLK_24M>, + <&clk IMX8MN_VIDEO_PLL1_OUT>, + <&clk IMX8MN_SYS_PLL2_1000M>, + <&clk IMX8MN_SYS_PLL1_800M>; + assigned-clock-rates = <266000000>, + <24000000>, + <594000000>, + <500000000>, + <200000000>; #power-domain-cells = <1>; }; From f161cea5a20f3aeeb637a88ad1705fc2720b4d58 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Mon, 1 May 2023 13:05:32 -0400 Subject: [PATCH 026/271] arm64: dts: imx8mn-var-som: fix PHY detection bug by adding deassert delay While testing the ethernet interface on a Variscite symphony carrier board using an imx8mn SOM with an onboard ADIN1300 PHY (EC hardware configuration), the ethernet PHY is not detected. The ADIN1300 datasheet indicate that the "Management interface active (t4)" state is reached at most 5ms after the reset signal is deasserted. The device tree in Variscite custom git repository uses the following property: phy-reset-post-delay = <20>; Add a new MDIO property 'reset-deassert-us' of 20ms to have the same delay inside the ethphy node. Adding this property fixes the problem with the PHY detection. Note that this SOM can also have an Atheros AR8033 PHY. In this case, a 1ms deassert delay is sufficient. Add a comment to that effect. Fixes: ade0176dd8a0 ("arm64: dts: imx8mn-var-som: Add Variscite VAR-SOM-MX8MN System on Module") Signed-off-by: Hugo Villeneuve Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi index 67072e6c77d5..cbd9d124c80d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mn-var-som.dtsi @@ -98,11 +98,17 @@ #address-cells = <1>; #size-cells = <0>; - ethphy: ethernet-phy@4 { + ethphy: ethernet-phy@4 { /* AR8033 or ADIN1300 */ compatible = "ethernet-phy-ieee802.3-c22"; reg = <4>; reset-gpios = <&gpio1 9 GPIO_ACTIVE_LOW>; reset-assert-us = <10000>; + /* + * Deassert delay: + * ADIN1300 requires 5ms. + * AR8033 requires 1ms. + */ + reset-deassert-us = <20000>; }; }; }; From 892943d7729bbfb2edeed9e323eba9a5cec21c49 Mon Sep 17 00:00:00 2001 From: Christoph Niedermaier Date: Tue, 2 May 2023 13:14:24 +0200 Subject: [PATCH 027/271] ARM: dts: imx6ull-dhcor: Set and limit the mode for PMIC buck 1, 2 and 3 According to Renesas Electronics (formerly Dialog Semiconductor), the standard AUTO mode of the PMIC DA9061 can lead to stability problems depending on the hardware revision. It is recommended to set a defined mode such as PFM or PWM permanently. So set and limit the mode for buck 1, 2 and 3 to a fixed one. Fixes: 611b6c891e40 ("ARM: dts: imx6ull-dhcom: Add DH electronics DHCOM i.MX6ULL SoM and PDK2 board") Signed-off-by: Christoph Niedermaier Reviewed-by: Marek Vasut Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6ull-dhcor-som.dtsi | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/boot/dts/imx6ull-dhcor-som.dtsi b/arch/arm/boot/dts/imx6ull-dhcor-som.dtsi index 5882c7565f64..32a6022625d9 100644 --- a/arch/arm/boot/dts/imx6ull-dhcor-som.dtsi +++ b/arch/arm/boot/dts/imx6ull-dhcor-som.dtsi @@ -8,6 +8,7 @@ #include #include #include +#include #include "imx6ull.dtsi" / { @@ -84,16 +85,20 @@ regulators { vdd_soc_in_1v4: buck1 { + regulator-allowed-modes = ; /* PFM */ regulator-always-on; regulator-boot-on; + regulator-initial-mode = ; regulator-max-microvolt = <1400000>; regulator-min-microvolt = <1400000>; regulator-name = "vdd_soc_in_1v4"; }; vcc_3v3: buck2 { + regulator-allowed-modes = ; /* PWM */ regulator-always-on; regulator-boot-on; + regulator-initial-mode = ; regulator-max-microvolt = <3300000>; regulator-min-microvolt = <3300000>; regulator-name = "vcc_3v3"; @@ -106,8 +111,10 @@ * the voltage is set to 1.5V. */ vcc_ddr_1v35: buck3 { + regulator-allowed-modes = ; /* PWM */ regulator-always-on; regulator-boot-on; + regulator-initial-mode = ; regulator-max-microvolt = <1500000>; regulator-min-microvolt = <1500000>; regulator-name = "vcc_ddr_1v35"; From 91aa4b3782448a7a13baa8cbcdfd5fd19defcbd9 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 3 May 2023 13:31:10 +0200 Subject: [PATCH 028/271] ARM: dts: imx6qdl-mba6: Add missing pvcie-supply regulator This worked before by coincidence, as the regulator was probed and enabled before PCI RC probe. But probe order changed since commit 259b93b21a9f ("regulator: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in 4.14") and PCIe supply is enabled after RC. Fix this by adding the regulator to RC node. The PCIe vaux regulator still needs to be enabled unconditionally for Mini-PCIe USB-only devices. Fixes: ef3846247b41 ("ARM: dts: imx6qdl: add TQ-Systems MBa6x device trees") Signed-off-by: Alexander Stein Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6qdl-mba6.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/imx6qdl-mba6.dtsi b/arch/arm/boot/dts/imx6qdl-mba6.dtsi index 78555a618851..7b7e6c2ad190 100644 --- a/arch/arm/boot/dts/imx6qdl-mba6.dtsi +++ b/arch/arm/boot/dts/imx6qdl-mba6.dtsi @@ -209,6 +209,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_pcie>; reset-gpio = <&gpio6 7 GPIO_ACTIVE_LOW>; + vpcie-supply = <®_pcie>; status = "okay"; }; From 07bb2e368820a4de9b4b586691e143976b74ea44 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Thu, 11 May 2023 20:04:23 -0500 Subject: [PATCH 029/271] arm64: dts: imx8mp: Fix video clock parents There are a few clocks whose parents are set in mipi_dsi and lcdif nodes, but these clocks are used by the media_blk_ctrl power domain. This may cause an issue when re-parenting, because the media_blk_ctrl may start the clocks before the reparent is done resulting in a disp_pixel clock having the wrong parent and rate. Fix this by moving the assigned-clock-parents and rates to the media_blk_ctrl node to configure these clocks before they are enabled. After this patch, both disp1_pix_root and dixp2_pix_root clock become children of the video_pll1. video_pll1_ref_sel 24000000 video_pll1 1039500000 video_pll1_bypass 1039500000 video_pll1_out 1039500000 media_disp2_pix 1039500000 media_disp2_pix_root_clk 1039500000 media_disp1_pix 1039500000 media_disp1_pix_root_clk 1039500000 Fixes: eda09fe149df ("arm64: dts: imx8mp: Add display pipeline components") Signed-off-by: Adam Ford Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp.dtsi | 25 ++++++++--------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi index f81391993354..428c60462e3d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi @@ -1211,13 +1211,6 @@ <&clk IMX8MP_CLK_MEDIA_APB_ROOT>, <&clk IMX8MP_CLK_MEDIA_AXI_ROOT>; clock-names = "pix", "axi", "disp_axi"; - assigned-clocks = <&clk IMX8MP_CLK_MEDIA_DISP1_PIX_ROOT>, - <&clk IMX8MP_CLK_MEDIA_AXI>, - <&clk IMX8MP_CLK_MEDIA_APB>; - assigned-clock-parents = <&clk IMX8MP_CLK_MEDIA_DISP1_PIX>, - <&clk IMX8MP_SYS_PLL2_1000M>, - <&clk IMX8MP_SYS_PLL1_800M>; - assigned-clock-rates = <594000000>, <500000000>, <200000000>; interrupts = ; power-domains = <&media_blk_ctrl IMX8MP_MEDIABLK_PD_LCDIF_1>; status = "disabled"; @@ -1237,11 +1230,6 @@ <&clk IMX8MP_CLK_MEDIA_APB_ROOT>, <&clk IMX8MP_CLK_MEDIA_AXI_ROOT>; clock-names = "pix", "axi", "disp_axi"; - assigned-clocks = <&clk IMX8MP_CLK_MEDIA_DISP2_PIX>, - <&clk IMX8MP_VIDEO_PLL1>; - assigned-clock-parents = <&clk IMX8MP_VIDEO_PLL1_OUT>, - <&clk IMX8MP_VIDEO_PLL1_REF_SEL>; - assigned-clock-rates = <0>, <1039500000>; power-domains = <&media_blk_ctrl IMX8MP_MEDIABLK_PD_LCDIF_2>; status = "disabled"; @@ -1296,11 +1284,16 @@ "disp1", "disp2", "isp", "phy"; assigned-clocks = <&clk IMX8MP_CLK_MEDIA_AXI>, - <&clk IMX8MP_CLK_MEDIA_APB>; + <&clk IMX8MP_CLK_MEDIA_APB>, + <&clk IMX8MP_CLK_MEDIA_DISP1_PIX>, + <&clk IMX8MP_CLK_MEDIA_DISP2_PIX>, + <&clk IMX8MP_VIDEO_PLL1>; assigned-clock-parents = <&clk IMX8MP_SYS_PLL2_1000M>, - <&clk IMX8MP_SYS_PLL1_800M>; - assigned-clock-rates = <500000000>, <200000000>; - + <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_VIDEO_PLL1_OUT>, + <&clk IMX8MP_VIDEO_PLL1_OUT>; + assigned-clock-rates = <500000000>, <200000000>, + <0>, <0>, <1039500000>; #power-domain-cells = <1>; lvds_bridge: bridge@5c { From a346d4dc74d94ac3c197283d0a7ec673a59a0674 Mon Sep 17 00:00:00 2001 From: Emanuele Ghidoli Date: Sun, 14 May 2023 11:22:43 +0200 Subject: [PATCH 030/271] arm64: dts: colibri-imx8x: fix eval board pin configuration Fix pinctrl groups to have SODIMM 75 only in one group. Remove configuration of the pin at SoM level because it is normally used as CSI_MCLK at camera interface connector. Without this fix it is not possible, without redefining iomuxc pinctrl, to use CSI_MCLK signal and leads to the following error messages: imx8qxp-pinctrl scu:pinctrl: pin IMX8QXP_CSI_MCLK already requested imx8qxp-pinctrl scu:pinctrl: pin-147 (16-003c) status -22 Fixes: 4d2adf738169 ("arm64: dts: colibri-imx8x: Split pinctrl_hog1") Signed-off-by: Emanuele Ghidoli Signed-off-by: Andrejs Cainikovs Signed-off-by: Francesco Dolcini Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi b/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi index 7cad79102e1a..6f88c11f16e1 100644 --- a/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi @@ -365,7 +365,7 @@ &iomuxc { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_ext_io0>, <&pinctrl_hog0>, <&pinctrl_hog1>, - <&pinctrl_hog2>, <&pinctrl_lpspi2_cs2>; + <&pinctrl_lpspi2_cs2>; /* On-module touch pen-down interrupt */ pinctrl_ad7879_int: ad7879intgrp { @@ -499,8 +499,7 @@ }; pinctrl_hog1: hog1grp { - fsl,pins = , /* SODIMM 75 */ - ; /* SODIMM 93 */ + fsl,pins = ; /* SODIMM 93 */ }; pinctrl_hog2: hog2grp { From 25acffb00899d3ac85a9701c99f7929f97a5d9af Mon Sep 17 00:00:00 2001 From: Emanuele Ghidoli Date: Sun, 14 May 2023 11:22:44 +0200 Subject: [PATCH 031/271] arm64: dts: colibri-imx8x: move pinctrl property from SoM to eval board Each carrier board device tree except the eval board one already override iomuxc pinctrl property to configure unused pins as gpio. So move also the pinctrl property to eval board device tree. Leave the pin group definition in imx8x-colibri.dtsi to avoid duplication and simplify configuration of gpio. Signed-off-by: Emanuele Ghidoli Signed-off-by: Andrejs Cainikovs Signed-off-by: Francesco Dolcini Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8x-colibri-eval-v3.dtsi | 6 ++++++ arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8x-colibri-eval-v3.dtsi b/arch/arm64/boot/dts/freescale/imx8x-colibri-eval-v3.dtsi index 7264d784ae72..9af769ab8ceb 100644 --- a/arch/arm64/boot/dts/freescale/imx8x-colibri-eval-v3.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8x-colibri-eval-v3.dtsi @@ -33,6 +33,12 @@ }; }; +&iomuxc { + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_ext_io0>, <&pinctrl_hog0>, <&pinctrl_hog1>, + <&pinctrl_lpspi2_cs2>; +}; + /* Colibri SPI */ &lpspi2 { status = "okay"; diff --git a/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi b/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi index 6f88c11f16e1..b0d6f632622c 100644 --- a/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi @@ -363,10 +363,6 @@ /* TODO VPU Encoder/Decoder */ &iomuxc { - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_ext_io0>, <&pinctrl_hog0>, <&pinctrl_hog1>, - <&pinctrl_lpspi2_cs2>; - /* On-module touch pen-down interrupt */ pinctrl_ad7879_int: ad7879intgrp { fsl,pins = ; From 34e5c0cd55af7258d819cb203da7d202bae3b97d Mon Sep 17 00:00:00 2001 From: Emanuele Ghidoli Date: Sun, 14 May 2023 11:22:45 +0200 Subject: [PATCH 032/271] arm64: dts: colibri-imx8x: fix iris pinctrl configuration Remove GPIO3_IO10 from Iris carrier board pinctrl configuration, this is already defined in the SOM dtsi since this is a standard SOM functionality (wake-up button). Duplicating it leads to the following error message imx8qxp-pinctrl scu:pinctrl: pin IMX8QXP_QSPI0A_DATA1 already requested Fixes: aefb5e2d974d ("arm64: dts: colibri-imx8x: Add iris carrier board") Signed-off-by: Emanuele Ghidoli Signed-off-by: Andrejs Cainikovs Signed-off-by: Francesco Dolcini Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8x-colibri-iris.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8x-colibri-iris.dtsi b/arch/arm64/boot/dts/freescale/imx8x-colibri-iris.dtsi index 5f30c88855e7..f8953067bc3b 100644 --- a/arch/arm64/boot/dts/freescale/imx8x-colibri-iris.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8x-colibri-iris.dtsi @@ -48,8 +48,7 @@ , /* SODIMM 101 */ , /* SODIMM 97 */ , /* SODIMM 85 */ - , /* SODIMM 79 */ - ; /* SODIMM 45 */ + ; /* SODIMM 79 */ }; pinctrl_uart1_forceoff: uart1forceoffgrp { From b8b23fbe93b34e595a7b0f31b4ed22400c47ec07 Mon Sep 17 00:00:00 2001 From: Andrejs Cainikovs Date: Sun, 14 May 2023 11:22:46 +0200 Subject: [PATCH 033/271] arm64: dts: colibri-imx8x: delete adc1 and dsp i.MX8, i.MX8X, i.MX8XP and i.MX8XL SOC device trees are all based on imx8-ss-*.dtsi files. For i.MX8X and i.MX8XP these device trees should be updated with some peripherals removed or updated, similar to i.MX8XL (imx8dxl-ss-*.dtsi files). However, it looks like only i.MX8 and i.MX8XL are up to date, but for i.MX8X and i.MX8XP some of the peripherals got inherited from imx8-ss-*.dtsi files, but in reality they are not present on SOC. As a result, during resource partition ownership check U-Boot receives messages from SCU firmware about these resources not owned by boot partition. In reality, these resources are not owned by anyone, as they simply does not exist, but are defined in Linux device tree. This change removes those peripherals, which are listed during U-Boot resource partition ownership check as warnings: ## Flattened Device Tree blob at 9d400000 Booting using the fdt blob at 0x9d400000 Loading Device Tree to 00000000fd652000, end 00000000fd67efff ... OK Disable clock-controller@59580000 rsrc 512 not owned Disable clock-controller@5ac90000 rsrc 102 not owned Starting kernel ... Fixes: ba5a5615d54f ("arm64: dts: freescale: add initial support for colibri imx8x") Signed-off-by: Andrejs Cainikovs Signed-off-by: Francesco Dolcini Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi b/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi index b0d6f632622c..49d105eb4769 100644 --- a/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8x-colibri.dtsi @@ -769,3 +769,10 @@ fsl,pins = ; }; }; + +/* Delete peripherals which are not present on SOC, but are defined in imx8-ss-*.dtsi */ + +/delete-node/ &adc1; +/delete-node/ &adc1_lpcg; +/delete-node/ &dsp; +/delete-node/ &dsp_lpcg; From 140809738d969376f26f13264b16669703956f6c Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 15 May 2023 11:29:43 +0200 Subject: [PATCH 034/271] drm/pl111: Fix FB depth on IMPD-1 framebuffer The last argument to the function drm_fbdev_dma_setup() was changed from desired BPP to desired depth. In our case the desired depth was 15 but BPP was 16, so we specified 16 as BPP and we relied on the FB emulation core to select a format with a suitable depth for the limited bandwidth and end up with e.g. XRGB1555 like in the past: [drm] Initialized pl111 1.0.0 20170317 for c1000000.display on minor 0 drm-clcd-pl111 c1000000.display: [drm] requested bpp 16, scaled depth down to 15 drm-clcd-pl111 c1000000.display: enable IM-PD1 CLCD connectors Console: switching to colour frame buffer device 80x30 drm-clcd-pl111 c1000000.display: [drm] fb0: pl111drmfb frame buffer device However the current code will fail at that: [drm] Initialized pl111 1.0.0 20170317 for c1000000.display on minor 0 drm-clcd-pl111 c1000000.display: [drm] bpp/depth value of 16/16 not supported drm-clcd-pl111 c1000000.display: [drm] No compatible format found drm-clcd-pl111 c1000000.display: [drm] *ERROR* fbdev: Failed to setup generic emulation (ret=-12) Fix this by passing the desired depth of 15 for the IM/PD-1 display instead of 16 to drm_fbdev_dma_setup(). The desired depth is however in turn used for bandwidth limiting calculations and that was done with a simple / integer division, whereas we now have to modify that to use DIV_ROUND_UP() so that we get DIV_ROUND_UP(15, 2) = 2 not 15/2 = 1. After this the display works again on the Integrator/AP IM/PD-1. Cc: Emma Anholt Cc: stable@vger.kernel.org Suggested-by: Thomas Zimmermann Fixes: 37c90d589dc0 ("drm/fb-helper: Fix single-probe color-format selection") Link: https://lore.kernel.org/dri-devel/20230102112927.26565-1-tzimmermann@suse.de/ Reviewed-by: Thomas Zimmermann Signed-off-by: Linus Walleij Link: https://patchwork.freedesktop.org/patch/msgid/20230515092943.1401558-1-linus.walleij@linaro.org --- drivers/gpu/drm/pl111/pl111_display.c | 2 +- drivers/gpu/drm/pl111/pl111_drm.h | 4 ++-- drivers/gpu/drm/pl111/pl111_drv.c | 8 ++++---- drivers/gpu/drm/pl111/pl111_versatile.c | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/pl111/pl111_display.c b/drivers/gpu/drm/pl111/pl111_display.c index 6afdf260a4e2..b9fe926a49e8 100644 --- a/drivers/gpu/drm/pl111/pl111_display.c +++ b/drivers/gpu/drm/pl111/pl111_display.c @@ -53,7 +53,7 @@ pl111_mode_valid(struct drm_simple_display_pipe *pipe, { struct drm_device *drm = pipe->crtc.dev; struct pl111_drm_dev_private *priv = drm->dev_private; - u32 cpp = priv->variant->fb_bpp / 8; + u32 cpp = DIV_ROUND_UP(priv->variant->fb_depth, 8); u64 bw; /* diff --git a/drivers/gpu/drm/pl111/pl111_drm.h b/drivers/gpu/drm/pl111/pl111_drm.h index 2a46b5bd8576..d1fe756444ee 100644 --- a/drivers/gpu/drm/pl111/pl111_drm.h +++ b/drivers/gpu/drm/pl111/pl111_drm.h @@ -114,7 +114,7 @@ struct drm_minor; * extensions to the control register * @formats: array of supported pixel formats on this variant * @nformats: the length of the array of supported pixel formats - * @fb_bpp: desired bits per pixel on the default framebuffer + * @fb_depth: desired depth per pixel on the default framebuffer */ struct pl111_variant_data { const char *name; @@ -126,7 +126,7 @@ struct pl111_variant_data { bool st_bitmux_control; const u32 *formats; unsigned int nformats; - unsigned int fb_bpp; + unsigned int fb_depth; }; struct pl111_drm_dev_private { diff --git a/drivers/gpu/drm/pl111/pl111_drv.c b/drivers/gpu/drm/pl111/pl111_drv.c index 4b2a9e9753f6..43049c8028b2 100644 --- a/drivers/gpu/drm/pl111/pl111_drv.c +++ b/drivers/gpu/drm/pl111/pl111_drv.c @@ -308,7 +308,7 @@ static int pl111_amba_probe(struct amba_device *amba_dev, if (ret < 0) goto dev_put; - drm_fbdev_dma_setup(drm, priv->variant->fb_bpp); + drm_fbdev_dma_setup(drm, priv->variant->fb_depth); return 0; @@ -351,7 +351,7 @@ static const struct pl111_variant_data pl110_variant = { .is_pl110 = true, .formats = pl110_pixel_formats, .nformats = ARRAY_SIZE(pl110_pixel_formats), - .fb_bpp = 16, + .fb_depth = 16, }; /* RealView, Versatile Express etc use this modern variant */ @@ -376,7 +376,7 @@ static const struct pl111_variant_data pl111_variant = { .name = "PL111", .formats = pl111_pixel_formats, .nformats = ARRAY_SIZE(pl111_pixel_formats), - .fb_bpp = 32, + .fb_depth = 32, }; static const u32 pl110_nomadik_pixel_formats[] = { @@ -405,7 +405,7 @@ static const struct pl111_variant_data pl110_nomadik_variant = { .is_lcdc = true, .st_bitmux_control = true, .broken_vblank = true, - .fb_bpp = 16, + .fb_depth = 16, }; static const struct amba_id pl111_id_table[] = { diff --git a/drivers/gpu/drm/pl111/pl111_versatile.c b/drivers/gpu/drm/pl111/pl111_versatile.c index 1b436b75fd39..00c3ebd32359 100644 --- a/drivers/gpu/drm/pl111/pl111_versatile.c +++ b/drivers/gpu/drm/pl111/pl111_versatile.c @@ -316,7 +316,7 @@ static const struct pl111_variant_data pl110_integrator = { .broken_vblank = true, .formats = pl110_integrator_pixel_formats, .nformats = ARRAY_SIZE(pl110_integrator_pixel_formats), - .fb_bpp = 16, + .fb_depth = 16, }; /* @@ -330,7 +330,7 @@ static const struct pl111_variant_data pl110_impd1 = { .broken_vblank = true, .formats = pl110_integrator_pixel_formats, .nformats = ARRAY_SIZE(pl110_integrator_pixel_formats), - .fb_bpp = 16, + .fb_depth = 15, }; /* @@ -343,7 +343,7 @@ static const struct pl111_variant_data pl110_versatile = { .external_bgr = true, .formats = pl110_versatile_pixel_formats, .nformats = ARRAY_SIZE(pl110_versatile_pixel_formats), - .fb_bpp = 16, + .fb_depth = 16, }; /* @@ -355,7 +355,7 @@ static const struct pl111_variant_data pl111_realview = { .name = "PL111 RealView", .formats = pl111_realview_pixel_formats, .nformats = ARRAY_SIZE(pl111_realview_pixel_formats), - .fb_bpp = 16, + .fb_depth = 16, }; /* @@ -367,7 +367,7 @@ static const struct pl111_variant_data pl111_vexpress = { .name = "PL111 Versatile Express", .formats = pl111_realview_pixel_formats, .nformats = ARRAY_SIZE(pl111_realview_pixel_formats), - .fb_bpp = 16, + .fb_depth = 16, .broken_clockdivider = true, }; From 43cd3ddbff3c1635d0e09fe5b09af48d39dbb9d7 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 15 May 2023 13:13:50 -0700 Subject: [PATCH 035/271] dt-bindings: interrupt-controller: arm,gic-v3: Add quirk for Mediatek SoCs w/ broken FW When trying to turn on the "pseudo NMI" kernel feature in Linux, it was discovered that all Mediatek-based Chromebooks that ever shipped (at least ones with GICv3) had a firmware bug where they wouldn't save certain GIC "GICR" registers properly. If a processor ever entered a suspend/idle mode where the GICR registers lost state then they'd be reset to their default state. As a result of the bug, if you try to enable "pseudo NMIs" on the affected devices then certain interrupts will unexpectedly get promoted to be "pseudo NMIs" and cause crashes / freezes / general mayhem. ChromeOS is looking to start turning on "pseudo NMIs" in production to make crash reports more actionable. To do so, we will release firmware updates for at least some of the affected Mediatek Chromebooks. However, even when we update the firmware of a Chromebook it's always possible that a user will end up booting with old firmware. We need to be able to detect when we're running with firmware that will crash and burn if pseudo NMIs are enabled. The current plan is: * Update the device trees of all affected Chromebooks to include the 'mediatek,broken-save-restore-fw' property. The kernel can use this to know not to enable certain features like "pseudo NMI". NOTE: device trees for Chromebooks are never baked into the firmware but are bundled with the kernel. A kernel will never be configured to use "pseudo NMIs" and be bundled with an old device tree. * When we get a fixed firmware for one of these Chromebooks, it will patch the device tree to remove this property. For some details, you can also see the public bug Reviewed-by: Julius Werner Signed-off-by: Douglas Anderson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230515131353.v2.1.Iabe67a827e206496efec6beb5616d5a3b99c1e65@changeid --- .../bindings/interrupt-controller/arm,gic-v3.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml index 92117261e1e1..39e64c7f6360 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml @@ -166,6 +166,12 @@ properties: resets: maxItems: 1 + mediatek,broken-save-restore-fw: + type: boolean + description: + Asserts that the firmware on this device has issues saving and restoring + GICR registers when the GIC redistributors are powered off. + dependencies: mbi-ranges: [ msi-controller ] msi-controller: [ mbi-ranges ] From 44bd78dd2b8897f59b7e3963f088caadb7e4f047 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 15 May 2023 13:13:51 -0700 Subject: [PATCH 036/271] irqchip/gic-v3: Disable pseudo NMIs on Mediatek devices w/ firmware issues Some Chromebooks with Mediatek SoCs have a problem where the firmware doesn't properly save/restore certain GICR registers. Newer Chromebooks should fix this issue and we may be able to do firmware updates for old Chromebooks. At the moment, the only known issue with these Chromebooks is that we can't enable "pseudo NMIs" since the priority register can be lost. Enabling "pseudo NMIs" on Chromebooks with the problematic firmware causes crashes and freezes. Let's detect devices with this problem and then disable "pseudo NMIs" on them. We'll detect the problem by looking for the presence of the "mediatek,broken-save-restore-fw" property in the GIC device tree node. Any devices with fixed firmware will not have this property. Our detection plan works because we never bake a Chromebook's device tree into firmware. Instead, device trees are always bundled with the kernel. We'll update the device trees of all affected Chromebooks and then we'll never enable "pseudo NMI" on a kernel that is bundled with old device trees. When a firmware update is shipped that fixes this issue it will know to patch the device tree to remove the property. In order to make this work, the quick detection mechanism of the GICv3 code is extended to be able to look for properties in addition to looking at "compatible". Reviewed-by: Julius Werner Signed-off-by: Douglas Anderson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230515131353.v2.2.I88dc0a0eb1d9d537de61604cd8994ecc55c0cac1@changeid --- drivers/irqchip/irq-gic-common.c | 8 ++++++-- drivers/irqchip/irq-gic-common.h | 1 + drivers/irqchip/irq-gic-v3.c | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-common.c b/drivers/irqchip/irq-gic-common.c index a610821c8ff2..de47b51cdadb 100644 --- a/drivers/irqchip/irq-gic-common.c +++ b/drivers/irqchip/irq-gic-common.c @@ -16,7 +16,11 @@ void gic_enable_of_quirks(const struct device_node *np, const struct gic_quirk *quirks, void *data) { for (; quirks->desc; quirks++) { - if (!of_device_is_compatible(np, quirks->compatible)) + if (quirks->compatible && + !of_device_is_compatible(np, quirks->compatible)) + continue; + if (quirks->property && + !of_property_read_bool(np, quirks->property)) continue; if (quirks->init(data)) pr_info("GIC: enabling workaround for %s\n", @@ -28,7 +32,7 @@ void gic_enable_quirks(u32 iidr, const struct gic_quirk *quirks, void *data) { for (; quirks->desc; quirks++) { - if (quirks->compatible) + if (quirks->compatible || quirks->property) continue; if (quirks->iidr != (quirks->mask & iidr)) continue; diff --git a/drivers/irqchip/irq-gic-common.h b/drivers/irqchip/irq-gic-common.h index 27e3d4ed4f32..3db4592cda1c 100644 --- a/drivers/irqchip/irq-gic-common.h +++ b/drivers/irqchip/irq-gic-common.h @@ -13,6 +13,7 @@ struct gic_quirk { const char *desc; const char *compatible; + const char *property; bool (*init)(void *data); u32 iidr; u32 mask; diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 6fcee221f201..a605aa79435a 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -39,6 +39,7 @@ #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) #define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539 (1ULL << 1) +#define FLAGS_WORKAROUND_MTK_GICR_SAVE (1ULL << 2) #define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1) @@ -1720,6 +1721,15 @@ static bool gic_enable_quirk_msm8996(void *data) return true; } +static bool gic_enable_quirk_mtk_gicr(void *data) +{ + struct gic_chip_data *d = data; + + d->flags |= FLAGS_WORKAROUND_MTK_GICR_SAVE; + + return true; +} + static bool gic_enable_quirk_cavium_38539(void *data) { struct gic_chip_data *d = data; @@ -1792,6 +1802,11 @@ static const struct gic_quirk gic_quirks[] = { .compatible = "qcom,msm8996-gic-v3", .init = gic_enable_quirk_msm8996, }, + { + .desc = "GICv3: Mediatek Chromebook GICR save problem", + .property = "mediatek,broken-save-restore-fw", + .init = gic_enable_quirk_mtk_gicr, + }, { .desc = "GICv3: HIP06 erratum 161010803", .iidr = 0x0204043b, @@ -1834,6 +1849,11 @@ static void gic_enable_nmi_support(void) if (!gic_prio_masking_enabled()) return; + if (gic_data.flags & FLAGS_WORKAROUND_MTK_GICR_SAVE) { + pr_warn("Skipping NMI enable due to firmware issues\n"); + return; + } + ppi_nmi_refs = kcalloc(gic_data.ppi_nr, sizeof(*ppi_nmi_refs), GFP_KERNEL); if (!ppi_nmi_refs) return; From 2c6c9c049510163090b979ea5f92a68ae8d93c45 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 24 Apr 2023 11:31:55 +0100 Subject: [PATCH 037/271] irqchip/mips-gic: Don't touch vl_map if a local interrupt is not routable When a GIC local interrupt is not routable, it's vl_map will be used to control some internal states for core (providing IPTI, IPPCI, IPFDC input signal for core). Overriding it will interfere core's intetrupt controller. Do not touch vl_map if a local interrupt is not routable, we are not going to remap it. Before dd098a0e0319 (" irqchip/mips-gic: Get rid of the reliance on irq_cpu_online()"), if a local interrupt is not routable, then it won't be requested from GIC Local domain, and thus gic_all_vpes_irq_cpu_online won't be called for that particular interrupt. Fixes: dd098a0e0319 (" irqchip/mips-gic: Get rid of the reliance on irq_cpu_online()") Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Reviewed-by: Serge Semin Tested-by: Serge Semin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230424103156.66753-2-jiaxun.yang@flygoat.com --- drivers/irqchip/irq-mips-gic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 046c355e120b..b568d55ef7c5 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -399,6 +399,8 @@ static void gic_all_vpes_irq_cpu_online(void) unsigned int intr = local_intrs[i]; struct gic_all_vpes_chip_data *cd; + if (!gic_local_irq_is_routable(intr)) + continue; cd = &gic_all_vpes_chip_data[intr]; write_gic_vl_map(mips_gic_vx_map_reg(intr), cd->map); if (cd->mask) From 3d6a0e4197c04599d75d85a608c8bb16a630a38c Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 24 Apr 2023 11:31:56 +0100 Subject: [PATCH 038/271] irqchip/mips-gic: Use raw spinlock for gic_lock Since we may hold gic_lock in hardirq context, use raw spinlock makes more sense given that it is for low-level interrupt handling routine and the critical section is small. Fixes BUG: [ 0.426106] ============================= [ 0.426257] [ BUG: Invalid wait context ] [ 0.426422] 6.3.0-rc7-next-20230421-dirty #54 Not tainted [ 0.426638] ----------------------------- [ 0.426766] swapper/0/1 is trying to lock: [ 0.426954] ffffffff8104e7b8 (gic_lock){....}-{3:3}, at: gic_set_type+0x30/08 Fixes: 95150ae8b330 ("irqchip: mips-gic: Implement irq_set_type callback") Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Reviewed-by: Serge Semin Tested-by: Serge Semin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230424103156.66753-3-jiaxun.yang@flygoat.com --- drivers/irqchip/irq-mips-gic.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index b568d55ef7c5..6d5ecc10a870 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -50,7 +50,7 @@ void __iomem *mips_gic_base; static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[GIC_MAX_LONGS], pcpu_masks); -static DEFINE_SPINLOCK(gic_lock); +static DEFINE_RAW_SPINLOCK(gic_lock); static struct irq_domain *gic_irq_domain; static int gic_shared_intrs; static unsigned int gic_cpu_pin; @@ -210,7 +210,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type) irq = GIC_HWIRQ_TO_SHARED(d->hwirq); - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_EDGE_FALLING: pol = GIC_POL_FALLING_EDGE; @@ -250,7 +250,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type) else irq_set_chip_handler_name_locked(d, &gic_level_irq_controller, handle_level_irq, NULL); - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); return 0; } @@ -268,7 +268,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask, return -EINVAL; /* Assumption : cpumask refers to a single CPU */ - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); /* Re-route this IRQ */ write_gic_map_vp(irq, BIT(mips_cm_vp_id(cpu))); @@ -279,7 +279,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask, set_bit(irq, per_cpu_ptr(pcpu_masks, cpu)); irq_data_update_effective_affinity(d, cpumask_of(cpu)); - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); return IRQ_SET_MASK_OK; } @@ -357,12 +357,12 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d) cd = irq_data_get_irq_chip_data(d); cd->mask = false; - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); for_each_online_cpu(cpu) { write_gic_vl_other(mips_cm_vp_id(cpu)); write_gic_vo_rmask(BIT(intr)); } - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); } static void gic_unmask_local_irq_all_vpes(struct irq_data *d) @@ -375,12 +375,12 @@ static void gic_unmask_local_irq_all_vpes(struct irq_data *d) cd = irq_data_get_irq_chip_data(d); cd->mask = true; - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); for_each_online_cpu(cpu) { write_gic_vl_other(mips_cm_vp_id(cpu)); write_gic_vo_smask(BIT(intr)); } - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); } static void gic_all_vpes_irq_cpu_online(void) @@ -393,7 +393,7 @@ static void gic_all_vpes_irq_cpu_online(void) unsigned long flags; int i; - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); for (i = 0; i < ARRAY_SIZE(local_intrs); i++) { unsigned int intr = local_intrs[i]; @@ -407,7 +407,7 @@ static void gic_all_vpes_irq_cpu_online(void) write_gic_vl_smask(BIT(intr)); } - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); } static struct irq_chip gic_all_vpes_local_irq_controller = { @@ -437,11 +437,11 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, data = irq_get_irq_data(virq); - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin); write_gic_map_vp(intr, BIT(mips_cm_vp_id(cpu))); irq_data_update_effective_affinity(data, cpumask_of(cpu)); - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); return 0; } @@ -533,12 +533,12 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, if (!gic_local_irq_is_routable(intr)) return -EPERM; - spin_lock_irqsave(&gic_lock, flags); + raw_spin_lock_irqsave(&gic_lock, flags); for_each_online_cpu(cpu) { write_gic_vl_other(mips_cm_vp_id(cpu)); write_gic_vo_map(mips_gic_vx_map_reg(intr), map); } - spin_unlock_irqrestore(&gic_lock, flags); + raw_spin_unlock_irqrestore(&gic_lock, flags); return 0; } From 14130211be5366a91ec07c3284c183b75d8fba17 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 12 May 2023 18:45:06 +0200 Subject: [PATCH 039/271] irqchip/meson-gpio: Mark OF related data as maybe unused MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver can be compile tested with !CONFIG_OF making certain data unused: drivers/irqchip/irq-meson-gpio.c:153:34: error: ‘meson_irq_gpio_matches’ defined but not used [-Werror=unused-const-variable=] Acked-by: Martin Blumenstingl Signed-off-by: Krzysztof Kozlowski Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230512164506.212267-1-krzysztof.kozlowski@linaro.org --- drivers/irqchip/irq-meson-gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index 2aaa9aad3e87..7da18ef95211 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -150,7 +150,7 @@ static const struct meson_gpio_irq_params s4_params = { INIT_MESON_S4_COMMON_DATA(82) }; -static const struct of_device_id meson_irq_gpio_matches[] = { +static const struct of_device_id meson_irq_gpio_matches[] __maybe_unused = { { .compatible = "amlogic,meson8-gpio-intc", .data = &meson8_params }, { .compatible = "amlogic,meson8b-gpio-intc", .data = &meson8b_params }, { .compatible = "amlogic,meson-gxbb-gpio-intc", .data = &gxbb_params }, From cddb536a73ef2c82ce04059dc61c65e85a355561 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 May 2023 17:06:54 +0800 Subject: [PATCH 040/271] irqchip/mbigen: Unify the error handling in mbigen_of_create_domain() Dan Carpenter reported that commit fea087fc291b "irqchip/mbigen: move to use bus_get_dev_root()" leads to the following Smatch static checker warning: drivers/irqchip/irq-mbigen.c:258 mbigen_of_create_domain() error: potentially dereferencing uninitialized 'child'. It should not cause a problem on real hardware, but better to fix the warning, let's move the bus_get_dev_root() out of the loop, and unify the error handling to silence it. Reported-by: Dan Carpenter Signed-off-by: Kefeng Wang Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230505090654.12793-1-wangkefeng.wang@huawei.com --- drivers/irqchip/irq-mbigen.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c index eada5e0e3eb9..5101a3fb11df 100644 --- a/drivers/irqchip/irq-mbigen.c +++ b/drivers/irqchip/irq-mbigen.c @@ -240,26 +240,27 @@ static int mbigen_of_create_domain(struct platform_device *pdev, struct irq_domain *domain; struct device_node *np; u32 num_pins; + int ret = 0; + + parent = bus_get_dev_root(&platform_bus_type); + if (!parent) + return -ENODEV; for_each_child_of_node(pdev->dev.of_node, np) { if (!of_property_read_bool(np, "interrupt-controller")) continue; - parent = bus_get_dev_root(&platform_bus_type); - if (parent) { - child = of_platform_device_create(np, NULL, parent); - put_device(parent); - if (!child) { - of_node_put(np); - return -ENOMEM; - } + child = of_platform_device_create(np, NULL, parent); + if (!child) { + ret = -ENOMEM; + break; } if (of_property_read_u32(child->dev.of_node, "num-pins", &num_pins) < 0) { dev_err(&pdev->dev, "No num-pins property\n"); - of_node_put(np); - return -EINVAL; + ret = -EINVAL; + break; } domain = platform_msi_create_device_domain(&child->dev, num_pins, @@ -267,12 +268,16 @@ static int mbigen_of_create_domain(struct platform_device *pdev, &mbigen_domain_ops, mgn_chip); if (!domain) { - of_node_put(np); - return -ENOMEM; + ret = -ENOMEM; + break; } } - return 0; + put_device(parent); + if (ret) + of_node_put(np); + + return ret; } #ifdef CONFIG_ACPI From 50a1726b148ff30778cb8a6cf3736130b07c93fd Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 15 May 2023 12:20:52 -0400 Subject: [PATCH 041/271] dt-binding: cdns,usb3: Fix cdns,on-chip-buff-size type In cdns3-gadget.c, 'cdns,on-chip-buff-size' was read using device_property_read_u16(). It resulted in 0 if a 32bit value was used in dts. This commit fixes the dt binding doc to declare it as u16. Cc: stable@vger.kernel.org Fixes: 68989fe1c39d ("dt-bindings: usb: Convert cdns-usb3.txt to YAML schema") Signed-off-by: Frank Li Reviewed-by: Krzysztof Kozlowski Signed-off-by: Shawn Guo --- Documentation/devicetree/bindings/usb/cdns,usb3.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/usb/cdns,usb3.yaml b/Documentation/devicetree/bindings/usb/cdns,usb3.yaml index cae46c4982ad..69a93a0722f0 100644 --- a/Documentation/devicetree/bindings/usb/cdns,usb3.yaml +++ b/Documentation/devicetree/bindings/usb/cdns,usb3.yaml @@ -64,7 +64,7 @@ properties: description: size of memory intended as internal memory for endpoints buffers expressed in KB - $ref: /schemas/types.yaml#/definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint16 cdns,phyrst-a-enable: description: Enable resetting of PHY if Rx fail is detected From 0f554e37dad416f445cd3ec5935f5aec1b0e7ba5 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 15 May 2023 12:20:53 -0400 Subject: [PATCH 042/271] arm64: dts: imx8: fix USB 3.0 Gadget Failure in QM & QXPB0 at super speed Resolve USB 3.0 gadget failure for QM and QXPB0 in super speed mode with single IN and OUT endpoints, like mass storage devices, due to incorrect ACTUAL_MEM_SIZE in ep_cap2 (32k instead of actual 18k). Implement dt property cdns,on-chip-buff-size to override ep_cap2 and set it to 18k for imx8QM and imx8QXP chips. No adverse effects for 8QXP C0. Cc: stable@vger.kernel.org Fixes: dce49449e04f ("usb: cdns3: allocate TX FIFO size according to composite EP number") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi index 2209c1ac6e9b..e62a43591361 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-conn.dtsi @@ -171,6 +171,7 @@ conn_subsys: bus@5b000000 { interrupt-names = "host", "peripheral", "otg", "wakeup"; phys = <&usb3_phy>; phy-names = "cdns3,usb3-phy"; + cdns,on-chip-buff-size = /bits/ 16 <18>; status = "disabled"; }; }; From f7ba52f302fdc392e0047f38e50841483d997144 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 18 Apr 2023 14:49:25 -0700 Subject: [PATCH 043/271] vmlinux.lds.h: Discard .note.gnu.property section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When tooling reads ELF notes, it assumes each note entry is aligned to the value listed in the .note section header's sh_addralign field. The kernel-created ELF notes in the .note.Linux and .note.Xen sections are aligned to 4 bytes. This causes the toolchain to set those sections' sh_addralign values to 4. On the other hand, the GCC-created .note.gnu.property section has an sh_addralign value of 8 for some reason, despite being based on struct Elf32_Nhdr which only needs 4-byte alignment. When the mismatched input sections get linked together into the vmlinux .notes output section, the higher alignment "wins", resulting in an sh_addralign of 8, which confuses tooling. For example: $ readelf -n .tmp_vmlinux.btf ... readelf: .tmp_vmlinux.btf: Warning: note with invalid namesz and/or descsz found at offset 0x170 readelf: .tmp_vmlinux.btf: Warning: type: 0x4, namesize: 0x006e6558, descsize: 0x00008801, alignment: 8 In this case readelf thinks there's alignment padding where there is none, so it starts reading an ELF note in the middle. With newer toolchains (e.g., latest Fedora Rawhide), a similar mismatch triggers a build failure when combined with CONFIG_X86_KERNEL_IBT: btf_encoder__encode: btf__dedup failed! Failed to encode BTF libbpf: failed to find '.BTF' ELF section in vmlinux FAILED: load BTF from vmlinux: No data available make[1]: *** [scripts/Makefile.vmlinux:35: vmlinux] Error 255 This latter error was caused by pahole crashing when it encountered the corrupt .notes section. This crash has been fixed in dwarves version 1.25. As Tianyi Liu describes: "Pahole reads .notes to look for LINUX_ELFNOTE_BUILD_LTO. When LTO is enabled, pahole needs to call cus__merge_and_process_cu to merge compile units, at which point there should only be one unspecified type (used to represent some compilation information) in the global context. However, when the kernel is compiled without LTO, if pahole calls cus__merge_and_process_cu due to alignment issues with notes, multiple unspecified types may appear after merging the cus, and older versions of pahole only support up to one. This is why pahole 1.24 crashes, while newer versions support multiple. However, the latest version of pahole still does not solve the problem of incorrect LTO recognition, so compiling the kernel may be slower than normal." Even with the newer pahole, the note section misaligment issue still exists and pahole is misinterpreting the LTO note. Fix it by discarding the .note.gnu.property section. While GNU properties are important for user space (and VDSO), they don't seem to have any use for vmlinux. (In fact, they're already getting (inadvertently) stripped from vmlinux when CONFIG_DEBUG_INFO_BTF is enabled. The BTF data is extracted from vmlinux.o with "objcopy --only-section=.BTF" into .btf.vmlinux.bin.o. That file doesn't have .note.gnu.property, so when it gets modified and linked back into the main object, the linker automatically strips it (see "How GNU properties are merged" in the ld man page).) Reported-by: Daniel Xu Link: https://lkml.kernel.org/bpf/57830c30-cd77-40cf-9cd1-3bb608aa602e@app.fastmail.com Debugged-by: Tianyi Liu Suggested-by: Joan Bruguera Micó Link: https://lore.kernel.org/r/20230418214925.ay3jpf2zhw75kgmd@treble Signed-off-by: Josh Poimboeuf --- include/asm-generic/vmlinux.lds.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index d1f57e4868ed..cebdf1ca415d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -891,9 +891,16 @@ /* * Discard .note.GNU-stack, which is emitted as PROGBITS by the compiler. * Otherwise, the type of .notes section would become PROGBITS instead of NOTES. + * + * Also, discard .note.gnu.property, otherwise it forces the notes section to + * be 8-byte aligned which causes alignment mismatches with the kernel's custom + * 4-byte aligned notes. */ #define NOTES \ - /DISCARD/ : { *(.note.GNU-stack) } \ + /DISCARD/ : { \ + *(.note.GNU-stack) \ + *(.note.gnu.property) \ + } \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.note.*, _notes) \ } NOTES_HEADERS \ From 2e4be0d011f21593c6b316806779ba1eba2cd7e0 Mon Sep 17 00:00:00 2001 From: Vernon Lovejoy Date: Fri, 12 May 2023 12:42:32 +0200 Subject: [PATCH 044/271] x86/show_trace_log_lvl: Ensure stack pointer is aligned, again The commit e335bb51cc15 ("x86/unwind: Ensure stack pointer is aligned") tried to align the stack pointer in show_trace_log_lvl(), otherwise the "stack < stack_info.end" check can't guarantee that the last read does not go past the end of the stack. However, we have the same problem with the initial value of the stack pointer, it can also be unaligned. So without this patch this trivial kernel module #include static int init(void) { asm volatile("sub $0x4,%rsp"); dump_stack(); asm volatile("add $0x4,%rsp"); return -EAGAIN; } module_init(init); MODULE_LICENSE("GPL"); crashes the kernel. Fixes: e335bb51cc15 ("x86/unwind: Ensure stack pointer is aligned") Signed-off-by: Vernon Lovejoy Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20230512104232.GA10227@redhat.com Signed-off-by: Josh Poimboeuf --- arch/x86/kernel/dumpstack.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0bf6779187dd..f18ca44c904b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -195,7 +195,6 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); - stack = stack ? : get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -214,9 +213,13 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (stack = stack ?: get_stack_pointer(task, regs); + stack; + stack = stack_info.next_sp) { const char *stack_name; + stack = PTR_ALIGN(stack, sizeof(long)); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { /* * We weren't on a valid stack. It's possible that From 03262a3f5b5b910c7c2900c2f8884832794355f5 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 2 May 2023 10:50:05 -0400 Subject: [PATCH 045/271] phy: mediatek: rework the floating point comparisons to fixed point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc on aarch64 reports drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c: In function ‘mtk_hdmi_pll_set_rate’: drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c:240:52: error: ‘-mgeneral-regs-only’ is incompatible with the use of floating-point types 240 | else if (tmds_clk >= 54 * MEGA && tmds_clk < 148.35 * MEGA) Floating point should not be used, so rework the floating point comparisons to fixed point. Signed-off-by: Tom Rix Reviewed-by: Chunfeng Yun Link: https://lore.kernel.org/r/20230502145005.2927101-1-trix@redhat.com Signed-off-by: Vinod Koul --- drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c b/drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c index caa953780bee..8aa7251de4a9 100644 --- a/drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c +++ b/drivers/phy/mediatek/phy-mtk-hdmi-mt8195.c @@ -237,11 +237,11 @@ static int mtk_hdmi_pll_calc(struct mtk_hdmi_phy *hdmi_phy, struct clk_hw *hw, */ if (tmds_clk < 54 * MEGA) txposdiv = 8; - else if (tmds_clk >= 54 * MEGA && tmds_clk < 148.35 * MEGA) + else if (tmds_clk >= 54 * MEGA && (tmds_clk * 100) < 14835 * MEGA) txposdiv = 4; - else if (tmds_clk >= 148.35 * MEGA && tmds_clk < 296.7 * MEGA) + else if ((tmds_clk * 100) >= 14835 * MEGA && (tmds_clk * 10) < 2967 * MEGA) txposdiv = 2; - else if (tmds_clk >= 296.7 * MEGA && tmds_clk <= 594 * MEGA) + else if ((tmds_clk * 10) >= 2967 * MEGA && tmds_clk <= 594 * MEGA) txposdiv = 1; else return -EINVAL; @@ -324,12 +324,12 @@ static int mtk_hdmi_pll_drv_setting(struct clk_hw *hw) clk_channel_bias = 0x34; /* 20mA */ impedance_en = 0xf; impedance = 0x36; /* 100ohm */ - } else if (pixel_clk >= 74.175 * MEGA && pixel_clk <= 300 * MEGA) { + } else if (((u64)pixel_clk * 1000) >= 74175 * MEGA && pixel_clk <= 300 * MEGA) { data_channel_bias = 0x34; /* 20mA */ clk_channel_bias = 0x2c; /* 16mA */ impedance_en = 0xf; impedance = 0x36; /* 100ohm */ - } else if (pixel_clk >= 27 * MEGA && pixel_clk < 74.175 * MEGA) { + } else if (pixel_clk >= 27 * MEGA && ((u64)pixel_clk * 1000) < 74175 * MEGA) { data_channel_bias = 0x14; /* 10mA */ clk_channel_bias = 0x14; /* 10mA */ impedance_en = 0x0; From b949193011540bb17cf1da7795ec42af1b875203 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Fri, 12 May 2023 15:11:41 +0200 Subject: [PATCH 046/271] phy: amlogic: phy-meson-g12a-mipi-dphy-analog: fix CNTL2_DIF_TX_CTL0 value Use the same CNTL2_DIF_TX_CTL0 value used by the vendor, it was reported fixing timings issues. Fixes: 2a56dc650e54 ("phy: amlogic: Add G12A Analog MIPI D-PHY driver") Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20230512-amlogic-v6-4-upstream-dsi-ccf-vim3-v4-10-2592c29ea263@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/amlogic/phy-meson-g12a-mipi-dphy-analog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/amlogic/phy-meson-g12a-mipi-dphy-analog.c b/drivers/phy/amlogic/phy-meson-g12a-mipi-dphy-analog.c index c14089fa7db4..cabdddbbabfd 100644 --- a/drivers/phy/amlogic/phy-meson-g12a-mipi-dphy-analog.c +++ b/drivers/phy/amlogic/phy-meson-g12a-mipi-dphy-analog.c @@ -70,7 +70,7 @@ static int phy_g12a_mipi_dphy_analog_power_on(struct phy *phy) HHI_MIPI_CNTL1_BANDGAP); regmap_write(priv->regmap, HHI_MIPI_CNTL2, - FIELD_PREP(HHI_MIPI_CNTL2_DIF_TX_CTL0, 0x459) | + FIELD_PREP(HHI_MIPI_CNTL2_DIF_TX_CTL0, 0x45a) | FIELD_PREP(HHI_MIPI_CNTL2_DIF_TX_CTL1, 0x2680)); reg = DSI_LANE_CLK; From 2a881183dc5ab2474ef602e48fe7af34db460d95 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 7 May 2023 16:48:18 +0200 Subject: [PATCH 047/271] phy: qcom-snps: correct struct qcom_snps_hsphy kerneldoc Update kerneldoc of struct qcom_snps_hsphy to fix: drivers/phy/qualcomm/phy-qcom-snps-femto-v2.c:135: warning: Function parameter or member 'update_seq_cfg' not described in 'qcom_snps_hsphy' Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20230507144818.193039-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-snps-femto-v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/qualcomm/phy-qcom-snps-femto-v2.c b/drivers/phy/qualcomm/phy-qcom-snps-femto-v2.c index a59063596214..6c237f3cc66d 100644 --- a/drivers/phy/qualcomm/phy-qcom-snps-femto-v2.c +++ b/drivers/phy/qualcomm/phy-qcom-snps-femto-v2.c @@ -115,11 +115,11 @@ struct phy_override_seq { * * @cfg_ahb_clk: AHB2PHY interface clock * @ref_clk: phy reference clock - * @iface_clk: phy interface clock * @phy_reset: phy reset control * @vregs: regulator supplies bulk data * @phy_initialized: if PHY has been initialized correctly * @mode: contains the current mode the PHY is in + * @update_seq_cfg: tuning parameters for phy init */ struct qcom_snps_hsphy { struct phy *phy; From 2d2f5f1e8fd91fe848aacde28f738fb8e8d95a27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 May 2023 13:41:27 +0300 Subject: [PATCH 048/271] accel/qaic: silence some uninitialized variable warnings Smatch complains that these are not initialized if get_cntl_version() fails but we still print them in the debug message. Not the end of the world, but true enough. Let's just initialize them to a dummy value to make the checker happy. Fixes: c501ca23a6a3 ("accel/qaic: Add uapi and core driver file") Signed-off-by: Dan Carpenter Reviewed-by: Jeffrey Hugo Reviewed-by: Carl Vanderlip Reviewed-by: Pranjal Ramajor Asha Kanojiya [jhugo: Add fixes and reorder varable declarations for style] Signed-off-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/d11ee378-7b06-4b5e-b56f-d66174be1ab3@kili.mountain --- drivers/accel/qaic/qaic_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c index ff80eb571729..2d0828db28d8 100644 --- a/drivers/accel/qaic/qaic_drv.c +++ b/drivers/accel/qaic/qaic_drv.c @@ -262,8 +262,8 @@ static void qaic_destroy_drm_device(struct qaic_device *qdev, s32 partition_id) static int qaic_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id) { + u16 major = -1, minor = -1; struct qaic_device *qdev; - u16 major, minor; int ret; /* From 17eabd6a044b57b2c1b5459eb9d11af3ea909e63 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 15 May 2023 15:10:57 -0500 Subject: [PATCH 049/271] RDMA/rxe: Fix double unlock in rxe_qp.c A recent patch can cause a double spin_unlock_bh() in rxe_qp_to_attr() at line 715 in rxe_qp.c. Move the 2nd unlock into the if statement. Fixes: f605f26ea196 ("RDMA/rxe: Protect QP state with qp->state_lock") Link: https://lore.kernel.org/r/20230515201056.1591140-1-rpearsonhpe@gmail.com Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/27773078-40ce-414f-8b97-781954da9f25@kili.mountain Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c5451a4488ca..245dd36638c7 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -712,8 +712,9 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) if (qp->attr.sq_draining) { spin_unlock_bh(&qp->state_lock); cond_resched(); + } else { + spin_unlock_bh(&qp->state_lock); } - spin_unlock_bh(&qp->state_lock); return 0; } From b5f3fe27c54b1fe77d4dd834ccd48a6d694f872e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 May 2023 11:50:56 +0800 Subject: [PATCH 050/271] RDMA/rxe: Convert spin_{lock_bh,unlock_bh} to spin_{lock_irqsave,unlock_irqrestore} We need to call spin_lock_irqsave()/spin_unlock_irqrestore() for state_lock in rxe, otherwsie the callchain: ib_post_send_mad -> spin_lock_irqsave -> ib_post_send -> rxe_post_send -> spin_lock_bh -> spin_unlock_bh -> spin_unlock_irqrestore Causes below traces during run block nvmeof-mp/001 test due to mismatched spinlock nesting: WARNING: CPU: 0 PID: 94794 at kernel/softirq.c:376 __local_bh_enable_ip+0xc2/0x140 [ ... ] CPU: 0 PID: 94794 Comm: kworker/u4:1 Tainted: G E 6.4.0-rc1 #9 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 Workqueue: rdma_cm cma_work_handler [rdma_cm] RIP: 0010:__local_bh_enable_ip+0xc2/0x140 Code: 48 85 c0 74 72 5b 41 5c 5d 31 c0 89 c2 89 c1 89 c6 89 c7 41 89 c0 e9 bd 0e 11 01 65 8b 05 f2 65 72 48 85 c0 0f 85 76 ff ff ff <0f> 0b e9 6f ff ff ff e8 d2 39 1c 00 eb 80 4c 89 e7 e8 68 ad 0a 00 RSP: 0018:ffffb7cf818539f0 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000201 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000201 RDI: ffffffffc0f25f79 RBP: ffffb7cf81853a00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffc0f25f79 R13: ffff8db1f0fa6000 R14: ffff8db2c63ff000 R15: 00000000000000e8 FS: 0000000000000000(0000) GS:ffff8db33bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559758db0f20 CR3: 0000000105124000 CR4: 00000000003506f0 Call Trace: _raw_spin_unlock_bh+0x31/0x40 rxe_post_send+0x59/0x8b0 [rdma_rxe] ib_send_mad+0x26b/0x470 [ib_core] ib_post_send_mad+0x150/0xb40 [ib_core] ? cm_form_tid+0x5b/0x90 [ib_cm] ib_send_cm_req+0x7c8/0xb70 [ib_cm] rdma_connect_locked+0x433/0x940 [rdma_cm] nvme_rdma_cm_handler+0x5d7/0x9c0 [nvme_rdma] cma_cm_event_handler+0x4f/0x170 [rdma_cm] cma_work_handler+0x6a/0xe0 [rdma_cm] process_one_work+0x2a9/0x580 worker_thread+0x52/0x3f0 ? __pfx_worker_thread+0x10/0x10 kthread+0x109/0x140 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 0 PID: 94794 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x37/0x60 [ ... ] CPU: 0 PID: 94794 Comm: kworker/u4:1 Tainted: G W E 6.4.0-rc1 #9 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 Workqueue: rdma_cm cma_work_handler [rdma_cm] RIP: 0010:warn_bogus_irq_restore+0x37/0x60 Code: fb 01 77 36 83 e3 01 74 0e 48 8b 5d f8 c9 31 f6 89 f7 e9 ac ea 01 00 48 c7 c7 e0 52 33 b9 c6 05 bb 1c 69 01 01 e8 39 24 f0 fe <0f> 0b 48 8b 5d f8 c9 31 f6 89 f7 e9 89 ea 01 00 0f b6 f3 48 c7 c7 RSP: 0018:ffffb7cf81853a58 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb7cf81853a60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db2cfb1a9e8 R13: ffff8db2cfb1a9d8 R14: ffff8db2c63ff000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8db33bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559758db0f20 CR3: 0000000105124000 CR4: 00000000003506f0 Call Trace: _raw_spin_unlock_irqrestore+0x91/0xa0 ib_send_mad+0x1e3/0x470 [ib_core] ib_post_send_mad+0x150/0xb40 [ib_core] ? cm_form_tid+0x5b/0x90 [ib_cm] ib_send_cm_req+0x7c8/0xb70 [ib_cm] rdma_connect_locked+0x433/0x940 [rdma_cm] nvme_rdma_cm_handler+0x5d7/0x9c0 [nvme_rdma] cma_cm_event_handler+0x4f/0x170 [rdma_cm] cma_work_handler+0x6a/0xe0 [rdma_cm] process_one_work+0x2a9/0x580 worker_thread+0x52/0x3f0 ? __pfx_worker_thread+0x10/0x10 kthread+0x109/0x140 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 Fixes: f605f26ea196 ("RDMA/rxe: Protect QP state with qp->state_lock") Link: https://lore.kernel.org/r/20230510035056.881196-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 26 +++++++++++-------- drivers/infiniband/sw/rxe/rxe_net.c | 7 +++--- drivers/infiniband/sw/rxe/rxe_qp.c | 36 +++++++++++++++++---------- drivers/infiniband/sw/rxe/rxe_recv.c | 9 ++++--- drivers/infiniband/sw/rxe/rxe_req.c | 30 ++++++++++++---------- drivers/infiniband/sw/rxe/rxe_resp.c | 14 ++++++----- drivers/infiniband/sw/rxe/rxe_verbs.c | 25 ++++++++++--------- 7 files changed, 86 insertions(+), 61 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index db18ace74d2b..f46c5a5fd0ae 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -115,15 +115,16 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) void retransmit_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + unsigned long flags; rxe_dbg_qp(qp, "retransmit timer fired\n"); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; rxe_sched_task(&qp->comp.task); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) @@ -481,11 +482,13 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) static void comp_check_sq_drain_done(struct rxe_qp *qp) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_SQD)) { if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) { qp->attr.sq_draining = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->ibqp.event_handler) { struct ib_event ev; @@ -499,7 +502,7 @@ static void comp_check_sq_drain_done(struct rxe_qp *qp) return; } } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static inline enum comp_state complete_ack(struct rxe_qp *qp, @@ -625,13 +628,15 @@ static void free_pkt(struct rxe_pkt_info *pkt) */ static void reset_retry_timer(struct rxe_qp *qp) { + unsigned long flags; + if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) { - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) >= IB_QPS_RTS && psn_compare(qp->req.psn, qp->comp.psn) > 0) mod_timer(&qp->retrans_timer, jiffies + qp->qp_timeout_jiffies); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } } @@ -643,18 +648,19 @@ int rxe_completer(struct rxe_qp *qp) struct rxe_pkt_info *pkt = NULL; enum comp_state state; int ret; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_resp_pkts(qp); flush_send_queue(qp, notify); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->comp.timeout) { qp->comp.timeout_retry = 1; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 2bc7361152ea..a38fab19bed1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -412,15 +412,16 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 245dd36638c7..61a2eb77d999 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -300,6 +300,7 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + unsigned long flags; rxe_get(pd); rxe_get(rcq); @@ -325,10 +326,10 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, if (err) goto err2; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_RESET; qp->valid = 1; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return 0; @@ -492,24 +493,28 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_sched_task(&qp->resp.task); rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } /* caller should hold qp->state_lock */ @@ -555,14 +560,16 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, qp->attr.cur_qp_state = attr->qp_state; if (mask & IB_QP_STATE) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); err = __qp_chk_state(qp, attr, mask); if (!err) { qp->attr.qp_state = attr->qp_state; rxe_dbg_qp(qp, "state -> %s\n", qps2str[attr->qp_state]); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (err) return err; @@ -688,6 +695,8 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, /* called by the query qp verb */ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { + unsigned long flags; + *attr = qp->attr; attr->rq_psn = qp->resp.psn; @@ -708,12 +717,12 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) /* Applications that get this state typically spin on it. * Yield the processor */ - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->attr.sq_draining) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); } else { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } return 0; @@ -737,10 +746,11 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp) static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); qp->valid = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; if (qp_type(qp) == IB_QPT_RC) { diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 2f953cc74256..5861e4244049 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -14,6 +14,7 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct rxe_qp *qp) { unsigned int pkt_type; + unsigned long flags; if (unlikely(!qp->valid)) return -EINVAL; @@ -38,19 +39,19 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return -EINVAL; } - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (pkt->mask & RXE_REQ_MASK) { if (unlikely(qp_state(qp) < IB_QPS_RTR)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return -EINVAL; } } else { if (unlikely(qp_state(qp) < IB_QPS_RTS)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return -EINVAL; } } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 65134a9aefe7..5fe7cbae3031 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -99,17 +99,18 @@ static void req_retry(struct rxe_qp *qp) void rnr_nak_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); + unsigned long flags; rxe_dbg_qp(qp, "nak timer fired\n"); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; rxe_sched_task(&qp->req.task); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static void req_check_sq_drain_done(struct rxe_qp *qp) @@ -118,8 +119,9 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) unsigned int index; unsigned int cons; struct rxe_send_wqe *wqe; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_SQD) { q = qp->sq.queue; index = qp->req.wqe_index; @@ -140,7 +142,7 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) break; qp->attr.sq_draining = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->ibqp.event_handler) { struct ib_event ev; @@ -154,7 +156,7 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) return; } while (0); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) @@ -173,6 +175,7 @@ static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) { struct rxe_send_wqe *wqe; + unsigned long flags; req_check_sq_drain_done(qp); @@ -180,13 +183,13 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) if (wqe == NULL) return NULL; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely((qp_state(qp) == IB_QPS_SQD) && (wqe->state != wqe_state_processing))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return NULL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); return wqe; @@ -676,16 +679,17 @@ int rxe_requester(struct rxe_qp *qp) struct rxe_queue *q = qp->sq.queue; struct rxe_ah *ah; struct rxe_av *av; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = __req_next_wqe(qp); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (wqe) goto err; else @@ -700,10 +704,10 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wait_psn = 0; qp->req.need_retry = 0; qp->req.wait_for_rnr_timer = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); /* we come here if the retransmit timer has fired * or if the rnr timer has fired. If the retransmit diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 68f6cd188d8e..1da044f6b7d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1047,6 +1047,7 @@ static enum resp_states do_complete(struct rxe_qp *qp, struct ib_uverbs_wc *uwc = &cqe.uibwc; struct rxe_recv_wqe *wqe = qp->resp.wqe; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; if (!wqe) goto finish; @@ -1137,12 +1138,12 @@ static enum resp_states do_complete(struct rxe_qp *qp, return RESPST_ERR_CQ_OVERFLOW; finish: - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_ERR)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return RESPST_CHK_RESOURCE; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(!pkt)) return RESPST_DONE; @@ -1468,18 +1469,19 @@ int rxe_responder(struct rxe_qp *qp) enum resp_states state; struct rxe_pkt_info *pkt = NULL; int ret; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_req_pkts(qp); flush_recv_queue(qp, notify); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index dea605b7f683..4d8f6b8051ff 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -904,10 +904,10 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (!err) rxe_sched_task(&qp->req.task); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->comp.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return err; } @@ -917,22 +917,23 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, { struct rxe_qp *qp = to_rqp(ibqp); int err; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send"); return -EINVAL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ @@ -1008,22 +1009,22 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, struct rxe_rq *rq = &qp->rq; unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv"); return -EINVAL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; @@ -1044,10 +1045,10 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->resp.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return err; } From 866422cdddcdf59d8c68e9472d49ba1be29b5fcf Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Thu, 11 May 2023 11:51:03 +0000 Subject: [PATCH 051/271] RDMA/efa: Fix unsupported page sizes in device Device uses 4KB size blocks for user pages indirect list while the driver creates those blocks with the size of PAGE_SIZE of the kernel. On kernels with PAGE_SIZE different than 4KB (ARM RHEL), this leads to a failure on register MR with indirect list because of the miss communication between driver and device. Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Link: https://lore.kernel.org/r/20230511115103.13876-1-ynachum@amazon.com Reviewed-by: Firas Jahjah Reviewed-by: Michael Margolin Signed-off-by: Yonatan Nachum Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 8eca6c14d0cf..2a195c4b0f17 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1403,7 +1403,7 @@ static int pbl_continuous_initialize(struct efa_dev *dev, */ static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl) { - u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE); + u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_CHUNK_PAYLOAD_SIZE); struct scatterlist *sgl; int sg_dma_cnt, err; From 0642287e3ecdd0d1f88e6a2e63768e16153a990c Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 8 May 2023 23:07:16 -0700 Subject: [PATCH 052/271] dmaengine: idxd: Fix passing freed memory in idxd_cdev_open() Smatch warns: drivers/dma/idxd/cdev.c:327: idxd_cdev_open() warn: 'sva' was already freed. When idxd_wq_set_pasid() fails, the current code unbinds sva and then goes to 'failed_set_pasid' where iommu_sva_unbind_device is called again causing the above warning. [ device_user_pasid_enabled(idxd) is still true when calling failed_set_pasid ] Fix this by removing additional unbind when idxd_wq_set_pasid() fails Fixes: b022f59725f0 ("dmaengine: idxd: add idxd_copy_cr() to copy user completion record during page fault handling") Signed-off-by: Harshit Mogalapalli Acked-by: Fenghua Yu Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20230509060716.2830630-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index ecbf67c2ad2b..d32deb9b4e3d 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -277,7 +277,6 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) if (wq_dedicated(wq)) { rc = idxd_wq_set_pasid(wq, pasid); if (rc < 0) { - iommu_sva_unbind_device(sva); dev_err(dev, "wq set pasid failed: %d\n", rc); goto failed_set_pasid; } From 38de368a66360f1859428d5e191b45bd01c20786 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Tue, 16 May 2023 23:13:11 +0530 Subject: [PATCH 053/271] dmaengine: ti: k3-udma: annotate pm function with __maybe_unused We get a warning when PM is not set: ../drivers/dma/ti/k3-udma.c:5552:12: warning: 'udma_pm_resume' defined but not used [-Wunused-function] 5552 | static int udma_pm_resume(struct device *dev) | ^~~~~~~~~~~~~~ ../drivers/dma/ti/k3-udma.c:5530:12: warning: 'udma_pm_suspend' defined but not used [-Wunused-function] 5530 | static int udma_pm_suspend(struct device *dev) | ^~~~~~~~~~~~~~~ Fix this by annotating pm function with __maybe_unused Fixes: fbe05149e40b ("dmaengine: ti: k3-udma: Add system suspend/resume support") Reported-by: Randy Dunlap Signed-off-by: Vinod Koul Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20230516174311.117264-1-vkoul@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index fc3a2a05ab7b..b8329a23728d 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -5527,7 +5527,7 @@ static int udma_probe(struct platform_device *pdev) return ret; } -static int udma_pm_suspend(struct device *dev) +static int __maybe_unused udma_pm_suspend(struct device *dev) { struct udma_dev *ud = dev_get_drvdata(dev); struct dma_device *dma_dev = &ud->ddev; @@ -5549,7 +5549,7 @@ static int udma_pm_suspend(struct device *dev) return 0; } -static int udma_pm_resume(struct device *dev) +static int __maybe_unused udma_pm_resume(struct device *dev) { struct udma_dev *ud = dev_get_drvdata(dev); struct dma_device *dma_dev = &ud->ddev; From 0a1bb16e0fe6650c3841e611de374bfd5578ad70 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Tue, 16 May 2023 22:47:56 -0700 Subject: [PATCH 054/271] gpio: mockup: Fix mode of debugfs files This driver's debugfs files have had a read operation since commit 2a9e27408e12 ("gpio: mockup: rework debugfs interface"), but were still being created with write-only mode bits. Update them to indicate that the files can also be read. Signed-off-by: Zev Weiss Fixes: 2a9e27408e12 ("gpio: mockup: rework debugfs interface") Cc: stable@kernel.org # v5.1+ Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mockup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index e6a7049bef64..b32063ac845a 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -369,7 +369,7 @@ static void gpio_mockup_debugfs_setup(struct device *dev, priv->offset = i; priv->desc = gpiochip_get_desc(gc, i); - debugfs_create_file(name, 0200, chip->dbg_dir, priv, + debugfs_create_file(name, 0600, chip->dbg_dir, priv, &gpio_mockup_debugfs_ops); } } From 1aa7f416175619e0286fddc5fc44e968b06bf2aa Mon Sep 17 00:00:00 2001 From: Maya Matuszczyk Date: Mon, 15 May 2023 20:48:43 +0200 Subject: [PATCH 055/271] drm: panel-orientation-quirks: Change Air's quirk to support Air Plus It turned out that Aya Neo Air Plus had a different board name than expected. This patch changes Aya Neo Air's quirk to account for that, as both devices share "Air" in DMI product name. Tested on Air claiming to be an Air Pro, and on Air Plus. Signed-off-by: Maya Matuszczyk Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20230515184843.1552612-1-maccraft123mc@gmail.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index b1a38e6ce2f8..0cb646cb04ee 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -179,7 +179,7 @@ static const struct dmi_system_id orientation_data[] = { }, { /* AYA NEO AIR */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "AYANEO"), - DMI_MATCH(DMI_BOARD_NAME, "AIR"), + DMI_MATCH(DMI_PRODUCT_NAME, "AIR"), }, .driver_data = (void *)&lcd1080x1920_leftside_up, }, { /* AYA NEO NEXT */ From 7dd3d9bd873f138675cb727eaa51a498d99f0e89 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Thu, 4 May 2023 08:04:21 +0200 Subject: [PATCH 056/271] gpiolib: fix allocation of mixed dynamic/static GPIOs If static allocation and dynamic allocation GPIOs are present, dynamic allocation pollutes the numberspace for static allocation, causing static allocation to fail. Enforce dynamic allocation above GPIO_DYNAMIC_BASE. Seen on a GTA04 when omap-gpio (static) and twl-gpio (dynamic) raced: [some successful registrations of omap_gpio instances] [ 2.553833] twl4030_gpio twl4030-gpio: gpio (irq 145) chaining IRQs 161..178 [ 2.561401] gpiochip_find_base: found new base at 160 [ 2.564392] gpio gpiochip5: (twl4030): added GPIO chardev (254:5) [ 2.564544] gpio gpiochip5: registered GPIOs 160 to 177 on twl4030 [...] [ 2.692169] omap-gpmc 6e000000.gpmc: GPMC revision 5.0 [ 2.697357] gpmc_mem_init: disabling cs 0 mapped at 0x0-0x1000000 [ 2.703643] gpiochip_find_base: found new base at 178 [ 2.704376] gpio gpiochip6: (omap-gpmc): added GPIO chardev (254:6) [ 2.704589] gpio gpiochip6: registered GPIOs 178 to 181 on omap-gpmc [...] [ 2.840393] gpio gpiochip7: Static allocation of GPIO base is deprecated, use dynamic allocation. [ 2.849365] gpio gpiochip7: (gpio-160-191): GPIO integer space overlap, cannot add chip [ 2.857513] gpiochip_add_data_with_key: GPIOs 160..191 (gpio-160-191) failed to register, -16 [ 2.866149] omap_gpio 48310000.gpio: error -EBUSY: Could not register gpio chip On that device it is fixed invasively by commit 92bf78b33b0b4 ("gpio: omap: use dynamic allocation of base") but let's also fix that for devices where there is still a mixture of static and dynamic allocation. Fixes: 7b61212f2a07 ("gpiolib: Get rid of ARCH_NR_GPIOS") Signed-off-by: Andreas Kemnade Reviewed-by: Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 04fb05df805b..a7220e04a93e 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -209,6 +209,8 @@ static int gpiochip_find_base(int ngpio) break; /* nope, check the space right after the chip */ base = gdev->base + gdev->ngpio; + if (base < GPIO_DYNAMIC_BASE) + base = GPIO_DYNAMIC_BASE; } if (gpio_is_valid(base)) { From 7561551e7ba870b9659083b95feb520fb2dacce3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Apr 2023 13:57:17 +0800 Subject: [PATCH 057/271] btrfs: scrub: try harder to mark RAID56 block groups read-only Currently we allow a block group not to be marked read-only for scrub. But for RAID56 block groups if we require the block group to be read-only, then we're allowed to use cached content from scrub stripe to reduce unnecessary RAID56 reads. So this patch would: - Make btrfs_inc_block_group_ro() try harder During my tests, for cases like btrfs/061 and btrfs/064, we can hit ENOSPC from btrfs_inc_block_group_ro() calls during scrub. The reason is if we only have one single data chunk, and trying to scrub it, we won't have any space left for any newer data writes. But this check should be done by the caller, especially for scrub cases we only temporarily mark the chunk read-only. And newer data writes would always try to allocate a new data chunk when needed. - Return error for scrub if we failed to mark a RAID56 chunk read-only Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 14 ++++++++++++-- fs/btrfs/scrub.c | 9 ++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 957ad1c31c4f..590b03560265 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2818,10 +2818,20 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, } ret = inc_block_group_ro(cache, 0); - if (!do_chunk_alloc || ret == -ETXTBSY) - goto unlock_out; if (!ret) goto out; + if (ret == -ETXTBSY) + goto unlock_out; + + /* + * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * chunk allocation storm to exhaust the system chunk array. Otherwise + * we still want to try our best to mark the block group read-only. + */ + if (!do_chunk_alloc && ret == -ENOSPC && + (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) + goto unlock_out; + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 836725a19661..dd37cba58022 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2518,13 +2518,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC && !sctx->is_dev_replace) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace && + !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. + * + * For RAID56 chunks, we have to mark them read-only + * for scrub, as later we would use our own cache + * out of RAID56 realm. + * Thus we want the RAID56 bg to be marked RO to + * prevent RMW from screwing up out cache. */ ro_set = 0; } else if (ret == -ETXTBSY) { From 806570c0bb7b4847828c22c4934fcf2dc8fc572f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 May 2023 13:58:13 +0200 Subject: [PATCH 058/271] btrfs: handle memory allocation failure in btrfs_csum_one_bio Since f8a53bb58ec7 ("btrfs: handle checksum generation in the storage layer") the failures of btrfs_csum_one_bio() are handled via bio_end_io(). This means, we can return BLK_STS_RESOURCE from btrfs_csum_one_bio() in case the allocation of the ordered sums fails. This also fixes a syzkaller report, where injecting a failure into the kvzalloc() call results in a BUG_ON(). Reported-by: syzbot+d8941552e21eac774778@syzkaller.appspotmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index cd4cce9ba443..d1cd0a692f8e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -792,7 +792,9 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); - BUG_ON(!sums); /* -ENOMEM */ + if (!sums) + return BLK_STS_RESOURCE; + sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); From 597441b3436a43011f31ce71dc0a6c0bf5ce958a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 May 2023 12:45:59 -0400 Subject: [PATCH 059/271] btrfs: use nofs when cleaning up aborted transactions Our CI system caught a lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 6.3.0-rc7+ #1167 Not tainted ------------------------------------------------------ kswapd0/46 is trying to acquire lock: ffff8c6543abd650 (sb_internal#2){++++}-{0:0}, at: btrfs_commit_inode_delayed_inode+0x5f/0x120 but task is already holding lock: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0xa5/0xe0 kmem_cache_alloc+0x31/0x2c0 alloc_extent_state+0x1d/0xd0 __clear_extent_bit+0x2e0/0x4f0 try_release_extent_mapping+0x216/0x280 btrfs_release_folio+0x2e/0x90 invalidate_inode_pages2_range+0x397/0x470 btrfs_cleanup_dirty_bgs+0x9e/0x210 btrfs_cleanup_one_transaction+0x22/0x760 btrfs_commit_transaction+0x3b7/0x13a0 create_subvol+0x59b/0x970 btrfs_mksubvol+0x435/0x4f0 __btrfs_ioctl_snap_create+0x11e/0x1b0 btrfs_ioctl_snap_create_v2+0xbf/0x140 btrfs_ioctl+0xa45/0x28f0 __x64_sys_ioctl+0x88/0xc0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc -> #0 (sb_internal#2){++++}-{0:0}: __lock_acquire+0x1435/0x21a0 lock_acquire+0xc2/0x2b0 start_transaction+0x401/0x730 btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_evict_inode+0x292/0x3d0 evict+0xcc/0x1d0 inode_lru_isolate+0x14d/0x1e0 __list_lru_walk_one+0xbe/0x1c0 list_lru_walk_one+0x58/0x80 prune_icache_sb+0x39/0x60 super_cache_scan+0x161/0x1f0 do_shrink_slab+0x163/0x340 shrink_slab+0x1d3/0x290 shrink_node+0x300/0x720 balance_pgdat+0x35c/0x7a0 kswapd+0x205/0x410 kthread+0xf0/0x120 ret_from_fork+0x29/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sb_internal#2); lock(fs_reclaim); lock(sb_internal#2); *** DEADLOCK *** 3 locks held by kswapd0/46: #0: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0 #1: ffffffffabe50270 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x113/0x290 #2: ffff8c6543abd0e0 (&type->s_umount_key#44){++++}-{3:3}, at: super_cache_scan+0x38/0x1f0 stack backtrace: CPU: 0 PID: 46 Comm: kswapd0 Not tainted 6.3.0-rc7+ #1167 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack_lvl+0x58/0x90 check_noncircular+0xd6/0x100 ? save_trace+0x3f/0x310 ? add_lock_to_list+0x97/0x120 __lock_acquire+0x1435/0x21a0 lock_acquire+0xc2/0x2b0 ? btrfs_commit_inode_delayed_inode+0x5f/0x120 start_transaction+0x401/0x730 ? btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_evict_inode+0x292/0x3d0 ? lock_release+0x134/0x270 ? __pfx_wake_bit_function+0x10/0x10 evict+0xcc/0x1d0 inode_lru_isolate+0x14d/0x1e0 __list_lru_walk_one+0xbe/0x1c0 ? __pfx_inode_lru_isolate+0x10/0x10 ? __pfx_inode_lru_isolate+0x10/0x10 list_lru_walk_one+0x58/0x80 prune_icache_sb+0x39/0x60 super_cache_scan+0x161/0x1f0 do_shrink_slab+0x163/0x340 shrink_slab+0x1d3/0x290 shrink_node+0x300/0x720 balance_pgdat+0x35c/0x7a0 kswapd+0x205/0x410 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_kswapd+0x10/0x10 kthread+0xf0/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x50 This happens because when we abort the transaction in the transaction commit path we call invalidate_inode_pages2_range on our block group cache inodes (if we have space cache v1) and any delalloc inodes we may have. The plain invalidate_inode_pages2_range() call passes through GFP_KERNEL, which makes sense in most cases, but not here. Wrap these two invalidate callees with memalloc_nofs_save/memalloc_nofs_restore to make sure we don't end up with the fs reclaim dependency under the transaction dependency. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fbf9006c6234..6de6dcf2743e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4936,7 +4936,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) */ inode = igrab(&btrfs_inode->vfs_inode); if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); iput(inode); } spin_lock(&root->delalloc_lock); @@ -5042,7 +5046,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) inode = cache->io_ctl.inode; if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); + BTRFS_I(inode)->generation = 0; cache->io_ctl.inode = NULL; iput(inode); From f15afbd34d8fadbd375f1212e97837e32bc170cc Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 24 Apr 2023 13:18:35 +0800 Subject: [PATCH 060/271] fs: fix undefined behavior in bit shift for SB_NOUSER Shifting signed 32-bit value by 31 bits is undefined, so changing significant bit to unsigned. It was spotted by UBSAN. So let's just fix this by using the BIT() helper for all SB_* flags. Fixes: e462ec50cb5f ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") Signed-off-by: Hao Ge Message-Id: <20230424051835.374204-1-gehao@kylinos.cn> [brauner@kernel.org: use BIT() for all SB_* flags] Signed-off-by: Christian Brauner --- include/linux/fs.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 21a981680856..133f0640fb24 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1076,29 +1076,29 @@ extern int send_sigurg(struct fown_struct *fown); * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ -#define SB_RDONLY 1 /* Mount read-only */ -#define SB_NOSUID 2 /* Ignore suid and sgid bits */ -#define SB_NODEV 4 /* Disallow access to device special files */ -#define SB_NOEXEC 8 /* Disallow program execution */ -#define SB_SYNCHRONOUS 16 /* Writes are synced at once */ -#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC 128 /* Directory modifications are synchronous */ -#define SB_NOATIME 1024 /* Do not update access times. */ -#define SB_NODIRATIME 2048 /* Do not update directory access times */ -#define SB_SILENT 32768 -#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */ -#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */ -#define SB_I_VERSION (1<<23) /* Update inode I_version field */ -#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* VFS does not apply the umask */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ -#define SB_SUBMOUNT (1<<26) -#define SB_FORCE (1<<27) -#define SB_NOSEC (1<<28) -#define SB_BORN (1<<29) -#define SB_ACTIVE (1<<30) -#define SB_NOUSER (1<<31) +#define SB_SUBMOUNT BIT(26) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) From 3a7bb21b6f49b4d1695d4ba8ff31e9619cbcebe3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 16 May 2023 08:46:54 -0400 Subject: [PATCH 061/271] fs: don't call posix_acl_listxattr in generic_listxattr Commit f2620f166e2a caused the kernel to start emitting POSIX ACL xattrs for NFSv4 inodes, which it doesn't support. The only other user of generic_listxattr is HFS (classic) and it doesn't support POSIX ACLs either. Fixes: f2620f166e2a xattr: simplify listxattr helpers Reported-by: Ondrej Valousek Signed-off-by: Jeff Layton Message-Id: <20230516124655.82283-1-jlayton@kernel.org> Signed-off-by: Christian Brauner --- fs/xattr.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index fcf67d80d7f9..e7bbb7f57557 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -985,9 +985,16 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) return 0; } -/* +/** + * generic_listxattr - run through a dentry's xattr list() operations + * @dentry: dentry to list the xattrs + * @buffer: result buffer + * @buffer_size: size of @buffer + * * Combine the results of the list() operation from every xattr_handler in the - * list. + * xattr_handler stack. + * + * Note that this will not include the entries for POSIX ACLs. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) @@ -996,10 +1003,6 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) ssize_t remaining_size = buffer_size; int err = 0; - err = posix_acl_listxattr(d_inode(dentry), &buffer, &remaining_size); - if (err) - return err; - for_each_xattr_handler(handlers, handler) { if (!handler->name || (handler->list && !handler->list(dentry))) continue; From 48524463f807ec516a291bdf717dcf2c8e059f51 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 17 May 2023 16:07:39 +0900 Subject: [PATCH 062/271] ntfs: Add myself as a reviewer I'm volunteering to help review patches for current unmaintained ntfs filesytem. Signed-off-by: Namjae Jeon Acked-by: Bagas Sanjaya Message-Id: <20230517070739.6505-1-linkinjeon@kernel.org> Signed-off-by: Christian Brauner --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e0ad886d3163..83888aa3298c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14928,6 +14928,7 @@ F: drivers/ntb/hw/intel/ NTFS FILESYSTEM M: Anton Altaparmakov +R: Namjae Jeon L: linux-ntfs-dev@lists.sourceforge.net S: Supported W: http://www.tuxera.com/ From 58caa2a51ad4fd21763696cc6c4defc9fc1b4b4f Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 May 2023 17:22:43 +0800 Subject: [PATCH 063/271] RDMA/hns: Fix timeout attr in query qp for HIP08 On HIP08, the queried timeout attr is different from the timeout attr configured by the user. It is found by rdma-core testcase test_rdmacm_async_traffic: ====================================================================== FAIL: test_rdmacm_async_traffic (tests.test_rdmacm.CMTestCase) ---------------------------------------------------------------------- Traceback (most recent call last): File "./tests/test_rdmacm.py", line 33, in test_rdmacm_async_traffic self.two_nodes_rdmacm_traffic(CMAsyncConnection, self.rdmacm_traffic, File "./tests/base.py", line 382, in two_nodes_rdmacm_traffic raise(res) AssertionError Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Link: https://lore.kernel.org/r/20230512092245.344442-2-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 17 ++++++++++++++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 84f1167de1d9..3a1c90406ed9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5012,7 +5012,6 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) { #define QP_ACK_TIMEOUT_MAX_HIP08 20 -#define QP_ACK_TIMEOUT_OFFSET 10 #define QP_ACK_TIMEOUT_MAX 31 if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { @@ -5021,7 +5020,7 @@ static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) "local ACK timeout shall be 0 to 20.\n"); return false; } - *timeout += QP_ACK_TIMEOUT_OFFSET; + *timeout += HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08; } else if (hr_dev->pci_dev->revision > PCI_REVISION_ID_HIP08) { if (*timeout > QP_ACK_TIMEOUT_MAX) { ibdev_warn(&hr_dev->ib_dev, @@ -5307,6 +5306,18 @@ out: return ret; } +static u8 get_qp_timeout_attr(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_qp_context *context) +{ + u8 timeout; + + timeout = (u8)hr_reg_read(context, QPC_AT); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + timeout -= HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08; + + return timeout; +} + static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) @@ -5384,7 +5395,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_attr->max_dest_rd_atomic = 1 << hr_reg_read(&context, QPC_RR_MAX); qp_attr->min_rnr_timer = (u8)hr_reg_read(&context, QPC_MIN_RNR_TIME); - qp_attr->timeout = (u8)hr_reg_read(&context, QPC_AT); + qp_attr->timeout = get_qp_timeout_attr(hr_dev, &context); qp_attr->retry_cnt = hr_reg_read(&context, QPC_RETRY_NUM_INIT); qp_attr->rnr_retry = hr_reg_read(&context, QPC_RNR_NUM_INIT); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 1b44d2434ab4..7033eae2407c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -44,6 +44,8 @@ #define HNS_ROCE_V2_MAX_XRCD_NUM 0x1000000 #define HNS_ROCE_V2_RSV_XRCD_NUM 0 +#define HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08 10 + #define HNS_ROCE_V3_SCCC_SZ 64 #define HNS_ROCE_V3_GMV_ENTRY_SZ 32 From 7f3969b14f356dd65fa95b3528eb05c32e68bc06 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 May 2023 17:22:44 +0800 Subject: [PATCH 064/271] RDMA/hns: Fix base address table allocation For hns, the specification of an entry like resource (E.g. WQE/CQE/EQE) depends on BT page size, buf page size and hopnum. For user mode, the buf page size depends on UMEM. Therefore, the actual specification is controlled by BT page size and hopnum. The current BT page size and hopnum are obtained from firmware. This makes the driver inflexible and introduces unnecessary constraints. Resource allocation failures occur in many scenarios. This patch will calculate whether the BT page size set by firmware is sufficient before allocating BT, and increase the BT page size if it is insufficient. Fixes: 1133401412a9 ("RDMA/hns: Optimize base address table config flow for qp buffer") Link: https://lore.kernel.org/r/20230512092245.344442-3-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_mr.c | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 37a5cf62f88b..14376490ac22 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -33,6 +33,7 @@ #include #include +#include #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" @@ -909,6 +910,44 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, return page_cnt; } +static u64 cal_pages_per_l1ba(unsigned int ba_per_bt, unsigned int hopnum) +{ + return int_pow(ba_per_bt, hopnum - 1); +} + +static unsigned int cal_best_bt_pg_sz(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + unsigned int pg_shift) +{ + unsigned long cap = hr_dev->caps.page_size_cap; + struct hns_roce_buf_region *re; + unsigned int pgs_per_l1ba; + unsigned int ba_per_bt; + unsigned int ba_num; + int i; + + for_each_set_bit_from(pg_shift, &cap, sizeof(cap) * BITS_PER_BYTE) { + if (!(BIT(pg_shift) & cap)) + continue; + + ba_per_bt = BIT(pg_shift) / BA_BYTE_LEN; + ba_num = 0; + for (i = 0; i < mtr->hem_cfg.region_count; i++) { + re = &mtr->hem_cfg.region[i]; + if (re->hopnum == 0) + continue; + + pgs_per_l1ba = cal_pages_per_l1ba(ba_per_bt, re->hopnum); + ba_num += DIV_ROUND_UP(re->count, pgs_per_l1ba); + } + + if (ba_num <= ba_per_bt) + return pg_shift; + } + + return 0; +} + static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned int ba_page_shift) { @@ -917,6 +956,10 @@ static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, hns_roce_hem_list_init(&mtr->hem_list); if (!cfg->is_direct) { + ba_page_shift = cal_best_bt_pg_sz(hr_dev, mtr, ba_page_shift); + if (!ba_page_shift) + return -ERANGE; + ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, cfg->region, cfg->region_count, ba_page_shift); From 56518a603fd2bf74762d176ac980572db84a3e14 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Fri, 12 May 2023 17:22:45 +0800 Subject: [PATCH 065/271] RDMA/hns: Modify the value of long message loopback slice Long message loopback slice is used for achieving traffic balance between QPs. It prevents the problem that QPs with large traffic occupying the hardware pipeline for a long time and QPs with small traffic cannot be scheduled. Currently, its maximum value is set to 16K, which means only after a QP sends 16K will the second QP be scheduled. This value is too large, which will lead to unbalanced traffic scheduling, and thus it needs to be modified. The setting range of the long message loopback slice is modified to be from 1024 (the lower limit supported by hardware) to mtu. Actual testing shows that this value can significantly reduce error in hardware traffic scheduling. This solution is compatible with both HIP08 and HIP09. The modified lp_pktn_ini has a maximum value of 2 (when mtu is 256), so the range checking code for lp_pktn_ini is no longer necessary and needs to be deleted. Fixes: 0e60778efb07 ("RDMA/hns: Modify the value of MAX_LP_MSG_LEN to meet hardware compatibility") Link: https://lore.kernel.org/r/20230512092245.344442-4-huangjunxian6@hisilicon.com Signed-off-by: Yangyang Li Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 3a1c90406ed9..d4c6b9bc0a4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4583,11 +4583,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, mtu = ib_mtu_enum_to_int(ib_mtu); if (WARN_ON(mtu <= 0)) return -EINVAL; -#define MAX_LP_MSG_LEN 16384 - /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 16KB */ - lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); - if (WARN_ON(lp_pktn_ini >= 0xF)) - return -EINVAL; +#define MIN_LP_MSG_LEN 1024 + /* mtu * (2 ^ lp_pktn_ini) should be in the range of 1024 to mtu */ + lp_pktn_ini = ilog2(max(mtu, MIN_LP_MSG_LEN) / mtu); if (attr_mask & IB_QP_PATH_MTU) { hr_reg_write(context, QPC_MTU, ib_mtu); From 3981514180c987a79ea98f0ae06a7cbf58a9ac0f Mon Sep 17 00:00:00 2001 From: Jim Wylder Date: Wed, 17 May 2023 10:20:11 -0500 Subject: [PATCH 066/271] regmap: Account for register length when chunking Currently, when regmap_raw_write() splits the data, it uses the max_raw_write value defined for the bus. For any bus that includes the target register address in the max_raw_write value, the chunked transmission will always exceed the maximum transmission length. To avoid this problem, subtract the length of the register and the padding from the maximum transmission. Signed-off-by: Jim Wylder max_raw_write - map->format.reg_bytes - + map->format.pad_bytes; int ret, i; if (!val_count) @@ -2089,8 +2091,8 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg, if (map->use_single_write) chunk_regs = 1; - else if (map->max_raw_write && val_len > map->max_raw_write) - chunk_regs = map->max_raw_write / val_bytes; + else if (map->max_raw_write && val_len > max_data) + chunk_regs = max_data / val_bytes; chunk_count = val_count / chunk_regs; chunk_bytes = chunk_regs * val_bytes; From eb0764b822b9b26880b28ccb9100b2983e01bc17 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 May 2023 20:19:43 -0700 Subject: [PATCH 067/271] cxl/port: Enable the HDM decoder capability for switch ports Derick noticed, when testing hot plug, that hot-add behaves nominally after a removal. However, if the hot-add is done without a prior removal, CXL.mem accesses fail. It turns out that the original implementation of the port driver and region programming wrongly assumed that platform-firmware always enables the host-bridge HDM decoder capability. Add support turning on switch-level HDM decoders in the case where platform-firmware has not. The implementation is careful to only arrange for the enable to be undone if the current instance of the driver was the one that did the enable. This is to interoperate with platform-firmware that may expect CXL.mem to remain active after the driver is shutdown. This comes at the cost of potentially not shutting down the enable on kexec flows, but it is mitigated by the fact that the related HDM decoders still need to be enabled on an individual basis. Cc: Reported-by: Derick Marks Fixes: 54cdbf845cf7 ("cxl/port: Add a driver for 'struct cxl_port' objects") Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/168437998331.403037.15719879757678389217.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/pci.c | 27 +++++++++++++++++++++++---- drivers/cxl/cxl.h | 1 + drivers/cxl/port.c | 14 +++++++++----- tools/testing/cxl/Kbuild | 1 + tools/testing/cxl/test/mock.c | 15 +++++++++++++++ 5 files changed, 49 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index f332fe7af92b..c35c002b65dd 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -241,17 +241,36 @@ static void disable_hdm(void *_cxlhdm) hdm + CXL_HDM_DECODER_CTRL_OFFSET); } -static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm) +int devm_cxl_enable_hdm(struct cxl_port *port, struct cxl_hdm *cxlhdm) { - void __iomem *hdm = cxlhdm->regs.hdm_decoder; + void __iomem *hdm; u32 global_ctrl; + /* + * If the hdm capability was not mapped there is nothing to enable and + * the caller is responsible for what happens next. For example, + * emulate a passthrough decoder. + */ + if (IS_ERR(cxlhdm)) + return 0; + + hdm = cxlhdm->regs.hdm_decoder; global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + /* + * If the HDM decoder capability was enabled on entry, skip + * registering disable_hdm() since this decode capability may be + * owned by platform firmware. + */ + if (global_ctrl & CXL_HDM_DECODER_ENABLE) + return 0; + writel(global_ctrl | CXL_HDM_DECODER_ENABLE, hdm + CXL_HDM_DECODER_CTRL_OFFSET); - return devm_add_action_or_reset(host, disable_hdm, cxlhdm); + return devm_add_action_or_reset(&port->dev, disable_hdm, cxlhdm); } +EXPORT_SYMBOL_NS_GPL(devm_cxl_enable_hdm, CXL); int cxl_dvsec_rr_decode(struct device *dev, int d, struct cxl_endpoint_dvsec_info *info) @@ -425,7 +444,7 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, if (info->mem_enabled) return 0; - rc = devm_cxl_enable_hdm(&port->dev, cxlhdm); + rc = devm_cxl_enable_hdm(port, cxlhdm); if (rc) return rc; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 044a92d9813e..f93a28538962 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -710,6 +710,7 @@ struct cxl_endpoint_dvsec_info { struct cxl_hdm; struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, struct cxl_endpoint_dvsec_info *info); +int devm_cxl_enable_hdm(struct cxl_port *port, struct cxl_hdm *cxlhdm); int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, struct cxl_endpoint_dvsec_info *info); int devm_cxl_add_passthrough_decoder(struct cxl_port *port); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index eb57324c4ad4..17a95f469c26 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -60,13 +60,17 @@ static int discover_region(struct device *dev, void *root) static int cxl_switch_port_probe(struct cxl_port *port) { struct cxl_hdm *cxlhdm; - int rc; + int rc, nr_dports; - rc = devm_cxl_port_enumerate_dports(port); - if (rc < 0) - return rc; + nr_dports = devm_cxl_port_enumerate_dports(port); + if (nr_dports < 0) + return nr_dports; cxlhdm = devm_cxl_setup_hdm(port, NULL); + rc = devm_cxl_enable_hdm(port, cxlhdm); + if (rc) + return rc; + if (!IS_ERR(cxlhdm)) return devm_cxl_enumerate_decoders(cxlhdm, NULL); @@ -75,7 +79,7 @@ static int cxl_switch_port_probe(struct cxl_port *port) return PTR_ERR(cxlhdm); } - if (rc == 1) { + if (nr_dports == 1) { dev_dbg(&port->dev, "Fallback to passthrough decoder\n"); return devm_cxl_add_passthrough_decoder(port); } diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index fba7bec96acd..6f9347ade82c 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -6,6 +6,7 @@ ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register ldflags-y += --wrap=devm_cxl_port_enumerate_dports ldflags-y += --wrap=devm_cxl_setup_hdm +ldflags-y += --wrap=devm_cxl_enable_hdm ldflags-y += --wrap=devm_cxl_add_passthrough_decoder ldflags-y += --wrap=devm_cxl_enumerate_decoders ldflags-y += --wrap=cxl_await_media_ready diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index de3933a776fd..284416527644 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -149,6 +149,21 @@ struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_setup_hdm, CXL); +int __wrap_devm_cxl_enable_hdm(struct cxl_port *port, struct cxl_hdm *cxlhdm) +{ + int index, rc; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops && ops->is_mock_port(port->uport)) + rc = 0; + else + rc = devm_cxl_enable_hdm(port, cxlhdm); + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_enable_hdm, CXL); + int __wrap_devm_cxl_add_passthrough_decoder(struct cxl_port *port) { int rc, index; From ce17ad0d54985e2595a3e615fda31df61808a08c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 18 May 2023 14:54:34 -0700 Subject: [PATCH 068/271] cxl: Wait Memory_Info_Valid before access memory related info The Memory_Info_Valid bit (CXL 3.0 8.1.3.8.2) indicates that the CXL Range Size High and Size Low registers are valid. The bit must be set within 1 second of reset deassertion to the device. Check valid bit before we check the Memory_Active bit when waiting for cxl_await_media_ready() to ensure that the memory info is valid for consumption. Also ensures both DVSEC ranges 1 and 2 are ready if DVSEC Capability indicates they are both supported. Fixes: 523e594d9cc0 ("cxl/pci: Implement wait for media active") Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/168444687469.3134781.11033518965387297327.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/pci.c | 85 +++++++++++++++++++++++++++++++++++++----- drivers/cxl/cxlpci.h | 2 + 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c35c002b65dd..67f4ab6daa34 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -101,23 +101,57 @@ int devm_cxl_port_enumerate_dports(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, CXL); -/* - * Wait up to @media_ready_timeout for the device to report memory - * active. - */ -int cxl_await_media_ready(struct cxl_dev_state *cxlds) +static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + bool valid = false; + int rc, i; + u32 temp; + + if (id > CXL_DVSEC_RANGE_MAX) + return -EINVAL; + + /* Check MEM INFO VALID bit first, give up after 1s */ + i = 1; + do { + rc = pci_read_config_dword(pdev, + d + CXL_DVSEC_RANGE_SIZE_LOW(id), + &temp); + if (rc) + return rc; + + valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); + if (valid) + break; + msleep(1000); + } while (i--); + + if (!valid) { + dev_err(&pdev->dev, + "Timeout awaiting memory range %d valid after 1s.\n", + id); + return -ETIMEDOUT; + } + + return 0; +} + +static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; bool active = false; - u64 md_status; int rc, i; + u32 temp; + if (id > CXL_DVSEC_RANGE_MAX) + return -EINVAL; + + /* Check MEM ACTIVE bit, up to 60s timeout by default */ for (i = media_ready_timeout; i; i--) { - u32 temp; - rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &temp); + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; @@ -134,6 +168,39 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) return -ETIMEDOUT; } + return 0; +} + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + int rc, i, hdm_count; + u64 md_status; + u16 cap; + + rc = pci_read_config_word(pdev, + d + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + for (i = 0; i < hdm_count; i++) { + rc = cxl_dvsec_mem_range_valid(cxlds, i); + if (rc) + return rc; + } + + for (i = 0; i < hdm_count; i++) { + rc = cxl_dvsec_mem_range_active(cxlds, i); + if (rc) + return rc; + } + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); if (!CXLMDEV_READY(md_status)) return -EIO; diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 0465ef963cd6..7c02e55b8042 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -31,6 +31,8 @@ #define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) #define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_RANGE_MAX 2 + /* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ #define CXL_DVSEC_FUNCTION_MAP 2 From e764f12208b99ac7892c4e3f6bf88d71ca71036f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 18 May 2023 16:38:20 -0700 Subject: [PATCH 069/271] cxl: Move cxl_await_media_ready() to before capacity info retrieval Move cxl_await_media_ready() to cxl_pci probe before driver starts issuing IDENTIFY and retrieving memory device information to ensure that the device is ready to provide the information. Allow cxl_pci_probe() to succeed even if media is not ready. Cache the media failure in cxlds and don't ask the device for any media information. The rationale for proceeding in the !media_ready case is to allow for mailbox operations to interrogate and/or remediate the device. After media is repaired then rebinding the cxl_pci driver is expected to restart the capacity scan. Suggested-by: Dan Williams Fixes: b39cb1052a5c ("cxl/mem: Register CXL memX devices") Reviewed-by: Ira Weiny Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/168445310026.3251520.8124296540679268206.stgit@djiang5-mobl3 [djbw: fixup cxl_test] Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 15 ++++++++++----- drivers/cxl/cxlmem.h | 2 ++ drivers/cxl/mem.c | 3 +++ drivers/cxl/pci.c | 6 ++++++ drivers/cxl/port.c | 6 ------ tools/testing/cxl/test/mem.c | 1 + 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 23b9ff920d7e..2c8dc7e2b84d 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1028,7 +1028,7 @@ static int cxl_mem_get_partition_info(struct cxl_dev_state *cxlds) * cxl_dev_state_identify() - Send the IDENTIFY command to the device. * @cxlds: The device data for the operation * - * Return: 0 if identify was executed successfully. + * Return: 0 if identify was executed successfully or media not ready. * * This will dispatch the identify command to the device and on success populate * structures to be exported to sysfs. @@ -1041,6 +1041,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) u32 val; int rc; + if (!cxlds->media_ready) + return 0; + mbox_cmd = (struct cxl_mbox_cmd) { .opcode = CXL_MBOX_OP_IDENTIFY, .size_out = sizeof(id), @@ -1115,10 +1118,12 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) cxlds->persistent_only_bytes, "pmem"); } - rc = cxl_mem_get_partition_info(cxlds); - if (rc) { - dev_err(dev, "Failed to query partition information\n"); - return rc; + if (cxlds->media_ready) { + rc = cxl_mem_get_partition_info(cxlds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } } rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index db12b6313afb..a2845a7a69d8 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -266,6 +266,7 @@ struct cxl_poison_state { * @regs: Parsed register blocks * @cxl_dvsec: Offset to the PCIe device DVSEC * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) + * @media_ready: Indicate whether the device media is usable * @payload_size: Size of space for payload * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) * @lsa_size: Size of Label Storage Area @@ -303,6 +304,7 @@ struct cxl_dev_state { int cxl_dvsec; bool rcd; + bool media_ready; size_t payload_size; size_t lsa_size; struct mutex mbox_mutex; /* Protects device mailbox and firmware */ diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 10caf180b3fa..519edd0eb196 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -124,6 +124,9 @@ static int cxl_mem_probe(struct device *dev) struct dentry *dentry; int rc; + if (!cxlds->media_ready) + return -EBUSY; + /* * Someone is trying to reattach this device after it lost its port * connection (an endpoint port previously registered by this memdev was diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index f7a5b8e9c102..0872f2233ed0 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -708,6 +708,12 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "Failed to map RAS capability.\n"); + rc = cxl_await_media_ready(cxlds); + if (rc == 0) + cxlds->media_ready = true; + else + dev_warn(&pdev->dev, "Media not active (%d)\n", rc); + rc = cxl_pci_setup_mailbox(cxlds); if (rc) return rc; diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 17a95f469c26..c23b6164e1c0 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -117,12 +117,6 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) if (rc) return rc; - rc = cxl_await_media_ready(cxlds); - if (rc) { - dev_err(&port->dev, "Media not active (%d)\n", rc); - return rc; - } - rc = devm_cxl_enumerate_decoders(cxlhdm, &info); if (rc) return rc; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index ba572d03c687..34b48027b3de 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1256,6 +1256,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; + cxlds->media_ready = true; rc = cxl_dev_state_identify(cxlds); if (rc) return rc; From 4d43acb145c363626d76f49febb4240c488cd1cf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 15 May 2023 13:32:10 +0300 Subject: [PATCH 070/271] dmaengine: at_xdmac: fix potential Oops in at_xdmac_prep_interleaved() There are two place if the at_xdmac_interleaved_queue_desc() fails which could lead to a NULL dereference where "first" is NULL and we call list_add_tail(&first->desc_node, ...). In the first caller, the return is not checked so add a check for that. In the next caller, the return is checked but if it fails on the first iteration through the loop then it will lead to a NULL pointer dereference. Fixes: 4e5385784e69 ("dmaengine: at_xdmac: handle numf > 1") Fixes: 62b5cb757f1d ("dmaengine: at_xdmac: fix memory leak in interleaved mode") Signed-off-by: Dan Carpenter Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/21282b66-9860-410a-83df-39c17fcf2f1b@kili.mountain Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 7da6d9b6098e..c3b37168b21f 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1102,6 +1102,8 @@ at_xdmac_prep_interleaved(struct dma_chan *chan, NULL, src_addr, dst_addr, xt, xt->sgl); + if (!first) + return NULL; /* Length of the block is (BLEN+1) microblocks. */ for (i = 0; i < xt->numf - 1; i++) @@ -1132,8 +1134,9 @@ at_xdmac_prep_interleaved(struct dma_chan *chan, src_addr, dst_addr, xt, chunk); if (!desc) { - list_splice_tail_init(&first->descs_list, - &atchan->free_descs_list); + if (first) + list_splice_tail_init(&first->descs_list, + &atchan->free_descs_list); return NULL; } From aa8bf93101a185b49f83c9137453571a08be6e76 Mon Sep 17 00:00:00 2001 From: Vladislav Efanov Date: Wed, 17 May 2023 15:52:47 +0300 Subject: [PATCH 071/271] drm/sched: Remove redundant check The rq pointer points inside the drm_gpu_scheduler structure. Thus it can't be NULL. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: c61cdbdbffc1 ("drm/scheduler: Fix hang when sched_entity released") Signed-off-by: Vladislav Efanov Link: https://lore.kernel.org/r/20230517125247.434103-1-VEfanov@ispras.ru Reviewed-by: Luben Tuikov Signed-off-by: Luben Tuikov --- drivers/gpu/drm/scheduler/sched_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 8c183639603e..aea5a90ff98b 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1141,9 +1141,6 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched) for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { struct drm_sched_rq *rq = &sched->sched_rq[i]; - if (!rq) - continue; - spin_lock(&rq->lock); list_for_each_entry(s_entity, &rq->entities, list) /* From 349e3c0cf239cc01d58a1e6c749e171de014cd6a Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 01:10:59 -0700 Subject: [PATCH 072/271] RDMA/bnxt_re: Fix a possible memory leak Inside bnxt_qplib_create_cq(), when the check for NULL DPI fails, driver returns directly without freeing the memory allocated inside bnxt_qplib_alloc_init_hwq() routine. Fixed this by moving the check for NULL DPI before invoking bnxt_qplib_alloc_init_hwq(). Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684397461-23082-2-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index f139d4cd1712..8974f6235cfa 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2056,6 +2056,12 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) u32 pg_sz_lvl; int rc; + if (!cq->dpi) { + dev_err(&rcfw->pdev->dev, + "FP: CREATE_CQ failed due to NULL DPI\n"); + return -EINVAL; + } + hwq_attr.res = res; hwq_attr.depth = cq->max_wqe; hwq_attr.stride = sizeof(struct cq_base); @@ -2069,11 +2075,6 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) CMDQ_BASE_OPCODE_CREATE_CQ, sizeof(req)); - if (!cq->dpi) { - dev_err(&rcfw->pdev->dev, - "FP: CREATE_CQ failed due to NULL DPI\n"); - return -EINVAL; - } req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->hwq.max_elements); From 0fa0d520e2a878cb4c94c4dc84395905d3f14f54 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 01:11:00 -0700 Subject: [PATCH 073/271] RDMA/bnxt_re: Fix return value of bnxt_re_process_raw_qp_pkt_rx bnxt_re_process_raw_qp_pkt_rx() always return 0 and ignores the return value of bnxt_re_post_send_shadow_qp(). Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684397461-23082-3-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Hongguang Gao Reviewed-by: Ajit Khaparde Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e86afecfbe46..b1c36412025f 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3341,9 +3341,7 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp, udwr.remote_qkey = gsi_sqp->qplib_qp.qkey; /* post data received in the send queue */ - rc = bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr); - - return 0; + return bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr); } static void bnxt_re_process_res_rawqp1_wc(struct ib_wc *wc, From dd5fb04857d7346c9ea4901ec3ebe5b90b0e8868 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 01:11:01 -0700 Subject: [PATCH 074/271] RDMA/bnxt_re: Do not enable congestion control on VFs Congestion control needs to be enabled only on the PFs. FW fails the command if issued on VFs. Avoid sending the command on VFs. Fixes: f13bcef04ba0 ("RDMA/bnxt_re: Enable congestion control by default") Link: https://lore.kernel.org/r/1684397461-23082-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Reviewed-by: Selvin Thyparampil Xavier Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b9e2f89337e8..e34eccd178d0 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1336,6 +1336,10 @@ static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) { struct bnxt_qplib_cc_param cc_param = {}; + /* Do not enable congestion control on VFs */ + if (rdev->is_virtfn) + return; + /* Currently enabling only for GenP5 adapters */ if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) return; From a70fc4ed20a6118837b0aecbbf789074935f473b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 19 May 2023 23:54:35 +0200 Subject: [PATCH 075/271] cxl/port: Fix NULL pointer access in devm_cxl_add_port() In devm_cxl_add_port() the port creation may fail and its associated pointer does not contain a valid address. During error message generation this invalid port address is used. Fix that wrong address access. Fixes: f3cd264c4ec1 ("cxl: Unify debug messages when calling devm_cxl_add_port()") Signed-off-by: Robert Richter Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230519215436.3394532-1-rrichter@amd.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index da2068475fa2..e7c284c890bc 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -750,11 +750,10 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, parent_port = parent_dport ? parent_dport->port : NULL; if (IS_ERR(port)) { - dev_dbg(uport, "Failed to add %s%s%s%s: %ld\n", - dev_name(&port->dev), - parent_port ? " to " : "", + dev_dbg(uport, "Failed to add%s%s%s: %ld\n", + parent_port ? " port to " : "", parent_port ? dev_name(&parent_port->dev) : "", - parent_port ? "" : " (root port)", + parent_port ? "" : " root port", PTR_ERR(port)); } else { dev_dbg(uport, "%s added%s%s%s\n", From d1d8875c8c13517f6fd1ff8d4d3e1ac366a17e07 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Fri, 19 May 2023 19:59:49 +0000 Subject: [PATCH 076/271] binder: fix UAF of alloc->vma in race with munmap() [ cmllamas: clean forward port from commit 015ac18be7de ("binder: fix UAF of alloc->vma in race with munmap()") in 5.10 stable. It is needed in mainline after the revert of commit a43cfc87caaf ("android: binder: stop saving a pointer to the VMA") as pointed out by Liam. The commit log and tags have been tweaked to reflect this. ] In commit 720c24192404 ("ANDROID: binder: change down_write to down_read") binder assumed the mmap read lock is sufficient to protect alloc->vma inside binder_update_page_range(). This used to be accurate until commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap"), which now downgrades the mmap_lock after detaching the vma from the rbtree in munmap(). Then it proceeds to teardown and free the vma with only the read lock held. This means that accesses to alloc->vma in binder_update_page_range() now will race with vm_area_free() in munmap() and can cause a UAF as shown in the following KASAN trace: ================================================================== BUG: KASAN: use-after-free in vm_insert_page+0x7c/0x1f0 Read of size 8 at addr ffff16204ad00600 by task server/558 CPU: 3 PID: 558 Comm: server Not tainted 5.10.150-00001-gdc8dcf942daa #1 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2a0 show_stack+0x18/0x2c dump_stack+0xf8/0x164 print_address_description.constprop.0+0x9c/0x538 kasan_report+0x120/0x200 __asan_load8+0xa0/0xc4 vm_insert_page+0x7c/0x1f0 binder_update_page_range+0x278/0x50c binder_alloc_new_buf+0x3f0/0xba0 binder_transaction+0x64c/0x3040 binder_thread_write+0x924/0x2020 binder_ioctl+0x1610/0x2e5c __arm64_sys_ioctl+0xd4/0x120 el0_svc_common.constprop.0+0xac/0x270 do_el0_svc+0x38/0xa0 el0_svc+0x1c/0x2c el0_sync_handler+0xe8/0x114 el0_sync+0x180/0x1c0 Allocated by task 559: kasan_save_stack+0x38/0x6c __kasan_kmalloc.constprop.0+0xe4/0xf0 kasan_slab_alloc+0x18/0x2c kmem_cache_alloc+0x1b0/0x2d0 vm_area_alloc+0x28/0x94 mmap_region+0x378/0x920 do_mmap+0x3f0/0x600 vm_mmap_pgoff+0x150/0x17c ksys_mmap_pgoff+0x284/0x2dc __arm64_sys_mmap+0x84/0xa4 el0_svc_common.constprop.0+0xac/0x270 do_el0_svc+0x38/0xa0 el0_svc+0x1c/0x2c el0_sync_handler+0xe8/0x114 el0_sync+0x180/0x1c0 Freed by task 560: kasan_save_stack+0x38/0x6c kasan_set_track+0x28/0x40 kasan_set_free_info+0x24/0x4c __kasan_slab_free+0x100/0x164 kasan_slab_free+0x14/0x20 kmem_cache_free+0xc4/0x34c vm_area_free+0x1c/0x2c remove_vma+0x7c/0x94 __do_munmap+0x358/0x710 __vm_munmap+0xbc/0x130 __arm64_sys_munmap+0x4c/0x64 el0_svc_common.constprop.0+0xac/0x270 do_el0_svc+0x38/0xa0 el0_svc+0x1c/0x2c el0_sync_handler+0xe8/0x114 el0_sync+0x180/0x1c0 [...] ================================================================== To prevent the race above, revert back to taking the mmap write lock inside binder_update_page_range(). One might expect an increase of mmap lock contention. However, binder already serializes these calls via top level alloc->mutex. Also, there was no performance impact shown when running the binder benchmark tests. Fixes: c0fd2101781e ("Revert "android: binder: stop saving a pointer to the VMA"") Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Reported-by: Jann Horn Closes: https://lore.kernel.org/all/20230518144052.xkj6vmddccq4v66b@revolver Cc: Cc: Minchan Kim Cc: Yang Shi Cc: Liam Howlett Signed-off-by: Carlos Llamas Acked-by: Todd Kjos Link: https://lore.kernel.org/r/20230519195950.1775656-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index e7c9d466f8e8..662a2a2e2e84 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -212,7 +212,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, mm = alloc->mm; if (mm) { - mmap_read_lock(mm); + mmap_write_lock(mm); vma = alloc->vma; } @@ -270,7 +270,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, trace_binder_alloc_page_end(alloc, index); } if (mm) { - mmap_read_unlock(mm); + mmap_write_unlock(mm); mmput(mm); } return 0; @@ -303,7 +303,7 @@ err_page_ptr_cleared: } err_no_vma: if (mm) { - mmap_read_unlock(mm); + mmap_write_unlock(mm); mmput(mm); } return vma ? -ENOMEM : -ESRCH; From 358e526a1648cdd773ba169da5867874ae2408e3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 19 May 2023 21:38:06 +1000 Subject: [PATCH 077/271] powerpc/mm: Reinstate ARCH_FORCE_MAX_ORDER ranges Commit 1e8fed873e74 ("powerpc: drop ranges for definition of ARCH_FORCE_MAX_ORDER") removed the limits on the possible values for ARCH_FORCE_MAX_ORDER. However removing the ranges entirely causes some common work flows to break. For example building a defconfig (which uses 64K pages), changing the page size to 4K, and rebuilding used to work, because ARCH_FORCE_MAX_ORDER would be clamped to 12 by the ranges. With the ranges removed it creates a kernel that builds but crashes at boot: kernel BUG at mm/huge_memory.c:470! Oops: Exception in kernel mode, sig: 5 [#1] ... NIP hugepage_init+0x9c/0x278 LR do_one_initcall+0x80/0x320 Call Trace: do_one_initcall+0x80/0x320 kernel_init_freeable+0x304/0x3ac kernel_init+0x30/0x1a0 ret_from_kernel_user_thread+0x14/0x1c The reasoning for removing the ranges was that some of the values were too large. So take that into account and limit the maximums to 10 which is the default max, except for the 4K case which uses 12. Fixes: 1e8fed873e74 ("powerpc: drop ranges for definition of ARCH_FORCE_MAX_ORDER") Signed-off-by: Michael Ellerman Link: https://msgid.link/20230519113806.370635-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 539d1f03ff42..bff5820b7cda 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -906,11 +906,17 @@ config DATA_SHIFT config ARCH_FORCE_MAX_ORDER int "Order of maximal physically contiguous allocations" + range 7 8 if PPC64 && PPC_64K_PAGES default "8" if PPC64 && PPC_64K_PAGES + range 12 12 if PPC64 && !PPC_64K_PAGES default "12" if PPC64 && !PPC_64K_PAGES + range 8 10 if PPC32 && PPC_16K_PAGES default "8" if PPC32 && PPC_16K_PAGES + range 6 10 if PPC32 && PPC_64K_PAGES default "6" if PPC32 && PPC_64K_PAGES + range 4 10 if PPC32 && PPC_256K_PAGES default "4" if PPC32 && PPC_256K_PAGES + range 10 10 default "10" help The kernel page allocator limits the size of maximal physically From c21f11d182c2180d8b90eaff84f574cfa845b250 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 19 May 2023 10:07:33 +0100 Subject: [PATCH 078/271] drm: fix drmm_mutex_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In mutex_init() lockdep identifies a lock by defining a special static key for each lock class. However if we wrap the macro in a function, like in drmm_mutex_init(), we end up generating: int drmm_mutex_init(struct drm_device *dev, struct mutex *lock) { static struct lock_class_key __key; __mutex_init((lock), "lock", &__key); .... } The static __key here is what lockdep uses to identify the lock class, however since this is just a normal function the key here will be created once, where all callers then use the same key. In effect the mutex->depmap.key will be the same pointer for different drmm_mutex_init() callers. This then results in impossible lockdep splats since lockdep thinks completely unrelated locks are the same lock class. To fix this turn drmm_mutex_init() into a macro such that it generates a different "static struct lock_class_key __key" for each invocation, which looks to be inline with what mutex_init() wants. v2: - Revamp the commit message with clearer explanation of the issue. - Rather export __drmm_mutex_release() than static inline. Reported-by: Thomas Hellström Reported-by: Sarah Walker Fixes: e13f13e039dc ("drm: Add DRM-managed mutex_init()") Cc: Stanislaw Gruszka Cc: Boris Brezillon Cc: Thomas Zimmermann Cc: Jocelyn Falempe Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Matthew Auld Reviewed-by: Boris Brezillon Reviewed-by: Stanislaw Gruszka Reviewed-by: Lucas De Marchi Acked-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20230519090733.489019-1-matthew.auld@intel.com --- drivers/gpu/drm/drm_managed.c | 22 ++-------------------- include/drm/drm_managed.h | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/drm_managed.c b/drivers/gpu/drm/drm_managed.c index 4cf214de50c4..c21c3f623033 100644 --- a/drivers/gpu/drm/drm_managed.c +++ b/drivers/gpu/drm/drm_managed.c @@ -264,28 +264,10 @@ void drmm_kfree(struct drm_device *dev, void *data) } EXPORT_SYMBOL(drmm_kfree); -static void drmm_mutex_release(struct drm_device *dev, void *res) +void __drmm_mutex_release(struct drm_device *dev, void *res) { struct mutex *lock = res; mutex_destroy(lock); } - -/** - * drmm_mutex_init - &drm_device-managed mutex_init() - * @dev: DRM device - * @lock: lock to be initialized - * - * Returns: - * 0 on success, or a negative errno code otherwise. - * - * This is a &drm_device-managed version of mutex_init(). The initialized - * lock is automatically destroyed on the final drm_dev_put(). - */ -int drmm_mutex_init(struct drm_device *dev, struct mutex *lock) -{ - mutex_init(lock); - - return drmm_add_action_or_reset(dev, drmm_mutex_release, lock); -} -EXPORT_SYMBOL(drmm_mutex_init); +EXPORT_SYMBOL(__drmm_mutex_release); diff --git a/include/drm/drm_managed.h b/include/drm/drm_managed.h index 359883942612..ad08f834af40 100644 --- a/include/drm/drm_managed.h +++ b/include/drm/drm_managed.h @@ -105,6 +105,22 @@ char *drmm_kstrdup(struct drm_device *dev, const char *s, gfp_t gfp); void drmm_kfree(struct drm_device *dev, void *data); -int drmm_mutex_init(struct drm_device *dev, struct mutex *lock); +void __drmm_mutex_release(struct drm_device *dev, void *res); + +/** + * drmm_mutex_init - &drm_device-managed mutex_init() + * @dev: DRM device + * @lock: lock to be initialized + * + * Returns: + * 0 on success, or a negative errno code otherwise. + * + * This is a &drm_device-managed version of mutex_init(). The initialized + * lock is automatically destroyed on the final drm_dev_put(). + */ +#define drmm_mutex_init(dev, lock) ({ \ + mutex_init(lock); \ + drmm_add_action_or_reset(dev, __drmm_mutex_release, lock); \ +}) \ #endif From eb799279fb1f9c63c520fe8c1c41cb9154252db6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 11 May 2023 22:47:32 +0900 Subject: [PATCH 079/271] debugobjects: Don't wake up kswapd from fill_pool() syzbot is reporting a lockdep warning in fill_pool() because the allocation from debugobjects is using GFP_ATOMIC, which is (__GFP_HIGH | __GFP_KSWAPD_RECLAIM) and therefore tries to wake up kswapd, which acquires kswapd_wait::lock. Since fill_pool() might be called with arbitrary locks held, fill_pool() should not assume that acquiring kswapd_wait::lock is safe. Use __GFP_HIGH instead and remove __GFP_NORETRY as it is pointless for !__GFP_DIRECT_RECLAIM allocation. Fixes: 3ac7fe5a4aab ("infrastructure to debug (dynamic) objects") Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/6577e1fa-b6ee-f2be-2414-a2b51b1c5e30@I-love.SAKURA.ne.jp Closes: https://syzkaller.appspot.com/bug?extid=fe0c72f0ccbb93786380 --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 826c617b10a7..984985c39c9b 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -126,7 +126,7 @@ static const char *obj_states[ODEBUG_STATE_MAX] = { static void fill_pool(void) { - gfp_t gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; struct debug_obj *obj; unsigned long flags; From 45dfbd992923f4df174db4e23b96fca7e30d73e2 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 10 May 2023 13:31:18 +0300 Subject: [PATCH 080/271] drm/i915: Fix PIPEDMC disabling for a bigjoiner configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For a bigjoiner configuration display->crtc_disable() will be called first for the slave CRTCs and then for the master CRTC. However slave CRTCs will be actually disabled only after the master CRTC is disabled (from the encoder disable hooks called with the master CRTC state). Hence the slave PIPEDMCs can be disabled only after the master CRTC is disabled, make this so. intel_encoders_post_pll_disable() must be called only for the master CRTC, as for the other two encoder disable hooks. While at it fix this up as well. This didn't cause a problem, since intel_encoders_post_pll_disable() will call the corresponding hook only for an encoder/connector connected to the given CRTC, however slave CRTCs will have no associated encoder/connector. Fixes: 3af2ff0840be ("drm/i915: Enable a PIPEDMC whenever its corresponding pipe is enabled") Cc: Rodrigo Vivi Cc: Ville Syrjälä Reviewed-by: Ville Syrjälä Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20230510103131.1618266-2-imre.deak@intel.com (cherry picked from commit 7eeef32719f6af935a1554813e6bc206446339cd) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/intel_display.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 3c29792137a5..0aae9a1eb3d5 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -1851,9 +1851,17 @@ static void hsw_crtc_disable(struct intel_atomic_state *state, intel_disable_shared_dpll(old_crtc_state); - intel_encoders_post_pll_disable(state, crtc); + if (!intel_crtc_is_bigjoiner_slave(old_crtc_state)) { + struct intel_crtc *slave_crtc; - intel_dmc_disable_pipe(i915, crtc->pipe); + intel_encoders_post_pll_disable(state, crtc); + + intel_dmc_disable_pipe(i915, crtc->pipe); + + for_each_intel_crtc_in_pipe_mask(&i915->drm, slave_crtc, + intel_crtc_bigjoiner_slave_pipes(old_crtc_state)) + intel_dmc_disable_pipe(i915, slave_crtc->pipe); + } } static void i9xx_pfit_enable(const struct intel_crtc_state *crtc_state) From 59fa12646d9f56c842b4d5b6418ed77af625c588 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 17 May 2023 15:52:30 +0200 Subject: [PATCH 081/271] parisc: Improve cache flushing for PCXL in arch_sync_dma_for_cpu() Add comment in arch_sync_dma_for_device() and handle the direction flag in arch_sync_dma_for_cpu(). When receiving data from the device (DMA_FROM_DEVICE) unconditionally purge the data cache in arch_sync_dma_for_cpu(). Signed-off-by: Helge Deller --- arch/parisc/kernel/pci-dma.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ba87f791323b..71ed5391f29d 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -446,11 +446,27 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { + /* + * fdc: The data cache line is written back to memory, if and only if + * it is dirty, and then invalidated from the data cache. + */ flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size); } void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { - flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size); + unsigned long addr = (unsigned long) phys_to_virt(paddr); + + switch (dir) { + case DMA_TO_DEVICE: + case DMA_BIDIRECTIONAL: + flush_kernel_dcache_range(addr, size); + return; + case DMA_FROM_DEVICE: + purge_kernel_dcache_range_asm(addr, addr + size); + return; + default: + BUG(); + } } From d703797380c540bbeac03f104ebcfc364eaf47cc Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 17 May 2023 15:54:40 +0200 Subject: [PATCH 082/271] parisc: Flush gatt writes and adjust gatt mask in parisc_agp_mask_memory() Flush caches after changing gatt entries and calculate entry according to SBA requirements. Signed-off-by: Helge Deller --- drivers/char/agp/parisc-agp.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index d68d05d5d383..514f9f287a78 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -90,6 +90,9 @@ parisc_agp_tlbflush(struct agp_memory *mem) { struct _parisc_agp_info *info = &parisc_agp_info; + /* force fdc ops to be visible to IOMMU */ + asm_io_sync(); + writeq(info->gart_base | ilog2(info->gart_size), info->ioc_regs+IOC_PCOM); readq(info->ioc_regs+IOC_PCOM); /* flush */ } @@ -158,6 +161,7 @@ parisc_agp_insert_memory(struct agp_memory *mem, off_t pg_start, int type) info->gatt[j] = parisc_agp_mask_memory(agp_bridge, paddr, type); + asm_io_fdc(&info->gatt[j]); } } @@ -191,7 +195,16 @@ static unsigned long parisc_agp_mask_memory(struct agp_bridge_data *bridge, dma_addr_t addr, int type) { - return SBA_PDIR_VALID_BIT | addr; + unsigned ci; /* coherent index */ + dma_addr_t pa; + + pa = addr & IOVP_MASK; + asm("lci 0(%1), %0" : "=r" (ci) : "r" (phys_to_virt(pa))); + + pa |= (ci >> PAGE_SHIFT) & 0xff;/* move CI (8 bits) into lowest byte */ + pa |= SBA_PDIR_VALID_BIT; /* set "valid" bit */ + + return cpu_to_le64(pa); } static void From 3c845304d2d723f20d5b91fef5d133ff94825d76 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 17 May 2023 21:38:08 +0800 Subject: [PATCH 083/271] perf/x86/intel: Save/restore cpuc->active_pebs_data_cfg when using guest PEBS After commit b752ea0c28e3 ("perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG"), the cpuc->pebs_data_cfg may save some bits that are not supported by real hardware, such as PEBS_UPDATE_DS_SW. This would cause the VMX hardware MSR switching mechanism to save/restore invalid values for PEBS_DATA_CFG MSR, thus crashing the host when PEBS is used for guest. Fix it by using the active host value from cpuc->active_pebs_data_cfg. Fixes: b752ea0c28e3 ("perf/x86/intel/ds: Flush PEBS DS when changing PEBS_DATA_CFG") Signed-off-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20230517133808.67885-1-likexu@tencent.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 070cc4ef2672..89b9c1cebb61 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4074,7 +4074,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) if (x86_pmu.intel_cap.pebs_baseline) { arr[(*nr)++] = (struct perf_guest_switch_msr){ .msr = MSR_PEBS_DATA_CFG, - .host = cpuc->pebs_data_cfg, + .host = cpuc->active_pebs_data_cfg, .guest = kvm_pmu->pebs_data_cfg, }; } From 3002b8642f016d7fe3ff56240dacea1075f6b877 Mon Sep 17 00:00:00 2001 From: Henning Schild Date: Thu, 27 Apr 2023 17:20:55 +0200 Subject: [PATCH 084/271] gpio-f7188x: fix chip name and pin count on Nuvoton chip In fact the device with chip id 0xD283 is called NCT6126D, and that is the chip id the Nuvoton code was written for. Correct that name to avoid confusion, because a NCT6116D in fact exists as well but has another chip id, and is currently not supported. The look at the spec also revealed that GPIO group7 in fact has 8 pins, so correct the pin count in that group as well. Fixes: d0918a84aff0 ("gpio-f7188x: Add GPIO support for Nuvoton NCT6116") Reported-by: Xing Tong Wu Signed-off-by: Henning Schild Acked-by: Simon Guinot Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 2 +- drivers/gpio/gpio-f7188x.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 5521f060d58e..f45c6a36551c 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -897,7 +897,7 @@ config GPIO_F7188X help This option enables support for GPIOs found on Fintek Super-I/O chips F71869, F71869A, F71882FG, F71889F and F81866. - As well as Nuvoton Super-I/O chip NCT6116D. + As well as Nuvoton Super-I/O chip NCT6126D. To compile this driver as a module, choose M here: the module will be called f7188x-gpio. diff --git a/drivers/gpio/gpio-f7188x.c b/drivers/gpio/gpio-f7188x.c index 9effa7769bef..f54ca5a1775e 100644 --- a/drivers/gpio/gpio-f7188x.c +++ b/drivers/gpio/gpio-f7188x.c @@ -48,7 +48,7 @@ /* * Nuvoton devices. */ -#define SIO_NCT6116D_ID 0xD283 /* NCT6116D chipset ID */ +#define SIO_NCT6126D_ID 0xD283 /* NCT6126D chipset ID */ #define SIO_LD_GPIO_NUVOTON 0x07 /* GPIO logical device */ @@ -62,7 +62,7 @@ enum chips { f81866, f81804, f81865, - nct6116d, + nct6126d, }; static const char * const f7188x_names[] = { @@ -74,7 +74,7 @@ static const char * const f7188x_names[] = { "f81866", "f81804", "f81865", - "nct6116d", + "nct6126d", }; struct f7188x_sio { @@ -187,8 +187,8 @@ static int f7188x_gpio_set_config(struct gpio_chip *chip, unsigned offset, /* Output mode register (0:open drain 1:push-pull). */ #define f7188x_gpio_out_mode(base) ((base) + 3) -#define f7188x_gpio_dir_invert(type) ((type) == nct6116d) -#define f7188x_gpio_data_single(type) ((type) == nct6116d) +#define f7188x_gpio_dir_invert(type) ((type) == nct6126d) +#define f7188x_gpio_data_single(type) ((type) == nct6126d) static struct f7188x_gpio_bank f71869_gpio_bank[] = { F7188X_GPIO_BANK(0, 6, 0xF0, DRVNAME "-0"), @@ -274,7 +274,7 @@ static struct f7188x_gpio_bank f81865_gpio_bank[] = { F7188X_GPIO_BANK(60, 5, 0x90, DRVNAME "-6"), }; -static struct f7188x_gpio_bank nct6116d_gpio_bank[] = { +static struct f7188x_gpio_bank nct6126d_gpio_bank[] = { F7188X_GPIO_BANK(0, 8, 0xE0, DRVNAME "-0"), F7188X_GPIO_BANK(10, 8, 0xE4, DRVNAME "-1"), F7188X_GPIO_BANK(20, 8, 0xE8, DRVNAME "-2"), @@ -282,7 +282,7 @@ static struct f7188x_gpio_bank nct6116d_gpio_bank[] = { F7188X_GPIO_BANK(40, 8, 0xF0, DRVNAME "-4"), F7188X_GPIO_BANK(50, 8, 0xF4, DRVNAME "-5"), F7188X_GPIO_BANK(60, 8, 0xF8, DRVNAME "-6"), - F7188X_GPIO_BANK(70, 1, 0xFC, DRVNAME "-7"), + F7188X_GPIO_BANK(70, 8, 0xFC, DRVNAME "-7"), }; static int f7188x_gpio_get_direction(struct gpio_chip *chip, unsigned offset) @@ -490,9 +490,9 @@ static int f7188x_gpio_probe(struct platform_device *pdev) data->nr_bank = ARRAY_SIZE(f81865_gpio_bank); data->bank = f81865_gpio_bank; break; - case nct6116d: - data->nr_bank = ARRAY_SIZE(nct6116d_gpio_bank); - data->bank = nct6116d_gpio_bank; + case nct6126d: + data->nr_bank = ARRAY_SIZE(nct6126d_gpio_bank); + data->bank = nct6126d_gpio_bank; break; default: return -ENODEV; @@ -559,9 +559,9 @@ static int __init f7188x_find(int addr, struct f7188x_sio *sio) case SIO_F81865_ID: sio->type = f81865; break; - case SIO_NCT6116D_ID: + case SIO_NCT6126D_ID: sio->device = SIO_LD_GPIO_NUVOTON; - sio->type = nct6116d; + sio->type = nct6126d; break; default: pr_info("Unsupported Fintek device 0x%04x\n", devid); @@ -569,7 +569,7 @@ static int __init f7188x_find(int addr, struct f7188x_sio *sio) } /* double check manufacturer where possible */ - if (sio->type != nct6116d) { + if (sio->type != nct6126d) { manid = superio_inw(addr, SIO_FINTEK_MANID); if (manid != SIO_FINTEK_ID) { pr_debug("Not a Fintek device at 0x%08x\n", addr); @@ -581,7 +581,7 @@ static int __init f7188x_find(int addr, struct f7188x_sio *sio) err = 0; pr_info("Found %s at %#x\n", f7188x_names[sio->type], (unsigned int)addr); - if (sio->type != nct6116d) + if (sio->type != nct6126d) pr_info(" revision %d\n", superio_inb(addr, SIO_FINTEK_DEVREV)); err: From 75b18aac6fa39a1720677970cfcb52ecea1eb44c Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Tue, 9 May 2023 20:57:44 +0200 Subject: [PATCH 085/271] MIPS: unhide PATA_PLATFORM Alchemy DB1200/DB1300 boards can use the pata_platform driver. Unhide the config entry in all of MIPS. Signed-off-by: Manuel Lauss Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index c2f5498d207f..675a8660cb85 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -79,6 +79,7 @@ config MIPS select HAVE_LD_DEAD_CODE_DATA_ELIMINATION select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI + select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP From f2041708dee30a3425f680265c337acd28293782 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Wed, 10 May 2023 12:33:23 +0200 Subject: [PATCH 086/271] MIPS: Restore Au1300 support The Au1300, at least the one I have to test, uses the NetLogic vendor ID, but commit 95b8a5e0111a ("MIPS: Remove NETLOGIC support") also dropped Au1300 detection. Restore Au1300 detection. Tested on DB1300 with Au1380 chip. Signed-off-by: Manuel Lauss Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/cpu-probe.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index 6d15a398d389..e79adcb128e6 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -1502,6 +1502,10 @@ static inline void cpu_probe_alchemy(struct cpuinfo_mips *c, unsigned int cpu) break; } break; + case PRID_IMP_NETLOGIC_AU13XX: + c->cputype = CPU_ALCHEMY; + __cpu_name[cpu] = "Au1300"; + break; } } @@ -1863,6 +1867,7 @@ void cpu_probe(void) cpu_probe_mips(c, cpu); break; case PRID_COMP_ALCHEMY: + case PRID_COMP_NETLOGIC: cpu_probe_alchemy(c, cpu); break; case PRID_COMP_SIBYTE: From 2d645604f69f3a772d58ead702f9a8e84ab2b342 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Thu, 11 May 2023 17:30:10 +0200 Subject: [PATCH 087/271] MIPS: Alchemy: fix dbdma2 Various fixes for the Au1200/Au1550/Au1300 DBDMA2 code: - skip cache invalidation if chip has working coherency circuitry. - invalidate KSEG0-portion of the (physical) data address. - force the dma channel doorbell write out to bus immediately with a sync. Signed-off-by: Thomas Bogendoerfer --- arch/mips/alchemy/common/dbdma.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/mips/alchemy/common/dbdma.c b/arch/mips/alchemy/common/dbdma.c index 5ab043000409..6a3c890f7bbf 100644 --- a/arch/mips/alchemy/common/dbdma.c +++ b/arch/mips/alchemy/common/dbdma.c @@ -30,6 +30,7 @@ * */ +#include /* for dma_default_coherent */ #include #include #include @@ -623,17 +624,18 @@ u32 au1xxx_dbdma_put_source(u32 chanid, dma_addr_t buf, int nbytes, u32 flags) dp->dscr_cmd0 &= ~DSCR_CMD0_IE; /* - * There is an errata on the Au1200/Au1550 parts that could result - * in "stale" data being DMA'ed. It has to do with the snoop logic on - * the cache eviction buffer. DMA_NONCOHERENT is on by default for - * these parts. If it is fixed in the future, these dma_cache_inv will - * just be nothing more than empty macros. See io.h. + * There is an erratum on certain Au1200/Au1550 revisions that could + * result in "stale" data being DMA'ed. It has to do with the snoop + * logic on the cache eviction buffer. dma_default_coherent is set + * to false on these parts. */ - dma_cache_wback_inv((unsigned long)buf, nbytes); + if (!dma_default_coherent) + dma_cache_wback_inv(KSEG0ADDR(buf), nbytes); dp->dscr_cmd0 |= DSCR_CMD0_V; /* Let it rip */ wmb(); /* drain writebuffer */ dma_cache_wback_inv((unsigned long)dp, sizeof(*dp)); ctp->chan_ptr->ddma_dbell = 0; + wmb(); /* force doorbell write out to dma engine */ /* Get next descriptor pointer. */ ctp->put_ptr = phys_to_virt(DSCR_GET_NXTPTR(dp->dscr_nxtptr)); @@ -685,17 +687,18 @@ u32 au1xxx_dbdma_put_dest(u32 chanid, dma_addr_t buf, int nbytes, u32 flags) dp->dscr_source1, dp->dscr_dest0, dp->dscr_dest1); #endif /* - * There is an errata on the Au1200/Au1550 parts that could result in - * "stale" data being DMA'ed. It has to do with the snoop logic on the - * cache eviction buffer. DMA_NONCOHERENT is on by default for these - * parts. If it is fixed in the future, these dma_cache_inv will just - * be nothing more than empty macros. See io.h. + * There is an erratum on certain Au1200/Au1550 revisions that could + * result in "stale" data being DMA'ed. It has to do with the snoop + * logic on the cache eviction buffer. dma_default_coherent is set + * to false on these parts. */ - dma_cache_inv((unsigned long)buf, nbytes); + if (!dma_default_coherent) + dma_cache_inv(KSEG0ADDR(buf), nbytes); dp->dscr_cmd0 |= DSCR_CMD0_V; /* Let it rip */ wmb(); /* drain writebuffer */ dma_cache_wback_inv((unsigned long)dp, sizeof(*dp)); ctp->chan_ptr->ddma_dbell = 0; + wmb(); /* force doorbell write out to dma engine */ /* Get next descriptor pointer. */ ctp->put_ptr = phys_to_virt(DSCR_GET_NXTPTR(dp->dscr_nxtptr)); From 4897a898a216058dec55e5e5902534e6e224fcdf Mon Sep 17 00:00:00 2001 From: Liviu Dudau Date: Tue, 9 May 2023 18:29:21 +0100 Subject: [PATCH 088/271] mips: Move initrd_start check after initrd address sanitisation. PAGE_OFFSET is technically a virtual address so when checking the value of initrd_start against it we should make sure that it has been sanitised from the values passed by the bootloader. Without this change, even with a bootloader that passes correct addresses for an initrd, we are failing to load it on MT7621 boards, for example. Signed-off-by: Liviu Dudau Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/setup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index febdc5564638..c0e65135481b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -158,10 +158,6 @@ static unsigned long __init init_initrd(void) pr_err("initrd start must be page aligned\n"); goto disable; } - if (initrd_start < PAGE_OFFSET) { - pr_err("initrd start < PAGE_OFFSET\n"); - goto disable; - } /* * Sanitize initrd addresses. For example firmware @@ -174,6 +170,11 @@ static unsigned long __init init_initrd(void) initrd_end = (unsigned long)__va(end); initrd_start = (unsigned long)__va(__pa(initrd_start)); + if (initrd_start < PAGE_OFFSET) { + pr_err("initrd start < PAGE_OFFSET\n"); + goto disable; + } + ROOT_DEV = Root_RAM0; return PFN_UP(end); disable: From d9eef346b601afb0bd74b49e0db06f6a5cebd030 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Fri, 14 Apr 2023 11:22:10 -0700 Subject: [PATCH 089/271] HID: wacom: Check for string overflow from strscpy calls The strscpy function is able to return an error code when a copy would overflow the size of the destination. The copy is stopped and the buffer terminated before overflow actually occurs so it is safe to continue execution, but we should still produce a warning should this occur. Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Reviewed-by: Peter Hutterer Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 8214896adada..7192970d199a 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2224,7 +2224,9 @@ static void wacom_update_name(struct wacom *wacom, const char *suffix) } else if (strstr(product_name, "Wacom") || strstr(product_name, "wacom") || strstr(product_name, "WACOM")) { - strscpy(name, product_name, sizeof(name)); + if (strscpy(name, product_name, sizeof(name)) < 0) { + hid_warn(wacom->hdev, "String overflow while assembling device name"); + } } else { snprintf(name, sizeof(name), "Wacom %s", product_name); } @@ -2242,7 +2244,9 @@ static void wacom_update_name(struct wacom *wacom, const char *suffix) if (name[strlen(name)-1] == ' ') name[strlen(name)-1] = '\0'; } else { - strscpy(name, features->name, sizeof(name)); + if (strscpy(name, features->name, sizeof(name)) < 0) { + hid_warn(wacom->hdev, "String overflow while assembling device name"); + } } snprintf(wacom_wac->name, sizeof(wacom_wac->name), "%s%s", @@ -2500,8 +2504,10 @@ static void wacom_wireless_work(struct work_struct *work) goto fail; } - strscpy(wacom_wac->name, wacom_wac1->name, - sizeof(wacom_wac->name)); + if (strscpy(wacom_wac->name, wacom_wac1->name, + sizeof(wacom_wac->name)) < 0) { + hid_warn(wacom->hdev, "String overflow while assembling device name"); + } } return; From bd249b91977b768ea02bf84d04625d2690ad2b98 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 17 Apr 2023 09:01:48 -0700 Subject: [PATCH 090/271] HID: wacom: avoid integer overflow in wacom_intuos_inout() If high bit is set to 1 in ((data[3] & 0x0f << 28), after all arithmetic operations and integer promotions are done, high bits in wacom->serial[idx] will be filled with 1s as well. Avoid this, albeit unlikely, issue by specifying left operand's __u64 type for the right operand. Found by Linux Verification Center (linuxtesting.org) with static analysis tool SVACE. Fixes: 3bea733ab212 ("USB: wacom tablet driver reorganization") Signed-off-by: Nikita Zhandarovich Reviewed-by: Ping Cheng Cc: stable@vger.kernel.org Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index dc0f7d9a992c..2ccf83837134 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -831,7 +831,7 @@ static int wacom_intuos_inout(struct wacom_wac *wacom) /* Enter report */ if ((data[1] & 0xfc) == 0xc0) { /* serial number of the tool */ - wacom->serial[idx] = ((data[3] & 0x0f) << 28) + + wacom->serial[idx] = ((__u64)(data[3] & 0x0f) << 28) + (data[4] << 20) + (data[5] << 12) + (data[6] << 4) + (data[7] >> 4); From ed84c4517a5bc536e8572a01dfa11bc22a280d06 Mon Sep 17 00:00:00 2001 From: Sung-Chi Li Date: Mon, 24 Apr 2023 10:37:36 +0800 Subject: [PATCH 091/271] HID: google: add jewel USB id Add 1 additional hammer-like device. Signed-off-by: Sung-Chi Li Signed-off-by: Jiri Kosina --- drivers/hid/hid-google-hammer.c | 2 ++ drivers/hid/hid-ids.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c index 7ae5f27df54d..c6bdb9c4ef3e 100644 --- a/drivers/hid/hid-google-hammer.c +++ b/drivers/hid/hid-google-hammer.c @@ -586,6 +586,8 @@ static const struct hid_device_id hammer_devices[] = { USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_EEL) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_HAMMER) }, + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, + USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_JEWEL) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_MAGNEMITE) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index d79e946acdcb..5d29abac2300 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -529,6 +529,7 @@ #define USB_DEVICE_ID_GOOGLE_MOONBALL 0x5044 #define USB_DEVICE_ID_GOOGLE_DON 0x5050 #define USB_DEVICE_ID_GOOGLE_EEL 0x5057 +#define USB_DEVICE_ID_GOOGLE_JEWEL 0x5061 #define USB_VENDOR_ID_GOTOP 0x08f2 #define USB_DEVICE_ID_SUPER_Q2 0x007f From 16a9c24f24fbe4564284eb575b18cc20586b9270 Mon Sep 17 00:00:00 2001 From: Denis Arefev Date: Thu, 27 Apr 2023 14:47:45 +0300 Subject: [PATCH 092/271] HID: wacom: Add error check to wacom_parse_and_register() Added a variable check and transition in case of an error Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Denis Arefev Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 7192970d199a..76e5353aca0c 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2414,8 +2414,13 @@ static int wacom_parse_and_register(struct wacom *wacom, bool wireless) goto fail_quirks; } - if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) + if (features->device_type & WACOM_DEVICETYPE_WL_MONITOR) { error = hid_hw_open(hdev); + if (error) { + hid_err(hdev, "hw open failed\n"); + goto fail_quirks; + } + } wacom_set_shared_values(wacom_wac); devres_close_group(&hdev->dev, wacom); From ee7751b564a90f337330efc1221df40647d68756 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 5 May 2023 13:58:55 -0700 Subject: [PATCH 093/271] tracing/user_events: Use long vs int for atomic bit ops Each event stores a int to track which bit to set/clear when enablement changes. On big endian 64-bit configurations, it's possible this could cause memory corruption when it's used for atomic bit operations. Use unsigned long for enablement values to ensure any possible corruption cannot occur. Downcast to int after mask for the bit target. Link: https://lore.kernel.org/all/6f758683-4e5e-41c3-9b05-9efc703e827c@kili.mountain/ Link: https://lore.kernel.org/linux-trace-kernel/20230505205855.6407-1-beaub@linux.microsoft.com Fixes: dcb8177c1395 ("tracing/user_events: Add ioctl for disabling addresses") Reported-by: Dan Carpenter Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..e37c7f168c44 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -101,7 +101,7 @@ struct user_event_enabler { unsigned long addr; /* Track enable bit, flags, etc. Aligned for bitops. */ - unsigned int values; + unsigned long values; }; /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ @@ -116,7 +116,9 @@ struct user_event_enabler { /* Only duplicate the bit value */ #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK -#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values) +#define ENABLE_BITOPS(e) (&(e)->values) + +#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { @@ -423,9 +425,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + set_bit(ENABLE_BIT(enabler), ptr); else - clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); + clear_bit(ENABLE_BIT(enabler), ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(&page, 1, true); @@ -440,8 +442,7 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, struct user_event_enabler *next; list_for_each_entry_safe(enabler, next, &mm->enablers, link) { - if (enabler->addr == uaddr && - (enabler->values & ENABLE_VAL_BIT_MASK) == bit) + if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -2272,7 +2273,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) list_for_each_entry_safe(enabler, next, &mm->enablers, link) if (enabler->addr == reg.disable_addr && - (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) { + ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) From 0e163e54c34c12369ccf6562e74e8f0a800f4aad Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 17 May 2023 10:56:05 -0600 Subject: [PATCH 094/271] accel/qaic: initialize ret variable to 0 clang static analysis reports drivers/accel/qaic/qaic_data.c:610:2: warning: Undefined or garbage value returned to caller [core.uninitialized.UndefReturn] return ret; ^~~~~~~~~~ From a code analysis of the function, the ret variable is only set some of the time but is always returned. This suggests ret can return uninitialized garbage. However BO allocation will ensure ret is always set in reality. Initialize ret to 0 to silence the warning. Fixes: ff13be830333 ("accel/qaic: Add datapath") Signed-off-by: Tom Rix [jhugo: Reword commit text] Signed-off-by: Jeffrey Hugo Reviewed-by: Carl Vanderlip Reviewed-by: Pranjal Ramajor Asha Kanojiya Link: https://patchwork.freedesktop.org/patch/msgid/20230517165605.16770-1-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index c0a574cd1b35..b46a16fb3080 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -591,7 +591,7 @@ static int qaic_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struc struct qaic_bo *bo = to_qaic_bo(obj); unsigned long offset = 0; struct scatterlist *sg; - int ret; + int ret = 0; if (obj->import_attach) return -EINVAL; From d3b277b7aa1c74a65c84019b8fbe7856f841841a Mon Sep 17 00:00:00 2001 From: Pranjal Ramajor Asha Kanojiya Date: Wed, 17 May 2023 13:35:36 -0600 Subject: [PATCH 095/271] accel/qaic: Validate user data before grabbing any lock Validating user data does not need to be protected by any lock and it is safe to move it out of critical region. Fixes: ff13be830333 ("accel/qaic: Add datapath") Fixes: 129776ac2e38 ("accel/qaic: Add control path") Signed-off-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Carl Vanderlip Reviewed-by: Jeffrey Hugo Signed-off-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20230517193540.14323-2-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_control.c | 12 ++---- drivers/accel/qaic/qaic_data.c | 61 ++++++++++++------------------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/drivers/accel/qaic/qaic_control.c b/drivers/accel/qaic/qaic_control.c index 9f216eb6f76e..9e39b1a324f7 100644 --- a/drivers/accel/qaic/qaic_control.c +++ b/drivers/accel/qaic/qaic_control.c @@ -1249,7 +1249,7 @@ dma_cont_failed: int qaic_manage_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { - struct qaic_manage_msg *user_msg; + struct qaic_manage_msg *user_msg = data; struct qaic_device *qdev; struct manage_msg *msg; struct qaic_user *usr; @@ -1258,6 +1258,9 @@ int qaic_manage_ioctl(struct drm_device *dev, void *data, struct drm_file *file_ int usr_rcu_id; int ret; + if (user_msg->len > QAIC_MANAGE_MAX_MSG_LENGTH) + return -EINVAL; + usr = file_priv->driver_priv; usr_rcu_id = srcu_read_lock(&usr->qddev_lock); @@ -1275,13 +1278,6 @@ int qaic_manage_ioctl(struct drm_device *dev, void *data, struct drm_file *file_ return -ENODEV; } - user_msg = data; - - if (user_msg->len > QAIC_MANAGE_MAX_MSG_LENGTH) { - ret = -EINVAL; - goto out; - } - msg = kzalloc(QAIC_MANAGE_MAX_MSG_LENGTH + sizeof(*msg), GFP_KERNEL); if (!msg) { ret = -ENOMEM; diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index b46a16fb3080..5f71d76dd3a6 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -663,6 +663,10 @@ int qaic_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *fi if (args->pad) return -EINVAL; + size = PAGE_ALIGN(args->size); + if (size == 0) + return -EINVAL; + usr = file_priv->driver_priv; usr_rcu_id = srcu_read_lock(&usr->qddev_lock); if (!usr->qddev) { @@ -677,12 +681,6 @@ int qaic_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *fi goto unlock_dev_srcu; } - size = PAGE_ALIGN(args->size); - if (size == 0) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - bo = qaic_alloc_init_bo(); if (IS_ERR(bo)) { ret = PTR_ERR(bo); @@ -936,6 +934,22 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi struct qaic_bo *bo; int ret; + if (args->hdr.count == 0) + return -EINVAL; + + arg_size = args->hdr.count * sizeof(*slice_ent); + if (arg_size / args->hdr.count != sizeof(*slice_ent)) + return -EINVAL; + + if (args->hdr.size == 0) + return -EINVAL; + + if (!(args->hdr.dir == DMA_TO_DEVICE || args->hdr.dir == DMA_FROM_DEVICE)) + return -EINVAL; + + if (args->data == 0) + return -EINVAL; + usr = file_priv->driver_priv; usr_rcu_id = srcu_read_lock(&usr->qddev_lock); if (!usr->qddev) { @@ -950,43 +964,17 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi goto unlock_dev_srcu; } - if (args->hdr.count == 0) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - - arg_size = args->hdr.count * sizeof(*slice_ent); - if (arg_size / args->hdr.count != sizeof(*slice_ent)) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - if (args->hdr.dbc_id >= qdev->num_dbc) { ret = -EINVAL; goto unlock_dev_srcu; } - if (args->hdr.size == 0) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - - if (!(args->hdr.dir == DMA_TO_DEVICE || args->hdr.dir == DMA_FROM_DEVICE)) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - dbc = &qdev->dbc[args->hdr.dbc_id]; if (dbc->usr != usr) { ret = -EINVAL; goto unlock_dev_srcu; } - if (args->data == 0) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - user_data = u64_to_user_ptr(args->data); slice_ent = kzalloc(arg_size, GFP_KERNEL); @@ -1316,7 +1304,6 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr received_ts = ktime_get_ns(); size = is_partial ? sizeof(*pexec) : sizeof(*exec); - n = (unsigned long)size * args->hdr.count; if (args->hdr.count == 0 || n / args->hdr.count != size) return -EINVAL; @@ -1665,6 +1652,9 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file int rcu_id; int ret; + if (args->pad != 0) + return -EINVAL; + usr = file_priv->driver_priv; usr_rcu_id = srcu_read_lock(&usr->qddev_lock); if (!usr->qddev) { @@ -1679,11 +1669,6 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file goto unlock_dev_srcu; } - if (args->pad != 0) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - if (args->dbc_id >= qdev->num_dbc) { ret = -EINVAL; goto unlock_dev_srcu; From 2e0904efc945ed5b04a9b251b9cefdd6754f5e15 Mon Sep 17 00:00:00 2001 From: Pranjal Ramajor Asha Kanojiya Date: Wed, 17 May 2023 13:35:37 -0600 Subject: [PATCH 096/271] accel/qaic: Validate if BO is sliced before slicing QAIC_ATTACH_SLICE_BO attaches slicing configuration to a BO. Validate if given BO is already sliced. An already sliced BO cannot be sliced again. Fixes: ff13be830333 ("accel/qaic: Add datapath") Signed-off-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Carl Vanderlip Reviewed-by: Jeffrey Hugo Signed-off-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20230517193540.14323-3-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_data.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index 5f71d76dd3a6..e672dd244703 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -1001,6 +1001,11 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi bo = to_qaic_bo(obj); + if (bo->sliced) { + ret = -EINVAL; + goto put_bo; + } + ret = qaic_prepare_bo(qdev, bo, &args->hdr); if (ret) goto put_bo; From faa7c4eee412ca14dd6a107cb6122d31c4277cbe Mon Sep 17 00:00:00 2001 From: Pranjal Ramajor Asha Kanojiya Date: Wed, 17 May 2023 13:35:38 -0600 Subject: [PATCH 097/271] accel/qaic: Flush the transfer list again Before calling synchronize_srcu() we clear the transfer list, this is to allow all the QAIC_WAIT_BO callers to exit otherwise the system could deadlock. There could be a corner case where more elements get added to transfer list after we have flushed it. Re-flush the transfer list once all the holders of dbc->ch_lock have completed execution i.e. synchronize_srcu() is complete. Fixes: ff13be830333 ("accel/qaic: Add datapath") Signed-off-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Carl Vanderlip Reviewed-by: Jeffrey Hugo Signed-off-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20230517193540.14323-4-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_data.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index e672dd244703..c1af99cfd2d1 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -1845,6 +1845,11 @@ void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id) dbc->usr = NULL; empty_xfer_list(qdev, dbc); synchronize_srcu(&dbc->ch_lock); + /* + * Threads holding channel lock, may add more elements in the xfer_list. + * Flush out these elements from xfer_list. + */ + empty_xfer_list(qdev, dbc); } void release_dbc(struct qaic_device *qdev, u32 dbc_id) From 75af0a585af93183ba68bb1b45d0d7a61e963712 Mon Sep 17 00:00:00 2001 From: Pranjal Ramajor Asha Kanojiya Date: Wed, 17 May 2023 13:35:39 -0600 Subject: [PATCH 098/271] accel/qaic: Grab ch_lock during QAIC_ATTACH_SLICE_BO During QAIC_ATTACH_SLICE_BO, we associate a BO to its DBC. We need to grab the dbc->ch_lock to make sure that DBC does not goes away while QAIC_ATTACH_SLICE_BO is still running. Fixes: ff13be830333 ("accel/qaic: Add datapath") Signed-off-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Carl Vanderlip Reviewed-by: Jeffrey Hugo Signed-off-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20230517193540.14323-5-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_data.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index c1af99cfd2d1..e42c1f9ffff8 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -924,8 +924,8 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi { struct qaic_attach_slice_entry *slice_ent; struct qaic_attach_slice *args = data; + int rcu_id, usr_rcu_id, qdev_rcu_id; struct dma_bridge_chan *dbc; - int usr_rcu_id, qdev_rcu_id; struct drm_gem_object *obj; struct qaic_device *qdev; unsigned long arg_size; @@ -969,12 +969,6 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi goto unlock_dev_srcu; } - dbc = &qdev->dbc[args->hdr.dbc_id]; - if (dbc->usr != usr) { - ret = -EINVAL; - goto unlock_dev_srcu; - } - user_data = u64_to_user_ptr(args->data); slice_ent = kzalloc(arg_size, GFP_KERNEL); @@ -1006,9 +1000,16 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi goto put_bo; } + dbc = &qdev->dbc[args->hdr.dbc_id]; + rcu_id = srcu_read_lock(&dbc->ch_lock); + if (dbc->usr != usr) { + ret = -EINVAL; + goto unlock_ch_srcu; + } + ret = qaic_prepare_bo(qdev, bo, &args->hdr); if (ret) - goto put_bo; + goto unlock_ch_srcu; ret = qaic_attach_slicing_bo(qdev, bo, &args->hdr, slice_ent); if (ret) @@ -1018,6 +1019,7 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi dma_sync_sgtable_for_cpu(&qdev->pdev->dev, bo->sgt, args->hdr.dir); bo->dbc = dbc; + srcu_read_unlock(&dbc->ch_lock, rcu_id); drm_gem_object_put(obj); srcu_read_unlock(&qdev->dev_lock, qdev_rcu_id); srcu_read_unlock(&usr->qddev_lock, usr_rcu_id); @@ -1026,6 +1028,8 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi unprepare_bo: qaic_unprepare_bo(qdev, bo); +unlock_ch_srcu: + srcu_read_unlock(&dbc->ch_lock, rcu_id); put_bo: drm_gem_object_put(obj); free_slice_ent: From e997c218ad736fd6f524d73a987bad9d94128d3d Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 17 May 2023 13:35:40 -0600 Subject: [PATCH 099/271] accel/qaic: Fix NNC message corruption If msg_xfer() is unable to queue part of a NNC message because the MHI ring is full, it will attempt to give the QSM some time to drain the queue. However, if QSM fails to make any room, msg_xfer() will fail and tell the caller to try again. This is problematic because part of the message may have been committed to the ring and there is no mechanism to revoke that content. This will cause QSM to receive a corrupt message. The better way to do this is to check if the ring has enough space for the entire message before committing any of the message. Since msg_xfer() is under the cntl_mutex no one else can come in and consume the space. Fixes: 129776ac2e38 ("accel/qaic: Add control path") Signed-off-by: Jeffrey Hugo Reviewed-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Carl Vanderlip Link: https://patchwork.freedesktop.org/patch/msgid/20230517193540.14323-6-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_control.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/accel/qaic/qaic_control.c b/drivers/accel/qaic/qaic_control.c index 9e39b1a324f7..5c57f7b4494e 100644 --- a/drivers/accel/qaic/qaic_control.c +++ b/drivers/accel/qaic/qaic_control.c @@ -997,14 +997,34 @@ static void *msg_xfer(struct qaic_device *qdev, struct wrapper_list *wrappers, u struct xfer_queue_elem elem; struct wire_msg *out_buf; struct wrapper_msg *w; + long ret = -EAGAIN; + int xfer_count = 0; int retry_count; - long ret; if (qdev->in_reset) { mutex_unlock(&qdev->cntl_mutex); return ERR_PTR(-ENODEV); } + /* Attempt to avoid a partial commit of a message */ + list_for_each_entry(w, &wrappers->list, list) + xfer_count++; + + for (retry_count = 0; retry_count < QAIC_MHI_RETRY_MAX; retry_count++) { + if (xfer_count <= mhi_get_free_desc_count(qdev->cntl_ch, DMA_TO_DEVICE)) { + ret = 0; + break; + } + msleep_interruptible(QAIC_MHI_RETRY_WAIT_MS); + if (signal_pending(current)) + break; + } + + if (ret) { + mutex_unlock(&qdev->cntl_mutex); + return ERR_PTR(ret); + } + elem.seq_num = seq_num; elem.buf = NULL; init_completion(&elem.xfer_done); @@ -1038,16 +1058,9 @@ static void *msg_xfer(struct qaic_device *qdev, struct wrapper_list *wrappers, u list_for_each_entry(w, &wrappers->list, list) { kref_get(&w->ref_count); retry_count = 0; -retry: ret = mhi_queue_buf(qdev->cntl_ch, DMA_TO_DEVICE, &w->msg, w->len, list_is_last(&w->list, &wrappers->list) ? MHI_EOT : MHI_CHAIN); if (ret) { - if (ret == -EAGAIN && retry_count++ < QAIC_MHI_RETRY_MAX) { - msleep_interruptible(QAIC_MHI_RETRY_WAIT_MS); - if (!signal_pending(current)) - goto retry; - } - qdev->cntl_lost_buf = true; kref_put(&w->ref_count, free_wrapper); mutex_unlock(&qdev->cntl_mutex); From 632478a05821bc1c9b55c3a1dd0fb1be7bfa1acc Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Thu, 11 May 2023 18:32:01 +0200 Subject: [PATCH 100/271] tracing/timerlat: Always wakeup the timerlat thread While testing rtla timerlat auto analysis, I reach a condition where the interface was not receiving tracing data. I was able to manually reproduce the problem with these steps: # echo 0 > tracing_on # disable trace # echo 1 > osnoise/stop_tracing_us # stop trace if timerlat irq > 1 us # echo timerlat > current_tracer # enable timerlat tracer # sleep 1 # wait... that is the time when rtla # apply configs like prio or cgroup # echo 1 > tracing_on # start tracing # cat trace # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # ||||| ACTIVATION # TASK-PID CPU# ||||| TIMESTAMP ID CONTEXT LATENCY # | | | ||||| | | | | NOTHING! Then, trying to enable tracing again with echo 1 > tracing_on resulted in no change: the trace was still not tracing. This problem happens because the timerlat IRQ hits the stop tracing condition while tracing is off, and do not wake up the timerlat thread, so the timerlat threads are kept sleeping forever, resulting in no trace, even after re-enabling the tracer. Avoid this condition by always waking up the threads, even after stopping tracing, allowing the tracer to return to its normal operating after a new tracing on. Link: https://lore.kernel.org/linux-trace-kernel/1ed8f830638b20a39d535d27d908e319a9a3c4e2.1683822622.git.bristot@kernel.org Cc: Juri Lelli Cc: stable@vger.kernel.org Fixes: a955d7eac177 ("trace: Add timerlat tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index efbbec2caff8..e97e3fa5cbed 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1652,6 +1652,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + wake_up_process(tlat->kthread); + return HRTIMER_NORESTART; } } From b6405f0829d7b1dd926ba3ca5f691cab835abfaa Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 19 May 2023 12:12:06 +0200 Subject: [PATCH 101/271] parisc: Use num_present_cpus() in alternative patching code When patching the kernel code some alternatives depend on SMP vs. !SMP. Use the value of num_present_cpus() instead of num_online_cpus() to decide, otherwise we may run into issues if and additional CPU is enabled after having loaded a module while only one CPU was enabled. Signed-off-by: Helge Deller Cc: # v6.1+ --- arch/parisc/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c index 66f5672c70bd..25c4d6c3375d 100644 --- a/arch/parisc/kernel/alternative.c +++ b/arch/parisc/kernel/alternative.c @@ -25,7 +25,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, { struct alt_instr *entry; int index = 0, applied = 0; - int num_cpus = num_online_cpus(); + int num_cpus = num_present_cpus(); u16 cond_check; cond_check = ALT_COND_ALWAYS | From 8a2b20a997a3779ae9fcae268f2959eb82ec05a1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 22 May 2023 20:18:54 +0800 Subject: [PATCH 102/271] blk-wbt: fix that wbt can't be disabled by default commit b11d31ae01e6 ("blk-wbt: remove unnecessary check in wbt_enable_default()") removes the checking of CONFIG_BLK_WBT_MQ by mistake, which is used to control enable or disable wbt by default. Fix the problem by adding back the checking. This patch also do a litter cleanup to make related code more readable. Fixes: b11d31ae01e6 ("blk-wbt: remove unnecessary check in wbt_enable_default()") Reported-by: Lukas Bulwahn Link: https://lore.kernel.org/lkml/CAKXUXMzfKq_J9nKHGyr5P5rvUETY4B-fxoQD4sO+NYjFOfVtZA@mail.gmail.com/t/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230522121854.2928880-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index e49a48684532..9ec2a2f1eda3 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -730,14 +730,16 @@ void wbt_enable_default(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_qos *rqos; - bool disable_flag = q->elevator && - test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); + bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); + + if (q->elevator && + test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) + enable = false; /* Throttling already enabled? */ rqos = wbt_rq_qos(q); if (rqos) { - if (!disable_flag && - RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } @@ -746,7 +748,7 @@ void wbt_enable_default(struct gendisk *disk) if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && !disable_flag) + if (queue_is_mq(q) && enable) wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); From 3e94d54e83cafd2b562bb6d15bb2f72d76200fb5 Mon Sep 17 00:00:00 2001 From: Tian Lan Date: Mon, 22 May 2023 17:05:55 -0400 Subject: [PATCH 103/271] blk-mq: fix race condition in active queue accounting If multiple CPUs are sharing the same hardware queue, it can cause leak in the active queue counter tracking when __blk_mq_tag_busy() is executed simultaneously. Fixes: ee78ec1077d3 ("blk-mq: blk_mq_tag_busy is no need to return a value") Signed-off-by: Tian Lan Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: John Garry Link: https://lore.kernel.org/r/20230522210555.794134-1-tilan7663@gmail.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d6af9d431dc6..dfd81cab5788 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -39,16 +39,20 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; + /* + * calling test_bit() prior to test_and_set_bit() is intentional, + * it avoids dirtying the cacheline if the queue is already active. + */ if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; - set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags); } else { - if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; - set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state); } users = atomic_inc_return(&hctx->tags->active_queues); From da8d2fe717db06c9cf36b51e4628b2c0ece35b43 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 23 May 2023 11:17:24 +0200 Subject: [PATCH 104/271] block, bfq: update Paolo's address in maintainer list Current email address of Paolo Valente is no longer valid, use a good one. Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20230523091724.26636-1-paolo.valente@unimore.it Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7e0b87d5aa2e..93c6a48ea3f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3548,7 +3548,7 @@ F: Documentation/filesystems/befs.rst F: fs/befs/ BFQ I/O SCHEDULER -M: Paolo Valente +M: Paolo Valente M: Jens Axboe L: linux-block@vger.kernel.org S: Maintained From 46930b7cc7727271c9c27aac1fdc97a8645e2d00 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Tue, 23 May 2023 16:47:09 +0530 Subject: [PATCH 105/271] block: fix bio-cache for passthru IO commit <8af870aa5b847> ("block: enable bio caching use for passthru IO") introduced bio-cache for passthru IO. In case when nr_vecs are greater than BIO_INLINE_VECS, bio and bvecs are allocated from mempool (instead of percpu cache) and REQ_ALLOC_CACHE is cleared. This causes the side effect of not freeing bio/bvecs into mempool on completion. This patch lets the passthru IO fallback to allocation using bio_kmalloc when nr_vecs are greater than BIO_INLINE_VECS. The corresponding bio is freed during call to blk_mq_map_bio_put during completion. Cc: stable@vger.kernel.org # 6.1 fixes <8af870aa5b847> ("block: enable bio caching use for passthru IO") Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20230523111709.145676-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-map.c b/block/blk-map.c index 04c55f1c492e..46eed2e627c3 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -248,7 +248,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, { struct bio *bio; - if (rq->cmd_flags & REQ_ALLOC_CACHE) { + if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) From 4752354af71043e6fd72ef5490ed6da39e6cab4a Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Fri, 19 May 2023 14:58:43 +0800 Subject: [PATCH 106/271] vfio/type1: check pfn valid before converting to struct page Check physical PFN is valid before converting the PFN to a struct page pointer to be returned to caller of vfio_pin_pages(). vfio_pin_pages() pins user pages with contiguous IOVA. If the IOVA of a user page to be pinned belongs to vma of vm_flags VM_PFNMAP, pin_user_pages_remote() will return -EFAULT without returning struct page address for this PFN. This is because usually this kind of PFN (e.g. MMIO PFN) has no valid struct page address associated. Upon this error, vaddr_get_pfns() will obtain the physical PFN directly. While previously vfio_pin_pages() returns to caller PFN arrays directly, after commit 34a255e67615 ("vfio: Replace phys_pfn with pages for vfio_pin_pages()"), PFNs will be converted to "struct page *" unconditionally and therefore the returned "struct page *" array may contain invalid struct page addresses. Given current in-tree users of vfio_pin_pages() only expect "struct page * returned, check PFN validity and return -EINVAL to let the caller be aware of IOVAs to be pinned containing PFN not able to be returned in "struct page *" array. So that, the caller will not consume the returned pointer (e.g. test PageReserved()) and avoid error like "supervisor read access in kernel mode". Fixes: 34a255e67615 ("vfio: Replace phys_pfn with pages for vfio_pin_pages()") Cc: Sean Christopherson Reviewed-by: Jason Gunthorpe Signed-off-by: Yan Zhao Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230519065843.10653-1-yan.y.zhao@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3d4dd9420c30..0d2f805468e1 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -860,6 +860,11 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, if (ret) goto pin_unwind; + if (!pfn_valid(phys_pfn)) { + ret = -EINVAL; + goto pin_unwind; + } + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn); if (ret) { if (put_pfn(phys_pfn, dma->prot) && do_accounting) From 4ef4aee67eed640064fff95a693c0184cedb7bec Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 May 2023 13:48:41 +0100 Subject: [PATCH 107/271] cifs: Fix cifs_limit_bvec_subset() to correctly check the maxmimum size Fix cifs_limit_bvec_subset() so that it limits the span to the maximum specified and won't return with a size greater than max_size. Fixes: d08089f649a0 ("cifs: Change the I/O paths to use an iterator rather than a page list") Cc: stable@vger.kernel.org # 6.3 Reported-by: Shyam Prasad N Reviewed-by: Shyam Prasad N Signed-off-by: David Howells cc: Steve French cc: Rohith Surabattula cc: Paulo Alcantara cc: Tom Talpey cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/cifs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ba7f2e09d6c8..df88b8c04d03 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3353,9 +3353,10 @@ static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_siz while (n && ix < nbv) { len = min3(n, bvecs[ix].bv_len - skip, max_size); span += len; + max_size -= len; nsegs++; ix++; - if (span >= max_size || nsegs >= max_segs) + if (max_size == 0 || nsegs >= max_segs) break; skip = 0; n -= len; From 3e0fea09b17fa2255f6cb0108bbffd4a505f8925 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 May 2023 16:07:38 -0700 Subject: [PATCH 108/271] tracing/user_events: Split up mm alloc and attach When a new mm is being created in a fork() path it currently is allocated and then attached in one go. This leaves the mm exposed out to the tracing register callbacks while any parent enabler locations are copied in. This should not happen. Split up mm alloc and attach as unique operations. When duplicating enablers, first alloc, then duplicate, and only upon success, attach. This prevents any timing window outside of the event_reg mutex for enablement walking. This allows for dropping RCU requirement for enablement walking in later patches. Link: https://lkml.kernel.org/r/20230519230741.669-2-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=whTBvXJuoi_kACo3qi5WZUmRrhyA-_=rRFsycTytmB6qw@mail.gmail.com/ Signed-off-by: Linus Torvalds [ change log written by Beau Belgrave ] Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e37c7f168c44..599aab46a94b 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -539,10 +539,9 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) return found; } -static struct user_event_mm *user_event_mm_create(struct task_struct *t) +static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) { struct user_event_mm *user_mm; - unsigned long flags; user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); @@ -554,12 +553,6 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) refcount_set(&user_mm->refcnt, 1); refcount_set(&user_mm->tasks, 1); - spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); - spin_unlock_irqrestore(&user_event_mms_lock, flags); - - t->user_event_mm = user_mm; - /* * The lifetime of the memory descriptor can slightly outlast * the task lifetime if a ref to the user_event_mm is taken @@ -573,6 +566,17 @@ static struct user_event_mm *user_event_mm_create(struct task_struct *t) return user_mm; } +static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; +} + static struct user_event_mm *current_user_event_mm(void) { struct user_event_mm *user_mm = current->user_event_mm; @@ -580,10 +584,12 @@ static struct user_event_mm *current_user_event_mm(void) if (user_mm) goto inc; - user_mm = user_event_mm_create(current); + user_mm = user_event_mm_alloc(current); if (!user_mm) goto error; + + user_event_mm_attach(user_mm, current); inc: refcount_inc(&user_mm->refcnt); error: @@ -671,7 +677,7 @@ void user_event_mm_remove(struct task_struct *t) void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) { - struct user_event_mm *mm = user_event_mm_create(t); + struct user_event_mm *mm = user_event_mm_alloc(t); struct user_event_enabler *enabler; if (!mm) @@ -685,10 +691,11 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_unlock(); + user_event_mm_attach(mm, t); return; error: rcu_read_unlock(); - user_event_mm_remove(t); + user_event_mm_destroy(mm); } static bool current_user_event_enabler_exists(unsigned long uaddr, From aaecdaf922835ed9a8ce56cdd9a8d40fe630257a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 19 May 2023 16:07:39 -0700 Subject: [PATCH 109/271] tracing/user_events: Remove RCU lock while pinning pages pin_user_pages_remote() can reschedule which means we cannot hold any RCU lock while using it. Now that enablers are not exposed out to the tracing register callbacks during fork(), there is clearly no need to require the RCU lock as event_mutex is enough to protect changes. Remove unneeded RCU usages when pinning pages and walking enablers with event_mutex held. Cleanup a misleading "safe" list walk that is not needed. During fork() duplication, remove unneeded RCU list add, since the list is not exposed yet. Link: https://lkml.kernel.org/r/20230519230741.669-3-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wiiBfT4zNS29jA0XEsy8EmbqTH1hAPdRJCDAJMD8Gxt5A@mail.gmail.com/ Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event enablement") Signed-off-by: Linus Torvalds [ change log written by Beau Belgrave ] Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 599aab46a94b..d34a59630e70 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -439,9 +439,8 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, unsigned long uaddr, unsigned char bit) { struct user_event_enabler *enabler; - struct user_event_enabler *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, link) { if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -456,19 +455,19 @@ static void user_event_enabler_update(struct user_event *user) struct user_event_mm *next; int attempt; + lockdep_assert_held(&event_mutex); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); - rcu_read_lock(); - list_for_each_entry_rcu(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); } } - rcu_read_unlock(); mmap_read_unlock(mm->mm); user_event_mm_put(mm); mm = next; @@ -496,7 +495,9 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, enabler->values = orig->values & ENABLE_VAL_DUP_MASK; refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &mm->enablers); + + /* Enablers not exposed yet, RCU not required */ + list_add(&enabler->link, &mm->enablers); return true; } From dcbd1ac2668b5fa02069ea96d581ca3f70a7543c Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 19 May 2023 16:07:40 -0700 Subject: [PATCH 110/271] tracing/user_events: Rename link fields for clarity Currently most list_head fields of various structs within user_events are simply named link. This causes folks to keep additional context in their head when working with the code, which can be confusing. Instead of using link, describe what the actual link is, for example: list_del_rcu(&mm->link); Changes into: list_del_rcu(&mm->mms_link); The reader now is given a hint the link is to the mms global list instead of having to remember or spot check within the code. Link: https://lkml.kernel.org/r/20230519230741.669-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wicngggxVpbnrYHjRTwGE0WYscPRM+L2HO2BF8ia1EXgQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 2 +- kernel/trace/trace_events_user.c | 40 ++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 2847f5a18a86..17d452b389de 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -17,7 +17,7 @@ #ifdef CONFIG_USER_EVENTS struct user_event_mm { - struct list_head link; + struct list_head mms_link; struct list_head enablers; struct mm_struct *mm; struct user_event_mm *next; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index d34a59630e70..238c7a0615fa 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -96,7 +96,7 @@ struct user_event { * these to track enablement sites that are tied to an event. */ struct user_event_enabler { - struct list_head link; + struct list_head mm_enablers_link; struct user_event *event; unsigned long addr; @@ -155,7 +155,7 @@ struct user_event_file_info { #define VALIDATOR_REL (1 << 1) struct user_event_validator { - struct list_head link; + struct list_head user_event_link; int offset; int flags; }; @@ -261,7 +261,7 @@ error: static void user_event_enabler_destroy(struct user_event_enabler *enabler) { - list_del_rcu(&enabler->link); + list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ refcount_dec(&enabler->event->refcnt); @@ -440,7 +440,7 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, { struct user_event_enabler *enabler; - list_for_each_entry(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) return true; } @@ -461,7 +461,7 @@ static void user_event_enabler_update(struct user_event *user) next = mm->next; mmap_read_lock(mm->mm); - list_for_each_entry(enabler, &mm->enablers, link) { + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { attempt = 0; user_event_enabler_write(mm, enabler, true, &attempt); @@ -497,7 +497,7 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, refcount_inc(&enabler->event->refcnt); /* Enablers not exposed yet, RCU not required */ - list_add(&enabler->link, &mm->enablers); + list_add(&enabler->mm_enablers_link, &mm->enablers); return true; } @@ -527,13 +527,15 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) */ rcu_read_lock(); - list_for_each_entry_rcu(mm, &user_event_mms, link) - list_for_each_entry_rcu(enabler, &mm->enablers, link) + list_for_each_entry_rcu(mm, &user_event_mms, mms_link) { + list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) { if (enabler->event == user) { mm->next = found; found = user_event_mm_get(mm); break; } + } + } rcu_read_unlock(); @@ -572,7 +574,7 @@ static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_stru unsigned long flags; spin_lock_irqsave(&user_event_mms_lock, flags); - list_add_rcu(&user_mm->link, &user_event_mms); + list_add_rcu(&user_mm->mms_link, &user_event_mms); spin_unlock_irqrestore(&user_event_mms_lock, flags); t->user_event_mm = user_mm; @@ -601,7 +603,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) { struct user_event_enabler *enabler, *next; - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) user_event_enabler_destroy(enabler); mmdrop(mm->mm); @@ -638,7 +640,7 @@ void user_event_mm_remove(struct task_struct *t) /* Remove the mm from the list, so it can no longer be enabled */ spin_lock_irqsave(&user_event_mms_lock, flags); - list_del_rcu(&mm->link); + list_del_rcu(&mm->mms_link); spin_unlock_irqrestore(&user_event_mms_lock, flags); /* @@ -686,9 +688,10 @@ void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) rcu_read_lock(); - list_for_each_entry_rcu(enabler, &old_mm->enablers, link) + list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) { if (!user_event_enabler_dup(enabler, mm)) goto error; + } rcu_read_unlock(); @@ -757,7 +760,7 @@ retry: */ if (!*write_result) { refcount_inc(&enabler->event->refcnt); - list_add_rcu(&enabler->link, &user_mm->enablers); + list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } mutex_unlock(&event_mutex); @@ -913,8 +916,8 @@ static void user_event_destroy_validators(struct user_event *user) struct user_event_validator *validator, *next; struct list_head *head = &user->validators; - list_for_each_entry_safe(validator, next, head, link) { - list_del(&validator->link); + list_for_each_entry_safe(validator, next, head, user_event_link) { + list_del(&validator->user_event_link); kfree(validator); } } @@ -968,7 +971,7 @@ add_validator: validator->offset = offset; /* Want sequential access when validating */ - list_add_tail(&validator->link, &user->validators); + list_add_tail(&validator->user_event_link, &user->validators); add_field: field->type = type; @@ -1358,7 +1361,7 @@ static int user_event_validate(struct user_event *user, void *data, int len) void *pos, *end = data + len; u32 loc, offset, size; - list_for_each_entry(validator, head, link) { + list_for_each_entry(validator, head, user_event_link) { pos = data + validator->offset; /* Already done min_size check, no bounds check here */ @@ -2279,7 +2282,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) */ mutex_lock(&event_mutex); - list_for_each_entry_safe(enabler, next, &mm->enablers, link) + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) { if (enabler->addr == reg.disable_addr && ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); @@ -2290,6 +2293,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) /* Removed at least one */ ret = 0; } + } mutex_unlock(&event_mutex); From ff9e1632d69e596d8ca256deb07433a8f3565038 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Fri, 19 May 2023 16:07:41 -0700 Subject: [PATCH 111/271] tracing/user_events: Document user_event_mm one-shot list usage During 6.4 development it became clear that the one-shot list used by the user_event_mm's next field was confusing to others. It is not clear how this list is protected or what the next field usage is for unless you are familiar with the code. Add comments into the user_event_mm struct indicating lock requirement and usage. Also document how and why this approach was used via comments in both user_event_enabler_update() and user_event_mm_get_all() and the rules to properly use it. Link: https://lkml.kernel.org/r/20230519230741.669-5-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wicngggxVpbnrYHjRTwGE0WYscPRM+L2HO2BF8ia1EXgQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/linux/user_events.h | 1 + kernel/trace/trace_events_user.c | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 17d452b389de..8afa8c3a0973 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -20,6 +20,7 @@ struct user_event_mm { struct list_head mms_link; struct list_head enablers; struct mm_struct *mm; + /* Used for one-shot lists, protected by event_mutex */ struct user_event_mm *next; refcount_t refcnt; refcount_t tasks; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 238c7a0615fa..dbb14705d0d3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -451,12 +451,25 @@ static bool user_event_enabler_exists(struct user_event_mm *mm, static void user_event_enabler_update(struct user_event *user) { struct user_event_enabler *enabler; - struct user_event_mm *mm = user_event_mm_get_all(user); struct user_event_mm *next; + struct user_event_mm *mm; int attempt; lockdep_assert_held(&event_mutex); + /* + * We need to build a one-shot list of all the mms that have an + * enabler for the user_event passed in. This list is only valid + * while holding the event_mutex. The only reason for this is due + * to the global mm list being RCU protected and we use methods + * which can wait (mmap_read_lock and pin_user_pages_remote). + * + * NOTE: user_event_mm_get_all() increments the ref count of each + * mm that is added to the list to prevent removal timing windows. + * We must always put each mm after they are used, which may wait. + */ + mm = user_event_mm_get_all(user); + while (mm) { next = mm->next; mmap_read_lock(mm->mm); @@ -515,6 +528,14 @@ static struct user_event_mm *user_event_mm_get_all(struct user_event *user) struct user_event_enabler *enabler; struct user_event_mm *mm; + /* + * We use the mm->next field to build a one-shot list from the global + * RCU protected list. To build this list the event_mutex must be held. + * This lets us build a list without requiring allocs that could fail + * when user based events are most wanted for diagnostics. + */ + lockdep_assert_held(&event_mutex); + /* * We do not want to block fork/exec while enablements are being * updated, so we use RCU to walk the current tasks that have used From e30fbc618e97b38dbb49f1d44dcd0778d3f23b8c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 22:11:08 -0400 Subject: [PATCH 112/271] tracing/histograms: Allow variables to have some modifiers Modifiers are used to change the behavior of keys. For instance, they can grouped into buckets, converted to syscall names (from the syscall identifier), show task->comm of the current pid, be an array of longs that represent a stacktrace, and more. It was found that nothing stopped a value from taking a modifier. As values are simple counters. If this happened, it would call code that was not expecting a modifier and crash the kernel. This was fixed by having the ___create_val_field() function test if a modifier was present and fail if one was. This fixed the crash. Now there's a problem with variables. Variables are used to pass fields from one event to another. Variables are allowed to have some modifiers, as the processing may need to happen at the time of the event (like stacktraces and comm names of the current pid). The issue is that it too uses __create_val_field(). Now that fails on modifiers, variables can no longer use them (this is a regression). As not all modifiers are for variables, have them use a separate check. Link: https://lore.kernel.org/linux-trace-kernel/20230523221108.064a5d82@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Fixes: e0213434fe3e4 ("tracing: Do not let histogram values have some modifiers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 486cca3c2b75..543cb7dc84ad 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4238,13 +4238,19 @@ static int __create_val_field(struct hist_trigger_data *hist_data, goto out; } - /* Some types cannot be a value */ - if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | - HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | - HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | - HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { - hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); - ret = -EINVAL; + /* values and variables should not have some modifiers */ + if (hist_field->flags & HIST_FIELD_FL_VAR) { + /* Variable */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2)) + goto err; + } else { + /* Value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) + goto err; } hist_data->fields[val_idx] = hist_field; @@ -4256,6 +4262,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, ret = -EINVAL; out: return ret; + err: + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + return -EINVAL; } static int create_val_field(struct hist_trigger_data *hist_data, From 4b512860bdbdddcf41467ebd394f27cb8dfb528c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 23:09:13 -0400 Subject: [PATCH 113/271] tracing: Rename stacktrace field to common_stacktrace The histogram and synthetic events can use a pseudo event called "stacktrace" that will create a stacktrace at the time of the event and use it just like it was a normal field. We have other pseudo events such as "common_cpu" and "common_timestamp". To stay consistent with that, convert "stacktrace" to "common_stacktrace". As this was used in older kernels, to keep backward compatibility, this will act just like "common_cpu" did with "cpu". That is, "cpu" will be the same as "common_cpu" unless the event has a "cpu" field. In which case, the event's field is used. The same is true with "stacktrace". Also update the documentation to reflect this change. Link: https://lore.kernel.org/linux-trace-kernel/20230523230913.6860e28d@rorschach.local.home Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/histogram.rst | 64 +++++++++++++++---------------- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace_events.c | 2 + kernel/trace/trace_events_hist.c | 16 +++++--- 5 files changed, 46 insertions(+), 39 deletions(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 479c9eac6335..3c9b263de9c2 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -35,7 +35,7 @@ Documentation written by Tom Zanussi in place of an explicit value field - this is simply a count of event hits. If 'values' isn't specified, an implicit 'hitcount' value will be automatically created and used as the only value. - Keys can be any field, or the special string 'stacktrace', which + Keys can be any field, or the special string 'common_stacktrace', which will use the event's kernel stacktrace as the key. The keywords 'keys' or 'key' can be used to specify keys, and the keywords 'values', 'vals', or 'val' can be used to specify values. Compound @@ -54,7 +54,7 @@ Documentation written by Tom Zanussi 'compatible' if the fields named in the trigger share the same number and type of fields and those fields also have the same names. Note that any two events always share the compatible 'hitcount' and - 'stacktrace' fields and can therefore be combined using those + 'common_stacktrace' fields and can therefore be combined using those fields, however pointless that may be. 'hist' triggers add a 'hist' file to each event's subdirectory. @@ -547,9 +547,9 @@ Extended error information the hist trigger display symbolic call_sites, we can have the hist trigger additionally display the complete set of kernel stack traces that led to each call_site. To do that, we simply use the special - value 'stacktrace' for the key parameter:: + value 'common_stacktrace' for the key parameter:: - # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ + # echo 'hist:keys=common_stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ /sys/kernel/tracing/events/kmem/kmalloc/trigger The above trigger will use the kernel stack trace in effect when an @@ -561,9 +561,9 @@ Extended error information every callpath to a kmalloc for a kernel compile):: # cat /sys/kernel/tracing/events/kmem/kmalloc/hist - # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] + # trigger info: hist:keys=common_stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] - { stacktrace: + { common_stacktrace: __kmalloc_track_caller+0x10b/0x1a0 kmemdup+0x20/0x50 hidraw_report_event+0x8a/0x120 [hid] @@ -581,7 +581,7 @@ Extended error information cpu_startup_entry+0x315/0x3e0 rest_init+0x7c/0x80 } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: + { common_stacktrace: __kmalloc_track_caller+0x10b/0x1a0 kmemdup+0x20/0x50 hidraw_report_event+0x8a/0x120 [hid] @@ -596,7 +596,7 @@ Extended error information do_IRQ+0x5a/0xf0 ret_from_intr+0x0/0x30 } hitcount: 3 bytes_req: 21 bytes_alloc: 24 - { stacktrace: + { common_stacktrace: kmem_cache_alloc_trace+0xeb/0x150 aa_alloc_task_context+0x27/0x40 apparmor_cred_prepare+0x1f/0x50 @@ -608,7 +608,7 @@ Extended error information . . . - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 i915_gem_execbuffer2+0x6c/0x2c0 [i915] drm_ioctl+0x349/0x670 [drm] @@ -616,7 +616,7 @@ Extended error information SyS_ioctl+0x81/0xa0 system_call_fastpath+0x12/0x6a } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 load_elf_phdrs+0x76/0xa0 load_elf_binary+0x102/0x1650 @@ -625,7 +625,7 @@ Extended error information SyS_execve+0x3a/0x50 return_from_execve+0x0/0x23 } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 - { stacktrace: + { common_stacktrace: kmem_cache_alloc_trace+0xeb/0x150 apparmor_file_alloc_security+0x27/0x40 security_file_alloc+0x16/0x20 @@ -636,7 +636,7 @@ Extended error information SyS_open+0x1e/0x20 system_call_fastpath+0x12/0x6a } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 - { stacktrace: + { common_stacktrace: __kmalloc+0x11b/0x1b0 seq_buf_alloc+0x1b/0x50 seq_read+0x2cc/0x370 @@ -1026,7 +1026,7 @@ Extended error information First we set up an initially paused stacktrace trigger on the netif_receive_skb event:: - # echo 'hist:key=stacktrace:vals=len:pause' > \ + # echo 'hist:key=common_stacktrace:vals=len:pause' > \ /sys/kernel/tracing/events/net/netif_receive_skb/trigger Next, we set up an 'enable_hist' trigger on the sched_process_exec @@ -1060,9 +1060,9 @@ Extended error information $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz # cat /sys/kernel/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + # trigger info: hist:keys=common_stacktrace:vals=len:sort=hitcount:size=2048 [paused] - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1079,7 +1079,7 @@ Extended error information kthread+0xd2/0xf0 ret_from_fork+0x42/0x70 } hitcount: 85 len: 28884 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1097,7 +1097,7 @@ Extended error information irq_thread+0x11f/0x150 kthread+0xd2/0xf0 } hitcount: 98 len: 664329 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 process_backlog+0xa8/0x150 @@ -1115,7 +1115,7 @@ Extended error information inet_sendmsg+0x64/0xa0 sock_sendmsg+0x3d/0x50 } hitcount: 115 len: 13030 - { stacktrace: + { common_stacktrace: __netif_receive_skb_core+0x46d/0x990 __netif_receive_skb+0x18/0x60 netif_receive_skb_internal+0x23/0x90 @@ -1142,14 +1142,14 @@ Extended error information into the histogram. In order to avoid having to set everything up again, we can just clear the histogram first:: - # echo 'hist:key=stacktrace:vals=len:clear' >> \ + # echo 'hist:key=common_stacktrace:vals=len:clear' >> \ /sys/kernel/tracing/events/net/netif_receive_skb/trigger Just to verify that it is in fact cleared, here's what we now see in the hist file:: # cat /sys/kernel/tracing/events/net/netif_receive_skb/hist - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] + # trigger info: hist:keys=common_stacktrace:vals=len:sort=hitcount:size=2048 [paused] Totals: Hits: 0 @@ -1485,12 +1485,12 @@ Extended error information And here's an example that shows how to combine histogram data from any two events even if they don't share any 'compatible' fields - other than 'hitcount' and 'stacktrace'. These commands create a + other than 'hitcount' and 'common_stacktrace'. These commands create a couple of triggers named 'bar' using those fields:: - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + # echo 'hist:name=bar:key=common_stacktrace:val=hitcount' > \ /sys/kernel/tracing/events/sched/sched_process_fork/trigger - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + # echo 'hist:name=bar:key=common_stacktrace:val=hitcount' > \ /sys/kernel/tracing/events/net/netif_rx/trigger And displaying the output of either shows some interesting if @@ -1501,16 +1501,16 @@ Extended error information # event histogram # - # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] + # trigger info: hist:name=bar:keys=common_stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] # - { stacktrace: + { common_stacktrace: kernel_clone+0x18e/0x330 kernel_thread+0x29/0x30 kthreadd+0x154/0x1b0 ret_from_fork+0x3f/0x70 } hitcount: 1 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx_ni+0x20/0x70 dev_loopback_xmit+0xaa/0xd0 @@ -1528,7 +1528,7 @@ Extended error information call_cpuidle+0x3b/0x60 cpu_startup_entry+0x22d/0x310 } hitcount: 1 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx_ni+0x20/0x70 dev_loopback_xmit+0xaa/0xd0 @@ -1543,7 +1543,7 @@ Extended error information SyS_sendto+0xe/0x10 entry_SYSCALL_64_fastpath+0x12/0x6a } hitcount: 2 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1561,7 +1561,7 @@ Extended error information sock_sendmsg+0x38/0x50 ___sys_sendmsg+0x14e/0x270 } hitcount: 76 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1579,7 +1579,7 @@ Extended error information sock_sendmsg+0x38/0x50 ___sys_sendmsg+0x269/0x270 } hitcount: 77 - { stacktrace: + { common_stacktrace: netif_rx_internal+0xb2/0xd0 netif_rx+0x1c/0x60 loopback_xmit+0x6c/0xb0 @@ -1597,7 +1597,7 @@ Extended error information sock_sendmsg+0x38/0x50 SYSC_sendto+0xef/0x170 } hitcount: 88 - { stacktrace: + { common_stacktrace: kernel_clone+0x18e/0x330 SyS_clone+0x19/0x20 entry_SYSCALL_64_fastpath+0x12/0x6a @@ -1949,7 +1949,7 @@ uninterruptible state:: # cd /sys/kernel/tracing # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events - # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger + # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=common_stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(block_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger # echo 1 > events/synthetic/block_lat/enable # cat trace diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0e373222a6df..7c4a0b72334e 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -806,6 +806,7 @@ enum { FILTER_TRACE_FN, FILTER_COMM, FILTER_CPU, + FILTER_STACKTRACE, }; extern int trace_event_raw_init(struct trace_event_call *call); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebc59781456a..81801dc31784 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5752,7 +5752,7 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field, or the special string 'stacktrace'.\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..57e539d47989 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -194,6 +194,8 @@ static int trace_define_generic_fields(void) __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); + __generic_field(char *, stacktrace, FILTER_STACKTRACE); + __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 543cb7dc84ad..b97d3ad832f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1364,7 +1364,7 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else - field_name = "stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -2367,7 +2367,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "stacktrace") == 0) { + } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; @@ -2378,11 +2378,15 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (!field || !field->size) { /* * For backward compatibility, if field_name - * was "cpu", then we treat this the same as - * common_cpu. This also works for "CPU". + * was "cpu" or "stacktrace", then we treat this + * the same as common_cpu and common_stacktrace + * respectively. This also works for "CPU", and + * "STACKTRACE". */ if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; + } else if (field && field->filter_type == FILTER_STACKTRACE) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -5394,7 +5398,7 @@ static void hist_trigger_print_key(struct seq_file *m, if (key_field->field) seq_printf(m, "%s.stacktrace", key_field->field->name); else - seq_puts(m, "stacktrace:\n"); + seq_puts(m, "common_stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5977,7 +5981,7 @@ static int event_hist_trigger_print(struct seq_file *m, if (field->field) seq_printf(m, "%s.stacktrace", field->field->name); else - seq_puts(m, "stacktrace"); + seq_puts(m, "common_stacktrace"); } else hist_field_print(m, field); } From f1aab363438cd1c55a3c5420db2a17c0e29ef625 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 23 May 2023 22:54:29 -0400 Subject: [PATCH 114/271] tracing/selftests: Update synthetic event selftest to use common_stacktrace With the rename of the stacktrace field to common_stacktrace, update the selftests to reflect this change. Copy the current selftest to test the backward compatibility "stacktrace" keyword. Also the "requires" of that test was incorrect, so it would never actually ran before. That is fixed now. Link: https://lore.kernel.org/linux-trace-kernel/20230523225402.55951f2f@rorschach.local.home Cc: Masami Hiramatsu Cc: Tom Zanussi Cc: Mark Rutland Cc: Shuah Khan Cc: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- .../trigger-synthetic-event-stack-legacy.tc | 24 +++++++++++++++++++ .../trigger-synthetic-event-stack.tc | 5 ++-- 2 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack-legacy.tc diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack-legacy.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack-legacy.tc new file mode 100644 index 000000000000..d0cd91a93069 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack-legacy.tc @@ -0,0 +1,24 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test inter-event histogram trigger trace action with dynamic string param (legacy stack) +# requires: set_event synthetic_events events/sched/sched_process_exec/hist "long[] stack' >> synthetic_events":README + +fail() { #msg + echo $1 + exit_fail +} + +echo "Test create synthetic event with stack" + +# Test the old stacktrace keyword (for backward compatibility) +echo 's:wake_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events +echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 1||prev_state == 2' >> events/sched/sched_switch/trigger +echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(wake_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger +echo 1 > events/synthetic/wake_lat/enable +sleep 1 + +if ! grep -q "=>.*sched" trace; then + fail "Failed to create synthetic event with stack" +fi + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc index 755dbe94ccf4..8f1cc9a86a06 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: event trigger - test inter-event histogram trigger trace action with dynamic string param -# requires: set_event synthetic_events events/sched/sched_process_exec/hist "long[]' >> synthetic_events":README +# requires: set_event synthetic_events events/sched/sched_process_exec/hist "can be any field, or the special string 'common_stacktrace'":README fail() { #msg echo $1 @@ -10,9 +10,8 @@ fail() { #msg echo "Test create synthetic event with stack" - echo 's:wake_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events -echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 1||prev_state == 2' >> events/sched/sched_switch/trigger +echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=common_stacktrace if prev_state == 1||prev_state == 2' >> events/sched/sched_switch/trigger echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(wake_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger echo 1 > events/synthetic/wake_lat/enable sleep 1 From a1a5f2c887252dec161c1e12e04303ca9ba56fa9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 23 May 2023 21:53:10 -0700 Subject: [PATCH 115/271] dmaengine: pl330: rename _start to prevent build error "_start" is used in several arches and proably should be reserved for ARCH usage. Using it in a driver for a private symbol can cause a build error when it conflicts with ARCH usage of the same symbol. Therefore rename pl330's "_start" to "pl330_start_thread" so that there is no conflict and no build error. drivers/dma/pl330.c:1053:13: error: '_start' redeclared as different kind of symbol 1053 | static bool _start(struct pl330_thread *thrd) | ^~~~~~ In file included from ../include/linux/interrupt.h:21, from ../drivers/dma/pl330.c:18: arch/riscv/include/asm/sections.h:11:13: note: previous declaration of '_start' with type 'char[]' 11 | extern char _start[]; | ^~~~~~ Fixes: b7d861d93945 ("DMA: PL330: Merge PL330 driver into drivers/dma/") Fixes: ae43b3289186 ("ARM: 8202/1: dmaengine: pl330: Add runtime Power Management support v12") Signed-off-by: Randy Dunlap Cc: Jaswinder Singh Cc: Boojin Kim Cc: Krzysztof Kozlowski Cc: Russell King Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Cc: linux-riscv@lists.infradead.org Link: https://lore.kernel.org/r/20230524045310.27923-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- drivers/dma/pl330.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 0d9257fbdfb0..b4731fe6bbc1 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -1050,7 +1050,7 @@ static bool _trigger(struct pl330_thread *thrd) return true; } -static bool _start(struct pl330_thread *thrd) +static bool pl330_start_thread(struct pl330_thread *thrd) { switch (_state(thrd)) { case PL330_STATE_FAULT_COMPLETING: @@ -1702,7 +1702,7 @@ static int pl330_update(struct pl330_dmac *pl330) thrd->req_running = -1; /* Get going again ASAP */ - _start(thrd); + pl330_start_thread(thrd); /* For now, just make a list of callbacks to be done */ list_add_tail(&descdone->rqd, &pl330->req_done); @@ -2089,7 +2089,7 @@ static void pl330_tasklet(struct tasklet_struct *t) } else { /* Make sure the PL330 Channel thread is active */ spin_lock(&pch->thread->dmac->lock); - _start(pch->thread); + pl330_start_thread(pch->thread); spin_unlock(&pch->thread->dmac->lock); } @@ -2107,7 +2107,7 @@ static void pl330_tasklet(struct tasklet_struct *t) if (power_down) { pch->active = true; spin_lock(&pch->thread->dmac->lock); - _start(pch->thread); + pl330_start_thread(pch->thread); spin_unlock(&pch->thread->dmac->lock); power_down = false; } From 2a6c7e8cc74e58ba94b8c897035a8ef7f7349f76 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Tue, 23 May 2023 19:20:37 +0200 Subject: [PATCH 116/271] dmaengine: at_hdmac: Repair bitfield macros for peripheral ID handling The MSB part of the peripheral IDs need to go into the ATC_SRC_PER_MSB and ATC_DST_PER_MSB fields. Not the LSB part. This fixes a severe regression for TSE-850 devices (compatible axentia,tse850v3) where output to the audio I2S codec (the main purpose of the device) simply do not work. Fixes: d8840a7edcf0 ("dmaengine: at_hdmac: Use bitfield access macros") Cc: stable@vger.kernel.org Signed-off-by: Peter Rosin Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/01e5dae1-d4b0-cf31-516b-423b11b077f1@axentia.se Signed-off-by: Vinod Koul --- drivers/dma/at_hdmac.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 8858470246e1..6362013b90df 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -153,8 +153,6 @@ #define ATC_AUTO BIT(31) /* Auto multiple buffer tx enable */ /* Bitfields in CFG */ -#define ATC_PER_MSB(h) ((0x30U & (h)) >> 4) /* Extract most significant bits of a handshaking identifier */ - #define ATC_SRC_PER GENMASK(3, 0) /* Channel src rq associated with periph handshaking ifc h */ #define ATC_DST_PER GENMASK(7, 4) /* Channel dst rq associated with periph handshaking ifc h */ #define ATC_SRC_REP BIT(8) /* Source Replay Mod */ @@ -181,10 +179,15 @@ #define ATC_DPIP_HOLE GENMASK(15, 0) #define ATC_DPIP_BOUNDARY GENMASK(25, 16) -#define ATC_SRC_PER_ID(id) (FIELD_PREP(ATC_SRC_PER_MSB, (id)) | \ - FIELD_PREP(ATC_SRC_PER, (id))) -#define ATC_DST_PER_ID(id) (FIELD_PREP(ATC_DST_PER_MSB, (id)) | \ - FIELD_PREP(ATC_DST_PER, (id))) +#define ATC_PER_MSB GENMASK(5, 4) /* Extract MSBs of a handshaking identifier */ +#define ATC_SRC_PER_ID(id) \ + ({ typeof(id) _id = (id); \ + FIELD_PREP(ATC_SRC_PER_MSB, FIELD_GET(ATC_PER_MSB, _id)) | \ + FIELD_PREP(ATC_SRC_PER, _id); }) +#define ATC_DST_PER_ID(id) \ + ({ typeof(id) _id = (id); \ + FIELD_PREP(ATC_DST_PER_MSB, FIELD_GET(ATC_PER_MSB, _id)) | \ + FIELD_PREP(ATC_DST_PER, _id); }) From e14fd2af7a1d621c167dad761f729135a7a76ff4 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Tue, 23 May 2023 19:20:47 +0200 Subject: [PATCH 117/271] dmaengine: at_hdmac: Extend the Flow Controller bitfield to three bits Some chips have two bits (e.g SAMA5D3), and some have three (e.g. SAM9G45). A field width of three is compatible as long as valid values are used for the different chips. There is no current use of any value needing three bits, so the fixed bug is relatively benign. Fixes: d8840a7edcf0 ("dmaengine: at_hdmac: Use bitfield access macros") Cc: stable@vger.kernel.org Reviewed-by: Tudor Ambarus Signed-off-by: Peter Rosin Link: https://lore.kernel.org/r/e2c898ba-c3a3-5dd3-384b-0585661c79f2@axentia.se Signed-off-by: Vinod Koul --- drivers/dma/at_hdmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 6362013b90df..ee3a219e3a89 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -132,7 +132,7 @@ #define ATC_DST_PIP BIT(12) /* Destination Picture-in-Picture enabled */ #define ATC_SRC_DSCR_DIS BIT(16) /* Src Descriptor fetch disable */ #define ATC_DST_DSCR_DIS BIT(20) /* Dst Descriptor fetch disable */ -#define ATC_FC GENMASK(22, 21) /* Choose Flow Controller */ +#define ATC_FC GENMASK(23, 21) /* Choose Flow Controller */ #define ATC_FC_MEM2MEM 0x0 /* Mem-to-Mem (DMA) */ #define ATC_FC_MEM2PER 0x1 /* Mem-to-Periph (DMA) */ #define ATC_FC_PER2MEM 0x2 /* Periph-to-Mem (DMA) */ From 6ab39f99927eed605728b02d512438d828183c97 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 May 2023 20:14:19 +0200 Subject: [PATCH 118/271] crypto: x86/aria - Use 16 byte alignment for GFNI constant vectors The GFNI routines in the AVX version of the ARIA implementation now use explicit VMOVDQA instructions to load the constant input vectors, which means they must be 16 byte aligned. So ensure that this is the case, by dropping the section split and the incorrect .align 8 directive, and emitting the constants into the 16-byte aligned section instead. Note that the AVX2 version of this code deviates from this pattern, and does not require a similar fix, given that it loads these contants as 8-byte memory operands, for which AVX2 permits any alignment. Cc: Taehee Yoo Fixes: 8b84475318641c2b ("crypto: x86/aria-avx - Do not use avx2 instructions") Reported-by: syzbot+a6abcf08bad8b18fd198@syzkaller.appspotmail.com Tested-by: syzbot+a6abcf08bad8b18fd198@syzkaller.appspotmail.com Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/aria-aesni-avx-asm_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S index 7c1abc513f34..9556dacd9841 100644 --- a/arch/x86/crypto/aria-aesni-avx-asm_64.S +++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S @@ -773,8 +773,6 @@ .octa 0x3F893781E95FE1576CDA64D2BA0CB204 #ifdef CONFIG_AS_GFNI -.section .rodata.cst8, "aM", @progbits, 8 -.align 8 /* AES affine: */ #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) .Ltf_aff_bitmatrix: From 95856d1f3c223c015780fffb8373a827fc4efd2e Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 23 May 2023 16:47:47 +0100 Subject: [PATCH 119/271] regmap: sdw: check for invalid multi-register writes config SoundWire code as it is only supports Bulk register writes and it does not support multi-register writes. Any drivers that set can_multi_write and use regmap_multi_reg_write() will easily endup with programming the hardware incorrectly without any errors. So, add this check in bus code to be able to validate the drivers config. Fixes: 522272047dc6 ("regmap: sdw: Remove 8-bit value size restriction") Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20230523154747.5429-1-srinivas.kandagatla@linaro.org Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-sdw.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/base/regmap/regmap-sdw.c b/drivers/base/regmap/regmap-sdw.c index 09899ae99fc1..159c0b740b00 100644 --- a/drivers/base/regmap/regmap-sdw.c +++ b/drivers/base/regmap/regmap-sdw.c @@ -59,6 +59,10 @@ static int regmap_sdw_config_check(const struct regmap_config *config) if (config->pad_bits != 0) return -ENOTSUPP; + /* Only bulk writes are supported not multi-register writes */ + if (config->can_multi_write) + return -ENOTSUPP; + return 0; } From 0cc6578048e0980d254aee345130cced4912f723 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 23 May 2023 23:18:19 +0100 Subject: [PATCH 120/271] regmap: maple: Drop the RCU read lock while syncing registers Unfortunately the maple tree requires us to explicitly lock it so we need to take the RCU read lock while iterating. When syncing this means that we end up trying to write out register values while holding the RCU read lock which triggers lockdep issues since that is an atomic context but most buses can't be used in atomic context. Pause the iteration and drop the lock for each register we check to avoid this. Reported-by: Pierre-Louis Bossart Tested-by: Pierre-Louis Bossart Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20230523-regcache-maple-sync-lock-v1-1-530e4d68dfab@kernel.org Signed-off-by: Mark Brown --- drivers/base/regmap/regcache-maple.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c index 9b1b559107ef..c2e3a0f6c218 100644 --- a/drivers/base/regmap/regcache-maple.c +++ b/drivers/base/regmap/regcache-maple.c @@ -203,15 +203,18 @@ static int regcache_maple_sync(struct regmap *map, unsigned int min, mas_for_each(&mas, entry, max) { for (r = max(mas.index, lmin); r <= min(mas.last, lmax); r++) { + mas_pause(&mas); + rcu_read_unlock(); ret = regcache_sync_val(map, r, entry[r - mas.index]); if (ret != 0) goto out; + rcu_read_lock(); } } -out: rcu_read_unlock(); +out: map->cache_bypass = false; return ret; From 15e64ef6520ea8702998db05b87fa5c3d3d40710 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 22 May 2023 15:11:16 +0200 Subject: [PATCH 121/271] parisc: Add lightweight spinlock checks Add a lightweight spinlock check which uses only two instructions per spinlock call. It detects if a spinlock has been trashed by some memory corruption and then halts the kernel. It will not detect uninitialized spinlocks, for which CONFIG_DEBUG_SPINLOCK needs to be enabled. This lightweight spinlock check shouldn't influence runtime, so it's safe to enable it by default. The __ARCH_SPIN_LOCK_UNLOCKED_VAL constant has been choosen small enough to be able to be loaded by one LDI assembler statement. Signed-off-by: Helge Deller --- arch/parisc/Kconfig.debug | 11 +++++++ arch/parisc/include/asm/spinlock.h | 39 +++++++++++++++++++++--- arch/parisc/include/asm/spinlock_types.h | 8 +++-- arch/parisc/kernel/traps.c | 10 ++++++ 4 files changed, 61 insertions(+), 7 deletions(-) diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug index f66554cd5c45..3a059cb5e112 100644 --- a/arch/parisc/Kconfig.debug +++ b/arch/parisc/Kconfig.debug @@ -1 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 +# +config LIGHTWEIGHT_SPINLOCK_CHECK + bool "Enable lightweight spinlock checks" + depends on SMP && !DEBUG_SPINLOCK + default y + help + Add checks with low performance impact to the spinlock functions + to catch memory overwrites at runtime. For more advanced + spinlock debugging you should choose the DEBUG_SPINLOCK option + which will detect unitialized spinlocks too. + If unsure say Y here. diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index a6e5d66a7656..edfcb9858bcb 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -7,10 +7,26 @@ #include #include +#define SPINLOCK_BREAK_INSN 0x0000c006 /* break 6,6 */ + +static inline void arch_spin_val_check(int lock_val) +{ + if (IS_ENABLED(CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK)) + asm volatile( "andcm,= %0,%1,%%r0\n" + ".word %2\n" + : : "r" (lock_val), "r" (__ARCH_SPIN_LOCK_UNLOCKED_VAL), + "i" (SPINLOCK_BREAK_INSN)); +} + static inline int arch_spin_is_locked(arch_spinlock_t *x) { - volatile unsigned int *a = __ldcw_align(x); - return READ_ONCE(*a) == 0; + volatile unsigned int *a; + int lock_val; + + a = __ldcw_align(x); + lock_val = READ_ONCE(*a); + arch_spin_val_check(lock_val); + return (lock_val == 0); } static inline void arch_spin_lock(arch_spinlock_t *x) @@ -18,9 +34,18 @@ static inline void arch_spin_lock(arch_spinlock_t *x) volatile unsigned int *a; a = __ldcw_align(x); - while (__ldcw(a) == 0) + do { + int lock_val_old; + + lock_val_old = __ldcw(a); + arch_spin_val_check(lock_val_old); + if (lock_val_old) + return; /* got lock */ + + /* wait until we should try to get lock again */ while (*a == 0) continue; + } while (1); } static inline void arch_spin_unlock(arch_spinlock_t *x) @@ -29,15 +54,19 @@ static inline void arch_spin_unlock(arch_spinlock_t *x) a = __ldcw_align(x); /* Release with ordered store. */ - __asm__ __volatile__("stw,ma %0,0(%1)" : : "r"(1), "r"(a) : "memory"); + __asm__ __volatile__("stw,ma %0,0(%1)" + : : "r"(__ARCH_SPIN_LOCK_UNLOCKED_VAL), "r"(a) : "memory"); } static inline int arch_spin_trylock(arch_spinlock_t *x) { volatile unsigned int *a; + int lock_val; a = __ldcw_align(x); - return __ldcw(a) != 0; + lock_val = __ldcw(a); + arch_spin_val_check(lock_val); + return lock_val != 0; } /* diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index ca39ee350c3f..d65934079ebd 100644 --- a/arch/parisc/include/asm/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h @@ -2,13 +2,17 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H +#define __ARCH_SPIN_LOCK_UNLOCKED_VAL 0x1a46 + typedef struct { #ifdef CONFIG_PA20 volatile unsigned int slock; -# define __ARCH_SPIN_LOCK_UNLOCKED { 1 } +# define __ARCH_SPIN_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED_VAL } #else volatile unsigned int lock[4]; -# define __ARCH_SPIN_LOCK_UNLOCKED { { 1, 1, 1, 1 } } +# define __ARCH_SPIN_LOCK_UNLOCKED \ + { { __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL, \ + __ARCH_SPIN_LOCK_UNLOCKED_VAL, __ARCH_SPIN_LOCK_UNLOCKED_VAL } } #endif } arch_spinlock_t; diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index f9696fbf646c..13df6169f9e2 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -47,6 +47,10 @@ #include #include +#if defined(CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK) +#include +#endif + #include "../math-emu/math-emu.h" /* for handle_fpe() */ static void parisc_show_stack(struct task_struct *task, @@ -309,6 +313,12 @@ static void handle_break(struct pt_regs *regs) } #endif +#ifdef CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK + if ((iir == SPINLOCK_BREAK_INSN) && !user_mode(regs)) { + die_if_kernel("Spinlock was trashed", regs, 1); + } +#endif + if (unlikely(iir != GDB_BREAK_INSN)) parisc_printk_ratelimited(0, regs, KERN_DEBUG "break %d,%d: pid=%d command='%s'\n", From adf8e96a7ea670d45b5de7594acc67e8f4787ae6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 23 May 2023 09:06:40 +0200 Subject: [PATCH 122/271] parisc: Enable LOCKDEP support Cc: # v6.0+ Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 466a25525364..967bde65dd0e 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -130,6 +130,10 @@ config PM config STACKTRACE_SUPPORT def_bool y +config LOCKDEP_SUPPORT + bool + default y + config ISA_DMA_API bool From 2028315cf59bb899a5ac7e87dc48ecb8fac7ac24 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 22 May 2023 22:57:30 +0200 Subject: [PATCH 123/271] parisc: Allow to reboot machine after system halt In case a machine can't power-off itself on system shutdown, allow the user to reboot it by pressing the RETURN key. Cc: # v4.14+ Signed-off-by: Helge Deller --- arch/parisc/kernel/process.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 97c6f875bd0e..24411ab79c30 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -122,13 +122,18 @@ void machine_power_off(void) /* It seems we have no way to power the system off via * software. The user has to press the button himself. */ - printk(KERN_EMERG "System shut down completed.\n" - "Please power this system off now."); + printk("Power off or press RETURN to reboot.\n"); /* prevent soft lockup/stalled CPU messages for endless loop. */ rcu_sysrq_start(); lockup_detector_soft_poweroff(); - for (;;); + while (1) { + /* reboot if user presses RETURN key */ + if (pdc_iodc_getc() == 13) { + printk("Rebooting...\n"); + machine_restart(NULL); + } + } } void (*pm_power_off)(void); From df419492e428b6a2bce98d0f613c58a13da6666c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 24 May 2023 14:31:14 +0200 Subject: [PATCH 124/271] parisc: Handle kprobes breakpoints only in kernel context The kernel kprobes break instructions should only be handled when running in kernel context. Cc: # v5.18+ Signed-off-by: Helge Deller --- arch/parisc/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 13df6169f9e2..4d5f7b167a65 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -295,11 +295,11 @@ static void handle_break(struct pt_regs *regs) } #ifdef CONFIG_KPROBES - if (unlikely(iir == PARISC_KPROBES_BREAK_INSN)) { + if (unlikely(iir == PARISC_KPROBES_BREAK_INSN && !user_mode(regs))) { parisc_kprobe_break_handler(regs); return; } - if (unlikely(iir == PARISC_KPROBES_BREAK_INSN2)) { + if (unlikely(iir == PARISC_KPROBES_BREAK_INSN2 && !user_mode(regs))) { parisc_kprobe_ss_handler(regs); return; } From 6888ff04e37d01295620a73f3f7efbc79f6ef152 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 24 May 2023 14:34:58 +0200 Subject: [PATCH 125/271] parisc: Handle kgdb breakpoints only in kernel context The kernel kgdb break instructions should only be handled when running in kernel context. Cc: # v5.4+ Signed-off-by: Helge Deller --- arch/parisc/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 4d5f7b167a65..304eebd1c83e 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -306,8 +306,8 @@ static void handle_break(struct pt_regs *regs) #endif #ifdef CONFIG_KGDB - if (unlikely(iir == PARISC_KGDB_COMPILED_BREAK_INSN || - iir == PARISC_KGDB_BREAK_INSN)) { + if (unlikely((iir == PARISC_KGDB_COMPILED_BREAK_INSN || + iir == PARISC_KGDB_BREAK_INSN)) && !user_mode(regs)) { kgdb_handle_exception(9, SIGTRAP, 0, regs); return; } From 3eb96946f0be6bf447cbdf219aba22bc42672f92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 08:05:38 +0200 Subject: [PATCH 126/271] block: make bio_check_eod work for zero sized devices Since the dawn of time bio_check_eod has a check for a non-zero size of the device. This doesn't really make any sense as we never want to send I/O to a device that's been set to zero size, or never moved out of that. I am a bit surprised we haven't caught this for a long time, but the removal of the extra validation inside of zram caused syzbot to trip over this issue recently. I've added a Fixes tag for that commit, but the issue really goes back way before git history. Fixes: 9fe95babc742 ("zram: remove valid_io_request") Reported-by: syzbot+b8d61a58b7c7ebd2c8e0@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230524060538.1593686-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 00c74330fa92..1da77e7d6289 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -520,7 +520,7 @@ static inline int bio_check_eod(struct bio *bio) sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); - if (nr_sectors && maxsector && + if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" From b6ebaa8100090092aa602530d7e8316816d0c98d Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 26 Apr 2023 17:40:05 +0100 Subject: [PATCH 127/271] xen/blkfront: Only check REQ_FUA for writes The existing code silently converts read operations with the REQ_FUA bit set into write-barrier operations. This results in data loss as the backend scribbles zeroes over the data instead of returning it. While the REQ_FUA bit doesn't make sense on a read operation, at least one well-known out-of-tree kernel module does set it and since it results in data loss, let's be safe here and only look at REQ_FUA for writes. Signed-off-by: Ross Lagerwall Acked-by: Juergen Gross Link: https://lore.kernel.org/r/20230426164005.2213139-1-ross.lagerwall@citrix.com Signed-off-by: Juergen Gross --- drivers/block/xen-blkfront.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 23ed258b57f0..c1890c8a9f6e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -780,7 +780,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri ring_req->u.rw.handle = info->handle; ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) { + if (req_op(req) == REQ_OP_FLUSH || + (req_op(req) == REQ_OP_WRITE && (req->cmd_flags & REQ_FUA))) { /* * Ideally we can do an unordered flush-to-disk. * In case the backend onlysupports barriers, use that. From 8fafac202d18230bb9926bda48e563fd2cce2a4f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 May 2023 18:11:35 +0300 Subject: [PATCH 128/271] xen/pvcalls-back: fix double frees with pvcalls_new_active_socket() In the pvcalls_new_active_socket() function, most error paths call pvcalls_back_release_active(fedata->dev, fedata, map) which calls sock_release() on "sock". The bug is that the caller also frees sock. Fix this by making every error path in pvcalls_new_active_socket() release the sock, and don't free it in the caller. Fixes: 5db4d286a8ef ("xen/pvcalls: implement connect command") Signed-off-by: Dan Carpenter Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/e5f98dc2-0305-491f-a860-71bbd1398a2f@kili.mountain Signed-off-by: Juergen Gross --- drivers/xen/pvcalls-back.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index 1f5219e12cc3..7beaf2c41fbb 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -325,8 +325,10 @@ static struct sock_mapping *pvcalls_new_active_socket( void *page; map = kzalloc(sizeof(*map), GFP_KERNEL); - if (map == NULL) + if (map == NULL) { + sock_release(sock); return NULL; + } map->fedata = fedata; map->sock = sock; @@ -418,10 +420,8 @@ static int pvcalls_back_connect(struct xenbus_device *dev, req->u.connect.ref, req->u.connect.evtchn, sock); - if (!map) { + if (!map) ret = -EFAULT; - sock_release(sock); - } out: rsp = RING_GET_RESPONSE(&fedata->ring, fedata->ring.rsp_prod_pvt++); @@ -561,7 +561,6 @@ static void __pvcalls_back_accept(struct work_struct *work) sock); if (!map) { ret = -EFAULT; - sock_release(sock); goto out_error; } From 335b4223466dd75f9f3ea4918187afbadd22e5c8 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 3 May 2023 13:16:53 +0000 Subject: [PATCH 129/271] x86/pci/xen: populate MSI sysfs entries Commit bf5e758f02fc ("genirq/msi: Simplify sysfs handling") reworked the creation of sysfs entries for MSI IRQs. The creation used to be in msi_domain_alloc_irqs_descs_locked after calling ops->domain_alloc_irqs. Then it moved into __msi_domain_alloc_irqs which is an implementation of domain_alloc_irqs. However, Xen comes with the only other implementation of domain_alloc_irqs and hence doesn't run the sysfs population code anymore. Commit 6c796996ee70 ("x86/pci/xen: Fixup fallout from the PCI/MSI overhaul") set the flag MSI_FLAG_DEV_SYSFS for the xen msi_domain_info but that doesn't actually have an effect because Xen uses it's own domain_alloc_irqs implementation. Fix this by making use of the fallback functions for sysfs population. Fixes: bf5e758f02fc ("genirq/msi: Simplify sysfs handling") Signed-off-by: Maximilian Heyne Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230503131656.15928-1-mheyne@amazon.de Signed-off-by: Juergen Gross --- arch/x86/pci/xen.c | 8 +++++--- include/linux/msi.h | 9 ++++++++- kernel/irq/msi.c | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8babce71915f..014c508e914d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -198,7 +198,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) i++; } kfree(v); - return 0; + return msi_device_populate_sysfs(&dev->dev); error: if (ret == -ENOSYS) @@ -254,7 +254,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) dev_dbg(&dev->dev, "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } - return 0; + return msi_device_populate_sysfs(&dev->dev); error: dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", @@ -346,7 +346,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (ret < 0) goto out; } - ret = 0; + ret = msi_device_populate_sysfs(&dev->dev); out: return ret; } @@ -394,6 +394,8 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_destroy_irq(msidesc->irq + i); msidesc->irq = 0; } + + msi_device_destroy_sysfs(&dev->dev); } static void xen_pv_teardown_msi_irqs(struct pci_dev *dev) diff --git a/include/linux/msi.h b/include/linux/msi.h index cdb14a1ef268..a50ea79522f8 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -383,6 +383,13 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ + +/* + * Xen uses non-default msi_domain_ops and hence needs a way to populate sysfs + * entries of MSI IRQs. + */ +#if defined(CONFIG_PCI_XEN) || defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) #ifdef CONFIG_SYSFS int msi_device_populate_sysfs(struct device *dev); void msi_device_destroy_sysfs(struct device *dev); @@ -390,7 +397,7 @@ void msi_device_destroy_sysfs(struct device *dev); static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } static inline void msi_device_destroy_sysfs(struct device *dev) { } #endif /* !CONFIG_SYSFS */ -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ +#endif /* CONFIG_PCI_XEN || CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* * The restore hook is still available even for fully irq domain based diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7a97bcb086bf..b4c31a5c1147 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -542,7 +542,7 @@ fail: return ret; } -#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS +#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN) /** * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device * @dev: The device (PCI, platform etc) which will get sysfs entries @@ -574,7 +574,7 @@ void msi_device_destroy_sysfs(struct device *dev) msi_for_each_desc(desc, dev, MSI_DESC_ALL) msi_sysfs_remove_desc(dev, desc); } -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */ #else /* CONFIG_SYSFS */ static inline int msi_sysfs_create_group(struct device *dev) { return 0; } static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } From 61e150fb310729c98227a5edf6e4a3619edc3702 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 24 May 2023 17:07:07 +0200 Subject: [PATCH 130/271] parisc: Fix flush_dcache_page() for usage from irq context Since at least kernel 6.1, flush_dcache_page() is called with IRQs disabled, e.g. from aio_complete(). But the current implementation for flush_dcache_page() on parisc unintentionally re-enables IRQs, which may lead to deadlocks. Fix it by using xa_lock_irqsave() and xa_unlock_irqrestore() for the flush_dcache_mmap_*lock() macros instead. Cc: linux-parisc@vger.kernel.org Cc: stable@kernel.org # 5.18+ Signed-off-by: Helge Deller --- arch/parisc/include/asm/cacheflush.h | 4 ++++ arch/parisc/kernel/cache.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 0bdee6724132..c8b6928cee1e 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -48,6 +48,10 @@ void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define flush_dcache_mmap_lock_irqsave(mapping, flags) \ + xa_lock_irqsave(&mapping->i_pages, flags) +#define flush_dcache_mmap_unlock_irqrestore(mapping, flags) \ + xa_unlock_irqrestore(&mapping->i_pages, flags) #define flush_icache_page(vma,page) do { \ flush_kernel_dcache_page_addr(page_address(page)); \ diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 1d3b8bc8a623..ca4a302d4365 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -399,6 +399,7 @@ void flush_dcache_page(struct page *page) unsigned long offset; unsigned long addr, old_addr = 0; unsigned long count = 0; + unsigned long flags; pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { @@ -420,7 +421,7 @@ void flush_dcache_page(struct page *page) * to flush one address here for them all to become coherent * on machines that support equivalent aliasing */ - flush_dcache_mmap_lock(mapping); + flush_dcache_mmap_lock_irqsave(mapping, flags); vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; @@ -460,7 +461,7 @@ void flush_dcache_page(struct page *page) } WARN_ON(++count == 4096); } - flush_dcache_mmap_unlock(mapping); + flush_dcache_mmap_unlock_irqrestore(mapping, flags); } EXPORT_SYMBOL(flush_dcache_page); From 4badf2eb1e986bdbf34dd2f5d4c979553a86fe54 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Wed, 17 May 2023 16:28:15 +0000 Subject: [PATCH 131/271] cpufreq: amd-pstate: Add ->fast_switch() callback Schedutil normally calls the adjust_perf callback for drivers with adjust_perf callback available and fast_switch_possible flag set. However, when frequency invariance is disabled and schedutil tries to invoke fast_switch. So, there is a chance of kernel crash if this function pointer is not set. To protect against this scenario add fast_switch callback to amd_pstate driver. Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Signed-off-by: Gautham R. Shenoy Signed-off-by: Wyes Karny Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5a3d4aa0f45a..45711fc0a856 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -444,9 +444,8 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy) return 0; } -static int amd_pstate_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) +static int amd_pstate_update_freq(struct cpufreq_policy *policy, + unsigned int target_freq, bool fast_switch) { struct cpufreq_freqs freqs; struct amd_cpudata *cpudata = policy->driver_data; @@ -465,14 +464,37 @@ static int amd_pstate_target(struct cpufreq_policy *policy, des_perf = DIV_ROUND_CLOSEST(target_freq * cap_perf, cpudata->max_freq); - cpufreq_freq_transition_begin(policy, &freqs); + WARN_ON(fast_switch && !policy->fast_switch_enabled); + /* + * If fast_switch is desired, then there aren't any registered + * transition notifiers. See comment for + * cpufreq_enable_fast_switch(). + */ + if (!fast_switch) + cpufreq_freq_transition_begin(policy, &freqs); + amd_pstate_update(cpudata, min_perf, des_perf, - max_perf, false, policy->governor->flags); - cpufreq_freq_transition_end(policy, &freqs, false); + max_perf, fast_switch, policy->governor->flags); + + if (!fast_switch) + cpufreq_freq_transition_end(policy, &freqs, false); return 0; } +static int amd_pstate_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + return amd_pstate_update_freq(policy, target_freq, false); +} + +static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy, + unsigned int target_freq) +{ + return amd_pstate_update_freq(policy, target_freq, true); +} + static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long _min_perf, unsigned long target_perf, @@ -715,6 +737,7 @@ static int amd_pstate_cpu_exit(struct cpufreq_policy *policy) freq_qos_remove_request(&cpudata->req[1]); freq_qos_remove_request(&cpudata->req[0]); + policy->fast_switch_possible = false; kfree(cpudata); return 0; @@ -1309,6 +1332,7 @@ static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, .target = amd_pstate_target, + .fast_switch = amd_pstate_fast_switch, .init = amd_pstate_cpu_init, .exit = amd_pstate_cpu_exit, .suspend = amd_pstate_cpu_suspend, From 249b62c448de7117c18531d626aed6e153cdfd75 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Wed, 17 May 2023 16:28:16 +0000 Subject: [PATCH 132/271] cpufreq: amd-pstate: Remove fast_switch_possible flag from active driver amd_pstate active mode driver is only compatible with static governors. Therefore it doesn't need fast_switch functionality. Remove fast_switch_possible flag from amd_pstate active mode driver. Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors") Signed-off-by: Wyes Karny Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 45711fc0a856..ac54779f5a49 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1102,7 +1102,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->policy = CPUFREQ_POLICY_POWERSAVE; if (boot_cpu_has(X86_FEATURE_CPPC)) { - policy->fast_switch_possible = true; ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); if (ret) return ret; @@ -1125,7 +1124,6 @@ free_cpudata1: static int amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) { pr_debug("CPU %d exiting\n", policy->cpu); - policy->fast_switch_possible = false; return 0; } From 5f7fdb0f255756b594cc45c2c08b0140bc4a1761 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 22 May 2023 15:38:44 -0700 Subject: [PATCH 133/271] thermal: intel: int340x: Add new line for UUID display Prior to the commit "763bd29fd3d1 ("thermal: int340x_thermal: Use sysfs_emit_at() instead of scnprintf()", there was a new line after each UUID string. With the newline removed, existing user space like "thermald" fails to compare each supported UUID as it is using getline() to read UUID and apply correct thermal table. To avoid breaking existing user space, add newline after each UUID string. Signed-off-by: Srinivas Pandruvada Fixes: 763bd29fd3d1 ("thermal: int340x_thermal: Use sysfs_emit_at() instead of scnprintf()") Cc: 6.3+ # 6.3+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 810231b59dcd..5e1164226ada 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -131,7 +131,7 @@ static ssize_t available_uuids_show(struct device *dev, for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; i++) { if (priv->uuid_bitmap & (1 << i)) - length += sysfs_emit_at(buf, length, int3400_thermal_uuids[i]); + length += sysfs_emit_at(buf, length, "%s\n", int3400_thermal_uuids[i]); } return length; @@ -149,7 +149,7 @@ static ssize_t current_uuid_show(struct device *dev, for (i = 0; i <= INT3400_THERMAL_CRITICAL; i++) { if (priv->os_uuid_mask & BIT(i)) - length += sysfs_emit_at(buf, length, int3400_thermal_uuids[i]); + length += sysfs_emit_at(buf, length, "%s\n", int3400_thermal_uuids[i]); } if (length) From 60ecaaf54886b0642d5c4744f7fbf1ff0d6b3e42 Mon Sep 17 00:00:00 2001 From: Sukrut Bellary Date: Wed, 3 May 2023 16:15:07 -0700 Subject: [PATCH 134/271] drm:amd:amdgpu: Fix missing buffer object unlock in failure path smatch warning - 1) drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c:3615 gfx_v9_0_kiq_resume() warn: inconsistent returns 'ring->mqd_obj->tbo.base.resv'. 2) drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c:6901 gfx_v10_0_kiq_resume() warn: inconsistent returns 'ring->mqd_obj->tbo.base.resv'. Signed-off-by: Sukrut Bellary Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 1ec076517c96..ab44c1391d52 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -6892,8 +6892,10 @@ static int gfx_v10_0_kiq_resume(struct amdgpu_device *adev) return r; r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); - if (unlikely(r != 0)) + if (unlikely(r != 0)) { + amdgpu_bo_unreserve(ring->mqd_obj); return r; + } gfx_v10_0_kiq_init_queue(ring); amdgpu_bo_kunmap(ring->mqd_obj); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 9818743ec419..ce22f7b30416 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3617,8 +3617,10 @@ static int gfx_v9_0_kiq_resume(struct amdgpu_device *adev) return r; r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); - if (unlikely(r != 0)) + if (unlikely(r != 0)) { + amdgpu_bo_unreserve(ring->mqd_obj); return r; + } gfx_v9_0_kiq_init_queue(ring); amdgpu_bo_kunmap(ring->mqd_obj); From 7fc602dbfd548045862df096910b7d21e6d300bf Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Thu, 18 May 2023 09:46:22 +0800 Subject: [PATCH 135/271] drm/amdgpu: don't enable secure display on incompatible platforms [why] [drm] psp gfx command LOAD_TA(0x1) failed and response status is (0x7) [drm] psp gfx command INVOKE_CMD(0x3) failed and response status is (0x4) amdgpu 0000:04:00.0: amdgpu: Secure display: Generic Failure. [how] don't enable secure display on incompatible platforms Suggested-by: Aaron Liu Signed-off-by: Jesse zhang Reviewed-by: Aaron Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/psp_v10_0.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c index e1b7fca09666..5f10883da6a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c @@ -57,7 +57,13 @@ static int psp_v10_0_init_microcode(struct psp_context *psp) if (err) return err; - return psp_init_ta_microcode(psp, ucode_prefix); + err = psp_init_ta_microcode(psp, ucode_prefix); + if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 1, 0)) && + (adev->pdev->revision == 0xa1) && + (psp->securedisplay_context.context.bin_desc.fw_version >= 0x27000008)) { + adev->psp.securedisplay_context.context.bin_desc.size_bytes = 0; + } + return err; } static int psp_v10_0_ring_create(struct psp_context *psp, From a34fc1bcd2c4d8b09dcfc0b95ac65bca1e579bd7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 18 May 2023 12:38:22 -0400 Subject: [PATCH 136/271] drm/radeon: reintroduce radeon_dp_work_func content Put back the radeon_dp_work_func logic. It seems that handling DP RX interrupts is necessary to make some panels work. This was removed with the MST support, but it regresses some systems so add it back. While we are here, add the proper mutex locking. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2567 Fixes: 01ad1d9c2888 ("drm/radeon: Drop legacy MST support") Reviewed-by: Lyude Paul Signed-off-by: Alex Deucher Cc: Lyude Paul Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_irq_kms.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_irq_kms.c b/drivers/gpu/drm/radeon/radeon_irq_kms.c index 3377fbc71f65..c4dda908666c 100644 --- a/drivers/gpu/drm/radeon/radeon_irq_kms.c +++ b/drivers/gpu/drm/radeon/radeon_irq_kms.c @@ -99,6 +99,16 @@ static void radeon_hotplug_work_func(struct work_struct *work) static void radeon_dp_work_func(struct work_struct *work) { + struct radeon_device *rdev = container_of(work, struct radeon_device, + dp_work); + struct drm_device *dev = rdev->ddev; + struct drm_mode_config *mode_config = &dev->mode_config; + struct drm_connector *connector; + + mutex_lock(&mode_config->mutex); + list_for_each_entry(connector, &mode_config->connector_list, head) + radeon_connector_hotplug(connector); + mutex_unlock(&mode_config->mutex); } /** From 0d2dd02d74e6377268f56b90261de0fae8f0d2cb Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 19 May 2023 14:20:17 +0800 Subject: [PATCH 137/271] drm/amd/pm: add missing NotifyPowerSource message mapping for SMU13.0.7 Otherwise, the power source switching will fail due to message unavailable. Fixes: bf4823267a81 ("drm/amd/pm: fix possible power mode mismatch between driver and PMFW") Signed-off-by: Evan Quan Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index 98a33f8ee209..bba621615abf 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -125,6 +125,7 @@ static struct cmn2asic_msg_mapping smu_v13_0_7_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(ArmD3, PPSMC_MSG_ArmD3, 0), MSG_MAP(AllowGpo, PPSMC_MSG_SetGpoAllow, 0), MSG_MAP(GetPptLimit, PPSMC_MSG_GetPptLimit, 0), + MSG_MAP(NotifyPowerSource, PPSMC_MSG_NotifyPowerSource, 0), }; static struct cmn2asic_mapping smu_v13_0_7_clk_map[SMU_CLK_COUNT] = { From 40baba5693b9af586dc1063af603d05a79e57a6b Mon Sep 17 00:00:00 2001 From: Jonatas Esteves Date: Sat, 20 May 2023 10:39:52 -0300 Subject: [PATCH 138/271] drm/amd/pm: Fix output of pp_od_clk_voltage Printing the other clock types should not be conditioned on being able to print OD_SCLK. Some GPUs currently have limited capability of only printing a subset of these. Since this condition was introduced in v5.18-rc1, reading from `pp_od_clk_voltage` has been returning empty on the Asus ROG Strix G15 (2021). Fixes: 79c65f3fcbb1 ("drm/amd/pm: do not expose power implementation details to amdgpu_pm.c") Reviewed-by: Evan Quan Signed-off-by: Jonatas Esteves Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 58c2246918fd..f4f40459f22b 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -871,13 +871,11 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev, } if (ret == -ENOENT) { size = amdgpu_dpm_print_clock_levels(adev, OD_SCLK, buf); - if (size > 0) { - size += amdgpu_dpm_print_clock_levels(adev, OD_MCLK, buf + size); - size += amdgpu_dpm_print_clock_levels(adev, OD_VDDC_CURVE, buf + size); - size += amdgpu_dpm_print_clock_levels(adev, OD_VDDGFX_OFFSET, buf + size); - size += amdgpu_dpm_print_clock_levels(adev, OD_RANGE, buf + size); - size += amdgpu_dpm_print_clock_levels(adev, OD_CCLK, buf + size); - } + size += amdgpu_dpm_print_clock_levels(adev, OD_MCLK, buf + size); + size += amdgpu_dpm_print_clock_levels(adev, OD_VDDC_CURVE, buf + size); + size += amdgpu_dpm_print_clock_levels(adev, OD_VDDGFX_OFFSET, buf + size); + size += amdgpu_dpm_print_clock_levels(adev, OD_RANGE, buf + size); + size += amdgpu_dpm_print_clock_levels(adev, OD_CCLK, buf + size); } if (size == 0) From 38776cc45eb7603df4735a0410f42cffff8e71a1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 8 May 2023 07:02:06 -0700 Subject: [PATCH 139/271] perf/x86/uncore: Correct the number of CHAs on SPR The number of CHAs from the discovery table on some SPR variants is incorrect, because of a firmware issue. An accurate number can be read from the MSR UNC_CBO_CONFIG. Fixes: 949b11381f81 ("perf/x86/intel/uncore: Add Sapphire Rapids server CHA support") Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Stephane Eranian Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230508140206.283708-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index fa9b209a11fa..d49e90dc04a4 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6150,6 +6150,7 @@ static struct intel_uncore_type spr_uncore_mdf = { }; #define UNCORE_SPR_NUM_UNCORE_TYPES 12 +#define UNCORE_SPR_CHA 0 #define UNCORE_SPR_IIO 1 #define UNCORE_SPR_IMC 6 #define UNCORE_SPR_UPI 8 @@ -6460,12 +6461,22 @@ static int uncore_type_max_boxes(struct intel_uncore_type **types, return max + 1; } +#define SPR_MSR_UNC_CBO_CONFIG 0x2FFE + void spr_uncore_cpu_init(void) { + struct intel_uncore_type *type; + u64 num_cbo; + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, UNCORE_SPR_MSR_EXTRA_UNCORES, spr_msr_uncores); + type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA); + if (type) { + rdmsrl(SPR_MSR_UNC_CBO_CONFIG, num_cbo); + type->num_boxes = num_cbo; + } spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO); } From 137f9cee113df91107cf91c130d5c414c4b191f0 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Tue, 2 May 2023 17:54:50 +0800 Subject: [PATCH 140/271] drm/amd/display: Fix warning in disabling vblank irq [Why] During gpu-reset, we toggle vblank irq by calling dc_interrupt_set() instead of amdgpu_irq_get/put() because we don't want to change the irq source's refcount. However, we see the warning when vblank irq is enabled by dc_interrupt_set() during gpu-reset but disabled by amdgpu_irq_put() after gpu-reset. [How] Only in dm_gpureset_toggle_interrupts() we toggle vblank interrupts by calling dc_interrupt_set(). Apart from this we call dm_set_vblank() which uses amdgpu_irq_get/put() to operate vblank irq. Reviewed-by: Bhawanpreet Lakha Acked-by: Tom Chung Signed-off-by: Alan Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ++++++++++++------- .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 16 +++----------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 8b4b186c57f5..26b5cbc97046 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2479,20 +2479,25 @@ static void dm_gpureset_toggle_interrupts(struct amdgpu_device *adev, if (acrtc && state->stream_status[i].plane_count != 0) { irq_source = IRQ_TYPE_PFLIP + acrtc->otg_inst; rc = dc_interrupt_set(adev->dm.dc, irq_source, enable) ? 0 : -EBUSY; - DRM_DEBUG_VBL("crtc %d - vupdate irq %sabling: r=%d\n", - acrtc->crtc_id, enable ? "en" : "dis", rc); if (rc) DRM_WARN("Failed to %s pflip interrupts\n", enable ? "enable" : "disable"); if (enable) { - rc = amdgpu_dm_crtc_enable_vblank(&acrtc->base); - if (rc) - DRM_WARN("Failed to enable vblank interrupts\n"); - } else { - amdgpu_dm_crtc_disable_vblank(&acrtc->base); - } + if (amdgpu_dm_crtc_vrr_active(to_dm_crtc_state(acrtc->base.state))) + rc = amdgpu_dm_crtc_set_vupdate_irq(&acrtc->base, true); + } else + rc = amdgpu_dm_crtc_set_vupdate_irq(&acrtc->base, false); + if (rc) + DRM_WARN("Failed to %sable vupdate interrupt\n", enable ? "en" : "dis"); + + irq_source = IRQ_TYPE_VBLANK + acrtc->otg_inst; + /* During gpu-reset we disable and then enable vblank irq, so + * don't use amdgpu_irq_get/put() to avoid refcount change. + */ + if (!dc_interrupt_set(adev->dm.dc, irq_source, enable)) + DRM_WARN("Failed to %sable vblank interrupt\n", enable ? "en" : "dis"); } } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c index e3762e806617..440fc0869a34 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c @@ -146,7 +146,6 @@ static void vblank_control_worker(struct work_struct *work) static inline int dm_set_vblank(struct drm_crtc *crtc, bool enable) { - enum dc_irq_source irq_source; struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc); struct amdgpu_device *adev = drm_to_adev(crtc->dev); struct dm_crtc_state *acrtc_state = to_dm_crtc_state(crtc->state); @@ -169,18 +168,9 @@ static inline int dm_set_vblank(struct drm_crtc *crtc, bool enable) if (rc) return rc; - if (amdgpu_in_reset(adev)) { - irq_source = IRQ_TYPE_VBLANK + acrtc->otg_inst; - /* During gpu-reset we disable and then enable vblank irq, so - * don't use amdgpu_irq_get/put() to avoid refcount change. - */ - if (!dc_interrupt_set(adev->dm.dc, irq_source, enable)) - rc = -EBUSY; - } else { - rc = (enable) - ? amdgpu_irq_get(adev, &adev->crtc_irq, acrtc->crtc_id) - : amdgpu_irq_put(adev, &adev->crtc_irq, acrtc->crtc_id); - } + rc = (enable) + ? amdgpu_irq_get(adev, &adev->crtc_irq, acrtc->crtc_id) + : amdgpu_irq_put(adev, &adev->crtc_irq, acrtc->crtc_id); if (rc) return rc; From 482e6ad9adde69d9da08864b4ccf4dfd53edb2f0 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Wed, 10 May 2023 16:43:30 -0400 Subject: [PATCH 141/271] drm/amd/display: Have Payload Properly Created After Resume At drm suspend sequence, MST dc_sink is removed. When commit cached MST stream back in drm resume sequence, the MST stream payload is not properly created and added into the payload table. After resume, topology change is reprobed by removing existing streams first. That leads to no payload is found in the existing payload table as below error "[drm] ERROR No payload for [MST PORT:] found in mst state" 1. In encoder .atomic_check routine, remove check existance of dc_sink 2. Bypass MST by checking existence of MST root port. dc_link_type cannot differentiate MST port before topology is rediscovered. Reviewed-by: Wayne Lin Acked-by: Tom Chung Signed-off-by: Fangzhi Zuo Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 26b5cbc97046..d5cec03eaa8d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2857,7 +2857,7 @@ static int dm_resume(void *handle) * this is the case when traversing through already created * MST connectors, should be skipped */ - if (aconnector->dc_link->type == dc_connection_mst_branch) + if (aconnector && aconnector->mst_root) continue; mutex_lock(&aconnector->hpd_lock); @@ -6742,7 +6742,7 @@ static int dm_encoder_helper_atomic_check(struct drm_encoder *encoder, int clock, bpp = 0; bool is_y420 = false; - if (!aconnector->mst_output_port || !aconnector->dc_sink) + if (!aconnector->mst_output_port) return 0; mst_port = aconnector->mst_output_port; From 72a7804a667eeac98888610521179f0418883158 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 23 May 2023 17:38:38 -0300 Subject: [PATCH 142/271] cifs: fix smb1 mount regression cifs.ko maps NT_STATUS_NOT_FOUND to -EIO when SMB1 servers couldn't resolve referral paths. Proceed to tree connect when we get -EIO from dfs_get_referral() as well. Reported-by: Kris Karas (Bug Reporting) Tested-by: Woody Suwalski Fixes: 8e3554150d6c ("cifs: fix sharing of DFS connections") Cc: stable@vger.kernel.org # v6.2+ Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/dfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c index a93dbca1411b..2f93bf8c3325 100644 --- a/fs/cifs/dfs.c +++ b/fs/cifs/dfs.c @@ -303,7 +303,7 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) if (!nodfs) { rc = dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL); if (rc) { - if (rc != -ENOENT && rc != -EOPNOTSUPP) + if (rc != -ENOENT && rc != -EOPNOTSUPP && rc != -EIO) goto out; nodfs = true; } From 8b4dd44f9b4702d42362b97b81c91b33a6dcea58 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 23 May 2023 20:25:47 -0500 Subject: [PATCH 143/271] smb3: display debug information better for encryption Fix /proc/fs/cifs/DebugData to use the same case for "encryption" (ie "Encryption" with init capital letter was used in one place). In addition, if gcm256 encryption (intead of gcm128) is used on a connection to a server, note that in the DebugData as well. It now displays (when gcm256 negotiated): Security type: RawNTLMSSP SessionId: 0x86125800bc000b0d encrypted(gcm256) Acked-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index d4ed200a9471..5034b862cec2 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -108,7 +108,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) if ((tcon->seal) || (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) - seq_printf(m, " Encrypted"); + seq_puts(m, " encrypted"); if (tcon->nocase) seq_printf(m, " nocase"); if (tcon->unix_ext) @@ -415,8 +415,12 @@ skip_rdma: /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); - if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) { seq_puts(m, " encrypted"); + /* can help in debugging to show encryption type */ + if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + seq_puts(m, "(gcm256)"); + } if (ses->sign) seq_puts(m, " signed"); From cb8b02fd6343228966324528adf920bfb8b8e681 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 24 May 2023 03:26:19 -0500 Subject: [PATCH 144/271] cifs: mapchars mount option ignored There are two ways that special characters (not allowed in some other operating systems like Windows, but allowed in POSIX) have been mapped in the past ("SFU" and "SFM" mappings) to allow them to be stored in a range reserved for special chars. The default for Linux has been to use "mapposix" (ie the SFM mapping) but the conversion to the new mount API in the 5.11 kernel broke the ability to override the default mapping of the reserved characters (like '?' and '*' and '\') via "mapchars" mount option. This patch fixes that - so can now mount with "mapchars" mount option to override the default ("mapposix" ie SFM) mapping. Reported-by: Tyler Spivey Fixes: 24e0a1eff9e2 ("cifs: switch to new mount api") Signed-off-by: Steve French --- fs/cifs/fs_context.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index ace11a1a7c8a..1bda75609b64 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -904,6 +904,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->sfu_remap = false; /* disable SFU mapping */ } break; + case Opt_mapchars: + if (result.negated) + ctx->sfu_remap = false; + else { + ctx->sfu_remap = true; + ctx->remap = false; /* disable SFM (mapposix) mapping */ + } + break; case Opt_user_xattr: if (result.negated) ctx->no_xattr = 1; From 38c8a9a52082579090e34c033d439ed2cd1a462d Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 21 May 2023 20:46:30 -0500 Subject: [PATCH 145/271] smb: move client and server files to common directory fs/smb Move CIFS/SMB3 related client and server files (cifs.ko and ksmbd.ko and helper modules) to new fs/smb subdirectory: fs/cifs --> fs/smb/client fs/ksmbd --> fs/smb/server fs/smbfs_common --> fs/smb/common Suggested-by: Linus Torvalds Acked-by: Namjae Jeon Signed-off-by: Steve French --- MAINTAINERS | 8 ++++---- fs/Kconfig | 9 +-------- fs/Makefile | 4 +--- fs/smb/Kconfig | 11 +++++++++++ fs/smb/Makefile | 5 +++++ fs/{cifs => smb/client}/Kconfig | 0 fs/{cifs => smb/client}/Makefile | 0 fs/{cifs => smb/client}/asn1.c | 0 fs/{cifs => smb/client}/cached_dir.c | 0 fs/{cifs => smb/client}/cached_dir.h | 0 fs/{cifs => smb/client}/cifs_debug.c | 0 fs/{cifs => smb/client}/cifs_debug.h | 0 fs/{cifs => smb/client}/cifs_dfs_ref.c | 0 fs/{cifs => smb/client}/cifs_fs_sb.h | 0 fs/{cifs => smb/client}/cifs_ioctl.h | 0 fs/{cifs => smb/client}/cifs_spnego.c | 0 fs/{cifs => smb/client}/cifs_spnego.h | 0 fs/{cifs => smb/client}/cifs_spnego_negtokeninit.asn1 | 0 fs/{cifs => smb/client}/cifs_swn.c | 0 fs/{cifs => smb/client}/cifs_swn.h | 0 fs/{cifs => smb/client}/cifs_unicode.c | 0 fs/{cifs => smb/client}/cifs_unicode.h | 0 fs/{cifs => smb/client}/cifs_uniupr.h | 0 fs/{cifs => smb/client}/cifsacl.c | 0 fs/{cifs => smb/client}/cifsacl.h | 0 fs/{cifs => smb/client}/cifsencrypt.c | 2 +- fs/{cifs => smb/client}/cifsfs.c | 0 fs/{cifs => smb/client}/cifsfs.h | 0 fs/{cifs => smb/client}/cifsglob.h | 2 +- fs/{cifs => smb/client}/cifspdu.h | 2 +- fs/{cifs => smb/client}/cifsproto.h | 0 fs/{cifs => smb/client}/cifsroot.c | 0 fs/{cifs => smb/client}/cifssmb.c | 0 fs/{cifs => smb/client}/connect.c | 0 fs/{cifs => smb/client}/dfs.c | 0 fs/{cifs => smb/client}/dfs.h | 0 fs/{cifs => smb/client}/dfs_cache.c | 0 fs/{cifs => smb/client}/dfs_cache.h | 0 fs/{cifs => smb/client}/dir.c | 0 fs/{cifs => smb/client}/dns_resolve.c | 0 fs/{cifs => smb/client}/dns_resolve.h | 0 fs/{cifs => smb/client}/export.c | 0 fs/{cifs => smb/client}/file.c | 0 fs/{cifs => smb/client}/fs_context.c | 0 fs/{cifs => smb/client}/fs_context.h | 0 fs/{cifs => smb/client}/fscache.c | 0 fs/{cifs => smb/client}/fscache.h | 0 fs/{cifs => smb/client}/inode.c | 0 fs/{cifs => smb/client}/ioctl.c | 0 fs/{cifs => smb/client}/link.c | 0 fs/{cifs => smb/client}/misc.c | 0 fs/{cifs => smb/client}/netlink.c | 0 fs/{cifs => smb/client}/netlink.h | 0 fs/{cifs => smb/client}/netmisc.c | 0 fs/{cifs => smb/client}/nterr.c | 0 fs/{cifs => smb/client}/nterr.h | 0 fs/{cifs => smb/client}/ntlmssp.h | 0 fs/{cifs => smb/client}/readdir.c | 0 fs/{cifs => smb/client}/rfc1002pdu.h | 0 fs/{cifs => smb/client}/sess.c | 0 fs/{cifs => smb/client}/smb1ops.c | 0 fs/{cifs => smb/client}/smb2file.c | 0 fs/{cifs => smb/client}/smb2glob.h | 0 fs/{cifs => smb/client}/smb2inode.c | 0 fs/{cifs => smb/client}/smb2maperror.c | 0 fs/{cifs => smb/client}/smb2misc.c | 0 fs/{cifs => smb/client}/smb2ops.c | 0 fs/{cifs => smb/client}/smb2pdu.c | 0 fs/{cifs => smb/client}/smb2pdu.h | 0 fs/{cifs => smb/client}/smb2proto.h | 0 fs/{cifs => smb/client}/smb2status.h | 0 fs/{cifs => smb/client}/smb2transport.c | 0 fs/{cifs => smb/client}/smbdirect.c | 0 fs/{cifs => smb/client}/smbdirect.h | 0 fs/{cifs => smb/client}/smbencrypt.c | 2 +- fs/{cifs => smb/client}/smberr.h | 0 fs/{cifs => smb/client}/trace.c | 0 fs/{cifs => smb/client}/trace.h | 0 fs/{cifs => smb/client}/transport.c | 0 fs/{cifs => smb/client}/unc.c | 0 fs/{cifs => smb/client}/winucase.c | 0 fs/{cifs => smb/client}/xattr.c | 0 fs/{smbfs_common => smb/common}/Makefile | 4 ++-- fs/{smbfs_common => smb/common}/arc4.h | 0 fs/{smbfs_common => smb/common}/cifs_arc4.c | 0 fs/{smbfs_common => smb/common}/cifs_md4.c | 0 fs/{smbfs_common => smb/common}/md4.h | 0 fs/{smbfs_common => smb/common}/smb2pdu.h | 0 fs/{smbfs_common => smb/common}/smbfsctl.h | 0 fs/{ksmbd => smb/server}/Kconfig | 0 fs/{ksmbd => smb/server}/Makefile | 0 fs/{ksmbd => smb/server}/asn1.c | 0 fs/{ksmbd => smb/server}/asn1.h | 0 fs/{ksmbd => smb/server}/auth.c | 2 +- fs/{ksmbd => smb/server}/auth.h | 0 fs/{ksmbd => smb/server}/connection.c | 0 fs/{ksmbd => smb/server}/connection.h | 0 fs/{ksmbd => smb/server}/crypto_ctx.c | 0 fs/{ksmbd => smb/server}/crypto_ctx.h | 0 fs/{ksmbd => smb/server}/glob.h | 0 fs/{ksmbd => smb/server}/ksmbd_netlink.h | 0 .../server}/ksmbd_spnego_negtokeninit.asn1 | 0 .../server}/ksmbd_spnego_negtokentarg.asn1 | 0 fs/{ksmbd => smb/server}/ksmbd_work.c | 0 fs/{ksmbd => smb/server}/ksmbd_work.h | 0 fs/{ksmbd => smb/server}/mgmt/ksmbd_ida.c | 0 fs/{ksmbd => smb/server}/mgmt/ksmbd_ida.h | 0 fs/{ksmbd => smb/server}/mgmt/share_config.c | 0 fs/{ksmbd => smb/server}/mgmt/share_config.h | 0 fs/{ksmbd => smb/server}/mgmt/tree_connect.c | 0 fs/{ksmbd => smb/server}/mgmt/tree_connect.h | 0 fs/{ksmbd => smb/server}/mgmt/user_config.c | 0 fs/{ksmbd => smb/server}/mgmt/user_config.h | 0 fs/{ksmbd => smb/server}/mgmt/user_session.c | 0 fs/{ksmbd => smb/server}/mgmt/user_session.h | 0 fs/{ksmbd => smb/server}/misc.c | 0 fs/{ksmbd => smb/server}/misc.h | 0 fs/{ksmbd => smb/server}/ndr.c | 0 fs/{ksmbd => smb/server}/ndr.h | 0 fs/{ksmbd => smb/server}/nterr.h | 0 fs/{ksmbd => smb/server}/ntlmssp.h | 0 fs/{ksmbd => smb/server}/oplock.c | 0 fs/{ksmbd => smb/server}/oplock.h | 0 fs/{ksmbd => smb/server}/server.c | 0 fs/{ksmbd => smb/server}/server.h | 0 fs/{ksmbd => smb/server}/smb2misc.c | 0 fs/{ksmbd => smb/server}/smb2ops.c | 0 fs/{ksmbd => smb/server}/smb2pdu.c | 0 fs/{ksmbd => smb/server}/smb2pdu.h | 0 fs/{ksmbd => smb/server}/smb_common.c | 0 fs/{ksmbd => smb/server}/smb_common.h | 2 +- fs/{ksmbd => smb/server}/smbacl.c | 0 fs/{ksmbd => smb/server}/smbacl.h | 0 fs/{ksmbd => smb/server}/smbfsctl.h | 0 fs/{ksmbd => smb/server}/smbstatus.h | 0 fs/{ksmbd => smb/server}/transport_ipc.c | 0 fs/{ksmbd => smb/server}/transport_ipc.h | 0 fs/{ksmbd => smb/server}/transport_rdma.c | 0 fs/{ksmbd => smb/server}/transport_rdma.h | 0 fs/{ksmbd => smb/server}/transport_tcp.c | 0 fs/{ksmbd => smb/server}/transport_tcp.h | 0 fs/{ksmbd => smb/server}/unicode.c | 0 fs/{ksmbd => smb/server}/unicode.h | 0 fs/{ksmbd => smb/server}/uniupr.h | 0 fs/{ksmbd => smb/server}/vfs.c | 0 fs/{ksmbd => smb/server}/vfs.h | 0 fs/{ksmbd => smb/server}/vfs_cache.c | 0 fs/{ksmbd => smb/server}/vfs_cache.h | 0 fs/{ksmbd => smb/server}/xattr.h | 0 149 files changed, 30 insertions(+), 23 deletions(-) create mode 100644 fs/smb/Kconfig create mode 100644 fs/smb/Makefile rename fs/{cifs => smb/client}/Kconfig (100%) rename fs/{cifs => smb/client}/Makefile (100%) rename fs/{cifs => smb/client}/asn1.c (100%) rename fs/{cifs => smb/client}/cached_dir.c (100%) rename fs/{cifs => smb/client}/cached_dir.h (100%) rename fs/{cifs => smb/client}/cifs_debug.c (100%) rename fs/{cifs => smb/client}/cifs_debug.h (100%) rename fs/{cifs => smb/client}/cifs_dfs_ref.c (100%) rename fs/{cifs => smb/client}/cifs_fs_sb.h (100%) rename fs/{cifs => smb/client}/cifs_ioctl.h (100%) rename fs/{cifs => smb/client}/cifs_spnego.c (100%) rename fs/{cifs => smb/client}/cifs_spnego.h (100%) rename fs/{cifs => smb/client}/cifs_spnego_negtokeninit.asn1 (100%) rename fs/{cifs => smb/client}/cifs_swn.c (100%) rename fs/{cifs => smb/client}/cifs_swn.h (100%) rename fs/{cifs => smb/client}/cifs_unicode.c (100%) rename fs/{cifs => smb/client}/cifs_unicode.h (100%) rename fs/{cifs => smb/client}/cifs_uniupr.h (100%) rename fs/{cifs => smb/client}/cifsacl.c (100%) rename fs/{cifs => smb/client}/cifsacl.h (100%) rename fs/{cifs => smb/client}/cifsencrypt.c (99%) rename fs/{cifs => smb/client}/cifsfs.c (100%) rename fs/{cifs => smb/client}/cifsfs.h (100%) rename fs/{cifs => smb/client}/cifsglob.h (99%) rename fs/{cifs => smb/client}/cifspdu.h (99%) rename fs/{cifs => smb/client}/cifsproto.h (100%) rename fs/{cifs => smb/client}/cifsroot.c (100%) rename fs/{cifs => smb/client}/cifssmb.c (100%) rename fs/{cifs => smb/client}/connect.c (100%) rename fs/{cifs => smb/client}/dfs.c (100%) rename fs/{cifs => smb/client}/dfs.h (100%) rename fs/{cifs => smb/client}/dfs_cache.c (100%) rename fs/{cifs => smb/client}/dfs_cache.h (100%) rename fs/{cifs => smb/client}/dir.c (100%) rename fs/{cifs => smb/client}/dns_resolve.c (100%) rename fs/{cifs => smb/client}/dns_resolve.h (100%) rename fs/{cifs => smb/client}/export.c (100%) rename fs/{cifs => smb/client}/file.c (100%) rename fs/{cifs => smb/client}/fs_context.c (100%) rename fs/{cifs => smb/client}/fs_context.h (100%) rename fs/{cifs => smb/client}/fscache.c (100%) rename fs/{cifs => smb/client}/fscache.h (100%) rename fs/{cifs => smb/client}/inode.c (100%) rename fs/{cifs => smb/client}/ioctl.c (100%) rename fs/{cifs => smb/client}/link.c (100%) rename fs/{cifs => smb/client}/misc.c (100%) rename fs/{cifs => smb/client}/netlink.c (100%) rename fs/{cifs => smb/client}/netlink.h (100%) rename fs/{cifs => smb/client}/netmisc.c (100%) rename fs/{cifs => smb/client}/nterr.c (100%) rename fs/{cifs => smb/client}/nterr.h (100%) rename fs/{cifs => smb/client}/ntlmssp.h (100%) rename fs/{cifs => smb/client}/readdir.c (100%) rename fs/{cifs => smb/client}/rfc1002pdu.h (100%) rename fs/{cifs => smb/client}/sess.c (100%) rename fs/{cifs => smb/client}/smb1ops.c (100%) rename fs/{cifs => smb/client}/smb2file.c (100%) rename fs/{cifs => smb/client}/smb2glob.h (100%) rename fs/{cifs => smb/client}/smb2inode.c (100%) rename fs/{cifs => smb/client}/smb2maperror.c (100%) rename fs/{cifs => smb/client}/smb2misc.c (100%) rename fs/{cifs => smb/client}/smb2ops.c (100%) rename fs/{cifs => smb/client}/smb2pdu.c (100%) rename fs/{cifs => smb/client}/smb2pdu.h (100%) rename fs/{cifs => smb/client}/smb2proto.h (100%) rename fs/{cifs => smb/client}/smb2status.h (100%) rename fs/{cifs => smb/client}/smb2transport.c (100%) rename fs/{cifs => smb/client}/smbdirect.c (100%) rename fs/{cifs => smb/client}/smbdirect.h (100%) rename fs/{cifs => smb/client}/smbencrypt.c (98%) rename fs/{cifs => smb/client}/smberr.h (100%) rename fs/{cifs => smb/client}/trace.c (100%) rename fs/{cifs => smb/client}/trace.h (100%) rename fs/{cifs => smb/client}/transport.c (100%) rename fs/{cifs => smb/client}/unc.c (100%) rename fs/{cifs => smb/client}/winucase.c (100%) rename fs/{cifs => smb/client}/xattr.c (100%) rename fs/{smbfs_common => smb/common}/Makefile (59%) rename fs/{smbfs_common => smb/common}/arc4.h (100%) rename fs/{smbfs_common => smb/common}/cifs_arc4.c (100%) rename fs/{smbfs_common => smb/common}/cifs_md4.c (100%) rename fs/{smbfs_common => smb/common}/md4.h (100%) rename fs/{smbfs_common => smb/common}/smb2pdu.h (100%) rename fs/{smbfs_common => smb/common}/smbfsctl.h (100%) rename fs/{ksmbd => smb/server}/Kconfig (100%) rename fs/{ksmbd => smb/server}/Makefile (100%) rename fs/{ksmbd => smb/server}/asn1.c (100%) rename fs/{ksmbd => smb/server}/asn1.h (100%) rename fs/{ksmbd => smb/server}/auth.c (99%) rename fs/{ksmbd => smb/server}/auth.h (100%) rename fs/{ksmbd => smb/server}/connection.c (100%) rename fs/{ksmbd => smb/server}/connection.h (100%) rename fs/{ksmbd => smb/server}/crypto_ctx.c (100%) rename fs/{ksmbd => smb/server}/crypto_ctx.h (100%) rename fs/{ksmbd => smb/server}/glob.h (100%) rename fs/{ksmbd => smb/server}/ksmbd_netlink.h (100%) rename fs/{ksmbd => smb/server}/ksmbd_spnego_negtokeninit.asn1 (100%) rename fs/{ksmbd => smb/server}/ksmbd_spnego_negtokentarg.asn1 (100%) rename fs/{ksmbd => smb/server}/ksmbd_work.c (100%) rename fs/{ksmbd => smb/server}/ksmbd_work.h (100%) rename fs/{ksmbd => smb/server}/mgmt/ksmbd_ida.c (100%) rename fs/{ksmbd => smb/server}/mgmt/ksmbd_ida.h (100%) rename fs/{ksmbd => smb/server}/mgmt/share_config.c (100%) rename fs/{ksmbd => smb/server}/mgmt/share_config.h (100%) rename fs/{ksmbd => smb/server}/mgmt/tree_connect.c (100%) rename fs/{ksmbd => smb/server}/mgmt/tree_connect.h (100%) rename fs/{ksmbd => smb/server}/mgmt/user_config.c (100%) rename fs/{ksmbd => smb/server}/mgmt/user_config.h (100%) rename fs/{ksmbd => smb/server}/mgmt/user_session.c (100%) rename fs/{ksmbd => smb/server}/mgmt/user_session.h (100%) rename fs/{ksmbd => smb/server}/misc.c (100%) rename fs/{ksmbd => smb/server}/misc.h (100%) rename fs/{ksmbd => smb/server}/ndr.c (100%) rename fs/{ksmbd => smb/server}/ndr.h (100%) rename fs/{ksmbd => smb/server}/nterr.h (100%) rename fs/{ksmbd => smb/server}/ntlmssp.h (100%) rename fs/{ksmbd => smb/server}/oplock.c (100%) rename fs/{ksmbd => smb/server}/oplock.h (100%) rename fs/{ksmbd => smb/server}/server.c (100%) rename fs/{ksmbd => smb/server}/server.h (100%) rename fs/{ksmbd => smb/server}/smb2misc.c (100%) rename fs/{ksmbd => smb/server}/smb2ops.c (100%) rename fs/{ksmbd => smb/server}/smb2pdu.c (100%) rename fs/{ksmbd => smb/server}/smb2pdu.h (100%) rename fs/{ksmbd => smb/server}/smb_common.c (100%) rename fs/{ksmbd => smb/server}/smb_common.h (99%) rename fs/{ksmbd => smb/server}/smbacl.c (100%) rename fs/{ksmbd => smb/server}/smbacl.h (100%) rename fs/{ksmbd => smb/server}/smbfsctl.h (100%) rename fs/{ksmbd => smb/server}/smbstatus.h (100%) rename fs/{ksmbd => smb/server}/transport_ipc.c (100%) rename fs/{ksmbd => smb/server}/transport_ipc.h (100%) rename fs/{ksmbd => smb/server}/transport_rdma.c (100%) rename fs/{ksmbd => smb/server}/transport_rdma.h (100%) rename fs/{ksmbd => smb/server}/transport_tcp.c (100%) rename fs/{ksmbd => smb/server}/transport_tcp.h (100%) rename fs/{ksmbd => smb/server}/unicode.c (100%) rename fs/{ksmbd => smb/server}/unicode.h (100%) rename fs/{ksmbd => smb/server}/uniupr.h (100%) rename fs/{ksmbd => smb/server}/vfs.c (100%) rename fs/{ksmbd => smb/server}/vfs.h (100%) rename fs/{ksmbd => smb/server}/vfs_cache.c (100%) rename fs/{ksmbd => smb/server}/vfs_cache.h (100%) rename fs/{ksmbd => smb/server}/xattr.h (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 27ef11624748..902f763e845d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5140,8 +5140,8 @@ S: Supported W: https://wiki.samba.org/index.php/LinuxCIFS T: git git://git.samba.org/sfrench/cifs-2.6.git F: Documentation/admin-guide/cifs/ -F: fs/cifs/ -F: fs/smbfs_common/ +F: fs/smb/client/ +F: fs/smb/common/ F: include/uapi/linux/cifs COMPACTPCI HOTPLUG CORE @@ -11301,8 +11301,8 @@ L: linux-cifs@vger.kernel.org S: Maintained T: git git://git.samba.org/ksmbd.git F: Documentation/filesystems/cifs/ksmbd.rst -F: fs/ksmbd/ -F: fs/smbfs_common/ +F: fs/smb/common/ +F: fs/smb/server/ KERNEL UNIT TESTING FRAMEWORK (KUnit) M: Brendan Higgins diff --git a/fs/Kconfig b/fs/Kconfig index cc07a0cd3172..18d034ec7953 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -368,14 +368,7 @@ config NFS_V4_2_SSC_HELPER source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" -source "fs/cifs/Kconfig" -source "fs/ksmbd/Kconfig" - -config SMBFS_COMMON - tristate - default y if CIFS=y || SMB_SERVER=y - default m if CIFS=m || SMB_SERVER=m - +source "fs/smb/Kconfig" source "fs/coda/Kconfig" source "fs/afs/Kconfig" source "fs/9p/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 834f1c3dba46..5bfdbf0d7037 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -95,9 +95,7 @@ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ -obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ -obj-$(CONFIG_CIFS) += cifs/ -obj-$(CONFIG_SMB_SERVER) += ksmbd/ +obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ diff --git a/fs/smb/Kconfig b/fs/smb/Kconfig new file mode 100644 index 000000000000..ef425789fa6a --- /dev/null +++ b/fs/smb/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# smbfs configuration + +source "fs/smb/client/Kconfig" +source "fs/smb/server/Kconfig" + +config SMBFS + tristate + default y if CIFS=y || SMB_SERVER=y + default m if CIFS=m || SMB_SERVER=m diff --git a/fs/smb/Makefile b/fs/smb/Makefile new file mode 100644 index 000000000000..9a1bf59a1a65 --- /dev/null +++ b/fs/smb/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SMBFS) += common/ +obj-$(CONFIG_CIFS) += client/ +obj-$(CONFIG_SMB_SERVER) += server/ diff --git a/fs/cifs/Kconfig b/fs/smb/client/Kconfig similarity index 100% rename from fs/cifs/Kconfig rename to fs/smb/client/Kconfig diff --git a/fs/cifs/Makefile b/fs/smb/client/Makefile similarity index 100% rename from fs/cifs/Makefile rename to fs/smb/client/Makefile diff --git a/fs/cifs/asn1.c b/fs/smb/client/asn1.c similarity index 100% rename from fs/cifs/asn1.c rename to fs/smb/client/asn1.c diff --git a/fs/cifs/cached_dir.c b/fs/smb/client/cached_dir.c similarity index 100% rename from fs/cifs/cached_dir.c rename to fs/smb/client/cached_dir.c diff --git a/fs/cifs/cached_dir.h b/fs/smb/client/cached_dir.h similarity index 100% rename from fs/cifs/cached_dir.h rename to fs/smb/client/cached_dir.h diff --git a/fs/cifs/cifs_debug.c b/fs/smb/client/cifs_debug.c similarity index 100% rename from fs/cifs/cifs_debug.c rename to fs/smb/client/cifs_debug.c diff --git a/fs/cifs/cifs_debug.h b/fs/smb/client/cifs_debug.h similarity index 100% rename from fs/cifs/cifs_debug.h rename to fs/smb/client/cifs_debug.h diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/smb/client/cifs_dfs_ref.c similarity index 100% rename from fs/cifs/cifs_dfs_ref.c rename to fs/smb/client/cifs_dfs_ref.c diff --git a/fs/cifs/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h similarity index 100% rename from fs/cifs/cifs_fs_sb.h rename to fs/smb/client/cifs_fs_sb.h diff --git a/fs/cifs/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h similarity index 100% rename from fs/cifs/cifs_ioctl.h rename to fs/smb/client/cifs_ioctl.h diff --git a/fs/cifs/cifs_spnego.c b/fs/smb/client/cifs_spnego.c similarity index 100% rename from fs/cifs/cifs_spnego.c rename to fs/smb/client/cifs_spnego.c diff --git a/fs/cifs/cifs_spnego.h b/fs/smb/client/cifs_spnego.h similarity index 100% rename from fs/cifs/cifs_spnego.h rename to fs/smb/client/cifs_spnego.h diff --git a/fs/cifs/cifs_spnego_negtokeninit.asn1 b/fs/smb/client/cifs_spnego_negtokeninit.asn1 similarity index 100% rename from fs/cifs/cifs_spnego_negtokeninit.asn1 rename to fs/smb/client/cifs_spnego_negtokeninit.asn1 diff --git a/fs/cifs/cifs_swn.c b/fs/smb/client/cifs_swn.c similarity index 100% rename from fs/cifs/cifs_swn.c rename to fs/smb/client/cifs_swn.c diff --git a/fs/cifs/cifs_swn.h b/fs/smb/client/cifs_swn.h similarity index 100% rename from fs/cifs/cifs_swn.h rename to fs/smb/client/cifs_swn.h diff --git a/fs/cifs/cifs_unicode.c b/fs/smb/client/cifs_unicode.c similarity index 100% rename from fs/cifs/cifs_unicode.c rename to fs/smb/client/cifs_unicode.c diff --git a/fs/cifs/cifs_unicode.h b/fs/smb/client/cifs_unicode.h similarity index 100% rename from fs/cifs/cifs_unicode.h rename to fs/smb/client/cifs_unicode.h diff --git a/fs/cifs/cifs_uniupr.h b/fs/smb/client/cifs_uniupr.h similarity index 100% rename from fs/cifs/cifs_uniupr.h rename to fs/smb/client/cifs_uniupr.h diff --git a/fs/cifs/cifsacl.c b/fs/smb/client/cifsacl.c similarity index 100% rename from fs/cifs/cifsacl.c rename to fs/smb/client/cifsacl.c diff --git a/fs/cifs/cifsacl.h b/fs/smb/client/cifsacl.h similarity index 100% rename from fs/cifs/cifsacl.h rename to fs/smb/client/cifsacl.h diff --git a/fs/cifs/cifsencrypt.c b/fs/smb/client/cifsencrypt.c similarity index 99% rename from fs/cifs/cifsencrypt.c rename to fs/smb/client/cifsencrypt.c index 357bd27a7fd1..ef4c2e3c9fa6 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -21,7 +21,7 @@ #include #include #include -#include "../smbfs_common/arc4.h" +#include "../common/arc4.h" #include /* diff --git a/fs/cifs/cifsfs.c b/fs/smb/client/cifsfs.c similarity index 100% rename from fs/cifs/cifsfs.c rename to fs/smb/client/cifsfs.c diff --git a/fs/cifs/cifsfs.h b/fs/smb/client/cifsfs.h similarity index 100% rename from fs/cifs/cifsfs.h rename to fs/smb/client/cifsfs.h diff --git a/fs/cifs/cifsglob.h b/fs/smb/client/cifsglob.h similarity index 99% rename from fs/cifs/cifsglob.h rename to fs/smb/client/cifsglob.h index 5f8fd20951af..0d84bb1a8cd9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -24,7 +24,7 @@ #include "cifsacl.h" #include #include -#include "../smbfs_common/smb2pdu.h" +#include "../common/smb2pdu.h" #include "smb2pdu.h" #include diff --git a/fs/cifs/cifspdu.h b/fs/smb/client/cifspdu.h similarity index 99% rename from fs/cifs/cifspdu.h rename to fs/smb/client/cifspdu.h index 445e3eaebcc1..e17222fec9d2 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -11,7 +11,7 @@ #include #include -#include "../smbfs_common/smbfsctl.h" +#include "../common/smbfsctl.h" #define CIFS_PROT 0 #define POSIX_PROT (CIFS_PROT+1) diff --git a/fs/cifs/cifsproto.h b/fs/smb/client/cifsproto.h similarity index 100% rename from fs/cifs/cifsproto.h rename to fs/smb/client/cifsproto.h diff --git a/fs/cifs/cifsroot.c b/fs/smb/client/cifsroot.c similarity index 100% rename from fs/cifs/cifsroot.c rename to fs/smb/client/cifsroot.c diff --git a/fs/cifs/cifssmb.c b/fs/smb/client/cifssmb.c similarity index 100% rename from fs/cifs/cifssmb.c rename to fs/smb/client/cifssmb.c diff --git a/fs/cifs/connect.c b/fs/smb/client/connect.c similarity index 100% rename from fs/cifs/connect.c rename to fs/smb/client/connect.c diff --git a/fs/cifs/dfs.c b/fs/smb/client/dfs.c similarity index 100% rename from fs/cifs/dfs.c rename to fs/smb/client/dfs.c diff --git a/fs/cifs/dfs.h b/fs/smb/client/dfs.h similarity index 100% rename from fs/cifs/dfs.h rename to fs/smb/client/dfs.h diff --git a/fs/cifs/dfs_cache.c b/fs/smb/client/dfs_cache.c similarity index 100% rename from fs/cifs/dfs_cache.c rename to fs/smb/client/dfs_cache.c diff --git a/fs/cifs/dfs_cache.h b/fs/smb/client/dfs_cache.h similarity index 100% rename from fs/cifs/dfs_cache.h rename to fs/smb/client/dfs_cache.h diff --git a/fs/cifs/dir.c b/fs/smb/client/dir.c similarity index 100% rename from fs/cifs/dir.c rename to fs/smb/client/dir.c diff --git a/fs/cifs/dns_resolve.c b/fs/smb/client/dns_resolve.c similarity index 100% rename from fs/cifs/dns_resolve.c rename to fs/smb/client/dns_resolve.c diff --git a/fs/cifs/dns_resolve.h b/fs/smb/client/dns_resolve.h similarity index 100% rename from fs/cifs/dns_resolve.h rename to fs/smb/client/dns_resolve.h diff --git a/fs/cifs/export.c b/fs/smb/client/export.c similarity index 100% rename from fs/cifs/export.c rename to fs/smb/client/export.c diff --git a/fs/cifs/file.c b/fs/smb/client/file.c similarity index 100% rename from fs/cifs/file.c rename to fs/smb/client/file.c diff --git a/fs/cifs/fs_context.c b/fs/smb/client/fs_context.c similarity index 100% rename from fs/cifs/fs_context.c rename to fs/smb/client/fs_context.c diff --git a/fs/cifs/fs_context.h b/fs/smb/client/fs_context.h similarity index 100% rename from fs/cifs/fs_context.h rename to fs/smb/client/fs_context.h diff --git a/fs/cifs/fscache.c b/fs/smb/client/fscache.c similarity index 100% rename from fs/cifs/fscache.c rename to fs/smb/client/fscache.c diff --git a/fs/cifs/fscache.h b/fs/smb/client/fscache.h similarity index 100% rename from fs/cifs/fscache.h rename to fs/smb/client/fscache.h diff --git a/fs/cifs/inode.c b/fs/smb/client/inode.c similarity index 100% rename from fs/cifs/inode.c rename to fs/smb/client/inode.c diff --git a/fs/cifs/ioctl.c b/fs/smb/client/ioctl.c similarity index 100% rename from fs/cifs/ioctl.c rename to fs/smb/client/ioctl.c diff --git a/fs/cifs/link.c b/fs/smb/client/link.c similarity index 100% rename from fs/cifs/link.c rename to fs/smb/client/link.c diff --git a/fs/cifs/misc.c b/fs/smb/client/misc.c similarity index 100% rename from fs/cifs/misc.c rename to fs/smb/client/misc.c diff --git a/fs/cifs/netlink.c b/fs/smb/client/netlink.c similarity index 100% rename from fs/cifs/netlink.c rename to fs/smb/client/netlink.c diff --git a/fs/cifs/netlink.h b/fs/smb/client/netlink.h similarity index 100% rename from fs/cifs/netlink.h rename to fs/smb/client/netlink.h diff --git a/fs/cifs/netmisc.c b/fs/smb/client/netmisc.c similarity index 100% rename from fs/cifs/netmisc.c rename to fs/smb/client/netmisc.c diff --git a/fs/cifs/nterr.c b/fs/smb/client/nterr.c similarity index 100% rename from fs/cifs/nterr.c rename to fs/smb/client/nterr.c diff --git a/fs/cifs/nterr.h b/fs/smb/client/nterr.h similarity index 100% rename from fs/cifs/nterr.h rename to fs/smb/client/nterr.h diff --git a/fs/cifs/ntlmssp.h b/fs/smb/client/ntlmssp.h similarity index 100% rename from fs/cifs/ntlmssp.h rename to fs/smb/client/ntlmssp.h diff --git a/fs/cifs/readdir.c b/fs/smb/client/readdir.c similarity index 100% rename from fs/cifs/readdir.c rename to fs/smb/client/readdir.c diff --git a/fs/cifs/rfc1002pdu.h b/fs/smb/client/rfc1002pdu.h similarity index 100% rename from fs/cifs/rfc1002pdu.h rename to fs/smb/client/rfc1002pdu.h diff --git a/fs/cifs/sess.c b/fs/smb/client/sess.c similarity index 100% rename from fs/cifs/sess.c rename to fs/smb/client/sess.c diff --git a/fs/cifs/smb1ops.c b/fs/smb/client/smb1ops.c similarity index 100% rename from fs/cifs/smb1ops.c rename to fs/smb/client/smb1ops.c diff --git a/fs/cifs/smb2file.c b/fs/smb/client/smb2file.c similarity index 100% rename from fs/cifs/smb2file.c rename to fs/smb/client/smb2file.c diff --git a/fs/cifs/smb2glob.h b/fs/smb/client/smb2glob.h similarity index 100% rename from fs/cifs/smb2glob.h rename to fs/smb/client/smb2glob.h diff --git a/fs/cifs/smb2inode.c b/fs/smb/client/smb2inode.c similarity index 100% rename from fs/cifs/smb2inode.c rename to fs/smb/client/smb2inode.c diff --git a/fs/cifs/smb2maperror.c b/fs/smb/client/smb2maperror.c similarity index 100% rename from fs/cifs/smb2maperror.c rename to fs/smb/client/smb2maperror.c diff --git a/fs/cifs/smb2misc.c b/fs/smb/client/smb2misc.c similarity index 100% rename from fs/cifs/smb2misc.c rename to fs/smb/client/smb2misc.c diff --git a/fs/cifs/smb2ops.c b/fs/smb/client/smb2ops.c similarity index 100% rename from fs/cifs/smb2ops.c rename to fs/smb/client/smb2ops.c diff --git a/fs/cifs/smb2pdu.c b/fs/smb/client/smb2pdu.c similarity index 100% rename from fs/cifs/smb2pdu.c rename to fs/smb/client/smb2pdu.c diff --git a/fs/cifs/smb2pdu.h b/fs/smb/client/smb2pdu.h similarity index 100% rename from fs/cifs/smb2pdu.h rename to fs/smb/client/smb2pdu.h diff --git a/fs/cifs/smb2proto.h b/fs/smb/client/smb2proto.h similarity index 100% rename from fs/cifs/smb2proto.h rename to fs/smb/client/smb2proto.h diff --git a/fs/cifs/smb2status.h b/fs/smb/client/smb2status.h similarity index 100% rename from fs/cifs/smb2status.h rename to fs/smb/client/smb2status.h diff --git a/fs/cifs/smb2transport.c b/fs/smb/client/smb2transport.c similarity index 100% rename from fs/cifs/smb2transport.c rename to fs/smb/client/smb2transport.c diff --git a/fs/cifs/smbdirect.c b/fs/smb/client/smbdirect.c similarity index 100% rename from fs/cifs/smbdirect.c rename to fs/smb/client/smbdirect.c diff --git a/fs/cifs/smbdirect.h b/fs/smb/client/smbdirect.h similarity index 100% rename from fs/cifs/smbdirect.h rename to fs/smb/client/smbdirect.h diff --git a/fs/cifs/smbencrypt.c b/fs/smb/client/smbencrypt.c similarity index 98% rename from fs/cifs/smbencrypt.c rename to fs/smb/client/smbencrypt.c index 4a0487753869..f0ce26414f17 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/smb/client/smbencrypt.c @@ -24,7 +24,7 @@ #include "cifsglob.h" #include "cifs_debug.h" #include "cifsproto.h" -#include "../smbfs_common/md4.h" +#include "../common/md4.h" #ifndef false #define false 0 diff --git a/fs/cifs/smberr.h b/fs/smb/client/smberr.h similarity index 100% rename from fs/cifs/smberr.h rename to fs/smb/client/smberr.h diff --git a/fs/cifs/trace.c b/fs/smb/client/trace.c similarity index 100% rename from fs/cifs/trace.c rename to fs/smb/client/trace.c diff --git a/fs/cifs/trace.h b/fs/smb/client/trace.h similarity index 100% rename from fs/cifs/trace.h rename to fs/smb/client/trace.h diff --git a/fs/cifs/transport.c b/fs/smb/client/transport.c similarity index 100% rename from fs/cifs/transport.c rename to fs/smb/client/transport.c diff --git a/fs/cifs/unc.c b/fs/smb/client/unc.c similarity index 100% rename from fs/cifs/unc.c rename to fs/smb/client/unc.c diff --git a/fs/cifs/winucase.c b/fs/smb/client/winucase.c similarity index 100% rename from fs/cifs/winucase.c rename to fs/smb/client/winucase.c diff --git a/fs/cifs/xattr.c b/fs/smb/client/xattr.c similarity index 100% rename from fs/cifs/xattr.c rename to fs/smb/client/xattr.c diff --git a/fs/smbfs_common/Makefile b/fs/smb/common/Makefile similarity index 59% rename from fs/smbfs_common/Makefile rename to fs/smb/common/Makefile index cafc61a3bfc3..c66dbbc1469c 100644 --- a/fs/smbfs_common/Makefile +++ b/fs/smb/common/Makefile @@ -3,5 +3,5 @@ # Makefile for Linux filesystem routines that are shared by client and server. # -obj-$(CONFIG_SMBFS_COMMON) += cifs_arc4.o -obj-$(CONFIG_SMBFS_COMMON) += cifs_md4.o +obj-$(CONFIG_SMBFS) += cifs_arc4.o +obj-$(CONFIG_SMBFS) += cifs_md4.o diff --git a/fs/smbfs_common/arc4.h b/fs/smb/common/arc4.h similarity index 100% rename from fs/smbfs_common/arc4.h rename to fs/smb/common/arc4.h diff --git a/fs/smbfs_common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c similarity index 100% rename from fs/smbfs_common/cifs_arc4.c rename to fs/smb/common/cifs_arc4.c diff --git a/fs/smbfs_common/cifs_md4.c b/fs/smb/common/cifs_md4.c similarity index 100% rename from fs/smbfs_common/cifs_md4.c rename to fs/smb/common/cifs_md4.c diff --git a/fs/smbfs_common/md4.h b/fs/smb/common/md4.h similarity index 100% rename from fs/smbfs_common/md4.h rename to fs/smb/common/md4.h diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smb/common/smb2pdu.h similarity index 100% rename from fs/smbfs_common/smb2pdu.h rename to fs/smb/common/smb2pdu.h diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smb/common/smbfsctl.h similarity index 100% rename from fs/smbfs_common/smbfsctl.h rename to fs/smb/common/smbfsctl.h diff --git a/fs/ksmbd/Kconfig b/fs/smb/server/Kconfig similarity index 100% rename from fs/ksmbd/Kconfig rename to fs/smb/server/Kconfig diff --git a/fs/ksmbd/Makefile b/fs/smb/server/Makefile similarity index 100% rename from fs/ksmbd/Makefile rename to fs/smb/server/Makefile diff --git a/fs/ksmbd/asn1.c b/fs/smb/server/asn1.c similarity index 100% rename from fs/ksmbd/asn1.c rename to fs/smb/server/asn1.c diff --git a/fs/ksmbd/asn1.h b/fs/smb/server/asn1.h similarity index 100% rename from fs/ksmbd/asn1.h rename to fs/smb/server/asn1.h diff --git a/fs/ksmbd/auth.c b/fs/smb/server/auth.c similarity index 99% rename from fs/ksmbd/auth.c rename to fs/smb/server/auth.c index df8fb076f6f1..5e5e120edcc2 100644 --- a/fs/ksmbd/auth.c +++ b/fs/smb/server/auth.c @@ -29,7 +29,7 @@ #include "mgmt/user_config.h" #include "crypto_ctx.h" #include "transport_ipc.h" -#include "../smbfs_common/arc4.h" +#include "../common/arc4.h" /* * Fixed format data defining GSS header and fixed string diff --git a/fs/ksmbd/auth.h b/fs/smb/server/auth.h similarity index 100% rename from fs/ksmbd/auth.h rename to fs/smb/server/auth.h diff --git a/fs/ksmbd/connection.c b/fs/smb/server/connection.c similarity index 100% rename from fs/ksmbd/connection.c rename to fs/smb/server/connection.c diff --git a/fs/ksmbd/connection.h b/fs/smb/server/connection.h similarity index 100% rename from fs/ksmbd/connection.h rename to fs/smb/server/connection.h diff --git a/fs/ksmbd/crypto_ctx.c b/fs/smb/server/crypto_ctx.c similarity index 100% rename from fs/ksmbd/crypto_ctx.c rename to fs/smb/server/crypto_ctx.c diff --git a/fs/ksmbd/crypto_ctx.h b/fs/smb/server/crypto_ctx.h similarity index 100% rename from fs/ksmbd/crypto_ctx.h rename to fs/smb/server/crypto_ctx.h diff --git a/fs/ksmbd/glob.h b/fs/smb/server/glob.h similarity index 100% rename from fs/ksmbd/glob.h rename to fs/smb/server/glob.h diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h similarity index 100% rename from fs/ksmbd/ksmbd_netlink.h rename to fs/smb/server/ksmbd_netlink.h diff --git a/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1 b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 similarity index 100% rename from fs/ksmbd/ksmbd_spnego_negtokeninit.asn1 rename to fs/smb/server/ksmbd_spnego_negtokeninit.asn1 diff --git a/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1 b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 similarity index 100% rename from fs/ksmbd/ksmbd_spnego_negtokentarg.asn1 rename to fs/smb/server/ksmbd_spnego_negtokentarg.asn1 diff --git a/fs/ksmbd/ksmbd_work.c b/fs/smb/server/ksmbd_work.c similarity index 100% rename from fs/ksmbd/ksmbd_work.c rename to fs/smb/server/ksmbd_work.c diff --git a/fs/ksmbd/ksmbd_work.h b/fs/smb/server/ksmbd_work.h similarity index 100% rename from fs/ksmbd/ksmbd_work.h rename to fs/smb/server/ksmbd_work.h diff --git a/fs/ksmbd/mgmt/ksmbd_ida.c b/fs/smb/server/mgmt/ksmbd_ida.c similarity index 100% rename from fs/ksmbd/mgmt/ksmbd_ida.c rename to fs/smb/server/mgmt/ksmbd_ida.c diff --git a/fs/ksmbd/mgmt/ksmbd_ida.h b/fs/smb/server/mgmt/ksmbd_ida.h similarity index 100% rename from fs/ksmbd/mgmt/ksmbd_ida.h rename to fs/smb/server/mgmt/ksmbd_ida.h diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c similarity index 100% rename from fs/ksmbd/mgmt/share_config.c rename to fs/smb/server/mgmt/share_config.c diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h similarity index 100% rename from fs/ksmbd/mgmt/share_config.h rename to fs/smb/server/mgmt/share_config.h diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c similarity index 100% rename from fs/ksmbd/mgmt/tree_connect.c rename to fs/smb/server/mgmt/tree_connect.c diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h similarity index 100% rename from fs/ksmbd/mgmt/tree_connect.h rename to fs/smb/server/mgmt/tree_connect.h diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c similarity index 100% rename from fs/ksmbd/mgmt/user_config.c rename to fs/smb/server/mgmt/user_config.c diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h similarity index 100% rename from fs/ksmbd/mgmt/user_config.h rename to fs/smb/server/mgmt/user_config.h diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c similarity index 100% rename from fs/ksmbd/mgmt/user_session.c rename to fs/smb/server/mgmt/user_session.c diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h similarity index 100% rename from fs/ksmbd/mgmt/user_session.h rename to fs/smb/server/mgmt/user_session.h diff --git a/fs/ksmbd/misc.c b/fs/smb/server/misc.c similarity index 100% rename from fs/ksmbd/misc.c rename to fs/smb/server/misc.c diff --git a/fs/ksmbd/misc.h b/fs/smb/server/misc.h similarity index 100% rename from fs/ksmbd/misc.h rename to fs/smb/server/misc.h diff --git a/fs/ksmbd/ndr.c b/fs/smb/server/ndr.c similarity index 100% rename from fs/ksmbd/ndr.c rename to fs/smb/server/ndr.c diff --git a/fs/ksmbd/ndr.h b/fs/smb/server/ndr.h similarity index 100% rename from fs/ksmbd/ndr.h rename to fs/smb/server/ndr.h diff --git a/fs/ksmbd/nterr.h b/fs/smb/server/nterr.h similarity index 100% rename from fs/ksmbd/nterr.h rename to fs/smb/server/nterr.h diff --git a/fs/ksmbd/ntlmssp.h b/fs/smb/server/ntlmssp.h similarity index 100% rename from fs/ksmbd/ntlmssp.h rename to fs/smb/server/ntlmssp.h diff --git a/fs/ksmbd/oplock.c b/fs/smb/server/oplock.c similarity index 100% rename from fs/ksmbd/oplock.c rename to fs/smb/server/oplock.c diff --git a/fs/ksmbd/oplock.h b/fs/smb/server/oplock.h similarity index 100% rename from fs/ksmbd/oplock.h rename to fs/smb/server/oplock.h diff --git a/fs/ksmbd/server.c b/fs/smb/server/server.c similarity index 100% rename from fs/ksmbd/server.c rename to fs/smb/server/server.c diff --git a/fs/ksmbd/server.h b/fs/smb/server/server.h similarity index 100% rename from fs/ksmbd/server.h rename to fs/smb/server/server.h diff --git a/fs/ksmbd/smb2misc.c b/fs/smb/server/smb2misc.c similarity index 100% rename from fs/ksmbd/smb2misc.c rename to fs/smb/server/smb2misc.c diff --git a/fs/ksmbd/smb2ops.c b/fs/smb/server/smb2ops.c similarity index 100% rename from fs/ksmbd/smb2ops.c rename to fs/smb/server/smb2ops.c diff --git a/fs/ksmbd/smb2pdu.c b/fs/smb/server/smb2pdu.c similarity index 100% rename from fs/ksmbd/smb2pdu.c rename to fs/smb/server/smb2pdu.c diff --git a/fs/ksmbd/smb2pdu.h b/fs/smb/server/smb2pdu.h similarity index 100% rename from fs/ksmbd/smb2pdu.h rename to fs/smb/server/smb2pdu.h diff --git a/fs/ksmbd/smb_common.c b/fs/smb/server/smb_common.c similarity index 100% rename from fs/ksmbd/smb_common.c rename to fs/smb/server/smb_common.c diff --git a/fs/ksmbd/smb_common.h b/fs/smb/server/smb_common.h similarity index 99% rename from fs/ksmbd/smb_common.h rename to fs/smb/server/smb_common.h index 9130d2e3cd78..6b0d5f1fe85c 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -10,7 +10,7 @@ #include "glob.h" #include "nterr.h" -#include "../smbfs_common/smb2pdu.h" +#include "../common/smb2pdu.h" #include "smb2pdu.h" /* ksmbd's Specific ERRNO */ diff --git a/fs/ksmbd/smbacl.c b/fs/smb/server/smbacl.c similarity index 100% rename from fs/ksmbd/smbacl.c rename to fs/smb/server/smbacl.c diff --git a/fs/ksmbd/smbacl.h b/fs/smb/server/smbacl.h similarity index 100% rename from fs/ksmbd/smbacl.h rename to fs/smb/server/smbacl.h diff --git a/fs/ksmbd/smbfsctl.h b/fs/smb/server/smbfsctl.h similarity index 100% rename from fs/ksmbd/smbfsctl.h rename to fs/smb/server/smbfsctl.h diff --git a/fs/ksmbd/smbstatus.h b/fs/smb/server/smbstatus.h similarity index 100% rename from fs/ksmbd/smbstatus.h rename to fs/smb/server/smbstatus.h diff --git a/fs/ksmbd/transport_ipc.c b/fs/smb/server/transport_ipc.c similarity index 100% rename from fs/ksmbd/transport_ipc.c rename to fs/smb/server/transport_ipc.c diff --git a/fs/ksmbd/transport_ipc.h b/fs/smb/server/transport_ipc.h similarity index 100% rename from fs/ksmbd/transport_ipc.h rename to fs/smb/server/transport_ipc.h diff --git a/fs/ksmbd/transport_rdma.c b/fs/smb/server/transport_rdma.c similarity index 100% rename from fs/ksmbd/transport_rdma.c rename to fs/smb/server/transport_rdma.c diff --git a/fs/ksmbd/transport_rdma.h b/fs/smb/server/transport_rdma.h similarity index 100% rename from fs/ksmbd/transport_rdma.h rename to fs/smb/server/transport_rdma.h diff --git a/fs/ksmbd/transport_tcp.c b/fs/smb/server/transport_tcp.c similarity index 100% rename from fs/ksmbd/transport_tcp.c rename to fs/smb/server/transport_tcp.c diff --git a/fs/ksmbd/transport_tcp.h b/fs/smb/server/transport_tcp.h similarity index 100% rename from fs/ksmbd/transport_tcp.h rename to fs/smb/server/transport_tcp.h diff --git a/fs/ksmbd/unicode.c b/fs/smb/server/unicode.c similarity index 100% rename from fs/ksmbd/unicode.c rename to fs/smb/server/unicode.c diff --git a/fs/ksmbd/unicode.h b/fs/smb/server/unicode.h similarity index 100% rename from fs/ksmbd/unicode.h rename to fs/smb/server/unicode.h diff --git a/fs/ksmbd/uniupr.h b/fs/smb/server/uniupr.h similarity index 100% rename from fs/ksmbd/uniupr.h rename to fs/smb/server/uniupr.h diff --git a/fs/ksmbd/vfs.c b/fs/smb/server/vfs.c similarity index 100% rename from fs/ksmbd/vfs.c rename to fs/smb/server/vfs.c diff --git a/fs/ksmbd/vfs.h b/fs/smb/server/vfs.h similarity index 100% rename from fs/ksmbd/vfs.h rename to fs/smb/server/vfs.h diff --git a/fs/ksmbd/vfs_cache.c b/fs/smb/server/vfs_cache.c similarity index 100% rename from fs/ksmbd/vfs_cache.c rename to fs/smb/server/vfs_cache.c diff --git a/fs/ksmbd/vfs_cache.h b/fs/smb/server/vfs_cache.h similarity index 100% rename from fs/ksmbd/vfs_cache.h rename to fs/smb/server/vfs_cache.h diff --git a/fs/ksmbd/xattr.h b/fs/smb/server/xattr.h similarity index 100% rename from fs/ksmbd/xattr.h rename to fs/smb/server/xattr.h From bf8a352d4997147b1e63020ed17cb4ee94392608 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 21 May 2023 22:52:04 -0500 Subject: [PATCH 146/271] cifs: correct references in Documentation to old fs/cifs path The fs/cifs directory has moved to fs/smb/client, correct mentions of this in Documentation and comments. Acked-by: Namjae Jeon Signed-off-by: Steve French --- Documentation/admin-guide/cifs/changes.rst | 4 ++-- Documentation/admin-guide/cifs/usage.rst | 8 ++++---- Documentation/filesystems/cifs/cifsroot.rst | 2 +- Documentation/userspace-api/ioctl/ioctl-number.rst | 2 +- fs/smb/server/smbfsctl.h | 2 +- fs/smb/server/smbstatus.h | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/cifs/changes.rst b/Documentation/admin-guide/cifs/changes.rst index 3147bbae9c43..8c42c4de510b 100644 --- a/Documentation/admin-guide/cifs/changes.rst +++ b/Documentation/admin-guide/cifs/changes.rst @@ -5,5 +5,5 @@ Changes See https://wiki.samba.org/index.php/LinuxCIFSKernel for summary information about fixes/improvements to CIFS/SMB2/SMB3 support (changes to cifs.ko module) by kernel version (and cifs internal module version). -This may be easier to read than parsing the output of "git log fs/cifs" -by release. +This may be easier to read than parsing the output of +"git log fs/smb/client" by release. diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index 2e151cd8c2e4..5f936b4b6018 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -45,7 +45,7 @@ Installation instructions If you have built the CIFS vfs as module (successfully) simply type ``make modules_install`` (or if you prefer, manually copy the file to -the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.ko). +the modules directory e.g. /lib/modules/6.3.0-060300-generic/kernel/fs/smb/client/cifs.ko). If you have built the CIFS vfs into the kernel itself, follow the instructions for your distribution on how to install a new kernel (usually you @@ -66,15 +66,15 @@ If cifs is built as a module, then the size and number of network buffers and maximum number of simultaneous requests to one server can be configured. Changing these from their defaults is not recommended. By executing modinfo:: - modinfo kernel/fs/cifs/cifs.ko + modinfo -on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made +on kernel/fs/smb/client/cifs.ko the list of configuration changes that can be made at module initialization time (by running insmod cifs.ko) can be seen. Recommendations =============== -To improve security the SMB2.1 dialect or later (usually will get SMB3) is now +To improve security the SMB2.1 dialect or later (usually will get SMB3.1.1) is now the new default. To use old dialects (e.g. to mount Windows XP) use "vers=1.0" on mount (or vers=2.0 for Windows Vista). Note that the CIFS (vers=1.0) is much older and less secure than the default dialect SMB3 which includes diff --git a/Documentation/filesystems/cifs/cifsroot.rst b/Documentation/filesystems/cifs/cifsroot.rst index 4930bb443134..bf2d9db3acb9 100644 --- a/Documentation/filesystems/cifs/cifsroot.rst +++ b/Documentation/filesystems/cifs/cifsroot.rst @@ -59,7 +59,7 @@ the root file system via SMB protocol. Enables the kernel to mount the root file system via SMB that are located in the and specified in this option. -The default mount options are set in fs/cifs/cifsroot.c. +The default mount options are set in fs/smb/client/cifsroot.c. server-ip IPv4 address of the server. diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 176e8fc3f31b..4f7b23faebb9 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -363,7 +363,7 @@ Code Seq# Include File Comments 0xCC 00-0F drivers/misc/ibmvmc.h pseries VMC driver 0xCD 01 linux/reiserfs_fs.h 0xCE 01-02 uapi/linux/cxl_mem.h Compute Express Link Memory Devices -0xCF 02 fs/cifs/ioctl.c +0xCF 02 fs/smb/client/cifs_ioctl.h 0xDB 00-0F drivers/char/mwave/mwavepub.h 0xDD 00-3F ZFCP device driver see drivers/s390/scsi/ diff --git a/fs/smb/server/smbfsctl.h b/fs/smb/server/smbfsctl.h index b98418aae20c..ecdf8f6e0df4 100644 --- a/fs/smb/server/smbfsctl.h +++ b/fs/smb/server/smbfsctl.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ /* - * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions + * fs/smb/server/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions * * Copyright (c) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/smb/server/smbstatus.h b/fs/smb/server/smbstatus.h index 108a8b6ed24a..8963deb42404 100644 --- a/fs/smb/server/smbstatus.h +++ b/fs/smb/server/smbstatus.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ /* - * fs/cifs/smb2status.h + * fs/server/smb2status.h * * SMB2 Status code (network error) definitions * Definitions are from MS-ERREF From ab6cacf833ba337b41700ee193d2c8936f1d049e Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 22 May 2023 09:50:33 -0500 Subject: [PATCH 147/271] smb3: move Documentation/filesystems/cifs to Documentation/filesystems/smb Documentation/filesystems/cifs contains both server and client information so its pathname is misleading. In addition, the directory fs/smb now contains both server and client, so move Documentation/filesystems/cifs to Documentation/filesystems/smb Suggested-by: Namjae Jeon Acked-by: Namjae Jeon Signed-off-by: Steve French --- Documentation/filesystems/index.rst | 2 +- Documentation/filesystems/{cifs => smb}/cifsroot.rst | 0 Documentation/filesystems/{cifs => smb}/index.rst | 0 Documentation/filesystems/{cifs => smb}/ksmbd.rst | 0 MAINTAINERS | 2 +- 5 files changed, 2 insertions(+), 2 deletions(-) rename Documentation/filesystems/{cifs => smb}/cifsroot.rst (100%) rename Documentation/filesystems/{cifs => smb}/index.rst (100%) rename Documentation/filesystems/{cifs => smb}/ksmbd.rst (100%) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index fbb2b5ada95b..eb252fc972aa 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -72,7 +72,6 @@ Documentation for filesystem implementations. befs bfs btrfs - cifs/index ceph coda configfs @@ -111,6 +110,7 @@ Documentation for filesystem implementations. ramfs-rootfs-initramfs relay romfs + smb/index spufs/index squashfs sysfs diff --git a/Documentation/filesystems/cifs/cifsroot.rst b/Documentation/filesystems/smb/cifsroot.rst similarity index 100% rename from Documentation/filesystems/cifs/cifsroot.rst rename to Documentation/filesystems/smb/cifsroot.rst diff --git a/Documentation/filesystems/cifs/index.rst b/Documentation/filesystems/smb/index.rst similarity index 100% rename from Documentation/filesystems/cifs/index.rst rename to Documentation/filesystems/smb/index.rst diff --git a/Documentation/filesystems/cifs/ksmbd.rst b/Documentation/filesystems/smb/ksmbd.rst similarity index 100% rename from Documentation/filesystems/cifs/ksmbd.rst rename to Documentation/filesystems/smb/ksmbd.rst diff --git a/MAINTAINERS b/MAINTAINERS index 902f763e845d..6152a4251ce7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11300,7 +11300,7 @@ R: Tom Talpey L: linux-cifs@vger.kernel.org S: Maintained T: git git://git.samba.org/ksmbd.git -F: Documentation/filesystems/cifs/ksmbd.rst +F: Documentation/filesystems/smb/ksmbd.rst F: fs/smb/common/ F: fs/smb/server/ From e2ab5aa11f191b54514f063a5b5c29f3559f4ab7 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Wed, 1 Mar 2023 10:50:53 +0200 Subject: [PATCH 148/271] net/mlx5e: Extract remaining tunnel encap code to dedicated file Move set_encap_dests() and clean_encap_dests() to the tunnel encap dedicated file. And rename them to mlx5e_tc_tun_encap_dests_set() and mlx5e_tc_tun_encap_dests_unset(). No functional change in this patch. It is needed in the next patch. Signed-off-by: Chris Mi Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en/tc_tun_encap.c | 83 +++++++++++++++++ .../mellanox/mlx5/core/en/tc_tun_encap.h | 9 ++ .../net/ethernet/mellanox/mlx5/core/en_tc.c | 89 +------------------ 3 files changed, 94 insertions(+), 87 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 6a052c6cfc15..d516227b8fe8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -1016,6 +1016,89 @@ out_err: return err; } +int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack, + bool *vf_tun) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_esw_flow_attr *esw_attr; + struct net_device *encap_dev = NULL; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *out_priv; + int out_index; + int err = 0; + + if (!mlx5e_is_eswitch_flow(flow)) + return 0; + + parse_attr = attr->parse_attr; + esw_attr = attr->esw_attr; + *vf_tun = false; + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + struct net_device *out_dev; + int mirred_ifindex; + + if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) + continue; + + mirred_ifindex = parse_attr->mirred_ifindex[out_index]; + out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); + if (!out_dev) { + NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found"); + err = -ENODEV; + goto out; + } + err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index, + extack, &encap_dev); + dev_put(out_dev); + if (err) + goto out; + + if (esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE && + !esw_attr->dest_int_port) + *vf_tun = true; + + out_priv = netdev_priv(encap_dev); + rpriv = out_priv->ppriv; + esw_attr->dests[out_index].rep = rpriv->rep; + esw_attr->dests[out_index].mdev = out_priv->mdev; + } + + if (*vf_tun && esw_attr->out_count > 1) { + NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported"); + err = -EOPNOTSUPP; + goto out; + } + +out: + return err; +} + +void mlx5e_tc_tun_encap_dests_unset(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr; + int out_index; + + if (!mlx5e_is_eswitch_flow(flow)) + return; + + esw_attr = attr->esw_attr; + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) + continue; + + mlx5e_detach_encap(flow->priv, flow, attr, out_index); + kfree(attr->parse_attr->tun_info[out_index]); + } +} + static int cmp_route_info(struct mlx5e_route_key *a, struct mlx5e_route_key *b) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h index 8ad273dde40e..5d7d67687cbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h @@ -30,6 +30,15 @@ int mlx5e_attach_decap_route(struct mlx5e_priv *priv, void mlx5e_detach_decap_route(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow); +int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack, + bool *vf_tun); +void mlx5e_tc_tun_encap_dests_unset(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr); + struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info); int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e95414ef1f04..8935156f8f4e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1699,91 +1699,6 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro return mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport); } -static int -set_encap_dests(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct mlx5_flow_attr *attr, - struct netlink_ext_ack *extack, - bool *vf_tun) -{ - struct mlx5e_tc_flow_parse_attr *parse_attr; - struct mlx5_esw_flow_attr *esw_attr; - struct net_device *encap_dev = NULL; - struct mlx5e_rep_priv *rpriv; - struct mlx5e_priv *out_priv; - int out_index; - int err = 0; - - if (!mlx5e_is_eswitch_flow(flow)) - return 0; - - parse_attr = attr->parse_attr; - esw_attr = attr->esw_attr; - *vf_tun = false; - - for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { - struct net_device *out_dev; - int mirred_ifindex; - - if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) - continue; - - mirred_ifindex = parse_attr->mirred_ifindex[out_index]; - out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); - if (!out_dev) { - NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found"); - err = -ENODEV; - goto out; - } - err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index, - extack, &encap_dev); - dev_put(out_dev); - if (err) - goto out; - - if (esw_attr->dests[out_index].flags & - MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE && - !esw_attr->dest_int_port) - *vf_tun = true; - - out_priv = netdev_priv(encap_dev); - rpriv = out_priv->ppriv; - esw_attr->dests[out_index].rep = rpriv->rep; - esw_attr->dests[out_index].mdev = out_priv->mdev; - } - - if (*vf_tun && esw_attr->out_count > 1) { - NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported"); - err = -EOPNOTSUPP; - goto out; - } - -out: - return err; -} - -static void -clean_encap_dests(struct mlx5e_priv *priv, - struct mlx5e_tc_flow *flow, - struct mlx5_flow_attr *attr) -{ - struct mlx5_esw_flow_attr *esw_attr; - int out_index; - - if (!mlx5e_is_eswitch_flow(flow)) - return; - - esw_attr = attr->esw_attr; - - for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { - if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) - continue; - - mlx5e_detach_encap(priv, flow, attr, out_index); - kfree(attr->parse_attr->tun_info[out_index]); - } -} - static int verify_attr_actions(u32 actions, struct netlink_ext_ack *extack) { @@ -1820,7 +1735,7 @@ post_process_attr(struct mlx5e_tc_flow *flow, if (err) goto err_out; - err = set_encap_dests(flow->priv, flow, attr, extack, &vf_tun); + err = mlx5e_tc_tun_encap_dests_set(flow->priv, flow, attr, extack, &vf_tun); if (err) goto err_out; @@ -4324,7 +4239,7 @@ mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *a if (attr->post_act_handle) mlx5e_tc_post_act_del(get_post_action(flow->priv), attr->post_act_handle); - clean_encap_dests(flow->priv, flow, attr); + mlx5e_tc_tun_encap_dests_unset(flow->priv, flow, attr); if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) mlx5_fc_destroy(counter_dev, attr->counter); From 37c3b9fa7ccf5caad6d87ba4d42bf00be46be1cf Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 21 Feb 2023 04:41:41 +0200 Subject: [PATCH 149/271] net/mlx5e: Prevent encap offload when neigh update is running The cited commit adds a compeletion to remove dependency on rtnl lock. But it causes a deadlock for multiple encapsulations: crash> bt ffff8aece8a64000 PID: 1514557 TASK: ffff8aece8a64000 CPU: 3 COMMAND: "tc" #0 [ffffa6d14183f368] __schedule at ffffffffb8ba7f45 #1 [ffffa6d14183f3f8] schedule at ffffffffb8ba8418 #2 [ffffa6d14183f418] schedule_preempt_disabled at ffffffffb8ba8898 #3 [ffffa6d14183f428] __mutex_lock at ffffffffb8baa7f8 #4 [ffffa6d14183f4d0] mutex_lock_nested at ffffffffb8baabeb #5 [ffffa6d14183f4e0] mlx5e_attach_encap at ffffffffc0f48c17 [mlx5_core] #6 [ffffa6d14183f628] mlx5e_tc_add_fdb_flow at ffffffffc0f39680 [mlx5_core] #7 [ffffa6d14183f688] __mlx5e_add_fdb_flow at ffffffffc0f3b636 [mlx5_core] #8 [ffffa6d14183f6f0] mlx5e_tc_add_flow at ffffffffc0f3bcdf [mlx5_core] #9 [ffffa6d14183f728] mlx5e_configure_flower at ffffffffc0f3c1d1 [mlx5_core] #10 [ffffa6d14183f790] mlx5e_rep_setup_tc_cls_flower at ffffffffc0f3d529 [mlx5_core] #11 [ffffa6d14183f7a0] mlx5e_rep_setup_tc_cb at ffffffffc0f3d714 [mlx5_core] #12 [ffffa6d14183f7b0] tc_setup_cb_add at ffffffffb8931bb8 #13 [ffffa6d14183f810] fl_hw_replace_filter at ffffffffc0dae901 [cls_flower] #14 [ffffa6d14183f8d8] fl_change at ffffffffc0db5c57 [cls_flower] #15 [ffffa6d14183f970] tc_new_tfilter at ffffffffb8936047 #16 [ffffa6d14183fac8] rtnetlink_rcv_msg at ffffffffb88c7c31 #17 [ffffa6d14183fb50] netlink_rcv_skb at ffffffffb8942853 #18 [ffffa6d14183fbc0] rtnetlink_rcv at ffffffffb88c1835 #19 [ffffa6d14183fbd0] netlink_unicast at ffffffffb8941f27 #20 [ffffa6d14183fc18] netlink_sendmsg at ffffffffb8942245 #21 [ffffa6d14183fc98] sock_sendmsg at ffffffffb887d482 #22 [ffffa6d14183fcb8] ____sys_sendmsg at ffffffffb887d81a #23 [ffffa6d14183fd38] ___sys_sendmsg at ffffffffb88806e2 #24 [ffffa6d14183fe90] __sys_sendmsg at ffffffffb88807a2 #25 [ffffa6d14183ff28] __x64_sys_sendmsg at ffffffffb888080f #26 [ffffa6d14183ff38] do_syscall_64 at ffffffffb8b9b6a8 #27 [ffffa6d14183ff50] entry_SYSCALL_64_after_hwframe at ffffffffb8c0007c crash> bt 0xffff8aeb07544000 PID: 1110766 TASK: ffff8aeb07544000 CPU: 0 COMMAND: "kworker/u20:9" #0 [ffffa6d14e6b7bd8] __schedule at ffffffffb8ba7f45 #1 [ffffa6d14e6b7c68] schedule at ffffffffb8ba8418 #2 [ffffa6d14e6b7c88] schedule_timeout at ffffffffb8baef88 #3 [ffffa6d14e6b7d10] wait_for_completion at ffffffffb8ba968b #4 [ffffa6d14e6b7d60] mlx5e_take_all_encap_flows at ffffffffc0f47ec4 [mlx5_core] #5 [ffffa6d14e6b7da0] mlx5e_rep_update_flows at ffffffffc0f3e734 [mlx5_core] #6 [ffffa6d14e6b7df8] mlx5e_rep_neigh_update at ffffffffc0f400bb [mlx5_core] #7 [ffffa6d14e6b7e50] process_one_work at ffffffffb80acc9c #8 [ffffa6d14e6b7ed0] worker_thread at ffffffffb80ad012 #9 [ffffa6d14e6b7f10] kthread at ffffffffb80b615d #10 [ffffa6d14e6b7f50] ret_from_fork at ffffffffb8001b2f After the first encap is attached, flow will be added to encap entry's flows list. If neigh update is running at this time, the following encaps of the flow can't hold the encap_tbl_lock and sleep. If neigh update thread is waiting for that flow's init_done, deadlock happens. Fix it by holding lock outside of the for loop. If neigh update is running, prevent encap flows from offloading. Since the lock is held outside of the for loop, concurrent creation of encap entries is not allowed. So remove unnecessary wait_for_completion call for res_ready. Fixes: 95435ad7999b ("net/mlx5e: Only access fully initialized flows in neigh update") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en/tc_tun_encap.c | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index d516227b8fe8..f0c3464f037f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -492,6 +492,19 @@ void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) mlx5e_encap_dealloc(priv, e); } +static void mlx5e_encap_put_locked(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + lockdep_assert_held(&esw->offloads.encap_tbl_lock); + + if (!refcount_dec_and_test(&e->refcnt)) + return; + list_del(&e->route_list); + hash_del_rcu(&e->encap_hlist); + mlx5e_encap_dealloc(priv, e); +} + static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; @@ -816,6 +829,8 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, uintptr_t hash_key; int err = 0; + lockdep_assert_held(&esw->offloads.encap_tbl_lock); + parse_attr = attr->parse_attr; tun_info = parse_attr->tun_info[out_index]; mpls_info = &parse_attr->mpls_info[out_index]; @@ -829,7 +844,6 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, hash_key = hash_encap_info(&key); - mutex_lock(&esw->offloads.encap_tbl_lock); e = mlx5e_encap_get(priv, &key, hash_key); /* must verify if encap is valid or not */ @@ -840,15 +854,6 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, goto out_err; } - mutex_unlock(&esw->offloads.encap_tbl_lock); - wait_for_completion(&e->res_ready); - - /* Protect against concurrent neigh update. */ - mutex_lock(&esw->offloads.encap_tbl_lock); - if (e->compl_result < 0) { - err = -EREMOTEIO; - goto out_err; - } goto attach_flow; } @@ -877,15 +882,12 @@ int mlx5e_attach_encap(struct mlx5e_priv *priv, INIT_LIST_HEAD(&e->flows); hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key); tbl_time_before = mlx5e_route_tbl_get_last_update(priv); - mutex_unlock(&esw->offloads.encap_tbl_lock); if (family == AF_INET) err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e); else if (family == AF_INET6) err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e); - /* Protect against concurrent neigh update. */ - mutex_lock(&esw->offloads.encap_tbl_lock); complete_all(&e->res_ready); if (err) { e->compl_result = err; @@ -920,18 +922,15 @@ attach_flow: } else { flow_flag_set(flow, SLOW); } - mutex_unlock(&esw->offloads.encap_tbl_lock); return err; out_err: - mutex_unlock(&esw->offloads.encap_tbl_lock); if (e) - mlx5e_encap_put(priv, e); + mlx5e_encap_put_locked(priv, e); return err; out_err_init: - mutex_unlock(&esw->offloads.encap_tbl_lock); kfree(tun_info); kfree(e); return err; @@ -1027,6 +1026,7 @@ int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, struct net_device *encap_dev = NULL; struct mlx5e_rep_priv *rpriv; struct mlx5e_priv *out_priv; + struct mlx5_eswitch *esw; int out_index; int err = 0; @@ -1037,6 +1037,8 @@ int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, esw_attr = attr->esw_attr; *vf_tun = false; + esw = priv->mdev->priv.eswitch; + mutex_lock(&esw->offloads.encap_tbl_lock); for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { struct net_device *out_dev; int mirred_ifindex; @@ -1075,6 +1077,7 @@ int mlx5e_tc_tun_encap_dests_set(struct mlx5e_priv *priv, } out: + mutex_unlock(&esw->offloads.encap_tbl_lock); return err; } From 81fe2be062915e2a2fdc494c3cd90e946e946c25 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 1 May 2023 17:31:40 +0300 Subject: [PATCH 150/271] net/mlx5e: Consider internal buffers size in port buffer calculations Currently, when a user triggers a change in port buffer headroom (buffers 0-7), the driver checks that the requested headroom does not exceed the total port buffer size. However, this check does not take into account the internal buffers (buffers 8-9), which are also part of the total port buffer. This can result in treating invalid port buffer change requests as valid, causing unintended changes to the shared buffer. To address this, include the internal buffers size in the calculation of available port buffer space which ensures that port buffer requests do not exceed the correct limit. Furthermore, remove internal buffers (8-9) size from the total_size calculation as these buffers are reserved for internal use and are not exposed to the user. While at it, add verbosity to the debug prints in mlx5e_port_query_buffer() function to ease future debugging. Fixes: ecdf2dadee8e ("net/mlx5e: Receive buffer support for DCBX") Signed-off-by: Maher Sanalla Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en/port_buffer.c | 40 ++++++++++++------- .../mellanox/mlx5/core/en/port_buffer.h | 8 ++-- .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 7 ++-- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 7ac1ad9c46de..0d78527451bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -51,7 +51,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, if (err) goto out; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]); port_buffer->buffer[i].lossy = MLX5_GET(bufferx_reg, buffer, lossy); @@ -73,14 +73,24 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, port_buffer->buffer[i].lossy); } - port_buffer->headroom_size = total_used; + port_buffer->internal_buffers_size = 0; + for (i = MLX5E_MAX_NETWORK_BUFFER; i < MLX5E_TOTAL_BUFFERS; i++) { + buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]); + port_buffer->internal_buffers_size += + MLX5_GET(bufferx_reg, buffer, size) * port_buff_cell_sz; + } + port_buffer->port_buffer_size = MLX5_GET(pbmc_reg, out, port_buffer_size) * port_buff_cell_sz; - port_buffer->spare_buffer_size = - port_buffer->port_buffer_size - total_used; + port_buffer->headroom_size = total_used; + port_buffer->spare_buffer_size = port_buffer->port_buffer_size - + port_buffer->internal_buffers_size - + port_buffer->headroom_size; - mlx5e_dbg(HW, priv, "total buffer size=%d, spare buffer size=%d\n", - port_buffer->port_buffer_size, + mlx5e_dbg(HW, priv, + "total buffer size=%u, headroom buffer size=%u, internal buffers size=%u, spare buffer size=%u\n", + port_buffer->port_buffer_size, port_buffer->headroom_size, + port_buffer->internal_buffers_size, port_buffer->spare_buffer_size); out: kfree(out); @@ -206,11 +216,11 @@ static int port_update_pool_cfg(struct mlx5_core_dev *mdev, if (!MLX5_CAP_GEN(mdev, sbcam_reg)) return 0; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) lossless_buff_count += ((port_buffer->buffer[i].size) && (!(port_buffer->buffer[i].lossy))); - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { p = select_sbcm_params(&port_buffer->buffer[i], lossless_buff_count); err = mlx5e_port_set_sbcm(mdev, 0, i, MLX5_INGRESS_DIR, @@ -293,7 +303,7 @@ static int port_set_buffer(struct mlx5e_priv *priv, if (err) goto out; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); u64 size = port_buffer->buffer[i].size; u64 xoff = port_buffer->buffer[i].xoff; @@ -351,7 +361,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, { int i; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { if (port_buffer->buffer[i].lossy) { port_buffer->buffer[i].xoff = 0; port_buffer->buffer[i].xon = 0; @@ -408,7 +418,7 @@ static int update_buffer_lossy(struct mlx5_core_dev *mdev, int err; int i; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { prio_count = 0; lossy_count = 0; @@ -515,7 +525,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (change & MLX5E_PORT_BUFFER_PRIO2BUFFER) { update_prio2buffer = true; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) mlx5e_dbg(HW, priv, "%s: requested to map prio[%d] to buffer %d\n", __func__, i, prio2buffer[i]); @@ -530,7 +540,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, } if (change & MLX5E_PORT_BUFFER_SIZE) { - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { mlx5e_dbg(HW, priv, "%s: buffer[%d]=%d\n", __func__, i, buffer_size[i]); if (!port_buffer.buffer[i].lossy && !buffer_size[i]) { mlx5e_dbg(HW, priv, "%s: lossless buffer[%d] size cannot be zero\n", @@ -544,7 +554,9 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, mlx5e_dbg(HW, priv, "%s: total buffer requested=%d\n", __func__, total_used); - if (total_used > port_buffer.port_buffer_size) + if (total_used > port_buffer.headroom_size && + (total_used - port_buffer.headroom_size) > + port_buffer.spare_buffer_size) return -EINVAL; update_buffer = true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h index a6ef118de758..f4a19ffbb641 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -35,7 +35,8 @@ #include "en.h" #include "port.h" -#define MLX5E_MAX_BUFFER 8 +#define MLX5E_MAX_NETWORK_BUFFER 8 +#define MLX5E_TOTAL_BUFFERS 10 #define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */ #define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \ @@ -60,8 +61,9 @@ struct mlx5e_bufferx_reg { struct mlx5e_port_buffer { u32 port_buffer_size; u32 spare_buffer_size; - u32 headroom_size; - struct mlx5e_bufferx_reg buffer[MLX5E_MAX_BUFFER]; + u32 headroom_size; /* Buffers 0-7 */ + u32 internal_buffers_size; /* Buffers 8-9 */ + struct mlx5e_bufferx_reg buffer[MLX5E_MAX_NETWORK_BUFFER]; }; int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 89de92d06483..ebee52a8361a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -926,9 +926,10 @@ static int mlx5e_dcbnl_getbuffer(struct net_device *dev, if (err) return err; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) dcb_buffer->buffer_size[i] = port_buffer.buffer[i].size; - dcb_buffer->total_size = port_buffer.port_buffer_size; + dcb_buffer->total_size = port_buffer.port_buffer_size - + port_buffer.internal_buffers_size; return 0; } @@ -970,7 +971,7 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, if (err) return err; - for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { if (port_buffer.buffer[i].size != dcb_buffer->buffer_size[i]) { changed |= MLX5E_PORT_BUFFER_SIZE; buffer_size = dcb_buffer->buffer_size; From 623efc4cbd6115db36716e31037cb6d1f3ce6754 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 9 May 2023 17:56:01 +0300 Subject: [PATCH 151/271] net/mlx5e: Do not update SBCM when prio2buffer command is invalid The shared buffer pools configuration which are stored in the SBCM register are updated when the user changes the prio2buffer mapping. However, in case the user desired prio2buffer change is invalid, which can occur due to mapping a lossless priority to a not large enough buffer, the SBCM update should not be performed, as the user command is failed. Thus, Perform the SBCM update only after xoff threshold calculation is performed and the user prio2buffer mapping is validated. Fixes: a440030d8946 ("net/mlx5e: Update shared buffer along with device buffer changes") Signed-off-by: Maher Sanalla Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 0d78527451bc..7e8e96cc5cd0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -442,11 +442,11 @@ static int update_buffer_lossy(struct mlx5_core_dev *mdev, } if (changed) { - err = port_update_pool_cfg(mdev, port_buffer); + err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; - err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); + err = port_update_pool_cfg(mdev, port_buffer); if (err) return err; From 824c8dc4a470040bf0e56ba716543839c2498d49 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 24 Apr 2023 12:31:59 +0300 Subject: [PATCH 152/271] net/mlx5: Drain health before unregistering devlink mlx5 health mechanism is using devlink APIs, which are using devlink notify APIs. After the cited patch, using devlink notify APIs after devlink is unregistered triggers a WARN_ON(). Hence, drain health WQ before devlink is unregistered. Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a7eb65cd0bdd..2132a6510639 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1802,15 +1802,16 @@ static void remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(dev); set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state); - /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain - * fw_reset before unregistering the devlink. + /* mlx5_drain_fw_reset() and mlx5_drain_health_wq() are using + * devlink notify APIs. + * Hence, we must drain them before unregistering the devlink. */ mlx5_drain_fw_reset(dev); + mlx5_drain_health_wq(dev); devlink_unregister(devlink); mlx5_sriov_disable(pdev); mlx5_thermal_uninit(dev); mlx5_crdump_disable(dev); - mlx5_drain_health_wq(dev); mlx5_uninit_one(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); From b4646da0573fae9dfa2b8f1f10936cb6eedd7230 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 24 Apr 2023 12:46:06 +0300 Subject: [PATCH 153/271] net/mlx5: SF, Drain health before removing device There is no point in recovery during device removal. Also, if health work started need to wait for it to avoid races and NULL pointer access. Hence, drain health WQ before removing device. Fixes: 1958fc2f0712 ("net/mlx5: SF, Add auxiliary device driver") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c index e2f26d0bc615..0692363cf80e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -63,6 +63,7 @@ static void mlx5_sf_dev_remove(struct auxiliary_device *adev) struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); struct devlink *devlink = priv_to_devlink(sf_dev->mdev); + mlx5_drain_health_wq(sf_dev->mdev); devlink_unregister(devlink); mlx5_uninit_one(sf_dev->mdev); iounmap(sf_dev->mdev->iseg); From 341a80de2468f481b1f771683709b5649cbfe513 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sat, 29 Apr 2023 20:41:41 +0300 Subject: [PATCH 154/271] net/mlx5: fw_tracer, Fix event handling mlx5 driver needs to parse traces with event_id inside the range of first_string_trace and num_string_trace. However, mlx5 is parsing all events with event_id >= first_string_trace. Fix it by checking for the correct range. Fixes: c71ad41ccb0c ("net/mlx5: FW tracer, events handling") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index f40497823e65..7c0f2adbea00 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -490,7 +490,7 @@ static void poll_trace(struct mlx5_fw_tracer *tracer, (u64)timestamp_low; break; default: - if (tracer_event->event_id >= tracer->str_db.first_string_trace || + if (tracer_event->event_id >= tracer->str_db.first_string_trace && tracer_event->event_id <= tracer->str_db.first_string_trace + tracer->str_db.num_string_trace) { tracer_event->type = TRACER_EVENT_TYPE_STRING; From 1db1f21caebbb1b6e9b1e7657df613616be3fb49 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 13 Apr 2023 15:48:30 +0300 Subject: [PATCH 155/271] net/mlx5e: Use query_special_contexts cmd only once per mdev Don't query the firmware so many times (num rqs * num wqes * wqe frags) because it slows down linearly the interface creation time when the product is larger. Do it only once per mdev and store the result in mlx5e_param. Due to helper function being called from different files, move it to an appropriate location. Rename the function with a proper prefix and add a small cleanup. This fix applies only for legacy rq. Fixes: 1b1e4868836a ("net/mlx5e: Use query_special_contexts for mkeys") Signed-off-by: Dragos Tatulea Reviewed-by: Or Har-Toov Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 24 +++---------------- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 21 ++++++++++++++++ include/linux/mlx5/driver.h | 1 + 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index b8987a404d75..8e999f238194 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -327,6 +327,7 @@ struct mlx5e_params { unsigned int sw_mtu; int hard_mtu; bool ptp_rx; + __be32 terminate_lkey_be; }; static inline u8 mlx5e_get_dcb_num_tc(struct mlx5e_params *params) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2944691f06ad..0235adcbc609 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -727,26 +727,6 @@ static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq) mlx5e_rq_shampo_hd_free(rq); } -static __be32 mlx5e_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev) -{ - u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; - int res; - - if (!MLX5_CAP_GEN(dev, terminate_scatter_list_mkey)) - return MLX5_TERMINATE_SCATTER_LIST_LKEY; - - MLX5_SET(query_special_contexts_in, in, opcode, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - res = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); - if (res) - return MLX5_TERMINATE_SCATTER_LIST_LKEY; - - res = MLX5_GET(query_special_contexts_out, out, - terminate_scatter_list_mkey); - return cpu_to_be32(res); -} - static int mlx5e_alloc_rq(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, struct mlx5e_rq_param *rqp, @@ -908,7 +888,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, /* check if num_frags is not a pow of two */ if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) { wqe->data[f].byte_count = 0; - wqe->data[f].lkey = mlx5e_get_terminate_scatter_list_mkey(mdev); + wqe->data[f].lkey = params->terminate_lkey_be; wqe->data[f].addr = 0; } } @@ -5007,6 +4987,8 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 /* RQ */ mlx5e_build_rq_params(mdev, params); + params->terminate_lkey_be = mlx5_core_get_terminate_scatter_list_mkey(mdev); + params->packet_merge.timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT); /* CQ moderation params */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 9d735c343a3b..678f0be81375 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -32,6 +32,7 @@ #include #include +#include #include "mlx5_core.h" int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in, @@ -122,3 +123,23 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) return mlx5_cmd_exec_in(dev, destroy_psv, in); } EXPORT_SYMBOL(mlx5_core_destroy_psv); + +__be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + u32 mkey; + + if (!MLX5_CAP_GEN(dev, terminate_scatter_list_mkey)) + return MLX5_TERMINATE_SCATTER_LIST_LKEY; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + if (mlx5_cmd_exec_inout(dev, query_special_contexts, in, out)) + return MLX5_TERMINATE_SCATTER_LIST_LKEY; + + mkey = MLX5_GET(query_special_contexts_out, out, + terminate_scatter_list_mkey); + return cpu_to_be32(mkey); +} +EXPORT_SYMBOL(mlx5_core_get_terminate_scatter_list_mkey); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a4c4f737f9c1..94d2be5848ae 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1093,6 +1093,7 @@ void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); +__be32 mlx5_core_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); From 5d862ec631f3d3cc3b4f8cdb5b9fc5879663f1d3 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 22 May 2023 14:48:52 +0200 Subject: [PATCH 156/271] net/mlx5: Fix post parse infra to only parse every action once Caller of mlx5e_tc_act_post_parse() needs it to parse only the subset of actions starting after previous split and ending at the current action. However, that range is not provided as arguments and mlx5e_tc_act_post_parse() uses generic flow_action_for_each() that iterates over all flow actions. Not only this is redundant, it also causes a bug when mlx5e_tc_act->post_parse() callback is not idempotent since it will be called for every split. For example, ct action tc_act_post_parse_ct() callback obtains a reference to mlx5_ct_ft instance and calling it several times during parsing stage will cause reference counter imbalance. Fix the issue by providing a proper action range of the current split subset to mlx5e_tc_act_post_parse() and only calling mlx5e_tc_act->post_parse() for actions inside the subset range. Fixes: 8300f225268b ("net/mlx5e: Create new flow attr for multi table actions") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c | 7 ++++++- drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 +++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c index fc923a99b6a4..0380a04c3691 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c @@ -84,7 +84,7 @@ mlx5e_tc_act_init_parse_state(struct mlx5e_tc_act_parse_state *parse_state, int mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, - struct flow_action *flow_action, + struct flow_action *flow_action, int from, int to, struct mlx5_flow_attr *attr, enum mlx5_flow_namespace_type ns_type) { @@ -96,6 +96,11 @@ mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, priv = parse_state->flow->priv; flow_action_for_each(i, act, flow_action) { + if (i < from) + continue; + else if (i > to) + break; + tc_act = mlx5e_tc_act_get(act->id, ns_type); if (!tc_act || !tc_act->post_parse) continue; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h index 0e6e1872ac62..d6c12d0ea55b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -112,7 +112,7 @@ mlx5e_tc_act_init_parse_state(struct mlx5e_tc_act_parse_state *parse_state, int mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, - struct flow_action *flow_action, + struct flow_action *flow_action, int from, int to, struct mlx5_flow_attr *attr, enum mlx5_flow_namespace_type ns_type); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 8935156f8f4e..8a5a8703f0a3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3859,8 +3859,8 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, struct mlx5_flow_attr *prev_attr; struct flow_action_entry *act; struct mlx5e_tc_act *tc_act; + int err, i, i_split = 0; bool is_missable; - int err, i; ns_type = mlx5e_get_flow_namespace(flow); list_add(&attr->list, &flow->attrs); @@ -3901,7 +3901,8 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, i < flow_action->num_entries - 1)) { is_missable = tc_act->is_missable ? tc_act->is_missable(act) : false; - err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type); + err = mlx5e_tc_act_post_parse(parse_state, flow_action, i_split, i, attr, + ns_type); if (err) goto out_free_post_acts; @@ -3911,6 +3912,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, goto out_free_post_acts; } + i_split = i + 1; list_add(&attr->list, &flow->attrs); } @@ -3925,7 +3927,7 @@ parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, } } - err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type); + err = mlx5e_tc_act_post_parse(parse_state, flow_action, i_split, i, attr, ns_type); if (err) goto out_free_post_acts; From bdf274750fca17b289404ef03453c4070725302c Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 13 Oct 2021 14:39:24 +0300 Subject: [PATCH 157/271] net/mlx5e: Don't attach netdev profile while handling internal error As part of switchdev mode disablement, driver changes port netdevice profile from uplink to nic. If this process is triggered by health recovery flow (PCI reset, for ex.) profile attach would fail because all fw commands aborted when internal error flag is set. As a result, nic netdevice profile is not attached and driver fails to rollback to uplink profile, which leave driver in broken state and cause crash later. To handle broken state do netdevice profile initialization only instead of full attachment and release mdev resources on driver suspend as expected. Actual netdevice attachment is done during driver load. Fixes: c4d7eb57687f ("net/mxl5e: Add change profile method") Signed-off-by: Dmytro Linkin Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0235adcbc609..a07bbe9a61be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5833,8 +5833,8 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv) } static int -mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, - const struct mlx5e_profile *new_profile, void *new_ppriv) +mlx5e_netdev_init_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, void *new_ppriv) { struct mlx5e_priv *priv = netdev_priv(netdev); int err; @@ -5850,6 +5850,25 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde err = new_profile->init(priv->mdev, priv->netdev); if (err) goto priv_cleanup; + + return 0; + +priv_cleanup: + mlx5e_priv_cleanup(priv); + return err; +} + +static int +mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + err = mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); + if (err) + return err; + err = mlx5e_attach_netdev(priv); if (err) goto profile_cleanup; @@ -5857,7 +5876,6 @@ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mde profile_cleanup: new_profile->cleanup(priv); -priv_cleanup: mlx5e_priv_cleanup(priv); return err; } @@ -5876,6 +5894,12 @@ int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, priv->profile->cleanup(priv); mlx5e_priv_cleanup(priv); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); + set_bit(MLX5E_STATE_DESTROYING, &priv->state); + return -EIO; + } + err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv); if (err) { /* roll back to original profile */ netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err); @@ -5937,8 +5961,11 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state) struct net_device *netdev = priv->netdev; struct mlx5_core_dev *mdev = priv->mdev; - if (!netif_device_present(netdev)) + if (!netif_device_present(netdev)) { + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5e_destroy_mdev_resources(mdev); return -ENODEV; + } mlx5e_detach_netdev(priv); mlx5e_destroy_mdev_resources(mdev); From c4c24fc30cc417ace332ceceaba4f70f81dcd521 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 16 May 2023 02:28:02 +0000 Subject: [PATCH 158/271] net/mlx5e: Move Ethernet driver debugfs to profile init callback As priv->dfs_root is cleared, and therefore missed, when change eswitch mode, move the creation of the root debugfs to the init callback of mlx5e_nic_profile and mlx5e_uplink_rep_profile, and the destruction to the cleanup callback for symmeter. Fixes: 288eca60cc31 ("net/mlx5e: Add Ethernet driver debugfs") Signed-off-by: Jianbo Liu Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++----- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 6 ++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a07bbe9a61be..a7c526ee5024 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5261,12 +5261,16 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, mlx5e_timestamp_init(priv); + priv->dfs_root = debugfs_create_dir("nic", + mlx5_debugfs_get_dev_root(mdev)); + fs = mlx5e_fs_init(priv->profile, mdev, !test_bit(MLX5E_STATE_DESTROYING, &priv->state), priv->dfs_root); if (!fs) { err = -ENOMEM; mlx5_core_err(mdev, "FS initialization failed, %d\n", err); + debugfs_remove_recursive(priv->dfs_root); return err; } priv->fs = fs; @@ -5287,6 +5291,7 @@ static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) mlx5e_health_destroy_reporters(priv); mlx5e_ktls_cleanup(priv); mlx5e_fs_cleanup(priv->fs); + debugfs_remove_recursive(priv->dfs_root); priv->fs = NULL; } @@ -6011,9 +6016,6 @@ static int mlx5e_probe(struct auxiliary_device *adev, priv->profile = profile; priv->ppriv = NULL; - priv->dfs_root = debugfs_create_dir("nic", - mlx5_debugfs_get_dev_root(priv->mdev)); - err = profile->init(mdev, netdev); if (err) { mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err); @@ -6042,7 +6044,6 @@ err_resume: err_profile_cleanup: profile->cleanup(priv); err_destroy_netdev: - debugfs_remove_recursive(priv->dfs_root); mlx5e_destroy_netdev(priv); err_devlink_port_unregister: mlx5e_devlink_port_unregister(mlx5e_dev); @@ -6062,7 +6063,6 @@ static void mlx5e_remove(struct auxiliary_device *adev) unregister_netdev(priv->netdev); mlx5e_suspend(adev, state); priv->profile->cleanup(priv); - debugfs_remove_recursive(priv->dfs_root); mlx5e_destroy_netdev(priv); mlx5e_devlink_port_unregister(mlx5e_dev); mlx5e_destroy_devlink(mlx5e_dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 1fc386eccaf8..3e7041bd5705 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -812,11 +813,15 @@ static int mlx5e_init_ul_rep(struct mlx5_core_dev *mdev, { struct mlx5e_priv *priv = netdev_priv(netdev); + priv->dfs_root = debugfs_create_dir("nic", + mlx5_debugfs_get_dev_root(mdev)); + priv->fs = mlx5e_fs_init(priv->profile, mdev, !test_bit(MLX5E_STATE_DESTROYING, &priv->state), priv->dfs_root); if (!priv->fs) { netdev_err(priv->netdev, "FS allocation failed\n"); + debugfs_remove_recursive(priv->dfs_root); return -ENOMEM; } @@ -829,6 +834,7 @@ static int mlx5e_init_ul_rep(struct mlx5_core_dev *mdev, static void mlx5e_cleanup_rep(struct mlx5e_priv *priv) { mlx5e_fs_cleanup(priv->fs); + debugfs_remove_recursive(priv->dfs_root); priv->fs = NULL; } From fe5c2d3aef9352ac36a6acbc2bff5f732211ce3b Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 17 May 2023 17:54:30 +0300 Subject: [PATCH 159/271] net/mlx5: DR, Add missing mutex init/destroy in pattern manager Add missing mutex init/destroy as caught by the lock's debug warning: DEBUG_LOCKS_WARN_ON(lock->magic != lock) Fixes: da5d0027d666 ("net/mlx5: DR, Add cache for modify header pattern") Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c index 13e06a6a6b22..d6947fe13d56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ptrn.c @@ -213,6 +213,8 @@ struct mlx5dr_ptrn_mgr *mlx5dr_ptrn_mgr_create(struct mlx5dr_domain *dmn) } INIT_LIST_HEAD(&mgr->ptrn_list); + mutex_init(&mgr->modify_hdr_mutex); + return mgr; free_mgr: @@ -237,5 +239,6 @@ void mlx5dr_ptrn_mgr_destroy(struct mlx5dr_ptrn_mgr *mgr) } mlx5dr_icm_pool_destroy(mgr->ptrn_icm_pool); + mutex_destroy(&mgr->modify_hdr_mutex); kfree(mgr); } From d5972f1000c1e6b5185ded0fc194f21984f26e14 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 May 2023 22:43:03 +0300 Subject: [PATCH 160/271] net/mlx5: Fix check for allocation failure in comp_irqs_request_pci() This function accidentally dereferences "cpus" instead of returning directly. Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202305200354.KV3jU94w-lkp@intel.com/ Fixes: b48a0f72bc3e ("net/mlx5: Refactor completion irq request/release code") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index fe698c79616c..3db4866d7880 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -824,7 +824,7 @@ static int comp_irqs_request_pci(struct mlx5_core_dev *dev) ncomp_eqs = table->num_comp_eqs; cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL); if (!cpus) - ret = -ENOMEM; + return -ENOMEM; i = 0; rcu_read_lock(); From c3acc46c602f1aa0449ea687057758e5224ae3da Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 10 May 2023 10:54:12 +0700 Subject: [PATCH 161/271] Documentation: net/mlx5: Wrap vnic reporter devlink commands in code blocks Sphinx reports htmldocs warnings: Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst:287: WARNING: Unexpected indentation. Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst:288: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst:290: WARNING: Unexpected indentation. Fix above warnings by wrapping diagnostic devlink commands in "vnic reporter" section in code blocks to be consistent with other devlink command snippets. Fixes: b0bc615df488ab ("net/mlx5: Add vnic devlink health reporter to PFs/VFs") Fixes: cf14af140a5ad0 ("net/mlx5e: Add vnic devlink health reporter to representors") Reviewed-by: Leon Romanovsky Signed-off-by: Bagas Sanjaya Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- .../device_drivers/ethernet/mellanox/mlx5/devlink.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst index 3a7a714cc08f..0f0598caea14 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst @@ -283,10 +283,14 @@ nic_receive_steering_discard: number of packets that completed RX flow steering but were discarded due to a mismatch in flow table. User commands examples: -- Diagnose PF/VF vnic counters + +- Diagnose PF/VF vnic counters:: + $ devlink health diagnose pci/0000:82:00.1 reporter vnic + - Diagnose representor vnic counters (performed by supplying devlink port of the - representor, which can be obtained via devlink port command) + representor, which can be obtained via devlink port command):: + $ devlink health diagnose pci/0000:82:00.1/65537 reporter vnic NOTE: This command can run over all interfaces such as PF/VF and representor ports. From 565b8c60e90f7f2d4763f5979c7aaa43697ab950 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 10 May 2023 10:54:13 +0700 Subject: [PATCH 162/271] Documentation: net/mlx5: Use bullet and definition lists for vnic counters description "vnic reporter" section contains unformatted description for vnic counters, which is rendered as one long paragraph instead of list. Use bullet and definition lists to match other lists. Fixes: b0bc615df488ab ("net/mlx5: Add vnic devlink health reporter to PFs/VFs") Reviewed-by: Leon Romanovsky Signed-off-by: Bagas Sanjaya Reviewed-by: Simon Horman Reviewed-by: Daniel Machon Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/devlink.rst | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst index 0f0598caea14..00687425d8b7 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst @@ -265,22 +265,26 @@ It is responsible for querying the vnic diagnostic counters from fw and displayi them in realtime. Description of the vnic counters: -total_q_under_processor_handle: number of queues in an error state due to -an async error or errored command. -send_queue_priority_update_flow: number of QP/SQ priority/SL update -events. -cq_overrun: number of times CQ entered an error state due to an -overflow. -async_eq_overrun: number of times an EQ mapped to async events was -overrun. -comp_eq_overrun: number of times an EQ mapped to completion events was -overrun. -quota_exceeded_command: number of commands issued and failed due to quota -exceeded. -invalid_command: number of commands issued and failed dues to any reason -other than quota exceeded. -nic_receive_steering_discard: number of packets that completed RX flow -steering but were discarded due to a mismatch in flow table. + +- total_q_under_processor_handle + number of queues in an error state due to + an async error or errored command. +- send_queue_priority_update_flow + number of QP/SQ priority/SL update events. +- cq_overrun + number of times CQ entered an error state due to an overflow. +- async_eq_overrun + number of times an EQ mapped to async events was overrun. + comp_eq_overrun number of times an EQ mapped to completion events was + overrun. +- quota_exceeded_command + number of commands issued and failed due to quota exceeded. +- invalid_command + number of commands issued and failed dues to any reason other than quota + exceeded. +- nic_receive_steering_discard + number of packets that completed RX flow + steering but were discarded due to a mismatch in flow table. User commands examples: From 92059da65d84227fcc7cabbd96c0cca19880f4ff Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 10 May 2023 10:54:14 +0700 Subject: [PATCH 163/271] Documentation: net/mlx5: Add blank line separator before numbered lists The doc forgets to add separator before numbered lists, which causes the lists to be appended to previous paragraph inline instead. Add the missing separator. Fixes: f2d51e579359b7 ("net/mlx5: Separate mlx5 driver documentation into multiple pages") Reviewed-by: Leon Romanovsky Signed-off-by: Bagas Sanjaya Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- .../device_drivers/ethernet/mellanox/mlx5/devlink.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst index 00687425d8b7..f962c0975d84 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst @@ -40,6 +40,7 @@ flow_steering_mode: Device flow steering mode --------------------------------------------- The flow steering mode parameter controls the flow steering mode of the driver. Two modes are supported: + 1. 'dmfs' - Device managed flow steering. 2. 'smfs' - Software/Driver managed flow steering. @@ -99,6 +100,7 @@ between representors and stacked devices. By default metadata is enabled on the supported devices in E-switch. Metadata is applicable only for E-switch in switchdev mode and users may disable it when NONE of the below use cases will be in use: + 1. HCA is in Dual/multi-port RoCE mode. 2. VF/SF representor bonding (Usually used for Live migration) 3. Stacked devices From bb72b94c659f5032850e9ab070d850bc743e3a0e Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 10 May 2023 10:54:15 +0700 Subject: [PATCH 164/271] Documentation: net/mlx5: Wrap notes in admonition blocks Wrap note paragraphs in note:: directive as it better fit for the purpose of noting devlink commands. Fixes: f2d51e579359b7 ("net/mlx5: Separate mlx5 driver documentation into multiple pages") Fixes: cf14af140a5ad0 ("net/mlx5e: Add vnic devlink health reporter to representors") Reviewed-by: Leon Romanovsky Signed-off-by: Bagas Sanjaya Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/devlink.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst index f962c0975d84..3354ca3608ee 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst @@ -182,7 +182,8 @@ User commands examples: $ devlink health diagnose pci/0000:82:00.0 reporter tx -NOTE: This command has valid output only when interface is up, otherwise the command has empty output. +.. note:: + This command has valid output only when interface is up, otherwise the command has empty output. - Show number of tx errors indicated, number of recover flows ended successfully, is autorecover enabled and graceful period from last recover:: @@ -234,8 +235,9 @@ User commands examples: $ devlink health dump show pci/0000:82:00.0 reporter fw -NOTE: This command can run only on the PF which has fw tracer ownership, -running it on other PF or any VF will return "Operation not permitted". +.. note:: + This command can run only on the PF which has fw tracer ownership, + running it on other PF or any VF will return "Operation not permitted". fw fatal reporter ----------------- @@ -258,7 +260,8 @@ User commands examples: $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal -NOTE: This command can run only on PF. +.. note:: + This command can run only on PF. vnic reporter ------------- @@ -299,4 +302,5 @@ User commands examples: $ devlink health diagnose pci/0000:82:00.1/65537 reporter vnic -NOTE: This command can run over all interfaces such as PF/VF and representor ports. +.. note:: + This command can run over all interfaces such as PF/VF and representor ports. From 51bba254edc79fbb1b04e207e036c0c68576f1a4 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 22 May 2023 10:37:01 +0100 Subject: [PATCH 165/271] MAINTAINERS: update arm64 Microchip entries Krzysztof noticed that patches for arch/arm64/boot/dts/microchip were getting lost & the listed tree was inactive. Nicolas and I are willing to shepherd patches to Arnd, using the existing at91 tree, so add a new entry covering arch/arm64/boot/dts/microchip, listing us as maintainers. Drop the tree from the existing sparx5 entry & narrow the devicetree pattern to just sparx devices, leaving Lars, Steen and Daniel looking after support for their SoCs. CC: Rob Herring CC: Krzysztof Kozlowski CC: Conor Dooley CC: Nicolas Ferre CC: Claudiu Beznea CC: soc@kernel.org CC: Lars Povlsen CC: Steen Hegelund CC: Daniel Machon Signed-off-by: Conor Dooley Acked-by: Nicolas Ferre Acked-by: Steen Hegelund Acked-by: Claudiu Beznea Link: https://lore.kernel.org/r/20230522-wagon-stencil-a164ec39322a@wendy Signed-off-by: Arnd Bergmann --- MAINTAINERS | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 27ef11624748..9fd43a3ef8b0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2429,6 +2429,15 @@ X: drivers/net/wireless/atmel/ N: at91 N: atmel +ARM/MICROCHIP (ARM64) SoC support +M: Conor Dooley +M: Nicolas Ferre +M: Claudiu Beznea +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +S: Supported +T: git https://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git +F: arch/arm64/boot/dts/microchip/ + ARM/Microchip Sparx5 SoC support M: Lars Povlsen M: Steen Hegelund @@ -2436,8 +2445,7 @@ M: Daniel Machon M: UNGLinuxDriver@microchip.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported -T: git git://github.com/microchip-ung/linux-upstream.git -F: arch/arm64/boot/dts/microchip/ +F: arch/arm64/boot/dts/microchip/sparx* F: drivers/net/ethernet/microchip/vcap/ F: drivers/pinctrl/pinctrl-microchip-sgpio.c N: sparx5 From 533ab73f5b5c95dcb4152b52d5482abcc824c690 Mon Sep 17 00:00:00 2001 From: Wenwen Chen Date: Thu, 25 May 2023 16:26:26 +0800 Subject: [PATCH 166/271] io_uring: unlock sqd->lock before sq thread release CPU The sq thread actively releases CPU resources by calling the cond_resched() and schedule() interfaces when it is idle. Therefore, more resources are available for other threads to run. There exists a problem in sq thread: it does not unlock sqd->lock before releasing CPU resources every time. This makes other threads pending on sqd->lock for a long time. For example, the following interfaces all require sqd->lock: io_sq_offload_create(), io_register_iowq_max_workers() and io_ring_exit_work(). Before the sq thread releases CPU resources, unlocking sqd->lock will provide the user a better experience because it can respond quickly to user requests. Signed-off-by: Kanchan Joshi Signed-off-by: Wenwen Chen Link: https://lore.kernel.org/r/20230525082626.577862-1-wenwen.chen@samsung.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 9db4bc1f521a..5e329e3cd470 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -255,9 +255,13 @@ static int io_sq_thread(void *data) sqt_spin = true; if (sqt_spin || !time_after(jiffies, timeout)) { - cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; + if (unlikely(need_resched())) { + mutex_unlock(&sqd->lock); + cond_resched(); + mutex_lock(&sqd->lock); + } continue; } From 3bf8c6307bad5c0cc09cde982e146d847859b651 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Thu, 18 May 2023 05:58:19 +0000 Subject: [PATCH 167/271] cpufreq: amd-pstate: Update policy->cur in amd_pstate_adjust_perf() Driver should update policy->cur after updating the frequency. Currently amd_pstate doesn't update policy->cur when `adjust_perf` is used. Which causes /proc/cpuinfo to show wrong cpu frequency. Fix this by updating policy->cur with correct frequency value in adjust_perf function callback. - Before the fix: (setting min freq to 1.5 MHz) [root@amd]# cat /proc/cpuinfo | grep "cpu MHz" | sort | uniq --count 1 cpu MHz : 1777.016 1 cpu MHz : 1797.160 1 cpu MHz : 1797.270 189 cpu MHz : 400.000 - After the fix: (setting min freq to 1.5 MHz) [root@amd]# cat /proc/cpuinfo | grep "cpu MHz" | sort | uniq --count 1 cpu MHz : 1753.353 1 cpu MHz : 1756.838 1 cpu MHz : 1776.466 1 cpu MHz : 1776.873 1 cpu MHz : 1777.308 1 cpu MHz : 1779.900 183 cpu MHz : 1805.231 1 cpu MHz : 1956.815 1 cpu MHz : 2246.203 1 cpu MHz : 2259.984 Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Signed-off-by: Wyes Karny [ rjw: Subject edits ] Cc: 5.17+ # 5.17+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ac54779f5a49..ddd346a239e0 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -501,12 +501,14 @@ static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long capacity) { unsigned long max_perf, min_perf, des_perf, - cap_perf, lowest_nonlinear_perf; + cap_perf, lowest_nonlinear_perf, max_freq; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata = policy->driver_data; + unsigned int target_freq; cap_perf = READ_ONCE(cpudata->highest_perf); lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + max_freq = READ_ONCE(cpudata->max_freq); des_perf = cap_perf; if (target_perf < capacity) @@ -523,6 +525,10 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; + des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + target_freq = div_u64(des_perf * max_freq, max_perf); + policy->cur = target_freq; + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, policy->governor->flags); cpufreq_cpu_put(policy); From edc0a2b5957652f4685ef3516f519f84807087db Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 23 Mar 2023 09:56:40 +0800 Subject: [PATCH 168/271] x86/topology: Fix erroneous smp_num_siblings on Intel Hybrid platforms Traditionally, all CPUs in a system have identical numbers of SMT siblings. That changes with hybrid processors where some logical CPUs have a sibling and others have none. Today, the CPU boot code sets the global variable smp_num_siblings when every CPU thread is brought up. The last thread to boot will overwrite it with the number of siblings of *that* thread. That last thread to boot will "win". If the thread is a Pcore, smp_num_siblings == 2. If it is an Ecore, smp_num_siblings == 1. smp_num_siblings describes if the *system* supports SMT. It should specify the maximum number of SMT threads among all cores. Ensure that smp_num_siblings represents the system-wide maximum number of siblings by always increasing its value. Never allow it to decrease. On MeteorLake-P platform, this fixes a problem that the Ecore CPUs are not updated in any cpu sibling map because the system is treated as an UP system when probing Ecore CPUs. Below shows part of the CPU topology information before and after the fix, for both Pcore and Ecore CPU (cpu0 is Pcore, cpu 12 is Ecore). ... -/sys/devices/system/cpu/cpu0/topology/package_cpus:000fff -/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-11 +/sys/devices/system/cpu/cpu0/topology/package_cpus:3fffff +/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-21 ... -/sys/devices/system/cpu/cpu12/topology/package_cpus:001000 -/sys/devices/system/cpu/cpu12/topology/package_cpus_list:12 +/sys/devices/system/cpu/cpu12/topology/package_cpus:3fffff +/sys/devices/system/cpu/cpu12/topology/package_cpus_list:0-21 Notice that the "before" 'package_cpus_list' has only one CPU. This means that userspace tools like lscpu will see a little laptop like an 11-socket system: -Core(s) per socket: 1 -Socket(s): 11 +Core(s) per socket: 16 +Socket(s): 1 This is also expected to make the scheduler do rather wonky things too. [ dhansen: remove CPUID detail from changelog, add end user effects ] CC: stable@kernel.org Fixes: bbb65d2d365e ("x86: use cpuid vector 0xb when available for detecting cpu topology") Fixes: 95f3d39ccf7a ("x86/cpu/topology: Provide detect_extended_topology_early()") Suggested-by: Len Brown Signed-off-by: Zhang Rui Signed-off-by: Dave Hansen Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20230323015640.27906-1-rui.zhang%40intel.com --- arch/x86/kernel/cpu/topology.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 5e868b62a7c4..0270925fe013 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -79,7 +79,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; - smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); #endif return 0; } @@ -109,7 +109,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c) */ cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); c->initial_apicid = edx; - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); From 9828ed3f695a138f7add89fa2a186ababceb8006 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 May 2023 09:32:25 -0700 Subject: [PATCH 169/271] module: error out early on concurrent load of the same module file It turns out that udev under certain circumstances will concurrently try to load the same modules over-and-over excessively. This isn't a kernel bug, but it ends up affecting the kernel, to the point that under certain circumstances we can fail to boot, because the kernel uses a lot of memory to read all the module data all at once. Note that it isn't a memory leak, it's just basically a thundering herd problem happening at bootup with a lot of CPUs, with the worst cases then being pretty bad. Admittedly the worst situations are somewhat contrived: lots and lots of CPUs, not a lot of memory, and KASAN enabled to make it all slower and as such (unintentionally) exacerbate the problem. Luis explains: [1] "My best assessment of the situation is that each CPU in udev ends up triggering a load of duplicate set of modules, not just one, but *a lot*. Not sure what heuristics udev uses to load a set of modules per CPU." Petr Pavlu chimes in: [2] "My understanding is that udev workers are forked. An initial kmod context is created by the main udevd process but no sharing happens after the fork. It means that the mentioned memory pool logic doesn't really kick in. Multiple parallel load requests come from multiple udev workers, for instance, each handling an udev event for one CPU device and making the exactly same requests as all others are doing at the same time. The optimization idea would be to recognize these duplicate requests at the udevd/kmod level and converge them" Note that module loading has tried to mitigate this issue before, see for example commit 064f4536d139 ("module: avoid allocation if module is already present and ready"), which has a few ASCII graphs on memory use due to this same issue. However, while that noticed that the module was already loaded, and exited with an error early before spending any more time on setting up the module, it didn't handle the case of multiple concurrent module loads all being active - but not complete - at the same time. Yes, one of them will eventually win the race and finalize its copy, and the others will then notice that the module already exists and error out, but while this all happens, we have tons of unnecessary concurrent work being done. Again, the real fix is for udev to not do that (maybe it should use threads instead of fork, and have actual shared data structures and not cause duplicate work). That real fix is apparently not trivial. But it turns out that the kernel already has a pretty good model for dealing with concurrent access to the same file: the i_writecount of the inode. In fact, the module loading already indirectly uses 'i_writecount' , because 'kernel_file_read()' will in fact do ret = deny_write_access(file); if (ret) return ret; ... allow_write_access(file); around the read of the file data. We do not allow concurrent writes to the file, and return -ETXTBUSY if the file was open for writing at the same time as the module data is loaded from it. And the solution to the reader concurrency problem is to simply extend this "no concurrent writers" logic to simply be "exclusive access". Note that "exclusive" in this context isn't really some absolute thing: it's only exclusion from writers and from other "special readers" that do this writer denial. So we simply introduce a variation of that "deny_write_access()" logic that not only denies write access, but also requires that this is the _only_ such access that denies write access. Which means that you can't start loading a module that is already being loaded as a module by somebody else, or you will get the same -ETXTBSY error that you would get if there were writers around. [ It also means that you can't try to load a currently executing executable as a module, for the same reason: executables do that same "deny_write_access()" thing, and that's obviously where the whole ETXTBSY logic traditionally came from. This is not a problem for kernel modules, since the set of normal executable files and kernel module files is entirely disjoint. ] This new function is called "exclusive_deny_write_access()", and the implementation is trivial, in that it's just an atomic decrement of i_writecount if it was 0 before. To use that new exclusivity check, all we then do is wrap the module loading with that exclusive_deny_write_access()() / allow_write_access() pair. The actual patch is a bit bigger than that, because we want to surround not just the "load file data" part, but the whole module setup, to get maximum exclusion. So this ends up splitting up "finit_module()" into a few helper functions to make it all very clear and legible. In Luis' test-case (bringing up 255 vcpu's in a virtual machine [3]), the "wasted vmalloc" space (ie module data read into a vmalloc'ed area in order to be loaded as a module, but then discarded because somebody else loaded the same module instead) dropped from 1.8GiB to 474kB. Yes, that's gigabytes to kilobytes. It doesn't drop completely to zero, because even with this change, you can still end up having completely serial pointless module loads, where one udev process has loaded a module fully (and thus the kernel has released that exclusive lock on the module file), and then another udev process tries to load the same module again. So while we cannot fully get rid of the fundamental bug in user space, we _can_ get rid of the excessive concurrent thundering herd effect. A couple of final side notes on this all: - This tweak only affects the "finit_module()" system call, which gives the kernel a file descriptor with the module data. You can also just feed the module data as raw data from user space with "init_module()" (note the lack of 'f' at the beginning), and obviously for that case we do _not_ have any "exclusive read" logic. So if you absolutely want to do things wrong in user space, and try to load the same module multiple times, and error out only later when the kernel ends up saying "you can't load the same module name twice", you can still do that. And in fact, some distros will do exactly that, because they will uncompress the kernel module data in user space before feeding it to the kernel (mainly because they haven't started using the new kernel side decompression yet). So this is not some absolute "you can't do concurrent loads of the same module". It's literally just a very simple heuristic that will catch it early in case you try to load the exact same module file at the same time, and in that case avoid a potentially nasty situation. - There is another user of "deny_write_access()": the verity code that enables fs-verity on a file (the FS_IOC_ENABLE_VERITY ioctl). If you use fs-verity and you care about verifying the kernel modules (which does make sense), you should do it *before* loading said kernel module. That may sound obvious, but now the implementation basically requires it. Because if you try to do it concurrently, the kernel may refuse to load the module file that is being set up by the fs-verity code. - This all will obviously mean that if you insist on loading the same module in parallel, only one module load will succeed, and the others will return with an error. That was true before too, but what is different is that the -ETXTBSY error can be returned *before* the success case of another process fully loading and instantiating the module. Again, that might sound obvious, and it is indeed the whole point of the whole change: we are much quicker to notice the whole "you're already in the process of loading this module". So it's very much intentional, but it does mean that if you just spray the kernel with "finit_module()", and expect that the module is immediately loaded afterwards without checking the return value, you are doing something horribly horribly wrong. I'd like to say that that would never happen, but the whole _reason_ for this commit is that udev is currently doing something horribly horribly wrong, so ... Link: https://lore.kernel.org/all/ZEGopJ8VAYnE7LQ2@bombadil.infradead.org/ [1] Link: https://lore.kernel.org/all/23bd0ce6-ef78-1cd8-1f21-0e706a00424a@suse.com/ [2] Link: https://lore.kernel.org/lkml/ZG%2Fa+nrt4%2FAAUi5z@bombadil.infradead.org/ [3] Cc: Greg Kroah-Hartman Cc: Lucas De Marchi Cc: Petr Pavlu Tested-by: Luis Chamberlain Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ++++ kernel/module/main.c | 72 ++++++++++++++++++++++++++++++-------------- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb24..86b50271b4f7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2566,6 +2566,12 @@ static inline int deny_write_access(struct file *file) struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } +static inline int exclusive_deny_write_access(struct file *file) +{ + int old = 0; + struct inode *inode = file_inode(file); + return atomic_try_cmpxchg(&inode->i_writecount, &old, -1) ? 0 : -ETXTBSY; +} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); diff --git a/kernel/module/main.c b/kernel/module/main.c index 044aa2c9e3cb..b4c7e925fdb0 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3057,11 +3057,53 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return load_module(&info, uargs, 0); } -SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +static int file_init_module(struct file *file, const char __user * uargs, int flags) { struct load_info info = { }; void *buf = NULL; int len; + + len = kernel_read_file(file, 0, &buf, INT_MAX, NULL, + READING_MODULE); + if (len < 0) { + mod_stat_inc(&failed_kreads); + mod_stat_add_long(len, &invalid_kread_bytes); + return len; + } + + if (flags & MODULE_INIT_COMPRESSED_FILE) { + int err = module_decompress(&info, buf, len); + vfree(buf); /* compressed data is no longer needed */ + if (err) { + mod_stat_inc(&failed_decompress); + mod_stat_add_long(len, &invalid_decompress_bytes); + return err; + } + } else { + info.hdr = buf; + info.len = len; + } + + return load_module(&info, uargs, flags); +} + +/* + * kernel_read_file() will already deny write access, but module + * loading wants _exclusive_ access to the file, so we do that + * here, along with basic sanity checks. + */ +static int prepare_file_for_module_load(struct file *file) +{ + if (!file || !(file->f_mode & FMODE_READ)) + return -EBADF; + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; + return exclusive_deny_write_access(file); +} + +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + struct fd f; int err; err = may_init_module(); @@ -3075,28 +3117,14 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, - READING_MODULE); - if (len < 0) { - mod_stat_inc(&failed_kreads); - mod_stat_add_long(len, &invalid_kread_bytes); - return len; + f = fdget(fd); + err = prepare_file_for_module_load(f.file); + if (!err) { + err = file_init_module(f.file, uargs, flags); + allow_write_access(f.file); } - - if (flags & MODULE_INIT_COMPRESSED_FILE) { - err = module_decompress(&info, buf, len); - vfree(buf); /* compressed data is no longer needed */ - if (err) { - mod_stat_inc(&failed_decompress); - mod_stat_add_long(len, &invalid_decompress_bytes); - return err; - } - } else { - info.hdr = buf; - info.len = len; - } - - return load_module(&info, uargs, flags); + fdput(f); + return err; } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ From 1dd5483af494216e91c9a12f11992bcba483f944 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 24 May 2023 15:12:29 -0500 Subject: [PATCH 170/271] smb3: update a reviewer email in MAINTAINERS file Paulo has a new email address so update maintainers files. Signed-off-by: Steve French --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 207a65905f5e..839cc375b205 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5130,7 +5130,7 @@ X: drivers/clk/clkdev.c COMMON INTERNET FILE SYSTEM CLIENT (CIFS and SMB3) M: Steve French -R: Paulo Alcantara (DFS, global name space) +R: Paulo Alcantara (DFS, global name space) R: Ronnie Sahlberg (directory leases, sparse files) R: Shyam Prasad N (multichannel) R: Tom Talpey (RDMA, smbdirect) From b535cc796a4b4942cd189652588e8d37c1f5925a Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 25 May 2023 18:53:28 -0500 Subject: [PATCH 171/271] smb3: missing null check in SMB2_change_notify If plen is null when passed in, we only checked for null in one of the two places where it could be used. Although plen is always valid (not null) for current callers of the SMB2_change_notify function, this change makes it more consistent. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/202305251831.3V1gbbFs-lkp@intel.com/ Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9ed61b6f9b21..7063b395d22f 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3725,7 +3725,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, if (*out_data == NULL) { rc = -ENOMEM; goto cnotify_exit; - } else + } else if (plen) *plen = le32_to_cpu(smb_rsp->OutputBufferLength); } From d68cb7cf1fd0ef4287bc0ecd1ed0b6ae8e05fc70 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Wed, 24 May 2023 21:49:08 +0200 Subject: [PATCH 172/271] net: mellanox: mlxbf_gige: Fix skb_panic splat under memory pressure Do skb_put() after a new skb has been successfully allocated otherwise the reused skb leads to skb_panics or incorrect packet sizes. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: Thomas Bogendoerfer Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230524194908.147145-1-tbogendoerfer@suse.de Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c index afa3b92a6905..0d5a41a2ae01 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c @@ -245,12 +245,6 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) skb = priv->rx_skb[rx_pi_rem]; - skb_put(skb, datalen); - - skb->ip_summed = CHECKSUM_NONE; /* device did not checksum packet */ - - skb->protocol = eth_type_trans(skb, netdev); - /* Alloc another RX SKB for this same index */ rx_skb = mlxbf_gige_alloc_skb(priv, MLXBF_GIGE_DEFAULT_BUF_SZ, &rx_buf_dma, DMA_FROM_DEVICE); @@ -259,6 +253,13 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) priv->rx_skb[rx_pi_rem] = rx_skb; dma_unmap_single(priv->dev, *rx_wqe_addr, MLXBF_GIGE_DEFAULT_BUF_SZ, DMA_FROM_DEVICE); + + skb_put(skb, datalen); + + skb->ip_summed = CHECKSUM_NONE; /* device did not checksum packet */ + + skb->protocol = eth_type_trans(skb, netdev); + *rx_wqe_addr = rx_buf_dma; } else if (rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_MAC_ERR) { priv->stats.rx_mac_errors++; From ffb3322181d9e8db880202e4f00991764a35d812 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 24 May 2023 20:57:14 +0800 Subject: [PATCH 173/271] net: stmmac: fix call trace when stmmac_xdp_xmit() is invoked We encountered a kernel call trace issue which was related to ndo_xdp_xmit callback on our i.MX8MP platform. The reproduce steps show as follows. 1. The FEC port (eth0) connects to a PC port, and the PC uses pktgen_sample03_burst_single_flow.sh to generate packets and send these packets to the FEC port. Notice that the script must be executed before step 2. 2. Run the "./xdp_redirect eth0 eth1" command on i.MX8MP, the eth1 interface is the dwmac. Then there will be a call trace issue soon. Please see the log for more details. The root cause is that the NETDEV_XDP_ACT_NDO_XMIT feature is enabled by default, so when the step 2 command is exexcuted and packets have already been sent to eth0, the stmmac_xdp_xmit() starts running before the stmmac_xdp_set_prog() finishes. To resolve this issue, we disable the NETDEV_XDP_ACT_NDO_XMIT feature by default and turn on/off this feature when the bpf program is installed/uninstalled which just like the other ethernet drivers. Call Trace log: [ 306.311271] ------------[ cut here ]------------ [ 306.315910] WARNING: CPU: 0 PID: 15 at lib/timerqueue.c:55 timerqueue_del+0x68/0x70 [ 306.323590] Modules linked in: [ 306.326654] CPU: 0 PID: 15 Comm: ksoftirqd/0 Not tainted 6.4.0-rc1+ #37 [ 306.333277] Hardware name: NXP i.MX8MPlus EVK board (DT) [ 306.338591] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 306.345561] pc : timerqueue_del+0x68/0x70 [ 306.349577] lr : __remove_hrtimer+0x5c/0xa0 [ 306.353777] sp : ffff80000b7c3920 [ 306.357094] x29: ffff80000b7c3920 x28: 0000000000000000 x27: 0000000000000001 [ 306.364244] x26: ffff80000a763a40 x25: ffff0000d0285a00 x24: 0000000000000001 [ 306.371390] x23: 0000000000000001 x22: ffff000179389a40 x21: 0000000000000000 [ 306.378537] x20: ffff000179389aa0 x19: ffff0000d2951308 x18: 0000000000001000 [ 306.385686] x17: f1d3000000000000 x16: 00000000c39c1000 x15: 55e99bbe00001a00 [ 306.392835] x14: 09000900120aa8c0 x13: e49af1d300000000 x12: 000000000000c39c [ 306.399987] x11: 100055e99bbe0000 x10: ffff8000090b1048 x9 : ffff8000081603fc [ 306.407133] x8 : 000000000000003c x7 : 000000000000003c x6 : 0000000000000001 [ 306.414284] x5 : ffff0000d2950980 x4 : 0000000000000000 x3 : 0000000000000000 [ 306.421432] x2 : 0000000000000001 x1 : ffff0000d2951308 x0 : ffff0000d2951308 [ 306.428585] Call trace: [ 306.431035] timerqueue_del+0x68/0x70 [ 306.434706] __remove_hrtimer+0x5c/0xa0 [ 306.438549] hrtimer_start_range_ns+0x2bc/0x370 [ 306.443089] stmmac_xdp_xmit+0x174/0x1b0 [ 306.447021] bq_xmit_all+0x194/0x4b0 [ 306.450612] __dev_flush+0x4c/0x98 [ 306.454024] xdp_do_flush+0x18/0x38 [ 306.457522] fec_enet_rx_napi+0x6c8/0xc68 [ 306.461539] __napi_poll+0x40/0x220 [ 306.465038] net_rx_action+0xf8/0x240 [ 306.468707] __do_softirq+0x128/0x3a8 [ 306.472378] run_ksoftirqd+0x40/0x58 [ 306.475961] smpboot_thread_fn+0x1c4/0x288 [ 306.480068] kthread+0x124/0x138 [ 306.483305] ret_from_fork+0x10/0x20 [ 306.486889] ---[ end trace 0000000000000000 ]--- Fixes: 66c0e13ad236 ("drivers: net: turn on XDP features") Signed-off-by: Wei Fang Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230524125714.357337-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +-- drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 0fca81507a77..52cab9de05f2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7233,8 +7233,7 @@ int stmmac_dvr_probe(struct device *device, ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM; ndev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_XSK_ZEROCOPY | - NETDEV_XDP_ACT_NDO_XMIT; + NETDEV_XDP_ACT_XSK_ZEROCOPY; ret = stmmac_tc_init(priv, priv); if (!ret) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c index 9d4d8c3dad0a..aa6f16d3df64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_xdp.c @@ -117,6 +117,9 @@ int stmmac_xdp_set_prog(struct stmmac_priv *priv, struct bpf_prog *prog, return -EOPNOTSUPP; } + if (!prog) + xdp_features_clear_redirect_target(dev); + need_update = !!priv->xdp_prog != !!prog; if (if_running && need_update) stmmac_xdp_release(dev); @@ -131,5 +134,8 @@ int stmmac_xdp_set_prog(struct stmmac_priv *priv, struct bpf_prog *prog, if (if_running && need_update) stmmac_xdp_open(dev); + if (prog) + xdp_features_set_redirect_target(dev, false); + return 0; } From 31642e7089df8fd3f54ca7843f7ee2952978cad1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 May 2023 14:14:56 +0000 Subject: [PATCH 174/271] netrom: fix info-leak in nr_write_internal() Simon Kapadia reported the following issue: The Online Amateur Radio Community (OARC) has recently been experimenting with building a nationwide packet network in the UK. As part of our experimentation, we have been testing out packet on 300bps HF, and playing with net/rom. For HF packet at this baud rate you really need to make sure that your MTU is relatively low; AX.25 suggests a PACLEN of 60, and a net/rom PACLEN of 40 to go with that. However the Linux net/rom support didn't work with a low PACLEN; the mkiss module would truncate packets if you set the PACLEN below about 200 or so, e.g.: Apr 19 14:00:51 radio kernel: [12985.747310] mkiss: ax1: truncating oversized transmit packet! This didn't make any sense to me (if the packets are smaller why would they be truncated?) so I started investigating. I looked at the packets using ethereal, and found that many were just huge compared to what I would expect. A simple net/rom connection request packet had the request and then a bunch of what appeared to be random data following it: Simon provided a patch that I slightly revised: Not only we must not use skb_tailroom(), we also do not want to count NR_NETWORK_LEN twice. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Co-Developed-by: Simon Kapadia Signed-off-by: Simon Kapadia Signed-off-by: Eric Dumazet Tested-by: Simon Kapadia Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230524141456.1045467-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/netrom/nr_subr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 3f99b432ea70..e2d2af924cff 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -123,7 +123,7 @@ void nr_write_internal(struct sock *sk, int frametype) unsigned char *dptr; int len, timeout; - len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + len = NR_TRANSPORT_LEN; switch (frametype & 0x0F) { case NR_CONNREQ: @@ -141,7 +141,8 @@ void nr_write_internal(struct sock *sk, int frametype) return; } - if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + skb = alloc_skb(NR_NETWORK_LEN + len, GFP_ATOMIC); + if (!skb) return; /* @@ -149,7 +150,7 @@ void nr_write_internal(struct sock *sk, int frametype) */ skb_reserve(skb, NR_NETWORK_LEN); - dptr = skb_put(skb, skb_tailroom(skb)); + dptr = skb_put(skb, len); switch (frametype & 0x0F) { case NR_CONNREQ: From 081e8df6819997eae236f75dd52f0c147c4be939 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 May 2023 10:07:12 -0700 Subject: [PATCH 175/271] tools: ynl: avoid dict errors on older Python versions Python 3.9.0 or newer supports combining dicts() with |, but older versions of Python are still used in the wild (e.g. on CentOS 8, which goes EoL May 31, 2024). With Python 3.6.8 we get: TypeError: unsupported operand type(s) for |: 'dict' and 'dict' Use older syntax. Tested with non-legacy families only. Fixes: f036d936ca57 ("tools: ynl: Add fixed-header support to ynl") Reviewed-by: Simon Horman Reviewed-by: Donald Hunter Tested-by: Donald Hunter Link: https://lore.kernel.org/r/20230524170712.2036128-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index aa77bcae4807..3144f33196be 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -591,8 +591,9 @@ class YnlFamily(SpecFamily): print('Unexpected message: ' + repr(gm)) continue - rsp.append(self._decode(gm.raw_attrs, op.attr_set.name) - | gm.fixed_header_attrs) + rsp_msg = self._decode(gm.raw_attrs, op.attr_set.name) + rsp_msg.update(gm.fixed_header_attrs) + rsp.append(rsp_msg) if not rsp: return None From 822b5a1c17df7e338b9f05d1cfe5764e37c7f74f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 May 2023 16:29:34 -0700 Subject: [PATCH 176/271] af_packet: Fix data-races of pkt_sk(sk)->num. syzkaller found a data race of pkt_sk(sk)->num. The value is changed under lock_sock() and po->bind_lock, so we need READ_ONCE() to access pkt_sk(sk)->num without these locks in packet_bind_spkt(), packet_bind(), and sk_diag_fill(). Note that WRITE_ONCE() is already added by commit c7d2ef5dd4b0 ("net/packet: annotate accesses to po->bind"). BUG: KCSAN: data-race in packet_bind / packet_do_bind write (marked) to 0xffff88802ffd1cee of 2 bytes by task 7322 on cpu 0: packet_do_bind+0x446/0x640 net/packet/af_packet.c:3236 packet_bind+0x99/0xe0 net/packet/af_packet.c:3321 __sys_bind+0x19b/0x1e0 net/socket.c:1803 __do_sys_bind net/socket.c:1814 [inline] __se_sys_bind net/socket.c:1812 [inline] __x64_sys_bind+0x40/0x50 net/socket.c:1812 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc read to 0xffff88802ffd1cee of 2 bytes by task 7318 on cpu 1: packet_bind+0xbf/0xe0 net/packet/af_packet.c:3322 __sys_bind+0x19b/0x1e0 net/socket.c:1803 __do_sys_bind net/socket.c:1814 [inline] __se_sys_bind net/socket.c:1812 [inline] __x64_sys_bind+0x40/0x50 net/socket.c:1812 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc value changed: 0x0300 -> 0x0000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 7318 Comm: syz-executor.4 Not tainted 6.3.0-13380-g7fddb5b5300c #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 96ec6327144e ("packet: Diag core and basic socket info dumping") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230524232934.50950-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 4 ++-- net/packet/diag.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 94c6a1ffa459..a1f9a0e9f3c8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3299,7 +3299,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; - return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); + return packet_do_bind(sk, name, 0, READ_ONCE(pkt_sk(sk)->num)); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -3317,7 +3317,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, - sll->sll_protocol ? : pkt_sk(sk)->num); + sll->sll_protocol ? : READ_ONCE(pkt_sk(sk)->num)); } static struct proto packet_proto = { diff --git a/net/packet/diag.c b/net/packet/diag.c index d0c4eda4cdc6..f6b200cb3c06 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -143,7 +143,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, rp = nlmsg_data(nlh); rp->pdiag_family = AF_PACKET; rp->pdiag_type = sk->sk_type; - rp->pdiag_num = ntohs(po->num); + rp->pdiag_num = ntohs(READ_ONCE(po->num)); rp->pdiag_ino = sk_ino; sock_diag_save_cookie(sk, rp->pdiag_cookie); From 8a0d57df8938e9fd2e99d47a85b7f37d86f91097 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 May 2023 22:17:41 -0700 Subject: [PATCH 177/271] tls: improve lockless access safety of tls_err_abort() Most protos' poll() methods insert a memory barrier between writes to sk_err and sk_error_report(). This dates back to commit a4d258036ed9 ("tcp: Fix race in tcp_poll"). I guess we should do the same thing in TLS, tcp_poll() does not hold the socket lock. Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_strp.c | 4 +++- net/tls/tls_sw.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index da95abbb7ea3..f37f4a0fcd3c 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -20,7 +20,9 @@ static void tls_strp_abort_strp(struct tls_strparser *strp, int err) strp->stopped = 1; /* Report an error on the lower socket */ - strp->sk->sk_err = -err; + WRITE_ONCE(strp->sk->sk_err, -err); + /* Paired with smp_rmb() in tcp_poll() */ + smp_wmb(); sk_error_report(strp->sk); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6e6a7c37d685..1a53c8f481e9 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -70,7 +70,9 @@ noinline void tls_err_abort(struct sock *sk, int err) { WARN_ON_ONCE(err >= 0); /* sk->sk_err should contain a positive error code. */ - sk->sk_err = -err; + WRITE_ONCE(sk->sk_err, -err); + /* Paired with smp_rmb() in tcp_poll() */ + smp_wmb(); sk_error_report(sk); } From dc362e20cd6ab7a93d1b09669730c406f0910c35 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 25 May 2023 23:56:12 +0530 Subject: [PATCH 178/271] amd-xgbe: fix the false linkup in xgbe_phy_status In the event of a change in XGBE mode, the current auto-negotiation needs to be reset and the AN cycle needs to be re-triggerred. However, the current code ignores the return value of xgbe_set_mode(), leading to false information as the link is declared without checking the status register. Fix this by propagating the mode switch status information to xgbe_phy_status(). Fixes: e57f7a3feaef ("amd-xgbe: Prepare for working with more than one type of phy") Co-developed-by: Sudheesh Mavila Signed-off-by: Sudheesh Mavila Reviewed-by: Simon Horman Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 33a9574e9e04..32d2c6fac652 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1329,7 +1329,7 @@ static enum xgbe_mode xgbe_phy_status_aneg(struct xgbe_prv_data *pdata) return pdata->phy_if.phy_impl.an_outcome(pdata); } -static void xgbe_phy_status_result(struct xgbe_prv_data *pdata) +static bool xgbe_phy_status_result(struct xgbe_prv_data *pdata) { struct ethtool_link_ksettings *lks = &pdata->phy.lks; enum xgbe_mode mode; @@ -1367,8 +1367,13 @@ static void xgbe_phy_status_result(struct xgbe_prv_data *pdata) pdata->phy.duplex = DUPLEX_FULL; - if (xgbe_set_mode(pdata, mode) && pdata->an_again) + if (!xgbe_set_mode(pdata, mode)) + return false; + + if (pdata->an_again) xgbe_phy_reconfig_aneg(pdata); + + return true; } static void xgbe_phy_status(struct xgbe_prv_data *pdata) @@ -1398,7 +1403,8 @@ static void xgbe_phy_status(struct xgbe_prv_data *pdata) return; } - xgbe_phy_status_result(pdata); + if (xgbe_phy_status_result(pdata)) + return; if (test_bit(XGBE_LINK_INIT, &pdata->dev_state)) clear_bit(XGBE_LINK_INIT, &pdata->dev_state); From 9b9e46aa07273ceb96866b2e812b46f1ee0b8d2f Mon Sep 17 00:00:00 2001 From: Osama Muhammad Date: Thu, 25 May 2023 22:27:46 +0500 Subject: [PATCH 179/271] nfcsim.c: Fix error checking for debugfs_create_dir This patch fixes the error checking in nfcsim.c. The DebugFS kernel API is developed in a way that the caller can safely ignore the errors that occur during the creation of DebugFS nodes. Signed-off-by: Osama Muhammad Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/nfc/nfcsim.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nfc/nfcsim.c b/drivers/nfc/nfcsim.c index 44eeb17ae48d..a55381f80cd6 100644 --- a/drivers/nfc/nfcsim.c +++ b/drivers/nfc/nfcsim.c @@ -336,10 +336,6 @@ static struct dentry *nfcsim_debugfs_root; static void nfcsim_debugfs_init(void) { nfcsim_debugfs_root = debugfs_create_dir("nfcsim", NULL); - - if (!nfcsim_debugfs_root) - pr_err("Could not create debugfs entry\n"); - } static void nfcsim_debugfs_remove(void) From 8d73259ef23f449329294dc187932f7470268126 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 25 May 2023 14:20:38 -0700 Subject: [PATCH 180/271] perf ftrace latency: Remove unnecessary "--" from --use-nsec option The option name should not have the dashes. Current version shows four dashes for the option. $ perf ftrace latency -h Usage: perf ftrace [] [] or: perf ftrace [] -- [] [] or: perf ftrace {trace|latency} [] [] or: perf ftrace {trace|latency} [] -- [] [] -b, --use-bpf Use BPF to measure function latency -n, ----use-nsec Use nano-second histogram -T, --trace-funcs Show latency of given function Fixes: 84005bb6148618cc ("perf ftrace latency: Add -n/--use-nsec option") Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Changbin Du Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230525212038.3535851-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 810e3376c7d6..f9906f52e4fa 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -1175,7 +1175,7 @@ int cmd_ftrace(int argc, const char **argv) OPT_BOOLEAN('b', "use-bpf", &ftrace.target.use_bpf, "Use BPF to measure function latency"), #endif - OPT_BOOLEAN('n', "--use-nsec", &ftrace.use_nsec, + OPT_BOOLEAN('n', "use-nsec", &ftrace.use_nsec, "Use nano-second histogram"), OPT_PARENT(common_options), }; From a3a9d63dcd15535e7fdf4c7c1b32bfaed762973a Mon Sep 17 00:00:00 2001 From: Tatsuki Sugiura Date: Sat, 20 May 2023 21:23:50 +0900 Subject: [PATCH 181/271] NVMe: Add MAXIO 1602 to bogus nid list. HIKSEMI FUTURE M.2 SSD uses the same dummy nguid and eui64. I confirmed it with my two devices. This patch marks the controller as NVME_QUIRK_BOGUS_NID. --------------------------------------------------------- sugi@tempest:~% sudo nvme id-ctrl /dev/nvme0 NVME Identify Controller: vid : 0x1e4b ssvid : 0x1e4b sn : 30096022612 mn : HS-SSD-FUTURE 2048G fr : SN10542 rab : 0 ieee : 000000 cmic : 0 mdts : 7 cntlid : 0 ver : 0x10400 rtd3r : 0x7a120 rtd3e : 0x1e8480 oaes : 0x200 ctratt : 0x2 rrls : 0 cntrltype : 1 fguid : 00000000-0000-0000-0000-000000000000 --------------------------------------------------------- --------------------------------------------------------- sugi@tempest:~% sudo nvme id-ns /dev/nvme0n1 NVME Identify Namespace 1: nguid : 00000000000000000000000000000000 eui64 : 0000000000000002 lbaf 0 : ms:0 lbads:9 rp:0 (in use) --------------------------------------------------------- Signed-off-by: Tatsuki Sugiura Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 32244582fdb0..492f319ebdf3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3424,6 +3424,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1202), /* MAXIO MAP1202 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1e4B, 0x1602), /* MAXIO MAP1602 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */ From 690917c647e245da76183656400d4926427a8b93 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 24 May 2023 17:03:07 -0700 Subject: [PATCH 182/271] perf bpf filter: Fix a broken perf sample data naming for BPF CO-RE BPF CO-RE requires 3 underscores for the ignored suffix rule but it mistakenly used only 2. Let's fix it. Fixes: 3a8b8fc3174891c4 ("perf bpf filter: Support pre-5.16 kernels where 'mem_hops' isn't in 'union perf_mem_data_src'") Signed-off-by: Namhyung Kim Acked-by: Andrii Nakryiko Acked-by: John Fastabend Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Song Liu Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20230525000307.3202449-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf_skel/sample_filter.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index cffe493af1ed..fb94f5280626 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -25,7 +25,7 @@ struct perf_sample_data___new { } __attribute__((preserve_access_index)); /* new kernel perf_mem_data_src definition */ -union perf_mem_data_src__new { +union perf_mem_data_src___new { __u64 val; struct { __u64 mem_op:5, /* type of opcode */ @@ -108,7 +108,7 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, if (entry->part == 7) return kctx->data->data_src.mem_blk; if (entry->part == 8) { - union perf_mem_data_src__new *data = (void *)&kctx->data->data_src; + union perf_mem_data_src___new *data = (void *)&kctx->data->data_src; if (bpf_core_field_exists(data->mem_hops)) return data->mem_hops; From a0c2f92d36d20d72efb27cbe8669037321e92f22 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 15 May 2023 09:50:39 -0700 Subject: [PATCH 183/271] perf arm: Fix include path to cs-etm.h Change "../cs-etm.h" to just "../../../util/cs-etm.h" as ../cs-etm.h doesn't exist. Suggested-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Mathieu Poirier Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: Will Deacon Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230515165039.544045-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index 860a8b42b4b5..a9623b128ece 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -12,7 +12,7 @@ #include "arm-spe.h" #include "hisi-ptt.h" #include "../../../util/pmu.h" -#include "../cs-etm.h" +#include "../../../util/cs-etm.h" struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) From 65cd8e5534f873b76b55fb17e942c512ee3699d9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 17 Apr 2023 12:25:46 -0700 Subject: [PATCH 184/271] perf build: Don't compile demangle-cxx.cpp if not necessary demangle-cxx.cpp requires a C++ compiler, but feature checks may fail because of the absence of this. Add a CONFIG_CXX_DEMANGLE so that the source isn't built if not supported. Copy libbfd and cplus demangle variants to a weak symbol-elf.c version so they aren't dependent on C++. These variants are only built with the build option BUILD_NONDISTRO=1. Committer note: This also handles this build break when a C++ compiler isn't available: CXX /tmp/build/perf/util/demangle-cxx.o /bin/sh: g++: command not found Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20230417192546.99923-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 1 + tools/perf/util/Build | 2 +- tools/perf/util/symbol-elf.c | 27 +++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 70268442f7ee..a794d9eca93d 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -927,6 +927,7 @@ ifndef NO_DEMANGLE EXTLIBS += -lstdc++ CFLAGS += -DHAVE_CXA_DEMANGLE_SUPPORT CXXFLAGS += -DHAVE_CXA_DEMANGLE_SUPPORT + $(call detected,CONFIG_CXX_DEMANGLE) endif ifdef BUILD_NONDISTRO ifeq ($(filter -liberty,$(EXTLIBS)),) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index bd18fe5f2719..f9df1df1eec0 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -214,7 +214,7 @@ perf-$(CONFIG_ZSTD) += zstd.o perf-$(CONFIG_LIBCAP) += cap.o -perf-y += demangle-cxx.o +perf-$(CONFIG_CXX_DEMANGLE) += demangle-cxx.o perf-y += demangle-ocaml.o perf-y += demangle-java.o perf-y += demangle-rust.o diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index b2ed9cc52265..63882a4db5c7 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -31,6 +31,13 @@ #include #endif +#if defined(HAVE_LIBBFD_SUPPORT) || defined(HAVE_CPLUS_DEMANGLE_SUPPORT) +#ifndef DMGL_PARAMS +#define DMGL_PARAMS (1 << 0) /* Include function args */ +#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ +#endif +#endif + #ifndef EM_AARCH64 #define EM_AARCH64 183 /* ARM 64 bit */ #endif @@ -271,6 +278,26 @@ static bool want_demangle(bool is_kernel_sym) return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle; } +/* + * Demangle C++ function signature, typically replaced by demangle-cxx.cpp + * version. + */ +__weak char *cxx_demangle_sym(const char *str __maybe_unused, bool params __maybe_unused, + bool modifiers __maybe_unused) +{ +#ifdef HAVE_LIBBFD_SUPPORT + int flags = (params ? DMGL_PARAMS : 0) | (modifiers ? DMGL_ANSI : 0); + + return bfd_demangle(NULL, str, flags); +#elif defined(HAVE_CPLUS_DEMANGLE_SUPPORT) + int flags = (params ? DMGL_PARAMS : 0) | (modifiers ? DMGL_ANSI : 0); + + return cplus_demangle(str, flags); +#else + return NULL; +#endif +} + static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name) { char *demangled = NULL; From 251c01e27feee83ff8fa294d33277616c9032dd0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 26 May 2023 15:30:23 -0300 Subject: [PATCH 185/271] perf bpf: Do not use llvm-strip on BPF binary llvm-strip is not really required. Remove this dependency to make it easier to build perf with BUILD_BPF_SKEL=1. Committer notes: This removes the need for the 'llvm' package just to get llvm-strip. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Song Liu Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.perf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 1593c5dcaa9e..f48794816d82 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -181,7 +181,6 @@ HOSTCC ?= gcc HOSTLD ?= ld HOSTAR ?= ar CLANG ?= clang -LLVM_STRIP ?= llvm-strip PKG_CONFIG = $(CROSS_COMPILE)pkg-config @@ -1083,7 +1082,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT) $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT) $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \ - -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ + -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL) $(QUIET_GENSKEL)$(BPFTOOL) gen skeleton $< > $@ From 15d4371bac978bb96b36c37e822cdae24c0dcd77 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 May 2023 11:26:04 +0100 Subject: [PATCH 186/271] perf cs-etm: Copy kernel coresight-pmu.h header Copy the kernel version of the header to fix the header diff build warning. Some new definitions were only added to the tools side header, but these are only used in Perf so move them to a different header. Signed-off-by: James Clark Reviewed-by: Mike Leach Link: https://lore.kernel.org/r/20230522102604.1081416-1-james.clark@arm.com Cc: Mark Rutland Cc: Suzuki K Poulose Cc: Ian Rogers Cc: Peter Zijlstra Cc: Adrian Hunter Cc: acme@kernel.org Cc: Jiri Olsa Cc: Namhyung Kim Cc: Will Deacon Cc: Leo Yan Cc: Alexander Shishkin Cc: linux-arm-kernel@lists.infradead.org Cc: coresight@lists.linaro.org Cc: siyanteng@loongson.cn Cc: John Garry Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/coresight-pmu.h | 13 ------------- tools/perf/util/cs-etm.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h index cef3b1c25335..51ac441a37c3 100644 --- a/tools/include/linux/coresight-pmu.h +++ b/tools/include/linux/coresight-pmu.h @@ -21,19 +21,6 @@ */ #define CORESIGHT_LEGACY_CPU_TRACE_ID(cpu) (0x10 + (cpu * 2)) -/* CoreSight trace ID is currently the bottom 7 bits of the value */ -#define CORESIGHT_TRACE_ID_VAL_MASK GENMASK(6, 0) - -/* - * perf record will set the legacy meta data values as unused initially. - * This allows perf report to manage the decoders created when dynamic - * allocation in operation. - */ -#define CORESIGHT_TRACE_ID_UNUSED_FLAG BIT(31) - -/* Value to set for unused trace ID values */ -#define CORESIGHT_TRACE_ID_UNUSED_VAL 0x7F - /* * Below are the definition of bit offsets for perf option, and works as * arbitrary values for all ETM versions. diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h index 70cac0375b34..ecca40787ac9 100644 --- a/tools/perf/util/cs-etm.h +++ b/tools/perf/util/cs-etm.h @@ -227,6 +227,19 @@ struct cs_etm_packet_queue { #define INFO_HEADER_SIZE (sizeof(((struct perf_record_auxtrace_info *)0)->type) + \ sizeof(((struct perf_record_auxtrace_info *)0)->reserved__)) +/* CoreSight trace ID is currently the bottom 7 bits of the value */ +#define CORESIGHT_TRACE_ID_VAL_MASK GENMASK(6, 0) + +/* + * perf record will set the legacy meta data values as unused initially. + * This allows perf report to manage the decoders created when dynamic + * allocation in operation. + */ +#define CORESIGHT_TRACE_ID_UNUSED_FLAG BIT(31) + +/* Value to set for unused trace ID values */ +#define CORESIGHT_TRACE_ID_UNUSED_VAL 0x7F + int cs_etm__process_auxtrace_info(union perf_event *event, struct perf_session *session); struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu); From c8268a9b91055c992909a0845cfa59c624908da8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 26 May 2023 15:45:03 -0300 Subject: [PATCH 187/271] tools headers UAPI: Sync the linux/in.h with the kernel sources Picking the changes from: 3632679d9e4f879f ("ipv{4,6}/raw: fix output xfrm lookup wrt protocol") That includes a define (IP_PROTOCOL) that isn't being used in generating any id -> string table used by 'perf trace'. Addresses this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/in.h' differs from latest version at 'include/uapi/linux/in.h' diff -u tools/include/uapi/linux/in.h include/uapi/linux/in.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Nicolas Dichtel Cc: Paolo Abeni Link: https://lore.kernel.org/lkml/ZHD/Ms0DMq7viaq+@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/in.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index 4b7f2df66b99..e682ab628dfa 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -163,6 +163,7 @@ struct in_addr { #define IP_MULTICAST_ALL 49 #define IP_UNICAST_IF 50 #define IP_LOCAL_PORT_RANGE 51 +#define IP_PROTOCOL 52 #define MCAST_EXCLUDE 0 #define MCAST_INCLUDE 1 From c041d33bf7ec731bb71f47e4d45a7aec9e40b1b9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 19 May 2023 16:57:57 -0700 Subject: [PATCH 188/271] perf evsel: Separate bpf_counter_list and bpf_filters, can be used at the same time 'struct evsel' uses a union for the two lists. This turned out to be error prone. For example: [root@quaco ~]# perf stat --bpf-prog 5246 Error: cpu-clock event does not have sample flags 66c660 failed to set filter "(null)" on event cpu-clock with 2 (No such file or directory) [root@quaco ~]# perf stat --bpf-prog 5246 Fix this issue by separating the two lists. Fixes: 56ec9457a4a20c5e ("perf bpf filter: Implement event sample filtering") Signed-off-by: Song Liu Acked-by: Namhyung Kim Cc: Jiri Olsa Cc: Namhyung Kim Cc: kernel-team@meta.com Link: https://lore.kernel.org/r/20230519235757.3613719-1-song@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 1 + tools/perf/util/evsel.h | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2f5910b31fa9..c2dbb5647e75 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -282,6 +282,7 @@ void evsel__init(struct evsel *evsel, evsel->bpf_fd = -1; INIT_LIST_HEAD(&evsel->config_terms); INIT_LIST_HEAD(&evsel->bpf_counter_list); + INIT_LIST_HEAD(&evsel->bpf_filters); perf_evsel__object.init(evsel); evsel->sample_size = __evsel__sample_size(attr->sample_type); evsel__calc_id_pos(evsel); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index df8928745fc6..0f54f28a69c2 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -151,10 +151,8 @@ struct evsel { */ struct bpf_counter_ops *bpf_counter_ops; - union { - struct list_head bpf_counter_list; /* for perf-stat -b */ - struct list_head bpf_filters; /* for perf-record --filter */ - }; + struct list_head bpf_counter_list; /* for perf-stat -b */ + struct list_head bpf_filters; /* for perf-record --filter */ /* for perf-stat --use-bpf */ int bperf_leader_prog_fd; From 47ee3f1dd93bcbe031539b1ecdaafb44b661c772 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 26 May 2023 12:34:20 -0700 Subject: [PATCH 189/271] x86: re-introduce support for ERMS copies for user space accesses I tried to streamline our user memory copy code fairly aggressively in commit adfcf4231b8c ("x86: don't use REP_GOOD or ERMS for user memory copies"), in order to then be able to clean up the code and inline the modern FSRM case in commit 577e6a7fd50d ("x86: inline the 'rep movs' in user copies for the FSRM case"). We had reports [1] of that causing regressions earlier with blogbench, but that turned out to be a horrible benchmark for that case, and not a sufficient reason for re-instating "rep movsb" on older machines. However, now Eric Dumazet reported [2] a regression in performance that seems to be a rather more real benchmark, where due to the removal of "rep movs" a TCP stream over a 100Gbps network no longer reaches line speed. And it turns out that with the simplified the calling convention for the non-FSRM case in commit 427fda2c8a49 ("x86: improve on the non-rep 'copy_user' function"), re-introducing the ERMS case is actually fairly simple. Of course, that "fairly simple" is glossing over several missteps due to having to fight our assembler alternative code. This code really wanted to rewrite a conditional branch to have two different targets, but that made objtool sufficiently unhappy that this instead just ended up doing a choice between "jump to the unrolled loop, or use 'rep movsb' directly". Let's see if somebody finds a case where the kernel memory copies also care (see commit 68674f94ffc9: "x86: don't use REP_GOOD or ERMS for small memory copies"). But Eric does argue that the user copies are special because networking tries to copy up to 32KB at a time, if order-3 pages allocations are possible. In-kernel memory copies are typically small, unless they are the special "copy pages at a time" kind that still use "rep movs". Link: https://lore.kernel.org/lkml/202305041446.71d46724-yujie.liu@intel.com/ [1] Link: https://lore.kernel.org/lkml/CANn89iKUbyrJ=r2+_kK+sb2ZSSHifFZ7QkPLDpAtkJ8v4WUumA@mail.gmail.com/ [2] Reported-and-tested-by: Eric Dumazet Fixes: adfcf4231b8c ("x86: don't use REP_GOOD or ERMS for user memory copies") Signed-off-by: Linus Torvalds --- arch/x86/lib/copy_user_64.S | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 4fc5c2de2de4..01c5de4c279b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include @@ -29,7 +31,7 @@ */ SYM_FUNC_START(rep_movs_alternative) cmpq $64,%rcx - jae .Lunrolled + jae .Llarge cmp $8,%ecx jae .Lword @@ -65,6 +67,12 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail) _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail) +.Llarge: +0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS +1: RET + + _ASM_EXTABLE_UA( 0b, 1b) + .p2align 4 .Lunrolled: 10: movq (%rsi),%r8 From 793a539ac78843ef9378bb42a44edfbc552a67d5 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 25 May 2023 17:33:01 -0700 Subject: [PATCH 190/271] cxl: Explicitly initialize resources when media is not ready When media is not ready do not assume that the capacity information from the identify command is valid, i.e. ->total_bytes ->partition_align_bytes ->{volatile,persistent}_only_bytes. Explicitly zero out the capacity resources and exit early. Given zero-init of those fields this patch is functionally equivalent to the prior state, but it improves readability and robustness going forward. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/168506118166.3004974.13523455340007852589.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- drivers/cxl/core/mbox.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 2c8dc7e2b84d..bea9cf31a12d 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1105,6 +1105,13 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) struct device *dev = cxlds->dev; int rc; + if (!cxlds->media_ready) { + cxlds->dpa_res = DEFINE_RES_MEM(0, 0); + cxlds->ram_res = DEFINE_RES_MEM(0, 0); + cxlds->pmem_res = DEFINE_RES_MEM(0, 0); + return 0; + } + cxlds->dpa_res = (struct resource)DEFINE_RES_MEM(0, cxlds->total_bytes); @@ -1118,12 +1125,10 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) cxlds->persistent_only_bytes, "pmem"); } - if (cxlds->media_ready) { - rc = cxl_mem_get_partition_info(cxlds); - if (rc) { - dev_err(dev, "Failed to query partition information\n"); - return rc; - } + rc = cxl_mem_get_partition_info(cxlds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; } rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, From 45c2f36871955b51b4ce083c447388d8c72d6b91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 May 2023 11:18:21 +0200 Subject: [PATCH 191/271] btrfs: call btrfs_orig_bbio_end_io in btrfs_end_bio_work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When I implemented the storage layer bio splitting, I was under the assumption that we'll never split metadata bios. But Qu reminded me that this can actually happen with very old file systems with unaligned metadata chunks and RAID0. I still haven't seen such a case in practice, but we better handled this case, especially as it is fairly easily to do not calling the ->end_іo method directly in btrfs_end_io_work, and using the proper btrfs_orig_bbio_end_io helper instead. In addition to the old file system with unaligned metadata chunks case documented in the commit log, the combination of the new scrub code with Johannes pending raid-stripe-tree also triggers this case. We spent some time debugging it and found that this patch solves the problem. Fixes: 103c19723c80 ("btrfs: split the bio submission path into a separate file") CC: stable@vger.kernel.org # 6.3+ Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 5379c4714905..35d34c731d72 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -330,7 +330,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - bbio->end_io(bbio); + btrfs_orig_bbio_end_io(bbio); } static void btrfs_simple_end_io(struct bio *bio) From 8fd9f4232d8152c650fd15127f533a0f6d0a4b2b Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Tue, 16 May 2023 09:34:30 +0800 Subject: [PATCH 192/271] btrfs: fix an uninitialized variable warning in btrfs_log_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following warning reported by gcc 10.2.1 under x86_64: ../fs/btrfs/tree-log.c: In function ‘btrfs_log_inode’: ../fs/btrfs/tree-log.c:6211:9: error: ‘last_range_start’ may be used uninitialized in this function [-Werror=maybe-uninitialized] 6211 | ret = insert_dir_log_key(trans, log, path, key.objectid, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6212 | first_dir_index, last_dir_index); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../fs/btrfs/tree-log.c:6161:6: note: ‘last_range_start’ was declared here 6161 | u64 last_range_start; | ^~~~~~~~~~~~~~~~ This might be a false positive fixed in later compiler versions but we want to have it fixed. Reported-by: k2ci Reviewed-by: Anand Jain Signed-off-by: Shida Zhang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9b212e8c70cc..d2755d5e338b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6158,7 +6158,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, { struct btrfs_root *log = inode->root->log_root; const struct btrfs_delayed_item *curr; - u64 last_range_start; + u64 last_range_start = 0; u64 last_range_end = 0; struct btrfs_key key; From 5ad9b4719fc9bc4715c7e19875a962095b0577e7 Mon Sep 17 00:00:00 2001 From: pengfuyuan Date: Tue, 23 May 2023 15:09:55 +0800 Subject: [PATCH 193/271] btrfs: fix csum_tree_block page iteration to avoid tripping on -Werror=array-bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling on a MIPS 64-bit machine we get these warnings: In file included from ./arch/mips/include/asm/cacheflush.h:13, from ./include/linux/cacheflush.h:5, from ./include/linux/highmem.h:8, from ./include/linux/bvec.h:10, from ./include/linux/blk_types.h:10, from ./include/linux/blkdev.h:9, from fs/btrfs/disk-io.c:7: fs/btrfs/disk-io.c: In function ‘csum_tree_block’: fs/btrfs/disk-io.c:100:34: error: array subscript 1 is above array bounds of ‘struct page *[1]’ [-Werror=array-bounds] 100 | kaddr = page_address(buf->pages[i]); | ~~~~~~~~~~^~~ ./include/linux/mm.h:2135:48: note: in definition of macro ‘page_address’ 2135 | #define page_address(page) lowmem_page_address(page) | ^~~~ cc1: all warnings being treated as errors We can check if i overflows to solve the problem. However, this doesn't make much sense, since i == 1 and num_pages == 1 doesn't execute the body of the loop. In addition, i < num_pages can also ensure that buf->pages[i] will not cross the boundary. Unfortunately, this doesn't help with the problem observed here: gcc still complains. To fix this add a compile-time condition for the extent buffer page array size limit, which would eventually lead to eliminating the whole for loop. CC: stable@vger.kernel.org # 5.10+ Signed-off-by: pengfuyuan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6de6dcf2743e..2b1b227505f3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -96,7 +96,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); - for (i = 1; i < num_pages; i++) { + for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = page_address(buf->pages[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } From 48b47f0caaa8a9f05ed803cb4f335fa3a7bfc622 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 12 May 2023 17:05:41 +0900 Subject: [PATCH 194/271] ksmbd: fix uninitialized pointer read in ksmbd_vfs_rename() Uninitialized rd.delegated_inode can be used in vfs_rename(). Fix this by setting rd.delegated_inode to NULL to avoid the uninitialized read. Fixes: 74d7970febf7 ("ksmbd: fix racy issue from using ->d_parent and ->d_name") Reported-by: Coverity Scan Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/vfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 778c152708e4..9bdb01c5b201 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -743,6 +743,7 @@ retry: rd.new_dir = new_path.dentry->d_inode, rd.new_dentry = new_dentry, rd.flags = flags, + rd.delegated_inode = NULL, err = vfs_rename(&rd); if (err) ksmbd_debug(VFS, "vfs_rename failed err %d\n", err); From df14afeed2e6c1bbadef7d2f9c46887bbd6d8d94 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 14 May 2023 10:02:27 +0900 Subject: [PATCH 195/271] ksmbd: fix uninitialized pointer read in smb2_create_link() There is a case that file_present is true and path is uninitialized. This patch change file_present is set to false by default and set to true when patch is initialized. Fixes: 74d7970febf7 ("ksmbd: fix racy issue from using ->d_parent and ->d_name") Reported-by: Coverity Scan Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 717bcd20545b..1632b2a1e516 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5506,7 +5506,7 @@ static int smb2_create_link(struct ksmbd_work *work, { char *link_name = NULL, *target_name = NULL, *pathname = NULL; struct path path; - bool file_present = true; + bool file_present = false; int rc; if (buf_len < (u64)sizeof(struct smb2_file_link_info) + @@ -5539,8 +5539,8 @@ static int smb2_create_link(struct ksmbd_work *work, if (rc) { if (rc != -ENOENT) goto out; - file_present = false; - } + } else + file_present = true; if (file_info->ReplaceIfExists) { if (file_present) { From 84c5aa47925a1f40d698b6a6a2bf67e99617433d Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 12 May 2023 23:29:12 +0900 Subject: [PATCH 196/271] ksmbd: fix credit count leakage This patch fix the failure from smb2.credits.single_req_credits_granted test. When client send 8192 credit request, ksmbd return 8191 credit granted. ksmbd should give maximum possible credits that must be granted within the range of not exceeding the max credit to client. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 1632b2a1e516..bec885c007ce 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -326,13 +326,9 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) if (hdr->Command == SMB2_NEGOTIATE) aux_max = 1; else - aux_max = conn->vals->max_credits - credit_charge; + aux_max = conn->vals->max_credits - conn->total_credits; credits_granted = min_t(unsigned short, credits_requested, aux_max); - if (conn->vals->max_credits - conn->total_credits < credits_granted) - credits_granted = conn->vals->max_credits - - conn->total_credits; - conn->total_credits += credits_granted; work->credits_granted += credits_granted; From d738950f112c8f40f0515fe967db998e8235a175 Mon Sep 17 00:00:00 2001 From: Kuan-Ting Chen Date: Fri, 19 May 2023 22:59:28 +0900 Subject: [PATCH 197/271] ksmbd: fix slab-out-of-bounds read in smb2_handle_negotiate Check request_buf length first to avoid out-of-bounds read by req->DialectCount. [ 3350.990282] BUG: KASAN: slab-out-of-bounds in smb2_handle_negotiate+0x35d7/0x3e60 [ 3350.990282] Read of size 2 at addr ffff88810ad61346 by task kworker/5:0/276 [ 3351.000406] Workqueue: ksmbd-io handle_ksmbd_work [ 3351.003499] Call Trace: [ 3351.006473] [ 3351.006473] dump_stack_lvl+0x8d/0xe0 [ 3351.006473] print_report+0xcc/0x620 [ 3351.006473] kasan_report+0x92/0xc0 [ 3351.006473] smb2_handle_negotiate+0x35d7/0x3e60 [ 3351.014760] ksmbd_smb_negotiate_common+0x7a7/0xf00 [ 3351.014760] handle_ksmbd_work+0x3f7/0x12d0 [ 3351.014760] process_one_work+0xa85/0x1780 Cc: stable@vger.kernel.org Signed-off-by: Kuan-Ting Chen Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index bec885c007ce..83c668243c95 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1053,16 +1053,16 @@ int smb2_handle_negotiate(struct ksmbd_work *work) return rc; } - if (req->DialectCount == 0) { - pr_err("malformed packet\n"); + smb2_buf_len = get_rfc1002_len(work->request_buf); + smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects); + if (smb2_neg_size > smb2_buf_len) { rsp->hdr.Status = STATUS_INVALID_PARAMETER; rc = -EINVAL; goto err_out; } - smb2_buf_len = get_rfc1002_len(work->request_buf); - smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects); - if (smb2_neg_size > smb2_buf_len) { + if (req->DialectCount == 0) { + pr_err("malformed packet\n"); rsp->hdr.Status = STATUS_INVALID_PARAMETER; rc = -EINVAL; goto err_out; From 0512a5f89e1fae74251fde6893ff634f1c96c6fb Mon Sep 17 00:00:00 2001 From: Kuan-Ting Chen Date: Fri, 19 May 2023 23:00:24 +0900 Subject: [PATCH 198/271] ksmbd: fix multiple out-of-bounds read during context decoding Check the remaining data length before accessing the context structure to ensure that the entire structure is contained within the packet. Additionally, since the context data length `ctxt_len` has already been checked against the total packet length `len_of_ctxts`, update the comparison to use `ctxt_len`. Cc: stable@vger.kernel.org Signed-off-by: Kuan-Ting Chen Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 53 ++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 83c668243c95..a1939ff7c742 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -845,13 +845,14 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, struct smb2_preauth_neg_context *pneg_ctxt, - int len_of_ctxts) + int ctxt_len) { /* * sizeof(smb2_preauth_neg_context) assumes SMB311_SALT_SIZE Salt, * which may not be present. Only check for used HashAlgorithms[1]. */ - if (len_of_ctxts < MIN_PREAUTH_CTXT_DATA_LEN) + if (ctxt_len < + sizeof(struct smb2_neg_context) + MIN_PREAUTH_CTXT_DATA_LEN) return STATUS_INVALID_PARAMETER; if (pneg_ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512) @@ -863,15 +864,23 @@ static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, static void decode_encrypt_ctxt(struct ksmbd_conn *conn, struct smb2_encryption_neg_context *pneg_ctxt, - int len_of_ctxts) + int ctxt_len) { - int cph_cnt = le16_to_cpu(pneg_ctxt->CipherCount); - int i, cphs_size = cph_cnt * sizeof(__le16); + int cph_cnt; + int i, cphs_size; + + if (sizeof(struct smb2_encryption_neg_context) > ctxt_len) { + pr_err("Invalid SMB2_ENCRYPTION_CAPABILITIES context size\n"); + return; + } conn->cipher_type = 0; + cph_cnt = le16_to_cpu(pneg_ctxt->CipherCount); + cphs_size = cph_cnt * sizeof(__le16); + if (sizeof(struct smb2_encryption_neg_context) + cphs_size > - len_of_ctxts) { + ctxt_len) { pr_err("Invalid cipher count(%d)\n", cph_cnt); return; } @@ -919,15 +928,22 @@ static void decode_compress_ctxt(struct ksmbd_conn *conn, static void decode_sign_cap_ctxt(struct ksmbd_conn *conn, struct smb2_signing_capabilities *pneg_ctxt, - int len_of_ctxts) + int ctxt_len) { - int sign_algo_cnt = le16_to_cpu(pneg_ctxt->SigningAlgorithmCount); - int i, sign_alos_size = sign_algo_cnt * sizeof(__le16); + int sign_algo_cnt; + int i, sign_alos_size; + + if (sizeof(struct smb2_signing_capabilities) > ctxt_len) { + pr_err("Invalid SMB2_SIGNING_CAPABILITIES context length\n"); + return; + } conn->signing_negotiated = false; + sign_algo_cnt = le16_to_cpu(pneg_ctxt->SigningAlgorithmCount); + sign_alos_size = sign_algo_cnt * sizeof(__le16); if (sizeof(struct smb2_signing_capabilities) + sign_alos_size > - len_of_ctxts) { + ctxt_len) { pr_err("Invalid signing algorithm count(%d)\n", sign_algo_cnt); return; } @@ -965,18 +981,16 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, len_of_ctxts = len_of_smb - offset; while (i++ < neg_ctxt_cnt) { - int clen; - - /* check that offset is not beyond end of SMB */ - if (len_of_ctxts == 0) - break; + int clen, ctxt_len; if (len_of_ctxts < sizeof(struct smb2_neg_context)) break; pctx = (struct smb2_neg_context *)((char *)pctx + offset); clen = le16_to_cpu(pctx->DataLength); - if (clen + sizeof(struct smb2_neg_context) > len_of_ctxts) + ctxt_len = clen + sizeof(struct smb2_neg_context); + + if (ctxt_len > len_of_ctxts) break; if (pctx->ContextType == SMB2_PREAUTH_INTEGRITY_CAPABILITIES) { @@ -987,7 +1001,7 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, status = decode_preauth_ctxt(conn, (struct smb2_preauth_neg_context *)pctx, - len_of_ctxts); + ctxt_len); if (status != STATUS_SUCCESS) break; } else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) { @@ -998,7 +1012,7 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, decode_encrypt_ctxt(conn, (struct smb2_encryption_neg_context *)pctx, - len_of_ctxts); + ctxt_len); } else if (pctx->ContextType == SMB2_COMPRESSION_CAPABILITIES) { ksmbd_debug(SMB, "deassemble SMB2_COMPRESSION_CAPABILITIES context\n"); @@ -1017,9 +1031,10 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, } else if (pctx->ContextType == SMB2_SIGNING_CAPABILITIES) { ksmbd_debug(SMB, "deassemble SMB2_SIGNING_CAPABILITIES context\n"); + decode_sign_cap_ctxt(conn, (struct smb2_signing_capabilities *)pctx, - len_of_ctxts); + ctxt_len); } /* offsets must be 8 byte aligned */ From 36322523dddb11107e9f7f528675a0dec2536103 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 19 May 2023 23:09:48 +0900 Subject: [PATCH 199/271] ksmbd: fix UAF issue from opinfo->conn If opinfo->conn is another connection and while ksmbd send oplock break request to cient on current connection, The connection for opinfo->conn can be disconnect and conn could be freed. When sending oplock break request, this ksmbd_conn can be used and cause user-after-free issue. When getting opinfo from the list, ksmbd check connection is being released. If it is not released, Increase ->r_count to wait that connection is freed. Cc: stable@vger.kernel.org Reported-by: Per Forlin Tested-by: Per Forlin Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/oplock.c | 72 +++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 6d1ccb999893..db181bdad73a 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -157,13 +157,42 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) rcu_read_lock(); opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info, op_entry); - if (opinfo && !atomic_inc_not_zero(&opinfo->refcount)) - opinfo = NULL; + if (opinfo) { + if (!atomic_inc_not_zero(&opinfo->refcount)) + opinfo = NULL; + else { + atomic_inc(&opinfo->conn->r_count); + if (ksmbd_conn_releasing(opinfo->conn)) { + atomic_dec(&opinfo->conn->r_count); + atomic_dec(&opinfo->refcount); + opinfo = NULL; + } + } + } + rcu_read_unlock(); return opinfo; } +static void opinfo_conn_put(struct oplock_info *opinfo) +{ + struct ksmbd_conn *conn; + + if (!opinfo) + return; + + conn = opinfo->conn; + /* + * Checking waitqueue to dropping pending requests on + * disconnection. waitqueue_active is safe because it + * uses atomic operation for condition. + */ + if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) + wake_up(&conn->r_count_q); + opinfo_put(opinfo); +} + void opinfo_put(struct oplock_info *opinfo) { if (!atomic_dec_and_test(&opinfo->refcount)) @@ -666,13 +695,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) out: ksmbd_free_work_struct(work); - /* - * Checking waitqueue to dropping pending requests on - * disconnection. waitqueue_active is safe because it - * uses atomic operation for condition. - */ - if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) - wake_up(&conn->r_count_q); } /** @@ -706,7 +728,6 @@ static int smb2_oplock_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; - atomic_inc(&conn->r_count); if (opinfo->op_state == OPLOCK_ACK_WAIT) { INIT_WORK(&work->work, __smb2_oplock_break_noti); ksmbd_queue_work(work); @@ -776,13 +797,6 @@ static void __smb2_lease_break_noti(struct work_struct *wk) out: ksmbd_free_work_struct(work); - /* - * Checking waitqueue to dropping pending requests on - * disconnection. waitqueue_active is safe because it - * uses atomic operation for condition. - */ - if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) - wake_up(&conn->r_count_q); } /** @@ -822,7 +836,6 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; - atomic_inc(&conn->r_count); if (opinfo->op_state == OPLOCK_ACK_WAIT) { list_for_each_safe(tmp, t, &opinfo->interim_list) { struct ksmbd_work *in_work; @@ -1144,8 +1157,10 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, } prev_opinfo = opinfo_get_list(ci); if (!prev_opinfo || - (prev_opinfo->level == SMB2_OPLOCK_LEVEL_NONE && lctx)) + (prev_opinfo->level == SMB2_OPLOCK_LEVEL_NONE && lctx)) { + opinfo_conn_put(prev_opinfo); goto set_lev; + } prev_op_has_lease = prev_opinfo->is_lease; if (prev_op_has_lease) prev_op_state = prev_opinfo->o_lease->state; @@ -1153,19 +1168,19 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, if (share_ret < 0 && prev_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { err = share_ret; - opinfo_put(prev_opinfo); + opinfo_conn_put(prev_opinfo); goto err_out; } if (prev_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH && prev_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) { - opinfo_put(prev_opinfo); + opinfo_conn_put(prev_opinfo); goto op_break_not_needed; } list_add(&work->interim_entry, &prev_opinfo->interim_list); err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II); - opinfo_put(prev_opinfo); + opinfo_conn_put(prev_opinfo); if (err == -ENOENT) goto set_lev; /* Check all oplock was freed by close */ @@ -1228,14 +1243,14 @@ static void smb_break_all_write_oplock(struct ksmbd_work *work, return; if (brk_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH && brk_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) { - opinfo_put(brk_opinfo); + opinfo_conn_put(brk_opinfo); return; } brk_opinfo->open_trunc = is_trunc; list_add(&work->interim_entry, &brk_opinfo->interim_list); oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II); - opinfo_put(brk_opinfo); + opinfo_conn_put(brk_opinfo); } /** @@ -1263,6 +1278,13 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) { if (!atomic_inc_not_zero(&brk_op->refcount)) continue; + + atomic_inc(&brk_op->conn->r_count); + if (ksmbd_conn_releasing(brk_op->conn)) { + atomic_dec(&brk_op->conn->r_count); + continue; + } + rcu_read_unlock(); if (brk_op->is_lease && (brk_op->o_lease->state & (~(SMB2_LEASE_READ_CACHING_LE | @@ -1292,7 +1314,7 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, brk_op->open_trunc = is_trunc; oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE); next: - opinfo_put(brk_op); + opinfo_conn_put(brk_op); rcu_read_lock(); } rcu_read_unlock(); From 6cc2268f5647cbfde3d4fc2e4ee005070ea3a8d2 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 19 May 2023 23:11:33 +0900 Subject: [PATCH 200/271] ksmbd: fix incorrect AllocationSize set in smb2_get_info If filesystem support sparse file, ksmbd should return allocated size using ->i_blocks instead of stat->size. This fix generic/694 xfstests. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index a1939ff7c742..7a81541de602 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4369,21 +4369,6 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, return 0; } -static unsigned long long get_allocation_size(struct inode *inode, - struct kstat *stat) -{ - unsigned long long alloc_size = 0; - - if (!S_ISDIR(stat->mode)) { - if ((inode->i_blocks << 9) <= stat->size) - alloc_size = stat->size; - else - alloc_size = inode->i_blocks << 9; - } - - return alloc_size; -} - static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct ksmbd_file *fp, void *rsp_org) { @@ -4398,7 +4383,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); - sinfo->AllocationSize = cpu_to_le64(get_allocation_size(inode, &stat)); + sinfo->AllocationSize = cpu_to_le64(inode->i_blocks << 9); sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); sinfo->DeletePending = delete_pending; @@ -4463,7 +4448,7 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->Attributes = fp->f_ci->m_fattr; file_info->Pad1 = 0; file_info->AllocationSize = - cpu_to_le64(get_allocation_size(inode, &stat)); + cpu_to_le64(inode->i_blocks << 9); file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); file_info->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); @@ -4652,7 +4637,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; file_info->AllocationSize = - cpu_to_le64(get_allocation_size(inode, &stat)); + cpu_to_le64(inode->i_blocks << 9); file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = From 6fe55c2799bc29624770c26f98ba7b06214f43e0 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 25 May 2023 00:13:38 +0900 Subject: [PATCH 201/271] ksmbd: call putname after using the last component last component point filename struct. Currently putname is called after vfs_path_parent_lookup(). And then last component is used for lookup_one_qstr_excl(). name in last component is freed by previous calling putname(). And It cause file lookup failure when testing generic/464 test of xfstest. Fixes: 74d7970febf7 ("ksmbd: fix racy issue from using ->d_parent and ->d_name") Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/vfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 9bdb01c5b201..6f302919e9f7 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -86,12 +86,14 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, err = vfs_path_parent_lookup(filename, flags, &parent_path, &last, &type, root_share_path); - putname(filename); - if (err) + if (err) { + putname(filename); return err; + } if (unlikely(type != LAST_NORM)) { path_put(&parent_path); + putname(filename); return -ENOENT; } @@ -108,12 +110,14 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, path->dentry = d; path->mnt = share_conf->vfs_path.mnt; path_put(&parent_path); + putname(filename); return 0; err_out: inode_unlock(parent_path.dentry->d_inode); path_put(&parent_path); + putname(filename); return -ENOENT; } From 396ac4c982f6d6cd7c0293a546a0ba5d77fe7f90 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 26 May 2023 15:01:33 +0300 Subject: [PATCH 202/271] smb: delete an unnecessary statement We don't need to set the list iterators to NULL before a list_for_each_entry() loop because they are assigned inside the macro. Signed-off-by: Dan Carpenter Reviewed-by: Mukesh Ojha Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 5065398665f1..6e3be58cfe49 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -618,7 +618,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, * Add a new one instead */ spin_lock(&ses->iface_lock); - iface = niface = NULL; list_for_each_entry_safe(iface, niface, &ses->iface_list, iface_head) { ret = iface_cmp(iface, &tmp_iface); From fdd7d1fff4e3b97c4706f4c0e000a38b134b7ae5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 27 May 2023 03:33:23 -0500 Subject: [PATCH 203/271] cifs: address unused variable warning Fix trivial unused variable warning (when SMB1 support disabled) "ioctl.c:324:17: warning: variable 'caps' set but not used [-Wunused-but-set-variable]" Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202305250056.oZhsJmdD-lkp@intel.com/ Signed-off-by: Steve French --- fs/smb/client/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index cb3be58cd55e..fff092bbc7a3 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -321,7 +321,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct tcon_link *tlink; struct cifs_sb_info *cifs_sb; __u64 ExtAttrBits = 0; +#ifdef CONFIG_CIFS_POSIX +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY __u64 caps; +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ +#endif /* CONFIG_CIFS_POSIX */ xid = get_xid(); @@ -331,9 +335,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) if (pSMBFile == NULL) break; tcon = tlink_tcon(pSMBFile->tlink); - caps = le64_to_cpu(tcon->fsUnixInfo.Capability); #ifdef CONFIG_CIFS_POSIX #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { __u64 ExtAttrMask = 0; rc = CIFSGetExtAttr(xid, tcon, From 7877cb91f1081754a1487c144d85dc0d2e2e7fc4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 May 2023 07:49:00 -0400 Subject: [PATCH 204/271] Linux 6.4-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3cd9bc2886d6..836643eaefee 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From b3e6bcb94590dea45396b9481e47b809b1be4afa Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 May 2023 23:49:48 -0400 Subject: [PATCH 205/271] ext4: add EA_INODE checking to ext4_iget() Add a new flag, EXT4_IGET_EA_INODE which indicates whether the inode is expected to have the EA_INODE flag or not. If the flag is not set/clear as expected, then fail the iget() operation and mark the file system as corrupted. This commit also makes the ext4_iget() always perform the is_bad_inode() check even when the inode is already inode cache. This allows us to remove the is_bad_inode() check from the callers of ext4_iget() in the ea_inode code. Reported-by: syzbot+cbb68193bdb95af4340a@syzkaller.appspotmail.com Reported-by: syzbot+62120febbd1ee3c3c860@syzkaller.appspotmail.com Reported-by: syzbot+edce54daffee36421b4c@syzkaller.appspotmail.com Cc: stable@kernel.org Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230524034951.779531-2-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/inode.c | 31 ++++++++++++++++++++++++++----- fs/ext4/xattr.c | 36 +++++++----------------------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6948d673bba2..9525c52b78dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2901,7 +2901,8 @@ typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ - EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ + EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ + EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce5f21b6c2b3..258f3cbed347 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4641,6 +4641,21 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } +static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) + +{ + if (flags & EXT4_IGET_EA_INODE) { + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return "missing EA_INODE flag"; + } else { + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return "unexpected EA_INODE flag"; + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) + return "unexpected bad inode w/o EXT4_IGET_BAD"; + return NULL; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4650,6 +4665,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; + const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4677,8 +4693,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->i_state & I_NEW)) { + if ((err_str = check_igot_inode(inode, flags)) != NULL) { + ext4_error_inode(inode, function, line, 0, err_str); + iput(inode); + return ERR_PTR(-EFSCORRUPTED); + } return inode; + } ei = EXT4_I(inode); iloc.bh = NULL; @@ -4944,10 +4966,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { - ext4_error_inode(inode, function, line, 0, - "bad inode without EXT4_IGET_BAD flag"); - ret = -EUCLEAN; + if ((err_str = check_igot_inode(inode, flags)) != NULL) { + ext4_error_inode(inode, function, line, 0, err_str); + ret = -EFSCORRUPTED; goto bad_inode; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index dfc2e223bd10..a27208129a80 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -433,7 +433,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, return -EFSCORRUPTED; } - inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, @@ -441,23 +441,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, err); return err; } - - if (is_bad_inode(inode)) { - ext4_error(parent->i_sb, - "error while reading EA inode %lu is_bad_inode", - ea_ino); - err = -EIO; - goto error; - } - - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { - ext4_error(parent->i_sb, - "EA inode %lu does not have EXT4_EA_INODE_FL flag", - ea_ino); - err = -EINVAL; - goto error; - } - ext4_xattr_inode_set_class(inode); /* @@ -478,9 +461,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, *ea_inode = inode; return 0; -error: - iput(inode); - return err; } /* Remove entry from mbcache when EA inode is getting evicted */ @@ -1556,11 +1536,10 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, - EXT4_IGET_NORMAL); - if (!IS_ERR(ea_inode) && - !is_bad_inode(ea_inode) && - (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && - i_size_read(ea_inode) == value_len && + EXT4_IGET_EA_INODE); + if (IS_ERR(ea_inode)) + goto next_entry; + if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && @@ -1570,9 +1549,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, kvfree(ea_data); return ea_inode; } - - if (!IS_ERR(ea_inode)) - iput(ea_inode); + iput(ea_inode); + next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); From e8352cf5778c53b80fdcee086278b2048ddb8f98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:38 -0400 Subject: [PATCH 206/271] tracing: Move setting of tracing_selftest_running out of register_tracer() The variables tracing_selftest_running and tracing_selftest_disabled are only used for when CONFIG_FTRACE_STARTUP_TEST is enabled. Make them only visible within the selftest code. The setting of those variables are in the register_tracer() call, and set in a location where they do not need to be. Create a wrapper around run_tracer_selftest() called do_run_tracer_selftest() which sets those variables, and have register_tracer() call that instead. Having those variables only set within the CONFIG_FTRACE_STARTUP_TEST scope gets rid of them (and also the ability to remove testing against them) when the startup tests are not enabled (most cases). Link: https://lkml.kernel.org/r/20230528051742.1325503-2-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 81801dc31784..87e5920b141f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2041,6 +2041,17 @@ static int run_tracer_selftest(struct tracer *type) return 0; } +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; @@ -2092,6 +2103,10 @@ static inline int run_tracer_selftest(struct tracer *type) { return 0; } +static inline int do_run_tracer_selftest(struct tracer *type) +{ + return 0; +} #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); @@ -2127,8 +2142,6 @@ int __init register_tracer(struct tracer *type) mutex_lock(&trace_types_lock); - tracing_selftest_running = true; - for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -2157,7 +2170,7 @@ int __init register_tracer(struct tracer *type) /* store the tracer for __set_tracer_option */ type->flags->trace = type; - ret = run_tracer_selftest(type); + ret = do_run_tracer_selftest(type); if (ret < 0) goto out; @@ -2166,7 +2179,6 @@ int __init register_tracer(struct tracer *type) add_tracer_options(&global_trace, type); out: - tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) From 9da705d432a07927526005a0688d81fbbf30e349 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:39 -0400 Subject: [PATCH 207/271] tracing: Have tracer selftests call cond_resched() before running As there are more and more internal selftests being added to the Linux kernel (KSAN, lockdep, etc) the selftests are taking longer to run when these are enabled. Add a cond_resched() to the calling of do_run_tracer_selftest() to force a schedule if NEED_RESCHED is set, otherwise the soft lockup watchdog may trigger on boot up. Link: https://lkml.kernel.org/r/20230528051742.1325503-3-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87e5920b141f..70f2b511b9cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2045,6 +2045,13 @@ static int do_run_tracer_selftest(struct tracer *type) { int ret; + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + tracing_selftest_running = true; ret = run_tracer_selftest(type); tracing_selftest_running = false; From a3ae76d7ff781208100e6acc58eb09afb7b4b177 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:40 -0400 Subject: [PATCH 208/271] tracing: Make tracing_selftest_running/delete nops when not used There's no reason to test the condition variables tracing_selftest_running or tracing_selftest_delete when tracing selftests are not enabled. Make them define 0s when not the selftests are not configured in. Link: https://lkml.kernel.org/r/20230528051742.1325503-4-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70f2b511b9cd..004f5f99e943 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -60,6 +60,7 @@ */ bool ring_buffer_expanded; +#ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -75,7 +76,6 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; -#ifdef CONFIG_FTRACE_STARTUP_TEST void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { @@ -83,6 +83,9 @@ void __init disable_tracing_selftest(const char *reason) pr_info("Ftrace startup test is disabled due to %s\n", reason); } } +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ From ac9d2cb1d5f8e22235c399338504dadc87d14e74 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:41 -0400 Subject: [PATCH 209/271] tracing: Only make selftest conditionals affect the global_trace The tracing_selftest_running and tracing_selftest_disabled variables were to keep trace_printk() and other writes from affecting the tracing selftests, as the tracing selftests would examine the ring buffer to see if it contained what it expected or not. trace_printk() and friends could add to the ring buffer and cause the selftests to fail (and then disable the tracer that was being tested). To keep that from happening, these variables were added and would keep trace_printk() and friends from writing to the ring buffer while the tests were going on. But this was only the top level ring buffer (owned by the global_trace instance). There is no reason to prevent writing into ring buffers of other instances via the trace_array_printk() and friends. For the functions that could be used by other instances, check if the global_trace is the tracer instance that is being written to before deciding to not allow the write. Link: https://lkml.kernel.org/r/20230528051742.1325503-5-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 004f5f99e943..64a4dde073ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1054,7 +1054,10 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; - if (unlikely(tracing_selftest_running || tracing_disabled)) + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ @@ -3512,7 +3515,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, unsigned int trace_ctx; char *tbuffer; - if (tracing_disabled || tracing_selftest_running) + if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ @@ -3560,6 +3563,9 @@ __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { + if (tracing_selftest_running && tr == &global_trace) + return 0; + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } From a2d910f02231f33118647fc438157ae69c073f89 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sun, 28 May 2023 01:17:42 -0400 Subject: [PATCH 210/271] tracing: Have function_graph selftest call cond_resched() When all kernel debugging is enabled (lockdep, KSAN, etc), the function graph enabling and disabling can take several seconds to complete. The function_graph selftest enables and disables function graph tracing several times. With full debugging enabled, the soft lockup watchdog was triggering because the selftest was running without ever scheduling. Add cond_resched() throughout the test to make sure it does not trigger the soft lockup detector. Link: https://lkml.kernel.org/r/20230528051742.1325503-6-rostedt@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a931d9aaea26..529590499b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -848,6 +848,12 @@ trace_selftest_startup_function_graph(struct tracer *trace, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * These tests can take some time to run. Make sure on non PREEMPT + * kernels, we do not trigger the softlockup detector. + */ + cond_resched(); + tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); @@ -869,6 +875,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); @@ -891,6 +899,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + tracing_start(); if (!ret && !count) { From ac2263b588dffd3a1efd7ed0b156ea6c5aea200d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 29 May 2023 06:40:33 -0400 Subject: [PATCH 211/271] Revert "module: error out early on concurrent load of the same module file" This reverts commit 9828ed3f695a138f7add89fa2a186ababceb8006. Sadly, it does seem to cause failures to load modules. Johan Hovold reports: "This change breaks module loading during boot on the Lenovo Thinkpad X13s (aarch64). Specifically it results in indefinite probe deferral of the display and USB (ethernet) which makes it a pain to debug. Typing in the dark to acquire some logs reveals that other modules are missing as well" Since this was applied late as a "let's try this", I'm reverting it asap, and we can try to figure out what goes wrong later. The excessive parallel module loading problem is annoying, but not noticeable in normal situations, and this was only meant as an optimistic workaround for a user-space bug. One possible solution may be to do the optimistic exclusive open first, and then use a lock to serialize loading if that fails. Reported-by: Johan Hovold Link: https://lore.kernel.org/lkml/ZHRpH-JXAxA6DnzR@hovoldconsulting.com/ Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ---- kernel/module/main.c | 72 ++++++++++++++------------------------------ 2 files changed, 22 insertions(+), 56 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 86b50271b4f7..133f0640fb24 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2566,12 +2566,6 @@ static inline int deny_write_access(struct file *file) struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } -static inline int exclusive_deny_write_access(struct file *file) -{ - int old = 0; - struct inode *inode = file_inode(file); - return atomic_try_cmpxchg(&inode->i_writecount, &old, -1) ? 0 : -ETXTBSY; -} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); diff --git a/kernel/module/main.c b/kernel/module/main.c index b4c7e925fdb0..044aa2c9e3cb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3057,53 +3057,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return load_module(&info, uargs, 0); } -static int file_init_module(struct file *file, const char __user * uargs, int flags) +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; void *buf = NULL; int len; - - len = kernel_read_file(file, 0, &buf, INT_MAX, NULL, - READING_MODULE); - if (len < 0) { - mod_stat_inc(&failed_kreads); - mod_stat_add_long(len, &invalid_kread_bytes); - return len; - } - - if (flags & MODULE_INIT_COMPRESSED_FILE) { - int err = module_decompress(&info, buf, len); - vfree(buf); /* compressed data is no longer needed */ - if (err) { - mod_stat_inc(&failed_decompress); - mod_stat_add_long(len, &invalid_decompress_bytes); - return err; - } - } else { - info.hdr = buf; - info.len = len; - } - - return load_module(&info, uargs, flags); -} - -/* - * kernel_read_file() will already deny write access, but module - * loading wants _exclusive_ access to the file, so we do that - * here, along with basic sanity checks. - */ -static int prepare_file_for_module_load(struct file *file) -{ - if (!file || !(file->f_mode & FMODE_READ)) - return -EBADF; - if (!S_ISREG(file_inode(file)->i_mode)) - return -EINVAL; - return exclusive_deny_write_access(file); -} - -SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) -{ - struct fd f; int err; err = may_init_module(); @@ -3117,14 +3075,28 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - f = fdget(fd); - err = prepare_file_for_module_load(f.file); - if (!err) { - err = file_init_module(f.file, uargs, flags); - allow_write_access(f.file); + len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, + READING_MODULE); + if (len < 0) { + mod_stat_inc(&failed_kreads); + mod_stat_add_long(len, &invalid_kread_bytes); + return len; } - fdput(f); - return err; + + if (flags & MODULE_INIT_COMPRESSED_FILE) { + err = module_decompress(&info, buf, len); + vfree(buf); /* compressed data is no longer needed */ + if (err) { + mod_stat_inc(&failed_decompress); + mod_stat_add_long(len, &invalid_decompress_bytes); + return err; + } + } else { + info.hdr = buf; + info.len = len; + } + + return load_module(&info, uargs, flags); } /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ From ffe14de983252862c91ad23bd5ca72fd9398d0e6 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Thu, 25 May 2023 09:44:44 +0000 Subject: [PATCH 212/271] MAINTAINERS: Update maintainer of Amazon EFA driver Change EFA driver maintainer from Gal Pressman to myself. Keep Gal as a reviewer at his request. Link: https://lore.kernel.org/r/20230525094444.12570-1-mrgolin@amazon.com Signed-off-by: Michael Margolin Acked-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c56043737848..dc8c4708733b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -956,7 +956,8 @@ F: Documentation/networking/device_drivers/ethernet/amazon/ena.rst F: drivers/net/ethernet/amazon/ AMAZON RDMA EFA DRIVER -M: Gal Pressman +M: Michael Margolin +R: Gal Pressman R: Yossi Leybovich L: linux-rdma@vger.kernel.org S: Supported From c8f304d75f6c6cc679a73f89591f9a915da38f09 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 22 May 2023 10:56:53 -0500 Subject: [PATCH 213/271] RDMA/irdma: Prevent QP use after free There is a window where the poll cq may use a QP that has been freed. This can happen if a CQE is polled before irdma_clean_cqes() can clear the CQE's related to the QP and the destroy QP races to free the QP memory. then the QP structures are used in irdma_poll_cq. Fix this by moving the clearing of CQE's before the reference is removed and the QP is destroyed. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20230522155654.1309-3-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index ab5cdf782785..68ce3cd40002 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -522,11 +522,6 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (!iwqp->user_mode) cancel_delayed_work_sync(&iwqp->dwork_flush); - irdma_qp_rem_ref(&iwqp->ibqp); - wait_for_completion(&iwqp->free_qp); - irdma_free_lsmm_rsrc(iwqp); - irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); - if (!iwqp->user_mode) { if (iwqp->iwscq) { irdma_clean_cqes(iwqp, iwqp->iwscq); @@ -534,6 +529,12 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) irdma_clean_cqes(iwqp, iwqp->iwrcq); } } + + irdma_qp_rem_ref(&iwqp->ibqp); + wait_for_completion(&iwqp->free_qp); + irdma_free_lsmm_rsrc(iwqp); + irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); + irdma_remove_push_mmap_entries(iwqp); irdma_free_qp_rsrc(iwqp); From 5842d1d9c1b0d17e0c29eae65ae1f245f83682dd Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 22 May 2023 10:56:54 -0500 Subject: [PATCH 214/271] RDMA/irdma: Fix Local Invalidate fencing If the local invalidate fence is indicated in the WR, only the read fence is currently being set in WQE. Fix this to set both the read and local fence in the WQE. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20230522155654.1309-4-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 68ce3cd40002..eaa12c124598 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3292,6 +3292,7 @@ static int irdma_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; + info.local_fence = info.read_fence; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; From 7f875850f20a42f488840c9df7af91ef7db2d576 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 22 May 2023 20:09:57 +0900 Subject: [PATCH 215/271] ata: libata-scsi: Use correct device no in ata_find_dev() For devices not attached to a port multiplier and managed directly by libata, the device number passed to ata_find_dev() must always be lower than the maximum number of devices returned by ata_link_max_devices(). That is 1 for SATA devices or 2 for an IDE link with master+slave devices. This device number is the SCSI device ID which matches these constraints as the IDs are generated per port and so never exceed the maximum number of devices for the link being used. However, for libsas managed devices, SCSI device IDs are assigned per struct scsi_host, leading to device IDs for SATA devices that can be well in excess of libata per-link maximum number of devices. This results in ata_find_dev() to always return NULL for libsas managed devices except for the first device of the target scsi_host with ID (device number) equal to 0. This issue is visible by executing the hdparm utility, which fails. E.g.: hdparm -i /dev/sdX /dev/sdX: HDIO_GET_IDENTITY failed: No message of desired type Fix this by rewriting ata_find_dev() to ignore the device number for non-PMP attached devices with a link with at most 1 device, that is SATA devices. For these, the device number 0 is always used to return the correct pointer to the struct ata_device of the port link. This change excludes IDE master/slave setups (maximum number of devices per link is 2) and port-multiplier attached devices. Also, to be consistant with the fact that SCSI device IDs and channel numbers used as device numbers are both unsigned int, change the devno argument of ata_find_dev() to unsigned int. Reported-by: Xingui Yang Fixes: 41bda9c98035 ("libata-link: update hotplug to handle PMP links") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Jason Yan --- drivers/ata/libata-scsi.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 7bb12deab70c..8ce90284eb34 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -2694,18 +2694,36 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc) return 0; } -static struct ata_device *ata_find_dev(struct ata_port *ap, int devno) +static struct ata_device *ata_find_dev(struct ata_port *ap, unsigned int devno) { - if (!sata_pmp_attached(ap)) { - if (likely(devno >= 0 && - devno < ata_link_max_devices(&ap->link))) + /* + * For the non-PMP case, ata_link_max_devices() returns 1 (SATA case), + * or 2 (IDE master + slave case). However, the former case includes + * libsas hosted devices which are numbered per scsi host, leading + * to devno potentially being larger than 0 but with each struct + * ata_device having its own struct ata_port and struct ata_link. + * To accommodate these, ignore devno and always use device number 0. + */ + if (likely(!sata_pmp_attached(ap))) { + int link_max_devices = ata_link_max_devices(&ap->link); + + if (link_max_devices == 1) + return &ap->link.device[0]; + + if (devno < link_max_devices) return &ap->link.device[devno]; - } else { - if (likely(devno >= 0 && - devno < ap->nr_pmp_links)) - return &ap->pmp_link[devno].device[0]; + + return NULL; } + /* + * For PMP-attached devices, the device number corresponds to C + * (channel) of SCSI [H:C:I:L], indicating the port pmp link + * for the device. + */ + if (devno < ap->nr_pmp_links) + return &ap->pmp_link[devno].device[0]; + return NULL; } From 36936a56e1814f6c526fe71fbf980beab4f5577a Mon Sep 17 00:00:00 2001 From: Sebastian Krzyszkowiak Date: Fri, 26 May 2023 16:38:11 +0200 Subject: [PATCH 216/271] net: usb: qmi_wwan: Set DTR quirk for BroadMobi BM818 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BM818 is based on Qualcomm MDM9607 chipset. Fixes: 9a07406b00cd ("net: usb: qmi_wwan: Add the BroadMobi BM818 card") Cc: stable@vger.kernel.org Signed-off-by: Sebastian Krzyszkowiak Acked-by: Bjørn Mork Link: https://lore.kernel.org/r/20230526-bm818-dtr-v1-1-64bbfa6ba8af@puri.sm Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 571e37e67f9c..f1865d047971 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1325,7 +1325,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2001, 0x7e3d, 4)}, /* D-Link DWM-222 A2 */ {QMI_FIXED_INTF(0x2020, 0x2031, 4)}, /* Olicard 600 */ {QMI_FIXED_INTF(0x2020, 0x2033, 4)}, /* BroadMobi BM806U */ - {QMI_FIXED_INTF(0x2020, 0x2060, 4)}, /* BroadMobi BM818 */ + {QMI_QUIRK_SET_DTR(0x2020, 0x2060, 4)}, /* BroadMobi BM818 */ {QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */ {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ From 0684f29a89e58944a9bfae3a8b5c1a217e44c960 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 May 2023 15:06:53 -0700 Subject: [PATCH 217/271] netlink: specs: correct types of legacy arrays ethtool has some attrs which dump multiple scalars into an attribute. The spec currently expects one attr per entry. Fixes: a353318ebf24 ("tools: ynl: populate most of the ethtool spec") Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230526220653.65538-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 32 ++++++------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 129f413ea349..3abc576ff797 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -60,22 +60,6 @@ attribute-sets: type: nest nested-attributes: bitset-bits - - - name: u64-array - attributes: - - - name: u64 - type: nest - multi-attr: true - nested-attributes: u64 - - - name: s32-array - attributes: - - - name: s32 - type: nest - multi-attr: true - nested-attributes: s32 - name: string attributes: @@ -705,16 +689,16 @@ attribute-sets: type: u8 - name: corrected - type: nest - nested-attributes: u64-array + type: binary + sub-type: u64 - name: uncorr - type: nest - nested-attributes: u64-array + type: binary + sub-type: u64 - name: corr-bits - type: nest - nested-attributes: u64-array + type: binary + sub-type: u64 - name: fec attributes: @@ -827,8 +811,8 @@ attribute-sets: type: u32 - name: index - type: nest - nested-attributes: s32-array + type: binary + sub-type: s32 - name: module attributes: From 6ffc57ea004234d9373c57b204fd10370a69f392 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 26 May 2023 15:43:42 +0000 Subject: [PATCH 218/271] af_packet: do not use READ_ONCE() in packet_bind() A recent patch added READ_ONCE() in packet_bind() and packet_bind_spkt() This is better handled by reading pkt_sk(sk)->num later in packet_do_bind() while appropriate lock is held. READ_ONCE() in writers are often an evidence of something being wrong. Fixes: 822b5a1c17df ("af_packet: Fix data-races of pkt_sk(sk)->num.") Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Jiri Pirko Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230526154342.2533026-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a1f9a0e9f3c8..a2dbeb264f26 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3201,6 +3201,9 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, lock_sock(sk); spin_lock(&po->bind_lock); + if (!proto) + proto = po->num; + rcu_read_lock(); if (po->fanout) { @@ -3299,7 +3302,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; - return packet_do_bind(sk, name, 0, READ_ONCE(pkt_sk(sk)->num)); + return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -3316,8 +3319,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_family != AF_PACKET) return -EINVAL; - return packet_do_bind(sk, NULL, sll->sll_ifindex, - sll->sll_protocol ? : READ_ONCE(pkt_sk(sk)->num)); + return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { From 4faeee0cf8a5d88d63cdbc3bab124fb0e6aed08c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 26 May 2023 16:34:58 +0000 Subject: [PATCH 219/271] tcp: deny tcp_disconnect() when threads are waiting Historically connect(AF_UNSPEC) has been abused by syzkaller and other fuzzers to trigger various bugs. A recent one triggers a divide-by-zero [1], and Paolo Abeni was able to diagnose the issue. tcp_recvmsg_locked() has tests about sk_state being not TCP_LISTEN and TCP REPAIR mode being not used. Then later if socket lock is released in sk_wait_data(), another thread can call connect(AF_UNSPEC), then make this socket a TCP listener. When recvmsg() is resumed, it can eventually call tcp_cleanup_rbuf() and attempt a divide by 0 in tcp_rcv_space_adjust() [1] This patch adds a new socket field, counting number of threads blocked in sk_wait_event() and inet_wait_for_connect(). If this counter is not zero, tcp_disconnect() returns an error. This patch adds code in blocking socket system calls, thus should not hurt performance of non blocking ones. Note that we probably could revert commit 499350a5a6e7 ("tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0") to restore original tcpi_rcv_mss meaning (was 0 if no payload was ever received on a socket) [1] divide error: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 13832 Comm: syz-executor.5 Not tainted 6.3.0-rc4-syzkaller-00224-g00c7b5f4ddc5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 RIP: 0010:tcp_rcv_space_adjust+0x36e/0x9d0 net/ipv4/tcp_input.c:740 Code: 00 00 00 00 fc ff df 4c 89 64 24 48 8b 44 24 04 44 89 f9 41 81 c7 80 03 00 00 c1 e1 04 44 29 f0 48 63 c9 48 01 e9 48 0f af c1 <49> f7 f6 48 8d 04 41 48 89 44 24 40 48 8b 44 24 30 48 c1 e8 03 48 RSP: 0018:ffffc900033af660 EFLAGS: 00010206 RAX: 4a66b76cbade2c48 RBX: ffff888076640cc0 RCX: 00000000c334e4ac RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000001 RBP: 00000000c324e86c R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8880766417f8 R13: ffff888028fbb980 R14: 0000000000000000 R15: 0000000000010344 FS: 00007f5bffbfe700(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b32f25000 CR3: 000000007ced0000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_recvmsg_locked+0x100e/0x22e0 net/ipv4/tcp.c:2616 tcp_recvmsg+0x117/0x620 net/ipv4/tcp.c:2681 inet6_recvmsg+0x114/0x640 net/ipv6/af_inet6.c:670 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg+0xe2/0x160 net/socket.c:1038 ____sys_recvmsg+0x210/0x5a0 net/socket.c:2720 ___sys_recvmsg+0xf2/0x180 net/socket.c:2762 do_recvmmsg+0x25e/0x6e0 net/socket.c:2856 __sys_recvmmsg net/socket.c:2935 [inline] __do_sys_recvmmsg net/socket.c:2958 [inline] __se_sys_recvmmsg net/socket.c:2951 [inline] __x64_sys_recvmmsg+0x20f/0x260 net/socket.c:2951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5c0108c0f9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f5bffbfe168 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 00007f5c011ac050 RCX: 00007f5c0108c0f9 RDX: 0000000000000001 RSI: 0000000020000bc0 RDI: 0000000000000003 RBP: 00007f5c010e7b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000122 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f5c012cfb1f R14: 00007f5bffbfe300 R15: 0000000000022000 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Reported-by: Paolo Abeni Diagnosed-by: Paolo Abeni Signed-off-by: Eric Dumazet Tested-by: Paolo Abeni Link: https://lore.kernel.org/r/20230526163458.2880232-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++++ net/ipv4/af_inet.c | 2 ++ net/ipv4/inet_connection_sock.c | 1 + net/ipv4/tcp.c | 6 ++++++ 4 files changed, 13 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 656ea89f60ff..b418425d7230 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -336,6 +336,7 @@ struct sk_filter; * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start + * @sk_wait_pending: number of threads blocked on this socket * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available @@ -428,6 +429,7 @@ struct sock { unsigned int sk_napi_id; #endif int sk_rcvbuf; + int sk_wait_pending; struct sk_filter __rcu *sk_filter; union { @@ -1174,6 +1176,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc; \ + __sk->sk_wait_pending++; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ @@ -1183,6 +1186,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } \ sched_annotate_sleep(); \ lock_sock(__sk); \ + __sk->sk_wait_pending--; \ __rc = __condition; \ __rc; \ }) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c4aab3aacbd8..4a76ebf793b8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -586,6 +586,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; + sk->sk_wait_pending++; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -601,6 +602,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; + sk->sk_wait_pending--; return timeo; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 65ad4251f6fd..1386787eaf1a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1142,6 +1142,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); + newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a60f6f4e7cd9..635607ac37e4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3081,6 +3081,12 @@ int tcp_disconnect(struct sock *sk, int flags) int old_state = sk->sk_state; u32 seq; + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); From 34dfde4ad87b84d21278a7e19d92b5b2c68e6c4d Mon Sep 17 00:00:00 2001 From: Cambda Zhu Date: Sat, 27 May 2023 12:03:17 +0800 Subject: [PATCH 220/271] tcp: Return user_mss for TCP_MAXSEG in CLOSE/LISTEN state if user_mss set This patch replaces the tp->mss_cache check in getting TCP_MAXSEG with tp->rx_opt.user_mss check for CLOSE/LISTEN sock. Since tp->mss_cache is initialized with TCP_MSS_DEFAULT, checking if it's zero is probably a bug. With this change, getting TCP_MAXSEG before connecting will return default MSS normally, and return user_mss if user_mss is set. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Jack Yang Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89i+3kL9pYtkxkwxwNMzvC_w3LNUum_2=3u+UyLBmGmifHA@mail.gmail.com/#t Signed-off-by: Cambda Zhu Link: https://lore.kernel.org/netdev/14D45862-36EA-4076-974C-EA67513C92F6@linux.alibaba.com/ Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230527040317.68247-1-cambda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 635607ac37e4..8d20d9221238 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4078,7 +4078,8 @@ int do_tcp_getsockopt(struct sock *sk, int level, switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + if (tp->rx_opt.user_mss && + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; From 020c69c1a793ed29d28793808eddd75210c858dd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 May 2023 12:34:54 +0100 Subject: [PATCH 221/271] rxrpc: Truncate UTS_RELEASE for rxrpc version UTS_RELEASE has a maximum length of 64 which can cause rxrpc_version to exceed the 65 byte message limit. Per the rx spec[1]: "If a server receives a packet with a type value of 13, and the client-initiated flag set, it should respond with a 65-byte payload containing a string that identifies the version of AFS software it is running." The current implementation causes a compile error when WERROR is turned on and/or UTS_RELEASE exceeds the length of 49 (making the version string more than 64 characters). Fix this by generating the string during module initialisation and limiting the UTS_RELEASE segment of the string does not exceed 49 chars. We need to make sure that the 64 bytes includes "linux-" at the front and " AF_RXRPC" at the back as this may be used in pattern matching. Fixes: 44ba06987c0b ("RxRPC: Handle VERSION Rx protocol packets") Reported-by: Kenny Ho Link: https://lore.kernel.org/r/20230523223944.691076-1-Kenny.Ho@amd.com/ Signed-off-by: David Howells Acked-by: Kenny Ho cc: Marc Dionne cc: Andrew Lunn cc: David Laight cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Link: https://web.mit.edu/kolya/afs/rx/rx-spec [1] Reviewed-by: Simon Horman Reviewed-by: Jeffrey Altman Link: https://lore.kernel.org/r/654974.1685100894@warthog.procyon.org.uk Signed-off-by: Paolo Abeni --- net/rxrpc/af_rxrpc.c | 1 + net/rxrpc/ar-internal.h | 1 + net/rxrpc/local_event.c | 11 ++++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 31f738d65f1c..da0b3b5157d5 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -980,6 +980,7 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); ret = -ENOMEM; + rxrpc_gen_version_string(); rxrpc_call_jar = kmem_cache_create( "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, SLAB_HWCACHE_ALIGN, NULL); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5d44dc08f66d..e8e14c6f904d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1068,6 +1068,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, /* * local_event.c */ +void rxrpc_gen_version_string(void); void rxrpc_send_version_request(struct rxrpc_local *local, struct rxrpc_host_header *hdr, struct sk_buff *skb); diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 5e69ea6b233d..993c69f97488 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -16,7 +16,16 @@ #include #include "ar-internal.h" -static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; +static char rxrpc_version_string[65]; // "linux-" UTS_RELEASE " AF_RXRPC"; + +/* + * Generate the VERSION packet string. + */ +void rxrpc_gen_version_string(void) +{ + snprintf(rxrpc_version_string, sizeof(rxrpc_version_string), + "linux-%.49s AF_RXRPC", UTS_RELEASE); +} /* * Reply to a version request From b24aa141c2ff26c919237aee61ea1818fc6780d9 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 26 May 2023 19:49:00 +0800 Subject: [PATCH 222/271] net/smc: Scan from current RMB list when no position specified When finding the first RMB of link group, it should start from the current RMB list whose index is 0. So fix it. Fixes: b4ba4652b3f8 ("net/smc: extend LLC layer for SMC-Rv2") Signed-off-by: Wen Gu Signed-off-by: Paolo Abeni --- net/smc/smc_llc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a0840b8c935b..8423e8e0063f 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -578,7 +578,10 @@ static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, { struct smc_buf_desc *buf_next; - if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + if (!buf_pos) + return _smc_llc_get_next_rmb(lgr, buf_lst); + + if (list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { (*buf_lst)++; return _smc_llc_get_next_rmb(lgr, buf_lst); } From 71c6aa0305e3d2365d3bfd0134b4025d9e7ba388 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 26 May 2023 19:49:01 +0800 Subject: [PATCH 223/271] net/smc: Don't use RMBs not mapped to new link in SMCRv2 ADD LINK We encountered a crash when using SMCRv2. It is caused by a logical error in smc_llc_fill_ext_v2(). BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 7 PID: 453 Comm: kworker/7:4 Kdump: loaded Tainted: G W E 6.4.0-rc3+ #44 Workqueue: events smc_llc_add_link_work [smc] RIP: 0010:smc_llc_fill_ext_v2+0x117/0x280 [smc] RSP: 0018:ffffacb5c064bd88 EFLAGS: 00010282 RAX: ffff9a6bc1c3c02c RBX: ffff9a6be3558000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000000002 RDI: 000000000000000a RBP: ffffacb5c064bdb8 R08: 0000000000000040 R09: 000000000000000c R10: ffff9a6bc0910300 R11: 0000000000000002 R12: 0000000000000000 R13: 0000000000000002 R14: ffff9a6bc1c3c02c R15: ffff9a6be3558250 FS: 0000000000000000(0000) GS:ffff9a6eefdc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 000000010b078003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: smc_llc_send_add_link+0x1ae/0x2f0 [smc] smc_llc_srv_add_link+0x2c9/0x5a0 [smc] ? cc_mkenc+0x40/0x60 smc_llc_add_link_work+0xb8/0x140 [smc] process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 When an alernate RNIC is available in system, SMC will try to add a new link based on the RNIC for resilience. All the RMBs in use will be mapped to the new link. Then the RMBs' MRs corresponding to the new link will be filled into SMCRv2 LLC ADD LINK messages. However, smc_llc_fill_ext_v2() mistakenly accesses to unused RMBs which haven't been mapped to the new link and have no valid MRs, thus causing a crash. So this patch fixes the logic. Fixes: b4ba4652b3f8 ("net/smc: extend LLC layer for SMC-Rv2") Signed-off-by: Wen Gu Signed-off-by: Paolo Abeni --- net/smc/smc_llc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 8423e8e0063f..7a8d9163d186 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -617,6 +617,8 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, goto out; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); for (i = 0; i < ext->num_rkeys; i++) { + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); if (!buf_pos) break; rmb = buf_pos; @@ -626,8 +628,6 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); - while (buf_pos && !(buf_pos)->used) - buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); } len += i * sizeof(ext->rt[0]); out: From 1919b39fc6eabb9a6f9a51706ff6d03865f5df29 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 26 May 2023 08:38:57 -0700 Subject: [PATCH 224/271] net: mana: Fix perf regression: remove rx_cqes, tx_cqes counters The apc->eth_stats.rx_cqes is one per NIC (vport), and it's on the frequent and parallel code path of all queues. So, r/w into this single shared variable by many threads on different CPUs creates a lot caching and memory overhead, hence perf regression. And, it's not accurate due to the high volume concurrent r/w. For example, a workload is iperf with 128 threads, and with RPS enabled. We saw perf regression of 25% with the previous patch adding the counters. And this patch eliminates the regression. Since the error path of mana_poll_rx_cq() already has warnings, so keeping the counter and convert it to a per-queue variable is not necessary. So, just remove this counter from this high frequency code path. Also, remove the tx_cqes counter for the same reason. We have warnings & other counters for errors on that path, and don't need to count every normal cqe processing. Cc: stable@vger.kernel.org Fixes: bd7fc6e1957c ("net: mana: Add new MANA VF performance counters for easier troubleshooting") Signed-off-by: Haiyang Zhang Reviewed-by: Horatiu Vultur Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/1685115537-31675-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 10 ---------- drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 2 -- include/net/mana/mana.h | 2 -- 3 files changed, 14 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 06d6292e09b3..d907727c7b7a 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1279,8 +1279,6 @@ static void mana_poll_tx_cq(struct mana_cq *cq) if (comp_read < 1) return; - apc->eth_stats.tx_cqes = comp_read; - for (i = 0; i < comp_read; i++) { struct mana_tx_comp_oob *cqe_oob; @@ -1363,8 +1361,6 @@ static void mana_poll_tx_cq(struct mana_cq *cq) WARN_ON_ONCE(1); cq->work_done = pkt_transmitted; - - apc->eth_stats.tx_cqes -= pkt_transmitted; } static void mana_post_pkt_rxq(struct mana_rxq *rxq) @@ -1626,15 +1622,11 @@ static void mana_poll_rx_cq(struct mana_cq *cq) { struct gdma_comp *comp = cq->gdma_comp_buf; struct mana_rxq *rxq = cq->rxq; - struct mana_port_context *apc; int comp_read, i; - apc = netdev_priv(rxq->ndev); - comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER); WARN_ON_ONCE(comp_read > CQE_POLLING_BUFFER); - apc->eth_stats.rx_cqes = comp_read; rxq->xdp_flush = false; for (i = 0; i < comp_read; i++) { @@ -1646,8 +1638,6 @@ static void mana_poll_rx_cq(struct mana_cq *cq) return; mana_process_rx_cqe(rxq, cq, &comp[i]); - - apc->eth_stats.rx_cqes--; } if (rxq->xdp_flush) diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index a64c81410dc1..0dc78679f620 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -13,11 +13,9 @@ static const struct { } mana_eth_stats[] = { {"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)}, {"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)}, - {"tx_cqes", offsetof(struct mana_ethtool_stats, tx_cqes)}, {"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)}, {"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats, tx_cqe_unknown_type)}, - {"rx_cqes", offsetof(struct mana_ethtool_stats, rx_cqes)}, {"rx_coalesced_err", offsetof(struct mana_ethtool_stats, rx_coalesced_err)}, {"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats, diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index cd386aa7c7cc..9eef19972845 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -347,10 +347,8 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; - u64 tx_cqes; u64 tx_cqe_err; u64 tx_cqe_unknown_type; - u64 rx_cqes; u64 rx_coalesced_err; u64 rx_cqe_unknown_type; }; From d328fe87067480cf2bd0b58dab428a98d31dbb7e Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:26 +0200 Subject: [PATCH 225/271] selftests: mptcp: join: avoid using 'cmp --bytes' BusyBox's 'cmp' command doesn't support the '--bytes' parameter. Some CIs -- i.e. LKFT -- use BusyBox and have the mptcp_join.sh test failing [1] because their 'cmp' command doesn't support this '--bytes' option: cmp: unrecognized option '--bytes=1024' BusyBox v1.35.0 () multi-call binary. Usage: cmp [-ls] [-n NUM] FILE1 [FILE2] Instead, 'head --bytes' can be used as this option is supported by BusyBox. A temporary file is needed for this operation. Because it is apparently quite common to use BusyBox, it is certainly better to backport this fix to impacted kernels. Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") Cc: stable@vger.kernel.org Link: https://qa-reports.linaro.org/lkft/linux-mainline-master/build/v6.3-rc5-5-g148341f0a2f5/testrun/16088933/suite/kselftest-net-mptcp/test/net_mptcp_userspace_pm_sh/log [1] Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 26310c17b4c6..4f3fe45f8f71 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -17,6 +17,7 @@ sout="" cin="" cinfail="" cinsent="" +tmpfile="" cout="" capout="" ns1="" @@ -175,6 +176,7 @@ cleanup() { rm -f "$cin" "$cout" "$sinfail" rm -f "$sin" "$sout" "$cinsent" "$cinfail" + rm -f "$tmpfile" rm -rf $evts_ns1 $evts_ns2 cleanup_partial } @@ -383,9 +385,16 @@ check_transfer() fail_test return 1 fi - bytes="--bytes=${bytes}" + + # note: BusyBox's "cmp" command doesn't support --bytes + tmpfile=$(mktemp) + head --bytes="$bytes" "$in" > "$tmpfile" + mv "$tmpfile" "$in" + head --bytes="$bytes" "$out" > "$tmpfile" + mv "$tmpfile" "$out" + tmpfile="" fi - cmp -l "$in" "$out" ${bytes} | while read -r i a b; do + cmp -l "$in" "$out" | while read -r i a b; do local sum=$((0${a} + 0${b})) if [ $check_invert -eq 0 ] || [ $sum -ne $((0xff)) ]; then echo "[ FAIL ] $what does not match (in, out):" From d83013bdf90a7994a474b0e650a7fc94b0d4ded6 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:27 +0200 Subject: [PATCH 226/271] selftests: mptcp: connect: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Note that this check can also mark the test as failed if 'SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES' env var is set to 1: by doing that, we can make sure a test is not being skipped by mistake. A new shared file is added here to be able to re-used the same check in the different selftests we have. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/Makefile | 2 +- .../selftests/net/mptcp/mptcp_connect.sh | 4 ++ .../testing/selftests/net/mptcp/mptcp_lib.sh | 40 +++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/mptcp/mptcp_lib.sh diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 43a723626126..7b936a926859 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -9,7 +9,7 @@ TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq -TEST_FILES := settings +TEST_FILES := mptcp_lib.sh settings EXTRA_CLEAN := *.pcap diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index a43d3e2f59bb..c1f7bac19942 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + time_start=$(date +%s) optstring="S:R:d:e:l:r:h4cm:f:tC" @@ -141,6 +143,8 @@ cleanup() done } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh new file mode 100644 index 000000000000..3286536b79d5 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -0,0 +1,40 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +readonly KSFT_FAIL=1 +readonly KSFT_SKIP=4 + +# SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var can be set when validating all +# features using the last version of the kernel and the selftests to make sure +# a test is not being skipped by mistake. +mptcp_lib_expect_all_features() { + [ "${SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES:-}" = "1" ] +} + +# $1: msg +mptcp_lib_fail_if_expected_feature() { + if mptcp_lib_expect_all_features; then + echo "ERROR: missing feature: ${*}" + exit ${KSFT_FAIL} + fi + + return 1 +} + +# $1: file +mptcp_lib_has_file() { + local f="${1}" + + if [ -f "${f}" ]; then + return 0 + fi + + mptcp_lib_fail_if_expected_feature "${f} file not found" +} + +mptcp_lib_check_mptcp() { + if ! mptcp_lib_has_file "/proc/sys/net/mptcp/enabled"; then + echo "SKIP: MPTCP support is not available" + exit ${KSFT_SKIP} + fi +} From 0f4955a40dafe18a1122e3714d8173e4b018e869 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:28 +0200 Subject: [PATCH 227/271] selftests: mptcp: pm nl: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: eedbc685321b ("selftests: add PM netlink functional tests") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 89839d1ff9d8..32f7533e0919 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + ksft_skip=4 ret=0 @@ -34,6 +36,8 @@ cleanup() ip netns del $ns1 } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" From 715c78a82e00f848f99ef76e6f6b89216ccba268 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:29 +0200 Subject: [PATCH 228/271] selftests: mptcp: join: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: b08fbf241064 ("selftests: add test-cases for MPTCP MP_JOIN") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 4f3fe45f8f71..96f63172b8fe 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -10,6 +10,8 @@ # because it's invoked by variable name, see how the "tests" array is used #shellcheck disable=SC2317 +. "$(dirname "${0}")/mptcp_lib.sh" + ret=0 sin="" sinfail="" @@ -137,6 +139,8 @@ cleanup_partial() check_tools() { + mptcp_lib_check_mptcp + if ! ip -Version &> /dev/null; then echo "SKIP: Could not run test without ip tool" exit $ksft_skip From 46565acdd29facbf418a11e4a3791b3c8967308d Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:30 +0200 Subject: [PATCH 229/271] selftests: mptcp: diag: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: df62f2ec3df6 ("selftests/mptcp: add diag interface tests") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/diag.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index ef628b16fe9b..4eacdb1ab962 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + sec=$(date +%s) rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns="ns1-$rndh" @@ -31,6 +33,8 @@ cleanup() ip netns del $ns } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" From 9161f21c74a1a0e7bb39eb84ea0c86b23c92fc87 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:31 +0200 Subject: [PATCH 230/271] selftests: mptcp: simult flows: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/simult_flows.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 9f22f7e5027d..36a3c9d92e20 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + sec=$(date +%s) rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns1="ns1-$rndh" @@ -34,6 +36,8 @@ cleanup() done } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" From cf6f0fda7af7e8e016070bfee6b189e671a0c776 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:32 +0200 Subject: [PATCH 231/271] selftests: mptcp: sockopt: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: dc65fe82fb07 ("selftests: mptcp: add packet mark test case") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 1b70c0a304ce..ff5adbb9c7f2 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -1,6 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + ret=0 sin="" sout="" @@ -84,6 +86,8 @@ cleanup() rm -f "$sin" "$sout" } +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" From 63212608a92a1ff10ae56dbb14e9fb685f7e4ffa Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sun, 28 May 2023 19:35:33 +0200 Subject: [PATCH 232/271] selftests: mptcp: userspace pm: skip if MPTCP is not supported Selftests are supposed to run on any kernels, including the old ones not supporting MPTCP. A new check is then added to make sure MPTCP is supported. If not, the test stops and is marked as "skipped". Link: https://github.com/multipath-tcp/mptcp_net-next/issues/368 Fixes: 259a834fadda ("selftests: mptcp: functional tests for the userspace PM type") Cc: stable@vger.kernel.org Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/mptcp/userspace_pm.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index b1eb7bce599d..8092399d911f 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -1,6 +1,10 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +. "$(dirname "${0}")/mptcp_lib.sh" + +mptcp_lib_check_mptcp + ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Cannot not run test without ip tool" From db3e33dd8bd956f165436afdbdbf1c653fb3c8e6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 28 May 2023 16:00:41 -0700 Subject: [PATCH 233/271] module: fix module load for ia64 Frank reported boot regression in ia64 as: ELILO v3.16 for EFI/IA-64 .. Uncompressing Linux... done Loading file AC100221.initrd.img...done [ 0.000000] Linux version 6.4.0-rc3 (root@x4270) (ia64-linux-gcc (GCC) 12.2.0, GNU ld (GNU Binutils) 2.39) #1 SMP Thu May 25 15:52:20 CEST 2023 [ 0.000000] efi: EFI v1.1 by HP [ 0.000000] efi: SALsystab=0x3ee7a000 ACPI 2.0=0x3fe2a000 ESI=0x3ee7b000 SMBIOS=0x3ee7c000 HCDP=0x3fe28000 [ 0.000000] PCDP: v3 at 0x3fe28000 [ 0.000000] earlycon: uart8250 at MMIO 0x00000000f4050000 (options '9600n8') [ 0.000000] printk: bootconsole [uart8250] enabled [ 0.000000] ACPI: Early table checksum verification disabled [ 0.000000] ACPI: RSDP 0x000000003FE2A000 000028 (v02 HP ) [ 0.000000] ACPI: XSDT 0x000000003FE2A02C 0000CC (v01 HP rx2620 00000000 HP 00000000) [...] [ 3.793350] Run /init as init process Loading, please wait... Starting systemd-udevd version 252.6-1 [ 3.951100] ------------[ cut here ]------------ [ 3.951100] WARNING: CPU: 6 PID: 140 at kernel/module/main.c:1547 __layout_sections+0x370/0x3c0 [ 3.949512] Unable to handle kernel paging request at virtual address 1000000000000000 [ 3.951100] Modules linked in: [ 3.951100] CPU: 6 PID: 140 Comm: (udev-worker) Not tainted 6.4.0-rc3 #1 [ 3.956161] (udev-worker)[142]: Oops 11003706212352 [1] [ 3.951774] Hardware name: hp server rx2620 , BIOS 04.29 11/30/2007 [ 3.951774] [ 3.951774] Call Trace: [ 3.958339] Unable to handle kernel paging request at virtual address 1000000000000000 [ 3.956161] Modules linked in: [ 3.951774] [] show_stack.part.0+0x30/0x60 [ 3.951774] sp=e000000183a67b20 bsp=e000000183a61628 [ 3.956161] [ 3.956161] which bisect to module_memory change [1]. Debug showed that ia64 uses some special sections: __layout_sections: section .got (sh_flags 10000002) matched to MOD_INVALID __layout_sections: section .sdata (sh_flags 10000003) matched to MOD_INVALID __layout_sections: section .sbss (sh_flags 10000003) matched to MOD_INVALID All these sections are loaded to module core memory before [1]. Fix ia64 boot by loading these sections to MOD_DATA (core rw data). [1] commit ac3b43283923 ("module: replace module_layout with module_memory") Fixes: ac3b43283923 ("module: replace module_layout with module_memory") Reported-by: Frank Scheiner Closes: https://lists.debian.org/debian-ia64/2023/05/msg00010.html Closes: https://marc.info/?l=linux-ia64&m=168509859125505 Cc: Linus Torvalds Signed-off-by: Song Liu Tested-by: John Paul Adrian Glaubitz Signed-off-by: Luis Chamberlain --- kernel/module/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index 044aa2c9e3cb..4e2cf784cf8c 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1521,14 +1521,14 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i MOD_RODATA, MOD_RO_AFTER_INIT, MOD_DATA, - MOD_INVALID, /* This is needed to match the masks array */ + MOD_DATA, }; static const int init_m_to_mem_type[] = { MOD_INIT_TEXT, MOD_INIT_RODATA, MOD_INVALID, MOD_INIT_DATA, - MOD_INVALID, /* This is needed to match the masks array */ + MOD_INIT_DATA, }; for (m = 0; m < ARRAY_SIZE(masks); ++m) { From b928dfdcb27d8fa59917b794cfba53052a2f050f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 May 2023 23:49:49 -0400 Subject: [PATCH 234/271] ext4: set lockdep subclass for the ea_inode in ext4_xattr_inode_cache_find() If the ea_inode has been pushed out of the inode cache while there is still a reference in the mb_cache, the lockdep subclass will not be set on the inode, which can lead to some lockdep false positives. Fixes: 33d201e0277b ("ext4: fix lockdep warning about recursive inode locking") Cc: stable@kernel.org Reported-by: syzbot+d4b971e744b1f5439336@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230524034951.779531-3-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a27208129a80..ff7ab63c5b4f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1539,6 +1539,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, EXT4_IGET_EA_INODE); if (IS_ERR(ea_inode)) goto next_entry; + ext4_xattr_inode_set_class(ea_inode); if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, From 2bc7e7c1a3bc9bd0cbf0f71006f6fe7ef24a00c2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 May 2023 23:49:50 -0400 Subject: [PATCH 235/271] ext4: disallow ea_inodes with extended attributes An ea_inode stores the value of an extended attribute; it can not have extended attributes itself, or this will cause recursive nightmares. Add a check in ext4_iget() to make sure this is the case. Cc: stable@kernel.org Reported-by: syzbot+e44749b6ba4d0434cd47@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230524034951.779531-4-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 258f3cbed347..02de439bf1f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4647,6 +4647,9 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "missing EA_INODE flag"; + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + EXT4_I(inode)->i_file_acl) + return "ea_inode with extended attributes"; } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "unexpected EA_INODE flag"; From aff3bea95388299eec63440389b4545c8041b357 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 23 May 2023 23:49:51 -0400 Subject: [PATCH 236/271] ext4: add lockdep annotations for i_data_sem for ea_inode's Treat i_data_sem for ea_inodes as being in their own lockdep class to avoid lockdep complaints about ext4_setattr's use of inode_lock() on normal inodes potentially causing lock ordering with i_data_sem on ea_inodes in ext4_xattr_inode_write(). However, ea_inodes will be operated on by ext4_setattr(), so this isn't a problem. Cc: stable@kernel.org Link: https://syzkaller.appspot.com/bug?extid=298c5d8fb4a128bc27b0 Reported-by: syzbot+298c5d8fb4a128bc27b0@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230524034951.779531-5-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/xattr.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9525c52b78dc..8104a21b001a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -918,11 +918,13 @@ do { \ * where the second inode has larger inode number * than the first * I_DATA_SEM_QUOTA - Used for quota inodes only + * I_DATA_SEM_EA - Used for ea_inodes only */ enum { I_DATA_SEM_NORMAL = 0, I_DATA_SEM_OTHER, I_DATA_SEM_QUOTA, + I_DATA_SEM_EA }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ff7ab63c5b4f..13d7f17a9c8c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -121,7 +121,11 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { + struct ext4_inode_info *ei = EXT4_I(ea_inode); + lockdep_set_subclass(&ea_inode->i_rwsem, 1); + (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ + lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); } #endif From 1077b2d53ef53629c14106aecf633bebd286c04c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 24 May 2023 12:44:53 +0200 Subject: [PATCH 237/271] ext4: fix fsync for non-directories Commit e360c6ed7274 ("ext4: Drop special handling of journalled data from ext4_sync_file()") simplified ext4_sync_file() by dropping special handling of journalled data mode as it was not needed anymore. However that branch was also used for directories and symlinks and since the fastcommit code does not track metadata changes to non-regular files, the change has caused e.g. fsync(2) on directories to not commit transaction as it should. Fix the problem by adding handling for non-regular files. Fixes: e360c6ed7274 ("ext4: Drop special handling of journalled data from ext4_sync_file()") Reported-by: Eric Whitney Link: https://lore.kernel.org/all/ZFqO3xVnmhL7zv1x@debian-BULLSEYE-live-builder-AMD64 Signed-off-by: Jan Kara Tested-by: Eric Whitney Link: https://lore.kernel.org/r/20230524104453.8734-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/fsync.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index f65fdb27ce14..2a143209aa0c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -108,6 +108,13 @@ static int ext4_fsync_journal(struct inode *inode, bool datasync, journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + /* + * Fastcommit does not really support fsync on directories or other + * special files. Force a full commit. + */ + if (!S_ISREG(inode->i_mode)) + return ext4_force_commit(inode->i_sb); + if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) *needs_barrier = true; From eb1f822c76beeaa76ab8b6737ab9dc9f9798408c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 26 May 2023 23:57:29 -0400 Subject: [PATCH 238/271] ext4: enable the lazy init thread when remounting read/write In commit a44be64bbecb ("ext4: don't clear SB_RDONLY when remounting r/w until quota is re-enabled") we defer clearing tyhe SB_RDONLY flag in struct super. However, we didn't defer when we checked sb_rdonly() to determine the lazy itable init thread should be enabled, with the next result that the lazy inode table initialization would not be properly started. This can cause generic/231 to fail in ext4's nojournal mode. Fix this by moving when we decide to start or stop the lazy itable init thread to after we clear the SB_RDONLY flag when we are remounting the file system read/write. Fixes a44be64bbecb ("ext4: don't clear SB_RDONLY when remounting r/w until...") Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230527035729.1001605-1-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9680fe753e59..56a5d1c469fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6588,18 +6588,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } } - /* - * Reinitialize lazy itable initialization thread based on - * current settings - */ - if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) - ext4_unregister_li_request(sb); - else { - ext4_group_t first_not_zeroed; - first_not_zeroed = ext4_has_uninit_itable(sb); - ext4_register_li_request(sb, first_not_zeroed); - } - /* * Handle creation of system zone data early because it can fail. * Releasing of existing data is done when we are sure remount will @@ -6637,6 +6625,18 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (enable_rw) sb->s_flags &= ~SB_RDONLY; + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); From c7cfbd115001f94de9e4053657946a383147e803 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 29 May 2023 12:52:55 -0700 Subject: [PATCH 239/271] net/sched: sch_ingress: Only create under TC_H_INGRESS ingress Qdiscs are only supposed to be created under TC_H_INGRESS. Return -EOPNOTSUPP if 'parent' is not TC_H_INGRESS, similar to mq_init(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+b53a9c0d1ea4ad62da8b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/ Tested-by: Pedro Tammela Acked-by: Jamal Hadi Salim Reviewed-by: Jamal Hadi Salim Reviewed-by: Vlad Buslov Signed-off-by: Peilin Ye Signed-off-by: Jakub Kicinski --- net/sched/sch_ingress.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 84838128b9c5..f9ef6deb2770 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -80,6 +80,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); int err; + if (sch->parent != TC_H_INGRESS) + return -EOPNOTSUPP; + net_inc_ingress_queue(); mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); @@ -101,6 +104,9 @@ static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + if (sch->parent != TC_H_INGRESS) + return; + tcf_block_put_ext(q->block, sch, &q->block_info); net_dec_ingress_queue(); } From 5eeebfe6c493192b10d516abfd72742900f2a162 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 29 May 2023 12:53:21 -0700 Subject: [PATCH 240/271] net/sched: sch_clsact: Only create under TC_H_CLSACT clsact Qdiscs are only supposed to be created under TC_H_CLSACT (which equals TC_H_INGRESS). Return -EOPNOTSUPP if 'parent' is not TC_H_CLSACT. Fixes: 1f211a1b929c ("net, sched: add clsact qdisc") Tested-by: Pedro Tammela Acked-by: Jamal Hadi Salim Reviewed-by: Jamal Hadi Salim Reviewed-by: Vlad Buslov Signed-off-by: Peilin Ye Signed-off-by: Jakub Kicinski --- net/sched/sch_ingress.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index f9ef6deb2770..35963929e117 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -225,6 +225,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); int err; + if (sch->parent != TC_H_CLSACT) + return -EOPNOTSUPP; + net_inc_ingress_queue(); net_inc_egress_queue(); @@ -254,6 +257,9 @@ static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + if (sch->parent != TC_H_CLSACT) + return; + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); From f85fa45d4a9408d98c46c8fa45ba2e3b2f4bf219 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 29 May 2023 12:54:03 -0700 Subject: [PATCH 241/271] net/sched: Reserve TC_H_INGRESS (TC_H_CLSACT) for ingress (clsact) Qdiscs Currently it is possible to add e.g. an HTB Qdisc under ffff:fff1 (TC_H_INGRESS, TC_H_CLSACT): $ ip link add name ifb0 type ifb $ tc qdisc add dev ifb0 parent ffff:fff1 htb $ tc qdisc add dev ifb0 clsact Error: Exclusivity flag on, cannot modify. $ drgn ... >>> ifb0 = netdev_get_by_name(prog, "ifb0") >>> qdisc = ifb0.ingress_queue.qdisc_sleeping >>> print(qdisc.ops.id.string_().decode()) htb >>> qdisc.flags.value_() # TCQ_F_INGRESS 2 Only allow ingress and clsact Qdiscs under ffff:fff1. Return -EINVAL for everything else. Make TCQ_F_INGRESS a static flag of ingress and clsact Qdiscs. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: 1f211a1b929c ("net, sched: add clsact qdisc") Tested-by: Pedro Tammela Acked-by: Jamal Hadi Salim Reviewed-by: Jamal Hadi Salim Reviewed-by: Vlad Buslov Signed-off-by: Peilin Ye Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 7 ++++++- net/sched/sch_ingress.c | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index fdb8f429333d..383195955b7d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1252,7 +1252,12 @@ static struct Qdisc *qdisc_create(struct net_device *dev, sch->parent = parent; if (handle == TC_H_INGRESS) { - sch->flags |= TCQ_F_INGRESS; + if (!(sch->flags & TCQ_F_INGRESS)) { + NL_SET_ERR_MSG(extack, + "Specified parent ID is reserved for ingress and clsact Qdiscs"); + err = -EINVAL; + goto err_out3; + } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 35963929e117..e43a45499372 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -140,7 +140,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", .priv_size = sizeof(struct ingress_sched_data), - .static_flags = TCQ_F_CPUSTATS, + .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -281,7 +281,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .cl_ops = &clsact_class_ops, .id = "clsact", .priv_size = sizeof(struct clsact_sched_data), - .static_flags = TCQ_F_CPUSTATS, + .static_flags = TCQ_F_INGRESS | TCQ_F_CPUSTATS, .init = clsact_init, .destroy = clsact_destroy, .dump = ingress_dump, From 9de95df5d15baa956c2b70b9e794842e790a8a13 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 29 May 2023 12:54:26 -0700 Subject: [PATCH 242/271] net/sched: Prohibit regrafting ingress or clsact Qdiscs Currently, after creating an ingress (or clsact) Qdisc and grafting it under TC_H_INGRESS (TC_H_CLSACT), it is possible to graft it again under e.g. a TBF Qdisc: $ ip link add ifb0 type ifb $ tc qdisc add dev ifb0 handle 1: root tbf rate 20kbit buffer 1600 limit 3000 $ tc qdisc add dev ifb0 clsact $ tc qdisc link dev ifb0 handle ffff: parent 1:1 $ tc qdisc show dev ifb0 qdisc tbf 1: root refcnt 2 rate 20Kbit burst 1600b lat 560.0ms qdisc clsact ffff: parent ffff:fff1 refcnt 2 ^^^^^^^^ clsact's refcount has increased: it is now grafted under both TC_H_CLSACT and 1:1. ingress and clsact Qdiscs should only be used under TC_H_INGRESS (TC_H_CLSACT). Prohibit regrafting them. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: 1f211a1b929c ("net, sched: add clsact qdisc") Tested-by: Pedro Tammela Acked-by: Jamal Hadi Salim Reviewed-by: Jamal Hadi Salim Reviewed-by: Vlad Buslov Signed-off-by: Peilin Ye Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 383195955b7d..49b9c1bbfdd9 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1596,6 +1596,11 @@ replay: NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } + if (q->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot regraft ingress or clsact Qdiscs"); + return -EINVAL; + } if (q == p || (p && check_loop(q, p, 0))) { NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); From 36eec020fab668719b541f34d97f44e232ffa165 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Sat, 27 May 2023 17:37:47 +0800 Subject: [PATCH 243/271] net: sched: fix NULL pointer dereference in mq_attach When use the following command to test: 1)ip link add bond0 type bond 2)ip link set bond0 up 3)tc qdisc add dev bond0 root handle ffff: mq 4)tc qdisc replace dev bond0 parent ffff:fff1 handle ffff: mq The kernel reports NULL pointer dereference issue. The stack information is as follows: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Internal error: Oops: 0000000096000006 [#1] SMP Modules linked in: pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : mq_attach+0x44/0xa0 lr : qdisc_graft+0x20c/0x5cc sp : ffff80000e2236a0 x29: ffff80000e2236a0 x28: ffff0000c0e59d80 x27: ffff0000c0be19c0 x26: ffff0000cae3e800 x25: 0000000000000010 x24: 00000000fffffff1 x23: 0000000000000000 x22: ffff0000cae3e800 x21: ffff0000c9df4000 x20: ffff0000c9df4000 x19: 0000000000000000 x18: ffff80000a934000 x17: ffff8000f5b56000 x16: ffff80000bb08000 x15: 0000000000000000 x14: 0000000000000000 x13: 6b6b6b6b6b6b6b6b x12: 6b6b6b6b00000001 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : ffff0000c0be0730 x7 : bbbbbbbbbbbbbbbb x6 : 0000000000000008 x5 : ffff0000cae3e864 x4 : 0000000000000000 x3 : 0000000000000001 x2 : 0000000000000001 x1 : ffff8000090bc23c x0 : 0000000000000000 Call trace: mq_attach+0x44/0xa0 qdisc_graft+0x20c/0x5cc tc_modify_qdisc+0x1c4/0x664 rtnetlink_rcv_msg+0x354/0x440 netlink_rcv_skb+0x64/0x144 rtnetlink_rcv+0x28/0x34 netlink_unicast+0x1e8/0x2a4 netlink_sendmsg+0x308/0x4a0 sock_sendmsg+0x64/0xac ____sys_sendmsg+0x29c/0x358 ___sys_sendmsg+0x90/0xd0 __sys_sendmsg+0x7c/0xd0 __arm64_sys_sendmsg+0x2c/0x38 invoke_syscall+0x54/0x114 el0_svc_common.constprop.1+0x90/0x174 do_el0_svc+0x3c/0xb0 el0_svc+0x24/0xec el0t_64_sync_handler+0x90/0xb4 el0t_64_sync+0x174/0x178 This is because when mq is added for the first time, qdiscs in mq is set to NULL in mq_attach(). Therefore, when replacing mq after adding mq, we need to initialize qdiscs in the mq before continuing to graft. Otherwise, it will couse NULL pointer dereference issue in mq_attach(). And the same issue will occur in the attach functions of mqprio, taprio and htb. ffff:fff1 means that the repalce qdisc is ingress. Ingress does not allow any qdisc to be attached. Therefore, ffff:fff1 is incorrectly used, and the command should be dropped. Fixes: 6ec1c69a8f64 ("net_sched: add classful multiqueue dummy scheduler") Signed-off-by: Zhengchao Shao Tested-by: Peilin Ye Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20230527093747.3583502-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 49b9c1bbfdd9..014209b1dd58 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1606,6 +1606,10 @@ replay: NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } + if (clid == TC_H_INGRESS) { + NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); + return -EINVAL; + } qdisc_refcount_inc(q); goto graft; } else { From f4e4534850a9d18c250a93f8d7fbb51310828110 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 29 May 2023 12:33:35 -0300 Subject: [PATCH 244/271] net/netlink: fix NETLINK_LIST_MEMBERSHIPS length report The current code for the length calculation wrongly truncates the reported length of the groups array, causing an under report of the subscribed groups. To fix this, use 'BITS_TO_BYTES()' which rounds up the division by 8. Fixes: b42be38b2778 ("netlink: add API to retrieve all group memberships") Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230529153335.389815-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c87804112d0c..3a1e0fd5bf14 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1779,7 +1779,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, break; } } - if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) + if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; From 448a5ce1120c5bdbce1f1ccdabcd31c7d029f328 Mon Sep 17 00:00:00 2001 From: Vladislav Efanov Date: Tue, 30 May 2023 14:39:41 +0300 Subject: [PATCH 245/271] udp6: Fix race condition in udp6_sendmsg & connect Syzkaller got the following report: BUG: KASAN: use-after-free in sk_setup_caps+0x621/0x690 net/core/sock.c:2018 Read of size 8 at addr ffff888027f82780 by task syz-executor276/3255 The function sk_setup_caps (called by ip6_sk_dst_store_flow-> ip6_dst_store) referenced already freed memory as this memory was freed by parallel task in udpv6_sendmsg->ip6_sk_dst_lookup_flow-> sk_dst_check. task1 (connect) task2 (udp6_sendmsg) sk_setup_caps->sk_dst_set | | sk_dst_check-> | sk_dst_set | dst_release sk_setup_caps references | to already freed dst_entry| The reason for this race condition is: sk_setup_caps() keeps using the dst after transferring the ownership to the dst cache. Found by Linux Verification Center (linuxtesting.org) with syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Vladislav Efanov Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 5440e67bcfe3..24f2761bdb1d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2381,7 +2381,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk_is_tcp(sk)) sk->sk_route_caps |= NETIF_F_GSO; @@ -2400,6 +2399,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } } sk->sk_gso_max_segs = max_segs; + sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps); From 6199d23c91ce53bfed455f09a8c5ed170d516824 Mon Sep 17 00:00:00 2001 From: Bastien Nocera Date: Wed, 31 May 2023 10:24:28 +0200 Subject: [PATCH 246/271] HID: logitech-hidpp: Handle timeout differently from busy If an attempt at contacting a receiver or a device fails because the receiver or device never responds, don't restart the communication, only restart it if the receiver or device answers that it's busy, as originally intended. This was the behaviour on communication timeout before commit 586e8fede795 ("HID: logitech-hidpp: Retry commands when device is busy"). This fixes some overly long waits in a critical path on boot, when checking whether the device is connected by getting its HID++ version. Signed-off-by: Bastien Nocera Suggested-by: Mark Lord Fixes: 586e8fede795 ("HID: logitech-hidpp: Retry commands when device is busy") Link: https://bugzilla.kernel.org/show_bug.cgi?id=217412 Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-hidpp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index 0fcfd85fea0f..2246044b1639 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -314,6 +314,7 @@ static int hidpp_send_message_sync(struct hidpp_device *hidpp, dbg_hid("%s:timeout waiting for response\n", __func__); memset(response, 0, sizeof(struct hidpp_report)); ret = -ETIMEDOUT; + goto exit; } if (response->report_id == REPORT_ID_HIDPP_SHORT && From 8fe72b76db79d694858e872370df49676bc3be8c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 May 2023 12:22:09 +0300 Subject: [PATCH 247/271] mailbox: mailbox-test: fix a locking issue in mbox_test_message_write() There was a bug where this code forgot to unlock the tdev->mutex if the kzalloc() failed. Fix this issue, by moving the allocation outside the lock. Fixes: 2d1e952a2b8e ("mailbox: mailbox-test: Fix potential double-free in mbox_test_message_write()") Signed-off-by: Dan Carpenter Reviewed-by: Lee Jones Signed-off-by: Jassi Brar --- drivers/mailbox/mailbox-test.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/mailbox/mailbox-test.c b/drivers/mailbox/mailbox-test.c index c4a705c30331..fc6a12a51b40 100644 --- a/drivers/mailbox/mailbox-test.c +++ b/drivers/mailbox/mailbox-test.c @@ -98,6 +98,7 @@ static ssize_t mbox_test_message_write(struct file *filp, size_t count, loff_t *ppos) { struct mbox_test_device *tdev = filp->private_data; + char *message; void *data; int ret; @@ -113,12 +114,13 @@ static ssize_t mbox_test_message_write(struct file *filp, return -EINVAL; } - mutex_lock(&tdev->mutex); - - tdev->message = kzalloc(MBOX_MAX_MSG_LEN, GFP_KERNEL); - if (!tdev->message) + message = kzalloc(MBOX_MAX_MSG_LEN, GFP_KERNEL); + if (!message) return -ENOMEM; + mutex_lock(&tdev->mutex); + + tdev->message = message; ret = copy_from_user(tdev->message, userbuf, count); if (ret) { ret = -EFAULT; From 1c4c769cdf682c63d3b10cb241f4a96ebad2f215 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 30 May 2023 11:59:34 +0300 Subject: [PATCH 248/271] net/mlx5: Remove rmap also in case dynamic MSIX not supported mlx5 add IRQs to rmap upon MSIX request, and mlx5 remove rmap from MSIX only if msi_map.index is populated. However, msi_map.index is populated only when dynamic MSIX is supported. This results in freeing IRQs without removing them from rmap, which triggers the bellow WARN_ON[1]. rmap is a feature which have no relation to dynamic MSIX. Hence, remove the check of msi_map.index when removing IRQ from rmap. [1] [ 200.307160 ] WARNING: CPU: 20 PID: 1702 at kernel/irq/manage.c:2034 free_irq+0x2ac/0x358 [ 200.316990 ] CPU: 20 PID: 1702 Comm: modprobe Not tainted 6.4.0-rc3_for_upstream_min_debug_2023_05_24_14_02 #1 [ 200.318939 ] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 200.321659 ] pc : free_irq+0x2ac/0x358 [ 200.322400 ] lr : free_irq+0x20/0x358 [ 200.337865 ] Call trace: [ 200.338360 ] free_irq+0x2ac/0x358 [ 200.339029 ] irq_release+0x58/0xd0 [mlx5_core] [ 200.340093 ] mlx5_irqs_release_vectors+0x80/0xb0 [mlx5_core] [ 200.341344 ] destroy_comp_eqs+0x120/0x170 [mlx5_core] [ 200.342469 ] mlx5_eq_table_destroy+0x1c/0x38 [mlx5_core] [ 200.343645 ] mlx5_unload+0x8c/0xc8 [mlx5_core] [ 200.344652 ] mlx5_uninit_one+0x78/0x118 [mlx5_core] [ 200.345745 ] remove_one+0x80/0x108 [mlx5_core] [ 200.346752 ] pci_device_remove+0x40/0xd8 [ 200.347554 ] device_remove+0x50/0x88 [ 200.348272 ] device_release_driver_internal+0x1c4/0x228 [ 200.349312 ] driver_detach+0x54/0xa0 [ 200.350030 ] bus_remove_driver+0x74/0x100 [ 200.350833 ] driver_unregister+0x34/0x68 [ 200.351619 ] pci_unregister_driver+0x28/0xa0 [ 200.352476 ] mlx5_cleanup+0x14/0x2210 [mlx5_core] [ 200.353536 ] __arm64_sys_delete_module+0x190/0x2e8 [ 200.354495 ] el0_svc_common.constprop.0+0x6c/0x1d0 [ 200.355455 ] do_el0_svc+0x38/0x98 [ 200.356122 ] el0_svc+0x1c/0x80 [ 200.356739 ] el0t_64_sync_handler+0xb4/0x130 [ 200.357604 ] el0t_64_sync+0x174/0x178 [ 200.358345 ] ---[ end trace 0000000000000000 ]--- Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index db5687d9fec9..86ac4a85fd87 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -141,7 +141,7 @@ static void irq_release(struct mlx5_irq *irq) irq_update_affinity_hint(irq->map.virq, NULL); #ifdef CONFIG_RFS_ACCEL rmap = mlx5_eq_table_get_rmap(pool->dev); - if (rmap && irq->map.index) + if (rmap) irq_cpu_rmap_remove(rmap, irq->map.virq); #endif From 8764bd0fa5d402c51b136f6aeaba20fc16961ba1 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 31 May 2023 10:48:56 +0200 Subject: [PATCH 249/271] net/mlx5: Fix setting of irq->map.index for static IRQ case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When dynamic IRQ allocation is not supported all IRQs are allocated up front in mlx5_irq_table_create() instead of dynamically as part of mlx5_irq_alloc(). In the latter dynamic case irq->map.index is set via the mapping returned by pci_msix_alloc_irq_at(). In the static case and prior to commit 1da438c0ae02 ("net/mlx5: Fix indexing of mlx5_irq") irq->map.index was set in mlx5_irq_alloc() twice once initially to 0 and then to the requested index before storing in the xarray. After this commit it is only set to 0 which breaks all other IRQ mappings. Fix this by setting irq->map.index to the requested index together with irq->map.virq and improve the related comment to make it clearer which cases it deals with. Cc: Chuck Lever III Tested-by: Mark Brown Reviewed-by: Mark Brown Reviewed-by: Simon Horman Reviewed-by: Eli Cohen Fixes: 1da438c0ae02 ("net/mlx5: Fix indexing of mlx5_irq") Signed-off-by: Niklas Schnelle Tested-by: Cédric Le Goater Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 86ac4a85fd87..38edd485ba6f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -232,12 +232,13 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, if (!irq) return ERR_PTR(-ENOMEM); if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) { - /* The vector at index 0 was already allocated. - * Just get the irq number. If dynamic irq is not supported - * vectors have also been allocated. + /* The vector at index 0 is always statically allocated. If + * dynamic irq is not supported all vectors are statically + * allocated. In both cases just get the irq number and set + * the index. */ irq->map.virq = pci_irq_vector(dev->pdev, i); - irq->map.index = 0; + irq->map.index = i; } else { irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc); if (!irq->map.virq) { From 368591995d010e639ad8f28b27f1b721f0872342 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 May 2023 15:48:25 -0400 Subject: [PATCH 250/271] net/mlx5: Ensure af_desc.mask is properly initialized [ 9.837087] mlx5_core 0000:02:00.0: firmware version: 16.35.2000 [ 9.843126] mlx5_core 0000:02:00.0: 126.016 Gb/s available PCIe bandwidth (8.0 GT/s PCIe x16 link) [ 10.311515] mlx5_core 0000:02:00.0: Rate limit: 127 rates are supported, range: 0Mbps to 97656Mbps [ 10.321948] mlx5_core 0000:02:00.0: E-Switch: Total vports 2, per vport: max uc(128) max mc(2048) [ 10.344324] mlx5_core 0000:02:00.0: mlx5_pcie_event:301:(pid 88): PCIe slot advertised sufficient power (27W). [ 10.354339] BUG: unable to handle page fault for address: ffffffff8ff0ade0 [ 10.361206] #PF: supervisor read access in kernel mode [ 10.366335] #PF: error_code(0x0000) - not-present page [ 10.371467] PGD 81ec39067 P4D 81ec39067 PUD 81ec3a063 PMD 114b07063 PTE 800ffff7e10f5062 [ 10.379544] Oops: 0000 [#1] PREEMPT SMP PTI [ 10.383721] CPU: 0 PID: 117 Comm: kworker/0:6 Not tainted 6.3.0-13028-g7222f123c983 #1 [ 10.391625] Hardware name: Supermicro X10SRA-F/X10SRA-F, BIOS 2.0b 06/12/2017 [ 10.398750] Workqueue: events work_for_cpu_fn [ 10.403108] RIP: 0010:__bitmap_or+0x10/0x26 [ 10.407286] Code: 85 c0 0f 95 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 89 c9 31 c0 48 83 c1 3f 48 c1 e9 06 39 c> [ 10.426024] RSP: 0000:ffffb45a0078f7b0 EFLAGS: 00010097 [ 10.431240] RAX: 0000000000000000 RBX: ffffffff8ff0adc0 RCX: 0000000000000004 [ 10.438365] RDX: ffff9156801967d0 RSI: ffffffff8ff0ade0 RDI: ffff9156801967b0 [ 10.445489] RBP: ffffb45a0078f7e8 R08: 0000000000000030 R09: 0000000000000000 [ 10.452613] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000000ec [ 10.459737] R13: ffffffff8ff0ade0 R14: 0000000000000001 R15: 0000000000000020 [ 10.466862] FS: 0000000000000000(0000) GS:ffff9165bfc00000(0000) knlGS:0000000000000000 [ 10.474936] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 10.480674] CR2: ffffffff8ff0ade0 CR3: 00000001011ae003 CR4: 00000000003706f0 [ 10.487800] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 10.494922] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 10.502046] Call Trace: [ 10.504493] [ 10.506589] ? matrix_alloc_area.constprop.0+0x43/0x9a [ 10.511729] ? prepare_namespace+0x84/0x174 [ 10.515914] irq_matrix_reserve_managed+0x56/0x10c [ 10.520699] x86_vector_alloc_irqs+0x1d2/0x31e [ 10.525146] irq_domain_alloc_irqs_hierarchy+0x39/0x3f [ 10.530284] irq_domain_alloc_irqs_parent+0x1a/0x2a [ 10.535155] intel_irq_remapping_alloc+0x59/0x5e9 [ 10.539859] ? kmem_cache_debug_flags+0x11/0x26 [ 10.544383] ? __radix_tree_lookup+0x39/0xb9 [ 10.548649] irq_domain_alloc_irqs_hierarchy+0x39/0x3f [ 10.553779] irq_domain_alloc_irqs_parent+0x1a/0x2a [ 10.558650] msi_domain_alloc+0x8c/0x120 [ 10.567697] irq_domain_alloc_irqs_locked+0x11d/0x286 [ 10.572741] __irq_domain_alloc_irqs+0x72/0x93 [ 10.577179] __msi_domain_alloc_irqs+0x193/0x3f1 [ 10.581789] ? __xa_alloc+0xcf/0xe2 [ 10.585273] msi_domain_alloc_irq_at+0xa8/0xfe [ 10.589711] pci_msix_alloc_irq_at+0x47/0x5c The crash is due to matrix_alloc_area() attempting to access per-CPU memory for CPUs that are not present on the system. The CPU mask passed into reserve_managed_vector() via it's @irqd parameter is corrupted because it contains uninitialized stack data. Fixes: bbac70c74183 ("net/mlx5: Use newer affinity descriptor") Reviewed-by: Thomas Gleixner Signed-off-by: Chuck Lever Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 38edd485ba6f..843da89a9035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -571,11 +571,11 @@ int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, af_desc.is_managed = false; for (i = 0; i < nirqs; i++) { + cpumask_clear(&af_desc.mask); cpumask_set_cpu(cpus[i], &af_desc.mask); irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap); if (IS_ERR(irq)) break; - cpumask_clear(&af_desc.mask); irqs[i] = irq; } From b6193d7030e3c59f1d4c75648c9c8fa40cad2bcd Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 27 May 2023 23:07:08 -0700 Subject: [PATCH 251/271] net/mlx5e: Fix error handling in mlx5e_refresh_tirs Allocation failure is outside the critical lock section and should return immediately rather than jumping to the unlock section. Also unlock as soon as required and remove the now redundant jump label. Fixes: 80a2a9026b24 ("net/mlx5e: Add a lock on tir list") Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 1f90594499c6..41c396e76457 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -150,10 +150,8 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, inlen = MLX5_ST_SZ_BYTES(modify_tir_in); in = kvzalloc(inlen, GFP_KERNEL); - if (!in) { - err = -ENOMEM; - goto out; - } + if (!in) + return -ENOMEM; if (enable_uc_lb) lb_flags = MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; @@ -171,14 +169,13 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, tirn = tir->tirn; err = mlx5_core_modify_tir(mdev, tirn, in); if (err) - goto out; + break; } + mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); -out: kvfree(in); if (err) netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err); - mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); return err; } From bbfa4b58997e3d38ba629c9f6fc0bd1c163aaf43 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 28 Apr 2023 13:48:13 +0300 Subject: [PATCH 252/271] net/mlx5: Read embedded cpu after init bit cleared During driver load it reads embedded_cpu bit from initialization segment, but the initialization segment is readable only after initialization bit is cleared. Move the call to mlx5_read_embedded_cpu() right after initialization bit cleared. Signed-off-by: Moshe Shemesh Fixes: 591905ba9679 ("net/mlx5: Introduce Mellanox SmartNIC and modify page management logic") Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 2132a6510639..d6ee016deae1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -923,7 +923,6 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, } mlx5_pci_vsc_init(dev); - dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); return 0; err_clr_master: @@ -1155,6 +1154,7 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, u64 timeout goto err_cmd_cleanup; } + dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); mlx5_start_health_poll(dev); From 622ab656344a288acf4fb03d628c3bb5dd241f34 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 30 May 2023 21:25:27 +0100 Subject: [PATCH 253/271] sfc: fix error unwinds in TC offload Failure ladders weren't exactly unwinding what the function had done up to that point; most seriously, when we encountered an already offloaded rule, the failure path tried to remove the new rule from the hashtable, which would in fact remove the already-present 'old' rule (since it has the same key) from the table, and leak its resources. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202305200745.xmIlkqjH-lkp@intel.com/ Fixes: d902e1a737d4 ("sfc: bare bones TC offload on EF100") Fixes: 17654d84b47c ("sfc: add offloading of 'foreign' TC (decap) rules") Signed-off-by: Edward Cree Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230530202527.53115-1-edward.cree@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/tc.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/sfc/tc.c b/drivers/net/ethernet/sfc/tc.c index 0327639a628a..c004443c1d58 100644 --- a/drivers/net/ethernet/sfc/tc.c +++ b/drivers/net/ethernet/sfc/tc.c @@ -624,13 +624,12 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, if (!found) { /* We don't care. */ netif_dbg(efx, drv, efx->net_dev, "Ignoring foreign filter that doesn't egdev us\n"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } rc = efx_mae_match_check_caps(efx, &match.mask, NULL); if (rc) - goto release; + return rc; if (efx_tc_match_is_encap(&match.mask)) { enum efx_encap_type type; @@ -639,8 +638,7 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, if (type == EFX_ENCAP_TYPE_NONE) { NL_SET_ERR_MSG_MOD(extack, "Egress encap match on unsupported tunnel device"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } rc = efx_mae_check_encap_type_supported(efx, type); @@ -648,25 +646,24 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, NL_SET_ERR_MSG_FMT_MOD(extack, "Firmware reports no support for %s encap match", efx_tc_encap_type_name(type)); - goto release; + return rc; } rc = efx_tc_flower_record_encap_match(efx, &match, type, extack); if (rc) - goto release; + return rc; } else { /* This is not a tunnel decap rule, ignore it */ netif_dbg(efx, drv, efx->net_dev, "Ignoring foreign filter without encap match\n"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } rule = kzalloc(sizeof(*rule), GFP_USER); if (!rule) { rc = -ENOMEM; - goto release; + goto out_free; } INIT_LIST_HEAD(&rule->acts.list); rule->cookie = tc->cookie; @@ -678,7 +675,7 @@ static int efx_tc_flower_replace_foreign(struct efx_nic *efx, "Ignoring already-offloaded rule (cookie %lx)\n", tc->cookie); rc = -EEXIST; - goto release; + goto out_free; } act = kzalloc(sizeof(*act), GFP_USER); @@ -843,6 +840,7 @@ release: efx_tc_match_action_ht_params); efx_tc_free_action_set_list(efx, &rule->acts, false); } +out_free: kfree(rule); if (match.encap) efx_tc_flower_release_encap_match(efx, match.encap); @@ -899,8 +897,7 @@ static int efx_tc_flower_replace(struct efx_nic *efx, return rc; if (efx_tc_match_is_encap(&match.mask)) { NL_SET_ERR_MSG_MOD(extack, "Ingress enc_key matches not supported"); - rc = -EOPNOTSUPP; - goto release; + return -EOPNOTSUPP; } if (tc->common.chain_index) { @@ -924,9 +921,9 @@ static int efx_tc_flower_replace(struct efx_nic *efx, if (old) { netif_dbg(efx, drv, efx->net_dev, "Already offloaded rule (cookie %lx)\n", tc->cookie); - rc = -EEXIST; NL_SET_ERR_MSG_MOD(extack, "Rule already offloaded"); - goto release; + kfree(rule); + return -EEXIST; } /* Parse actions */ From 4d56304e5827c8cc8cc18c75343d283af7c4825c Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Wed, 31 May 2023 18:28:04 +0800 Subject: [PATCH 254/271] net/sched: flower: fix possible OOB write in fl_set_geneve_opt() If we send two TCA_FLOWER_KEY_ENC_OPTS_GENEVE packets and their total size is 252 bytes(key->enc_opts.len = 252) then key->enc_opts.len = opt->length = data_len / 4 = 0 when the third TCA_FLOWER_KEY_ENC_OPTS_GENEVE packet enters fl_set_geneve_opt. This bypasses the next bounds check and results in an out-of-bounds. Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Signed-off-by: Hangyu Hua Reviewed-by: Simon Horman Reviewed-by: Pieter Jansen van Vuuren Link: https://lore.kernel.org/r/20230531102805.27090-1-hbh25y@gmail.com Signed-off-by: Paolo Abeni --- net/sched/cls_flower.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9dbc43388e57..815c3e416bc5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1153,6 +1153,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, if (option_len > sizeof(struct geneve_opt)) data_len = option_len - sizeof(struct geneve_opt); + if (key->enc_opts.len > FLOW_DIS_TUN_OPTS_MAX - 4) + return -ERANGE; + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; memset(opt, 0xff, option_len); opt->length = data_len / 4; From 30c6f0bf9579debce27e45fac34fdc97e46acacc Mon Sep 17 00:00:00 2001 From: fuyuanli Date: Wed, 31 May 2023 16:01:50 +0800 Subject: [PATCH 255/271] tcp: fix mishandling when the sack compression is deferred. In this patch, we mainly try to handle sending a compressed ack correctly if it's deferred. Here are more details in the old logic: When sack compression is triggered in the tcp_compressed_ack_kick(), if the sock is owned by user, it will set TCP_DELACK_TIMER_DEFERRED and then defer to the release cb phrase. Later once user releases the sock, tcp_delack_timer_handler() should send a ack as expected, which, however, cannot happen due to lack of ICSK_ACK_TIMER flag. Therefore, the receiver would not sent an ack until the sender's retransmission timeout. It definitely increases unnecessary latency. Fixes: 5d9f4262b7ea ("tcp: add SACK compression") Suggested-by: Eric Dumazet Signed-off-by: fuyuanli Signed-off-by: Jason Xing Link: https://lore.kernel.org/netdev/20230529113804.GA20300@didi-ThinkCentre-M920t-N000/ Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230531080150.GA20424@didi-ThinkCentre-M920t-N000 Signed-off-by: Paolo Abeni --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_timer.c | 16 +++++++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 18a038d16434..5066e4586cf0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -632,6 +632,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); +void tcp_sack_compress_send_ack(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 61b6710f337a..bf8b22218dd4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4530,7 +4530,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static void tcp_sack_compress_send_ack(struct sock *sk) +void tcp_sack_compress_send_ack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b839c2f91292..39eb947fe392 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -290,9 +290,19 @@ static int tcp_write_timeout(struct sock *sk) void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return; + + /* Handling the sack compression case */ + if (tp->compressed_ack) { + tcp_mstamp_refresh(tp); + tcp_sack_compress_send_ack(sk); + return; + } + + if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; if (time_after(icsk->icsk_ack.timeout, jiffies)) { @@ -312,7 +322,7 @@ void tcp_delack_timer_handler(struct sock *sk) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } - tcp_mstamp_refresh(tcp_sk(sk)); + tcp_mstamp_refresh(tp); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } From be7f8012a513f5099916ee2da28420156cbb8cf3 Mon Sep 17 00:00:00 2001 From: Bert Karwatzki Date: Wed, 31 May 2023 12:36:19 +0200 Subject: [PATCH 256/271] net: ipa: Use correct value for IPA_STATUS_SIZE IPA_STATUS_SIZE was introduced in commit b8dc7d0eea5a as a replacement for the size of the removed struct ipa_status which had size sizeof(__le32[8]). Use this value as IPA_STATUS_SIZE. Fixes: b8dc7d0eea5a ("net: ipa: stop using sizeof(status)") Signed-off-by: Bert Karwatzki Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230531103618.102608-1-spasswolf@web.de Signed-off-by: Paolo Abeni --- drivers/net/ipa/ipa_endpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 2ee80ed140b7..afa1d56d9095 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -119,7 +119,7 @@ enum ipa_status_field_id { }; /* Size in bytes of an IPA packet status structure */ -#define IPA_STATUS_SIZE sizeof(__le32[4]) +#define IPA_STATUS_SIZE sizeof(__le32[8]) /* IPA status structure decoder; looks up field values for a structure */ static u32 ipa_status_extract(struct ipa *ipa, const void *data, From 4420528254153189c70b6267593e445dc8654e37 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 29 May 2023 12:52:07 -0600 Subject: [PATCH 257/271] firewire: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero-length and one-element arrays are deprecated, and we are moving towards adopting C99 flexible-array members, instead. Address the following warnings found with GCC-13 and -fstrict-flex-arrays=3 enabled: sound/firewire/amdtp-stream.c: In function ‘build_it_pkt_header’: sound/firewire/amdtp-stream.c:694:17: warning: ‘generate_cip_header’ accessing 8 bytes in a region of size 0 [-Wstringop-overflow=] 694 | generate_cip_header(s, cip_header, data_block_counter, syt); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sound/firewire/amdtp-stream.c:694:17: note: referencing argument 2 of type ‘__be32[2]’ {aka ‘unsigned int[2]’} sound/firewire/amdtp-stream.c:667:13: note: in a call to function ‘generate_cip_header’ 667 | static void generate_cip_header(struct amdtp_stream *s, __be32 cip_header[2], | ^~~~~~~~~~~~~~~~~~~ This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. Link: https://github.com/KSPP/linux/issues/21 Link: https://github.com/KSPP/linux/issues/303 Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/ZHT0V3SpvHyxCv5W@work Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 1716c01c4e54..efb6e2cf2034 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -391,7 +391,7 @@ struct fw_iso_packet { u32 tag:2; /* tx: Tag in packet header */ u32 sy:4; /* tx: Sy in packet header */ u32 header_length:8; /* Length of immediate header */ - u32 header[0]; /* tx: Top of 1394 isoch. data_block */ + u32 header[]; /* tx: Top of 1394 isoch. data_block */ }; #define FW_ISO_CONTEXT_TRANSMIT 0 From 3c27f3d53d588618d81d30d6712459a3cc9489b8 Mon Sep 17 00:00:00 2001 From: Andreas Svensson Date: Tue, 30 May 2023 16:52:23 +0200 Subject: [PATCH 258/271] net: dsa: mv88e6xxx: Increase wait after reset deactivation A switch held in reset by default needs to wait longer until we can reliably detect it. An issue was observed when testing on the Marvell 88E6393X (Link Street). The driver failed to detect the switch on some upstarts. Increasing the wait time after reset deactivation solves this issue. The updated wait time is now also the same as the wait time in the mv88e6xxx_hardware_reset function. Fixes: 7b75e49de424 ("net: dsa: mv88e6xxx: wait after reset deactivation") Signed-off-by: Andreas Svensson Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230530145223.1223993-1-andreas.svensson@axis.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 64a2f2f83735..08a46ffd53af 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -7170,7 +7170,7 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) goto out; } if (chip->reset) - usleep_range(1000, 2000); + usleep_range(10000, 20000); /* Detect if the device is configured in single chip addressing mode, * otherwise continue with address specific smi init/detection. From a60caf039e96d806b1ced893242bae82ba3ccf0d Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 29 May 2023 16:38:17 +0900 Subject: [PATCH 259/271] net: renesas: rswitch: Fix return value in error path of xmit Fix return value in the error path of rswitch_start_xmit(). If TX queues are full, this function should return NETDEV_TX_BUSY. Fixes: 3590918b5d07 ("net: ethernet: renesas: Add support for "Ethernet Switch"") Signed-off-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20230529073817.1145208-1-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rswitch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 29afaddb598d..aace87139cea 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -1485,7 +1485,7 @@ static netdev_tx_t rswitch_start_xmit(struct sk_buff *skb, struct net_device *nd if (rswitch_get_num_cur_queues(gq) >= gq->ring_size - 1) { netif_stop_subqueue(ndev, 0); - return ret; + return NETDEV_TX_BUSY; } if (skb_put_padto(skb, ETH_ZLEN)) From 519d6487640835d19461817c75907e6308074a73 Mon Sep 17 00:00:00 2001 From: Xu Liang Date: Wed, 31 May 2023 15:48:22 +0800 Subject: [PATCH 260/271] net: phy: mxl-gpy: extend interrupt fix to all impacted variants The interrupt fix in commit 97a89ed101bb should be applied on all variants of GPY2xx PHY and GPY115C. Fixes: 97a89ed101bb ("net: phy: mxl-gpy: disable interrupts on GPY215 by default") Signed-off-by: Xu Liang Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230531074822.39136-1-lxu@maxlinear.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mxl-gpy.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index 6301a9abfb95..ea1073adc5a1 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -274,13 +274,6 @@ static int gpy_config_init(struct phy_device *phydev) return ret < 0 ? ret : 0; } -static bool gpy_has_broken_mdint(struct phy_device *phydev) -{ - /* At least these PHYs are known to have broken interrupt handling */ - return phydev->drv->phy_id == PHY_ID_GPY215B || - phydev->drv->phy_id == PHY_ID_GPY215C; -} - static int gpy_probe(struct phy_device *phydev) { struct device *dev = &phydev->mdio.dev; @@ -300,8 +293,7 @@ static int gpy_probe(struct phy_device *phydev) phydev->priv = priv; mutex_init(&priv->mbox_lock); - if (gpy_has_broken_mdint(phydev) && - !device_property_present(dev, "maxlinear,use-broken-interrupts")) + if (!device_property_present(dev, "maxlinear,use-broken-interrupts")) phydev->dev_flags |= PHY_F_NO_IRQ; fw_version = phy_read(phydev, PHY_FWV); @@ -659,11 +651,9 @@ static irqreturn_t gpy_handle_interrupt(struct phy_device *phydev) * frame. Therefore, polling is the best we can do and won't do any more * harm. * It was observed that this bug happens on link state and link speed - * changes on a GPY215B and GYP215C independent of the firmware version - * (which doesn't mean that this list is exhaustive). + * changes independent of the firmware version. */ - if (gpy_has_broken_mdint(phydev) && - (reg & (PHY_IMASK_LSTC | PHY_IMASK_LSPC))) { + if (reg & (PHY_IMASK_LSTC | PHY_IMASK_LSPC)) { reg = gpy_mbox_read(phydev, REG_GPIO0_OUT); if (reg < 0) { phy_error(phydev); From abaf8d51b0cedb16af51fb6b2189370d7515977c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 31 May 2023 08:44:57 -0700 Subject: [PATCH 261/271] ice: recycle/free all of the fragments from multi-buffer frame The ice driver caches next_to_clean value at the beginning of ice_clean_rx_irq() in order to remember the first buffer that has to be freed/recycled after main Rx processing loop. The end boundary is indicated by first descriptor of frame that Rx processing loop has ended its duties. Note that if mentioned loop ended in the middle of gathering multi-buffer frame, next_to_clean would be pointing to the descriptor in the middle of the frame BUT freeing/recycling stage will stop at the first descriptor. This means that next iteration of ice_clean_rx_irq() will miss the (first_desc, next_to_clean - 1) entries. When running various 9K MTU workloads, such splats were observed: [ 540.780716] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 540.787787] #PF: supervisor read access in kernel mode [ 540.793002] #PF: error_code(0x0000) - not-present page [ 540.798218] PGD 0 P4D 0 [ 540.800801] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 540.805231] CPU: 18 PID: 3984 Comm: xskxceiver Tainted: G W 6.3.0-rc7+ #96 [ 540.813619] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 [ 540.824209] RIP: 0010:ice_clean_rx_irq+0x2b6/0xf00 [ice] [ 540.829678] Code: 74 24 10 e9 aa 00 00 00 8b 55 78 41 31 57 10 41 09 c4 4d 85 ff 0f 84 83 00 00 00 49 8b 57 08 41 8b 4f 1c 65 8b 35 1a fa 4b 3f <48> 8b 02 48 c1 e8 3a 39 c6 0f 85 a2 00 00 00 f6 42 08 02 0f 85 98 [ 540.848717] RSP: 0018:ffffc9000f42fc50 EFLAGS: 00010282 [ 540.854029] RAX: 0000000000000004 RBX: 0000000000000002 RCX: 000000000000fffe [ 540.861272] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 00000000ffffffff [ 540.868519] RBP: ffff88984a05ac00 R08: 0000000000000000 R09: dead000000000100 [ 540.875760] R10: ffff88983fffcd00 R11: 000000000010f2b8 R12: 0000000000000004 [ 540.883008] R13: 0000000000000003 R14: 0000000000000800 R15: ffff889847a10040 [ 540.890253] FS: 00007f6ddf7fe640(0000) GS:ffff88afdf800000(0000) knlGS:0000000000000000 [ 540.898465] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 540.904299] CR2: 0000000000000000 CR3: 000000010d3da001 CR4: 00000000007706e0 [ 540.911542] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 540.918789] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 540.926032] PKRU: 55555554 [ 540.928790] Call Trace: [ 540.931276] [ 540.933418] ice_napi_poll+0x4ca/0x6d0 [ice] [ 540.937804] ? __pfx_ice_napi_poll+0x10/0x10 [ice] [ 540.942716] napi_busy_loop+0xd7/0x320 [ 540.946537] xsk_recvmsg+0x143/0x170 [ 540.950178] sock_recvmsg+0x99/0xa0 [ 540.953729] __sys_recvfrom+0xa8/0x120 [ 540.957543] ? do_futex+0xbd/0x1d0 [ 540.961008] ? __x64_sys_futex+0x73/0x1d0 [ 540.965083] __x64_sys_recvfrom+0x20/0x30 [ 540.969155] do_syscall_64+0x38/0x90 [ 540.972796] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 540.977934] RIP: 0033:0x7f6de5f27934 To fix this, set cached_ntc to first_desc so that at the end, when freeing/recycling buffers, descriptors from first to ntc are not missed. Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Signed-off-by: Maciej Fijalkowski Reviewed-by: Simon Horman Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230531154457.3216621-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 059bd911c51d..52d0a126eb61 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1152,11 +1152,11 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) unsigned int total_rx_bytes = 0, total_rx_pkts = 0; unsigned int offset = rx_ring->rx_offset; struct xdp_buff *xdp = &rx_ring->xdp; + u32 cached_ntc = rx_ring->first_desc; struct ice_tx_ring *xdp_ring = NULL; struct bpf_prog *xdp_prog = NULL; u32 ntc = rx_ring->next_to_clean; u32 cnt = rx_ring->count; - u32 cached_ntc = ntc; u32 xdp_xmit = 0; u32 cached_ntu; bool failure; From b0ad3c179059089d809b477a1d445c1183a7b8fe Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 31 May 2023 12:01:42 -0400 Subject: [PATCH 262/271] rtnetlink: call validate_linkmsg in rtnl_create_link validate_linkmsg() was introduced by commit 1840bb13c22f5b ("[RTNL]: Validate hardware and broadcast address attribute for RTM_NEWLINK") to validate tb[IFLA_ADDRESS/BROADCAST] for existing links. The same check should also be done for newly created links. This patch adds validate_linkmsg() call in rtnl_create_link(), to avoid the invalid address set when creating some devices like: # ip link add dummy0 type dummy # ip link add link dummy0 name mac0 address 01:02 type macsec Fixes: 0e06877c6fdb ("[RTNETLINK]: rtnl_link: allow specifying initial device address") Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 653901a1bf75..824688edb722 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3285,6 +3285,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; + int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); @@ -3320,13 +3321,18 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (!dev) return ERR_PTR(-ENOMEM); + err = validate_linkmsg(dev, tb, extack); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); - int err; err = dev_validate_mtu(dev, mtu, extack); if (err) { From fef5b228dd38378148bc850f7e69a7783f3b95a4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 31 May 2023 12:01:43 -0400 Subject: [PATCH 263/271] rtnetlink: move IFLA_GSO_ tb check to validate_linkmsg These IFLA_GSO_* tb check should also be done for the new created link, otherwise, they can be set to a huge value when creating links: # ip link add dummy1 gso_max_size 4294967295 type dummy # ip -d link show dummy1 dummy addrgenmode eui64 ... gso_max_size 4294967295 Fixes: 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation") Fixes: 9eefedd58ae1 ("net: add gso_ipv4_max_size and gro_ipv4_max_size per device") Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 824688edb722..bc068a857219 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2385,6 +2385,25 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; + + if (tb[IFLA_GSO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_max_size"); + return -EINVAL; + } + + if (tb[IFLA_GSO_MAX_SEGS] && + (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || + nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { + NL_SET_ERR_MSG(extack, "too big gso_max_segs"); + return -EINVAL; + } + + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { + NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); + return -EINVAL; + } } if (tb[IFLA_AF_SPEC]) { @@ -2858,11 +2877,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > dev->tso_max_size) { - err = -EINVAL; - goto errout; - } - if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; @@ -2872,11 +2886,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { - err = -EINVAL; - goto errout; - } - if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; @@ -2895,11 +2904,6 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); - if (max_size > dev->tso_max_size) { - err = -EINVAL; - goto errout; - } - if (dev->gso_ipv4_max_size ^ max_size) { netif_set_gso_ipv4_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; From 65d6914e253f3d83b724a9bbfc889ae95711e512 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 31 May 2023 12:01:44 -0400 Subject: [PATCH 264/271] rtnetlink: add the missing IFLA_GRO_ tb check in validate_linkmsg This fixes the issue that dev gro_max_size and gso_ipv4_max_size can be set to a huge value: # ip link add dummy1 type dummy # ip link set dummy1 gro_max_size 4294967295 # ip -d link show dummy1 dummy addrgenmode eui64 ... gro_max_size 4294967295 Fixes: 0fe79f28bfaf ("net: allow gro_max_size to exceed 65536") Fixes: 9eefedd58ae1 ("net: add gso_ipv4_max_size and gro_ipv4_max_size per device") Reported-by: Xiumei Mu Signed-off-by: Xin Long Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc068a857219..41de3a2f29e1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2399,11 +2399,23 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], return -EINVAL; } + if (tb[IFLA_GRO_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } + if (tb[IFLA_GSO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); return -EINVAL; } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE] && + nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); + return -EINVAL; + } } if (tb[IFLA_AF_SPEC]) { From 786fc12457268cc9b555dde6c22ae7300d4b40e1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 May 2023 12:37:03 -0700 Subject: [PATCH 265/271] mptcp: fix connect timeout handling Ondrej reported a functional issue WRT timeout handling on connect with a nice reproducer. The problem is that the current mptcp connect waits for both the MPTCP socket level timeout, and the first subflow socket timeout. The latter is not influenced/touched by the exposed setsockopt(). Overall the above makes the SO_SNDTIMEO a no-op on connect. Since mptcp_connect is invoked via inet_stream_connect and the latter properly handle the MPTCP level timeout, we can address the issue making the nested subflow level connect always unblocking. This also allow simplifying a bit the code, dropping an ugly hack to handle the fastopen and custom proto_ops connect. The issues predates the blamed commit below, but the current resolution requires the infrastructure introduced there. Fixes: 54f1944ed6d2 ("mptcp: factor out mptcp_connect()") Reported-by: Ondrej Mosnacek Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/399 Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 29 +++++++---------------------- net/mptcp/protocol.h | 1 - 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08dc53f56bc2..9cafd3b89908 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1702,7 +1702,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; - msk->connect_flags = O_NONBLOCK; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; @@ -3617,9 +3616,9 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * acquired the subflow socket lock, too. */ if (msk->fastopening) - err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1); else - err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK); inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; /* on successful connect, the msk state will be moved to established by @@ -3632,12 +3631,10 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) mptcp_copy_inaddrs(sk, ssock->sk); - /* unblocking connect, mptcp-level inet_stream_connect will error out - * without changing the socket state, update it here. + /* silence EINPROGRESS and let the caller inet_stream_connect + * handle the connection in progress */ - if (err == -EINPROGRESS) - sk->sk_socket->state = ssock->state; - return err; + return 0; } static struct proto mptcp_prot = { @@ -3696,18 +3693,6 @@ unlock: return err; } -static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) -{ - int ret; - - lock_sock(sock->sk); - mptcp_sk(sock->sk)->connect_flags = flags; - ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); - release_sock(sock->sk); - return ret; -} - static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); @@ -3859,7 +3844,7 @@ static const struct proto_ops mptcp_stream_ops = { .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, - .connect = mptcp_stream_connect, + .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, @@ -3954,7 +3939,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, - .connect = mptcp_stream_connect, + .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 2d7b2c80a164..de4667dafe59 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -297,7 +297,6 @@ struct mptcp_sock { nodelay:1, fastopening:1, in_accept_queue:1; - int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; From 5b825727d0871b23e8867f6371183e61628b4a26 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 May 2023 12:37:04 -0700 Subject: [PATCH 266/271] mptcp: add annotations around msk->subflow accesses The MPTCP can access the first subflow socket in a few spots outside the socket lock scope. That is actually safe, as MPTCP will delete the socket itself only after the msk sock close(). Still the such accesses causes a few KCSAN splats, as reported by Christoph. Silence the harmless warning adding a few annotation around the relevant accesses. Fixes: 71ba088ce0aa ("mptcp: cleanup accept and poll") Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/402 Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 ++++++++++-------- net/mptcp/protocol.h | 6 +++++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9cafd3b89908..ce9de2c946b0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -91,7 +91,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) return err; msk->first = ssock->sk; - msk->subflow = ssock; + WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); @@ -2282,7 +2282,7 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) { if (msk->subflow) { iput(SOCK_INODE(msk->subflow)); - msk->subflow = NULL; + WRITE_ONCE(msk->subflow, NULL); } } @@ -3136,7 +3136,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; - msk->subflow = NULL; + WRITE_ONCE(msk->subflow, NULL); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) @@ -3184,7 +3184,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, struct socket *listener; struct sock *newsk; - listener = msk->subflow; + listener = READ_ONCE(msk->subflow); if (WARN_ON_ONCE(!listener)) { *err = -EINVAL; return NULL; @@ -3736,10 +3736,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - /* buggy applications can call accept on socket states other then LISTEN + /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ - ssock = msk->subflow; + ssock = READ_ONCE(msk->subflow); if (!ssock) return -EINVAL; @@ -3813,10 +3813,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) { - if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) + struct socket *ssock = READ_ONCE(msk->subflow); + + if (WARN_ON_ONCE(!ssock || !ssock->sk)) return 0; - return inet_csk_listen_poll(msk->subflow->sk); + return inet_csk_listen_poll(ssock->sk); } if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index de4667dafe59..7a1a3c35470f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -305,7 +305,11 @@ struct mptcp_sock { struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; - struct socket *subflow; /* outgoing connect/listener/!mp_capable */ + struct socket *subflow; /* outgoing connect/listener/!mp_capable + * The mptcp ops can safely dereference, using suitable + * ONCE annotation, the subflow outside the socket + * lock as such sock is freed after close(). + */ struct sock *first; struct mptcp_pm_data pm; struct { From 7e8b88ec35eef363040e08d99536d2bebef83774 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 May 2023 12:37:05 -0700 Subject: [PATCH 267/271] mptcp: consolidate passive msk socket initialization When the msk socket is cloned at MPC handshake time, a few fields are initialized in a racy way outside mptcp_sk_clone() and the msk socket lock. The above is due historical reasons: before commit a88d0092b24b ("mptcp: simplify subflow_syn_recv_sock()") as the first subflow socket carrying all the needed date was not available yet at msk creation time We can now refactor the code moving the missing initialization bit under the socket lock, removing the init race and avoiding some code duplication. This will also simplify the next patch, as all msk->first write access are now under the msk socket lock. Fixes: 0397c6d85f9c ("mptcp: keep unaccepted MPC subflow into join list") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 35 ++++++++++++++++++++++++++++------- net/mptcp/protocol.h | 8 ++++---- net/mptcp/subflow.c | 28 +--------------------------- 3 files changed, 33 insertions(+), 38 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ce9de2c946b0..2ecd0117ab1b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3038,7 +3038,7 @@ static void mptcp_close(struct sock *sk, long timeout) sock_put(sk); } -void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -3115,9 +3115,10 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) } #endif -struct sock *mptcp_sk_clone(const struct sock *sk, - const struct mptcp_options_received *mp_opt, - struct request_sock *req) +struct sock *mptcp_sk_clone_init(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct sock *ssk, + struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); @@ -3149,10 +3150,30 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; sock_reset_flag(nsk, SOCK_RCU_FREE); - /* will be fully established after successful MPC subflow creation */ - inet_sk_state_store(nsk, TCP_SYN_RECV); - security_inet_csk_clone(nsk, req); + + /* this can't race with mptcp_close(), as the msk is + * not yet exposted to user-space + */ + inet_sk_state_store(nsk, TCP_ESTABLISHED); + + /* The msk maintain a ref to each subflow in the connections list */ + WRITE_ONCE(msk->first, ssk); + list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list); + sock_hold(ssk); + + /* new mpc subflow takes ownership of the newly + * created mptcp socket + */ + mptcp_token_accept(subflow_req, msk); + + /* set msk addresses early to ensure mptcp_pm_get_local_id() + * uses the correct data + */ + mptcp_copy_inaddrs(nsk, ssk); + mptcp_propagate_sndbuf(nsk, ssk); + + mptcp_rcv_space_init(msk, ssk); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7a1a3c35470f..c5255258bfb3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -616,7 +616,6 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); -void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -686,9 +685,10 @@ void __init mptcp_proto_init(void); int __init mptcp_proto_v6_init(void); #endif -struct sock *mptcp_sk_clone(const struct sock *sk, - const struct mptcp_options_received *mp_opt, - struct request_sock *req); +struct sock *mptcp_sk_clone_init(const struct sock *sk, + const struct mptcp_options_received *mp_opt, + struct sock *ssk, + struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ba065b66551a..4688daa6b38b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -815,38 +815,12 @@ create_child: ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { - ctx->conn = mptcp_sk_clone(listener->conn, &mp_opt, req); + ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; owner = mptcp_sk(ctx->conn); - - /* this can't race with mptcp_close(), as the msk is - * not yet exposted to user-space - */ - inet_sk_state_store(ctx->conn, TCP_ESTABLISHED); - - /* record the newly created socket as the first msk - * subflow, but don't link it yet into conn_list - */ - WRITE_ONCE(owner->first, child); - - /* new mpc subflow takes ownership of the newly - * created mptcp socket - */ - owner->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(owner, child, 1); - mptcp_token_accept(subflow_req, owner); - - /* set msk addresses early to ensure mptcp_pm_get_local_id() - * uses the correct data - */ - mptcp_copy_inaddrs(ctx->conn, child); - mptcp_propagate_sndbuf(ctx->conn, child); - - mptcp_rcv_space_init(owner, child); - list_add(&ctx->node, &owner->conn_list); - sock_hold(child); /* with OoO packets we can reach here without ingress * mpc option From 1b1b43ee7a208096ecd79e626f2fc90d4a321111 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 May 2023 12:37:06 -0700 Subject: [PATCH 268/271] mptcp: fix data race around msk->first access The first subflow socket is accessed outside the msk socket lock by mptcp_subflow_fail(), we need to annotate each write access with WRITE_ONCE, but a few spots still lacks it. Fixes: 76a13b315709 ("mptcp: invoke MP_FAIL response when needed") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2ecd0117ab1b..a7dd7d8c9af2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -90,7 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; - msk->first = ssock->sk; + WRITE_ONCE(msk->first, ssock->sk); WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); @@ -2419,7 +2419,7 @@ out_release: sock_put(ssk); if (ssk == msk->first) - msk->first = NULL; + WRITE_ONCE(msk->first, NULL); out: if (ssk == msk->last_snd) @@ -2720,7 +2720,7 @@ static int __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; - msk->first = NULL; + WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); From 6b9831bfd9322b297eb6d44257808cc055fdc586 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 May 2023 12:37:07 -0700 Subject: [PATCH 269/271] mptcp: add annotations around sk->sk_shutdown accesses Christoph reported the mptcp variant of a recently addressed plain TCP issue. Similar to commit e14cadfd80d7 ("tcp: add annotations around sk->sk_shutdown accesses") add READ/WRITE ONCE annotations to silence KCSAN reports around lockless sk_shutdown access. Fixes: 71ba088ce0aa ("mptcp: cleanup accept and poll") Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/401 Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a7dd7d8c9af2..af54a878ac27 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -603,7 +603,7 @@ static bool mptcp_check_data_fin(struct sock *sk) WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { @@ -910,7 +910,7 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ sk->sk_data_ready(sk); @@ -2526,7 +2526,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) } inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); @@ -2958,7 +2958,7 @@ bool __mptcp_close(struct sock *sk, long timeout) bool do_cancel_work = false; int subflows_alive = 0; - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_listen_inuse_dec(sk); @@ -3101,7 +3101,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } @@ -3806,9 +3806,6 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) - return EPOLLOUT | EPOLLWRNORM; - if (sk_stream_is_writeable(sk)) return EPOLLOUT | EPOLLWRNORM; @@ -3826,6 +3823,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; + u8 shutdown; int state; msk = mptcp_sk(sk); @@ -3842,17 +3840,22 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return inet_csk_listen_poll(ssock->sk); } + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; + if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - mask |= mptcp_check_writeable(msk); + if (shutdown & SEND_SHUTDOWN) + mask |= EPOLLOUT | EPOLLWRNORM; + else + mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) - mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); From 55b47ca7d80814ceb63d64e032e96cd6777811e5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 31 May 2023 12:37:08 -0700 Subject: [PATCH 270/271] mptcp: fix active subflow finalization Active subflow are inserted into the connection list at creation time. When the MPJ handshake completes successfully, a new subflow creation netlink event is generated correctly, but the current code wrongly avoid initializing a couple of subflow data. The above will cause misbehavior on a few exceptional events: unneeded mptcp-level retransmission on msk-level sequence wrap-around and infinite mapping fallback even when a MPJ socket is present. Address the issue factoring out the needed initialization in a new helper and invoking the latter from __mptcp_finish_join() time for passive subflow and from mptcp_finish_join() for active ones. Fixes: 0530020a7c8f ("mptcp: track and update contiguous data status") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index af54a878ac27..67311e7d5b21 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -825,6 +825,13 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) mptcp_data_unlock(sk); } +static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) +{ + mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); + mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); +} + static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; @@ -839,6 +846,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) mptcp_sock_graft(ssk, sk->sk_socket); mptcp_sockopt_sync_locked(msk, ssk); + mptcp_subflow_joined(msk, ssk); return true; } @@ -3485,14 +3493,16 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - if (!list_empty(&subflow->node)) - goto out; + /* active subflow, already present inside the conn_list */ + if (!list_empty(&subflow->node)) { + mptcp_subflow_joined(msk, ssk); + return true; + } if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; - /* active connections are already on conn_list. - * If we can't acquire msk socket lock here, let the release callback + /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); @@ -3515,11 +3525,6 @@ err_prohibited: return false; } - subflow->map_seq = READ_ONCE(msk->ack_seq); - WRITE_ONCE(msk->allow_infinite_fallback, false); - -out: - mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; } From f9010dbdce911ee1f1af1398a24b1f9f992e0080 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 1 Jun 2023 13:32:32 -0500 Subject: [PATCH 271/271] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression When switching from kthreads to vhost_tasks two bugs were added: 1. The vhost worker tasks's now show up as processes so scripts doing ps or ps a would not incorrectly detect the vhost task as another process. 2. kthreads disabled freeze by setting PF_NOFREEZE, but vhost tasks's didn't disable or add support for them. To fix both bugs, this switches the vhost task to be thread in the process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that SIGKILL/STOP support is required because CLONE_THREAD requires CLONE_SIGHAND which requires those 2 signals to be supported. This is a modified version of the patch written by Mike Christie which was a modified version of patch originally written by Linus. Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER. Including ignoring signals, setting up the register state, and having get_signal return instead of calling do_group_exit. Tidied up the vhost_task abstraction so that the definition of vhost_task only needs to be visible inside of vhost_task.c. Making it easier to review the code and tell what needs to be done where. As part of this the main loop has been moved from vhost_worker into vhost_task_fn. vhost_worker now returns true if work was done. The main loop has been updated to call get_signal which handles SIGSTOP, freezing, and collects the message that tells the thread to exit as part of process exit. This collection clears __fatal_signal_pending. This collection is not guaranteed to clear signal_pending() so clear that explicitly so the schedule() sleeps. For now the vhost thread continues to exist and run work until the last file descriptor is closed and the release function is called as part of freeing struct file. To avoid hangs in the coredump rendezvous and when killing threads in a multi-threaded exec. The coredump code and de_thread have been modified to ignore vhost threads. Remvoing the special case for exec appears to require teaching vhost_dev_flush how to directly complete transactions in case the vhost thread is no longer running. Removing the special case for coredump rendezvous requires either the above fix needed for exec or moving the coredump rendezvous into get_signal. Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") Signed-off-by: Eric W. Biederman Co-developed-by: Mike Christie Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Linus Torvalds --- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/kernel/fpu/context.h | 2 +- arch/x86/kernel/fpu/core.c | 2 +- drivers/vhost/vhost.c | 22 ++------ fs/coredump.c | 4 +- include/linux/sched/task.h | 1 - include/linux/sched/vhost_task.h | 15 ++---- kernel/exit.c | 5 +- kernel/fork.c | 13 ++--- kernel/signal.c | 8 +-- kernel/vhost_task.c | 92 +++++++++++++++++++++----------- 11 files changed, 89 insertions(+), 77 deletions(-) diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index c2d6cd78ed0c..78fcde7b1f07 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -39,7 +39,7 @@ extern void fpu_flush_thread(void); static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) { + !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index 9fcfa5c4dad7..af5cbdd9bd29 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); - if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index caf33486dc5e..1015af1ae562 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) this_cpu_write(in_kernel_fpu, true); - if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index a92af08e7864..074273020849 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -256,7 +256,7 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) * test_and_set_bit() implies a memory barrier. */ llist_add(&work->node, &dev->worker->work_list); - wake_up_process(dev->worker->vtsk->task); + vhost_task_wake(dev->worker->vtsk); } } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -333,31 +333,19 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } -static int vhost_worker(void *data) +static bool vhost_worker(void *data) { struct vhost_worker *worker = data; struct vhost_work *work, *work_next; struct llist_node *node; - for (;;) { - /* mb paired w/ kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - - if (vhost_task_should_stop(worker->vtsk)) { - __set_current_state(TASK_RUNNING); - break; - } - - node = llist_del_all(&worker->work_list); - if (!node) - schedule(); - + node = llist_del_all(&worker->work_list); + if (node) { node = llist_reverse_order(node); /* make sure flag is seen after deletion */ smp_wmb(); llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); - __set_current_state(TASK_RUNNING); kcov_remote_start_common(worker->kcov_handle); work->fn(work); kcov_remote_stop(); @@ -365,7 +353,7 @@ static int vhost_worker(void *data) } } - return 0; + return !!node; } static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) diff --git a/fs/coredump.c b/fs/coredump.c index ece7badf701b..88740c51b942 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -371,7 +371,9 @@ static int zap_process(struct task_struct *start, int exit_code) if (t != current && !(t->flags & PF_POSTCOREDUMP)) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - nr++; + /* The vhost_worker does not particpate in coredumps */ + if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER) + nr++; } } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 537cbf9a2ade..e0f5ac90a228 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -29,7 +29,6 @@ struct kernel_clone_args { u32 io_thread:1; u32 user_worker:1; u32 no_files:1; - u32 ignore_signals:1; unsigned long stack; unsigned long stack_size; unsigned long tls; diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index 6123c10b99cf..837a23624a66 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -2,22 +2,13 @@ #ifndef _LINUX_VHOST_TASK_H #define _LINUX_VHOST_TASK_H -#include -struct task_struct; +struct vhost_task; -struct vhost_task { - int (*fn)(void *data); - void *data; - struct completion exited; - unsigned long flags; - struct task_struct *task; -}; - -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, const char *name); void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); -bool vhost_task_should_stop(struct vhost_task *vtsk); +void vhost_task_wake(struct vhost_task *vtsk); #endif diff --git a/kernel/exit.c b/kernel/exit.c index 34b90e2e7cf7..edb50b4c9972 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -411,7 +411,10 @@ static void coredump_task_exit(struct task_struct *tsk) tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { + + /* The vhost_worker does not particpate in coredumps */ + if (core_state && + ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { struct core_thread self; self.task = current; diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..81cba91f30bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2336,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; - if (args->user_worker) - p->flags |= PF_USER_WORKER; - if (args->io_thread) { + if (args->user_worker) { /* - * Mark us an IO worker, and block any signal that isn't + * Mark us a user worker, and block any signal that isn't * fatal or STOP */ - p->flags |= PF_IO_WORKER; + p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->io_thread) + p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); @@ -2517,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; - if (args->ignore_signals) - ignore_signals(p); - stackleak_task_init(p); if (pid != &init_struct_pid) { diff --git a/kernel/signal.c b/kernel/signal.c index 8f6330f0e9ca..2547fa73bde5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1368,7 +1368,9 @@ int zap_other_threads(struct task_struct *p) while_each_thread(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - count++; + /* Don't require de_thread to wait for the vhost_worker */ + if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) + count++; /* Don't bother with already dead threads */ if (t->exit_state) @@ -2861,11 +2863,11 @@ relock: } /* - * PF_IO_WORKER threads will catch and exit on fatal signals + * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so * we cannot call do_exit() on their behalf. */ - if (current->flags & PF_IO_WORKER) + if (current->flags & PF_USER_WORKER) goto out; /* diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index b7cbd66f889e..f80d5c51ae67 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -12,58 +12,88 @@ enum vhost_task_flags { VHOST_TASK_FLAGS_STOP, }; +struct vhost_task { + bool (*fn)(void *data); + void *data; + struct completion exited; + unsigned long flags; + struct task_struct *task; +}; + static int vhost_task_fn(void *data) { struct vhost_task *vtsk = data; - int ret; + bool dead = false; + + for (;;) { + bool did_work; + + /* mb paired w/ vhost_task_stop */ + if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) + break; + + if (!dead && signal_pending(current)) { + struct ksignal ksig; + /* + * Calling get_signal will block in SIGSTOP, + * or clear fatal_signal_pending, but remember + * what was set. + * + * This thread won't actually exit until all + * of the file descriptors are closed, and + * the release function is called. + */ + dead = get_signal(&ksig); + if (dead) + clear_thread_flag(TIF_SIGPENDING); + } + + did_work = vtsk->fn(vtsk->data); + if (!did_work) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + } - ret = vtsk->fn(vtsk->data); complete(&vtsk->exited); - do_exit(ret); + do_exit(0); } +/** + * vhost_task_wake - wakeup the vhost_task + * @vtsk: vhost_task to wake + * + * wake up the vhost_task worker thread + */ +void vhost_task_wake(struct vhost_task *vtsk) +{ + wake_up_process(vtsk->task); +} +EXPORT_SYMBOL_GPL(vhost_task_wake); + /** * vhost_task_stop - stop a vhost_task * @vtsk: vhost_task to stop * - * Callers must call vhost_task_should_stop and return from their worker - * function when it returns true; + * vhost_task_fn ensures the worker thread exits after + * VHOST_TASK_FLAGS_SOP becomes true. */ void vhost_task_stop(struct vhost_task *vtsk) { - pid_t pid = vtsk->task->pid; - set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); - wake_up_process(vtsk->task); + vhost_task_wake(vtsk); /* * Make sure vhost_task_fn is no longer accessing the vhost_task before - * freeing it below. If userspace crashed or exited without closing, - * then the vhost_task->task could already be marked dead so - * kernel_wait will return early. + * freeing it below. */ wait_for_completion(&vtsk->exited); - /* - * If we are just closing/removing a device and the parent process is - * not exiting then reap the task. - */ - kernel_wait4(pid, NULL, __WCLONE, NULL); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); /** - * vhost_task_should_stop - should the vhost task return from the work function - * @vtsk: vhost_task to stop - */ -bool vhost_task_should_stop(struct vhost_task *vtsk) -{ - return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); -} -EXPORT_SYMBOL_GPL(vhost_task_should_stop); - -/** - * vhost_task_create - create a copy of a process to be used by the kernel - * @fn: thread stack + * vhost_task_create - create a copy of a task to be used by the kernel + * @fn: vhost worker function * @arg: data to be passed to fn * @name: the thread's name * @@ -71,17 +101,17 @@ EXPORT_SYMBOL_GPL(vhost_task_should_stop); * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, const char *name) { struct kernel_clone_args args = { - .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | + CLONE_THREAD | CLONE_SIGHAND, .exit_signal = 0, .fn = vhost_task_fn, .name = name, .user_worker = 1, .no_files = 1, - .ignore_signals = 1, }; struct vhost_task *vtsk; struct task_struct *tsk;