From 897614f90f7cd9fd7f5b7acca24dfb55b6c0c4ae Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 14 Nov 2024 16:46:02 +0100 Subject: [PATCH 01/27] s390/debug: Pass in and enforce output buffer size for format handlers The s390dbf format handler rely on being passed an output buffer sized such that their output will always fit and then use plain sprintf() to write to it. While only supplied data from other kernel components this still potentially allows buffer overwrite if callers are not careful. Instead just pass in the size of the output buffer and use scnprintf() instead of sprintf() and strscpy() instead of strcpy(). The latter also allows us to get rid of a separate strlen() call. Signed-off-by: Niklas Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/include/asm/debug.h | 8 ++-- arch/s390/kernel/debug.c | 83 +++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index ccd4e148b5ed..a7f7bdc9e19c 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -66,14 +66,15 @@ typedef int (debug_header_proc_t) (debug_info_t *id, struct debug_view *view, int area, debug_entry_t *entry, - char *out_buf); + char *out_buf, size_t out_buf_size); typedef int (debug_format_proc_t) (debug_info_t *id, struct debug_view *view, char *out_buf, + size_t out_buf_size, const char *in_buf); typedef int (debug_prolog_proc_t) (debug_info_t *id, struct debug_view *view, - char *out_buf); + char *out_buf, size_t out_buf_size); typedef int (debug_input_proc_t) (debug_info_t *id, struct debug_view *view, struct file *file, @@ -81,7 +82,8 @@ typedef int (debug_input_proc_t) (debug_info_t *id, size_t in_buf_size, loff_t *offset); int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf); + int area, debug_entry_t *entry, + char *out_buf, size_t out_buf_size); struct debug_view { char name[DEBUG_MAX_NAME_LEN]; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index b3f2103694e4..de19fd8a6a95 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -77,12 +77,14 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, static void debug_info_get(debug_info_t *); static void debug_info_put(debug_info_t *); static int debug_prolog_level_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_prolog_pages_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); @@ -90,9 +92,11 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf); + char *out_buf, size_t out_buf_size, + const char *in_buf); static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *inbuf); + char *out_buf, size_t out_buf_size, + const char *inbuf); static void debug_areas_swap(debug_info_t *a, debug_info_t *b); static void debug_events_append(debug_info_t *dest, debug_info_t *src); @@ -391,8 +395,10 @@ static int debug_format_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { /* print prolog */ - if (view->prolog_proc) - len += view->prolog_proc(id_snap, view, p_info->temp_buf); + if (view->prolog_proc) { + len += view->prolog_proc(id_snap, view, p_info->temp_buf, + sizeof(p_info->temp_buf)); + } goto out; } if (!id_snap->areas) /* this is true, if we have a prolog only view */ @@ -402,12 +408,16 @@ static int debug_format_entry(file_private_info_t *p_info) if (act_entry->clock == 0LL) goto out; /* empty entry */ - if (view->header_proc) + if (view->header_proc) { len += view->header_proc(id_snap, view, p_info->act_area, - act_entry, p_info->temp_buf + len); - if (view->format_proc) + act_entry, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len); + } + if (view->format_proc) { len += view->format_proc(id_snap, view, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len, DEBUG_DATA(act_entry)); + } out: return len; } @@ -1292,9 +1302,9 @@ static inline int debug_get_uint(char *buf) */ static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { - return sprintf(out_buf, "%i\n", id->pages_per_area); + return scnprintf(out_buf, out_buf_size, "%i\n", id->pages_per_area); } /* @@ -1341,14 +1351,14 @@ static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, * prints out actual debug level */ static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { int rc = 0; if (id->level == DEBUG_OFF_LEVEL) - rc = sprintf(out_buf, "-\n"); + rc = scnprintf(out_buf, out_buf_size, "-\n"); else - rc = sprintf(out_buf, "%i\n", id->level); + rc = scnprintf(out_buf, out_buf_size, "%i\n", id->level); return rc; } @@ -1465,22 +1475,24 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, * prints debug data in hex/ascii format */ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) + char *out_buf, size_t out_buf_size, const char *in_buf) { int i, rc = 0; - for (i = 0; i < id->buf_size; i++) - rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); - rc += sprintf(out_buf + rc, "| "); + for (i = 0; i < id->buf_size; i++) { + rc += scnprintf(out_buf + rc, out_buf_size - rc, + "%02x ", ((unsigned char *)in_buf)[i]); + } + rc += scnprintf(out_buf + rc, out_buf_size - rc, "| "); for (i = 0; i < id->buf_size; i++) { unsigned char c = in_buf[i]; if (isascii(c) && isprint(c)) - rc += sprintf(out_buf + rc, "%c", c); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "%c", c); else - rc += sprintf(out_buf + rc, "."); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "."); } - rc += sprintf(out_buf + rc, "\n"); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "\n"); return rc; } @@ -1488,7 +1500,8 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, * prints header for debug entry */ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) + int area, debug_entry_t *entry, char *out_buf, + size_t out_buf_size) { unsigned long sec, usec; unsigned long caller; @@ -1505,9 +1518,9 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, else except_str = "-"; caller = (unsigned long) entry->caller; - rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ", - area, sec, usec, level, except_str, - entry->cpu, (void *)caller); + rc += scnprintf(out_buf, out_buf_size, "%02i %011ld:%06lu %1u %1s %04u %px ", + area, sec, usec, level, except_str, + entry->cpu, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); @@ -1520,7 +1533,7 @@ EXPORT_SYMBOL(debug_dflt_header_fn); #define DEBUG_SPRINTF_MAX_ARGS 10 static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *inbuf) + char *out_buf, size_t out_buf_size, const char *inbuf) { debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf; int num_longs, num_used_args = 0, i, rc = 0; @@ -1533,8 +1546,9 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, goto out; /* bufsize of entry too small */ if (num_longs == 1) { /* no args, we use only the string */ - strcpy(out_buf, curr_event->string); - rc = strlen(curr_event->string); + rc = strscpy(out_buf, curr_event->string, out_buf_size); + if (rc == -E2BIG) + rc = out_buf_size; goto out; } @@ -1546,12 +1560,13 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, for (i = 0; i < num_used_args; i++) index[i] = i; - rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], - curr_event->args[index[1]], curr_event->args[index[2]], - curr_event->args[index[3]], curr_event->args[index[4]], - curr_event->args[index[5]], curr_event->args[index[6]], - curr_event->args[index[7]], curr_event->args[index[8]], - curr_event->args[index[9]]); + rc = scnprintf(out_buf, out_buf_size, + curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); out: return rc; } From 2f32cc40f1440a6aa9e7396af41db79bece67bb2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 8 Nov 2024 14:27:43 +0100 Subject: [PATCH 02/27] s390/mm: Remove bogus comment in __tlb_flush_mm() Since commit b3e5dc45fd1e ("s390/mm: fix local TLB flushing vs. detach of an mm address space") __tlb_flush_mm() does not flush local CPU, but the comment suggests it does, which is misleading. Signed-off-by: Alexander Gordeev Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/include/asm/tlbflush.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index a6e2cd89b609..9dfd46dd03c6 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -46,11 +46,6 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) { unsigned long gmap_asce; - /* - * If the machine has IDTE we prefer to do a per mm flush - * on all cpus instead of doing a local flush if the mm - * only ran on the local cpu. - */ preempt_disable(); atomic_inc(&mm->context.flush_count); /* Reset TLB flush mask */ From 588a9836a4ef7ec3bfcffda526dfa399637e6cfc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 18 Nov 2024 13:14:07 +0100 Subject: [PATCH 03/27] s390/stacktrace: Use break instead of return statement arch_stack_walk_user_common() contains a return statement instead of a break statement in case store_ip() fails while trying to store a callchain entry of a user space process. This may lead to a missing pagefault_enable() call. If this happens any subsequent page fault of the process won't be resolved by the page fault handler and this in turn will lead to the process being killed. Use a break instead of a return statement to fix this. Fixes: ebd912ff9919 ("s390/stacktrace: Merge perf_callchain_user() and arch_stack_walk_user()") Cc: stable@vger.kernel.org Reviewed-by: Jens Remus Signed-off-by: Heiko Carstens --- arch/s390/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 9f59837d159e..40edfde25f5b 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -151,7 +151,7 @@ void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *coo break; } if (!store_ip(consume_entry, cookie, entry, perf, ip)) - return; + break; first = false; } pagefault_enable(); From 9c7260b527f0f7c75a9c0fee297663d1acc40937 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 15 Nov 2024 14:56:11 +0100 Subject: [PATCH 04/27] s390/vfio-ap: Remove gmap_convert_to_secure() from vfio_ap_ops If the page has been exported, do not re-import it. Imports should only be triggered by the guest. The guest will import the page automatically when it will need it again, there is no advantage in importing it manually. Moreover, vfio_pin_pages() will take an extra reference on the page and thus will cause the import to always fail. The extra reference would be dropped only after pointlessly trying to import the page. Fixes: f88fb1335733 ("s390/vfio-ap: make sure nib is shared") Signed-off-by: Claudio Imbrenda Reviewed-by: Halil Pasic Link: https://lore.kernel.org/r/20241115135611.87836-1-imbrenda@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 32 +++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 8c0b40d8eb39..a52c2690933f 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -360,10 +360,26 @@ static int vfio_ap_validate_nib(struct kvm_vcpu *vcpu, dma_addr_t *nib) return 0; } -static int ensure_nib_shared(unsigned long addr, struct gmap *gmap) +/** + * ensure_nib_shared() - Ensure the address of the NIB is secure and shared + * @addr: the physical (absolute) address of the NIB + * + * This function checks whether the NIB page, which has been pinned with + * vfio_pin_pages(), is a shared page belonging to a secure guest. + * + * It will call uv_pin_shared() on it; if the page was already pinned shared + * (i.e. if the NIB belongs to a secure guest and is shared), then 0 + * (success) is returned. If the NIB was not shared, vfio_pin_pages() had + * exported it and now it does not belong to the secure guest anymore. In + * that case, an error is returned. + * + * Context: the NIB (at physical address @addr) has to be pinned with + * vfio_pin_pages() before calling this function. + * + * Return: 0 in case of success, otherwise an error < 0. + */ +static int ensure_nib_shared(unsigned long addr) { - int ret; - /* * The nib has to be located in shared storage since guest and * host access it. vfio_pin_pages() will do a pin shared and @@ -374,12 +390,7 @@ static int ensure_nib_shared(unsigned long addr, struct gmap *gmap) * * If the page is already pinned shared the UV will return a success. */ - ret = uv_pin_shared(addr); - if (ret) { - /* vfio_pin_pages() likely exported the page so let's re-import */ - gmap_convert_to_secure(gmap, addr); - } - return ret; + return uv_pin_shared(addr); } /** @@ -425,6 +436,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, return status; } + /* The pin will probably be successful even if the NIB was not shared */ ret = vfio_pin_pages(&q->matrix_mdev->vdev, nib, 1, IOMMU_READ | IOMMU_WRITE, &h_page); switch (ret) { @@ -447,7 +459,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, /* NIB in non-shared storage is a rc 6 for PV guests */ if (kvm_s390_pv_cpu_is_protected(vcpu) && - ensure_nib_shared(h_nib & PAGE_MASK, kvm->arch.gmap)) { + ensure_nib_shared(h_nib & PAGE_MASK)) { vfio_unpin_pages(&q->matrix_mdev->vdev, nib, 1); status.response_code = AP_RESPONSE_INVALID_ADDRESS; return status; From 7bc1ee28f4d21fc1e2f3d09534baf86ce7f3ba5e Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 19 Nov 2024 13:22:47 +0100 Subject: [PATCH 05/27] s390/cpum_sf: Simplify release of SDBs and SDBTs Free_sampling_buffer() releases the Sampling Data Buffers (SDBs) and Sampling Data Buffer Table (SDBTs) allocated at event initialization. Both buffers are of PAGE_SIZE bytes. Each SDBT consists of 512 entries. The first 511 entries point to SDBs the last entry points to a successor SDBT. The last SDBT in the list points to the origin of all SDBTs. SDBTs do not contain holes, that is an entry always points to a SDB. If less than 511 SDBs have been allocation, the last entry points to the origin SDBT. Simplify the release of the SDBs and SDBTs, walk along the SDBT chain, release SDBs and SDBTs and stop when reaching the origin again. Signed-off-by: Thomas Richter Reviewed-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/kernel/perf_cpum_sf.c | 38 +++++++++++---------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 0cde42f8af6e..1e99514fb7ae 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -180,39 +180,27 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw) */ static void free_sampling_buffer(struct sf_buffer *sfb) { - unsigned long *sdbt, *curr; - - if (!sfb->sdbt) - return; + unsigned long *sdbt, *curr, *head; sdbt = sfb->sdbt; - curr = sdbt; - + if (!sdbt) + return; + sfb->sdbt = NULL; /* Free the SDBT after all SDBs are processed... */ - while (1) { - if (!*curr || !sdbt) - break; - - /* Process table-link entries */ + head = sdbt; + curr = sdbt; + do { if (is_link_entry(curr)) { + /* Process table-link entries */ curr = get_next_sdbt(curr); - if (sdbt) - free_page((unsigned long)sdbt); - - /* If the origin is reached, sampling buffer is freed */ - if (curr == sfb->sdbt) - break; - else - sdbt = curr; + free_page((unsigned long)sdbt); + sdbt = curr; } else { /* Process SDB pointer */ - if (*curr) { - free_page((unsigned long)phys_to_virt(*curr)); - curr++; - } + free_page((unsigned long)phys_to_virt(*curr)); + curr++; } - } - + } while (curr != head); memset(sfb, 0, sizeof(*sfb)); } From 45c9f2b856a075a34873d00788d2e8a250c1effd Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 19 Nov 2024 14:54:07 +0100 Subject: [PATCH 06/27] s390/entry: Mark IRQ entries to fix stack depot warnings The stack depot filters out everything outside of the top interrupt context as an uninteresting or irrelevant part of the stack traces. This helps with stack trace de-duplication, avoiding an explosion of saved stack traces that share the same IRQ context code path but originate from different randomly interrupted points, eventually exhausting the stack depot. Filtering uses in_irqentry_text() to identify functions within the .irqentry.text and .softirqentry.text sections, which then become the last stack trace entries being saved. While __do_softirq() is placed into the .softirqentry.text section by common code, populating .irqentry.text is architecture-specific. Currently, the .irqentry.text section on s390 is empty, which prevents stack depot filtering and de-duplication and could result in warnings like: Stack depot reached limit capacity WARNING: CPU: 0 PID: 286113 at lib/stackdepot.c:252 depot_alloc_stack+0x39a/0x3c8 with PREEMPT and KASAN enabled. Fix this by moving the IO/EXT interrupt handlers from .kprobes.text into the .irqentry.text section and updating the kprobes blacklist to include the .irqentry.text section. This is done only for asynchronous interrupts and explicitly not for program checks, which are synchronous and where the context beyond the program check is important to preserve. Despite machine checks being somewhat in between, they are extremely rare, and preserving context when possible is also of value. SVCs and Restart Interrupts are not relevant, one being always at the boundary to user space and the other being a one-time thing. IRQ entries filtering is also optionally used in ftrace function graph, where the same logic applies. Cc: stable@vger.kernel.org # 5.15+ Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/kernel/entry.S | 4 ++++ arch/s390/kernel/kprobes.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 1ff13239d4e5..960c08700cf6 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -430,9 +430,13 @@ SYM_CODE_START(\name) SYM_CODE_END(\name) .endm + .section .irqentry.text, "ax" + INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq + .section .kprobes.text, "ax" + /* * Machine check handler routines */ diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 6295faf0987d..8b80ea57125f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -489,6 +489,12 @@ int __init arch_init_kprobes(void) return 0; } +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); +} + int arch_trampoline_kprobe(struct kprobe *p) { return 0; From 546d7bd47973790073b824d69ee59440337b407b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Nov 2024 12:41:07 +0100 Subject: [PATCH 07/27] s390: Add missing _TIF defines Add missing _TIF defines so that for every TIF bit its corresponding _TIF mask exists. Sort the _TIF list to match the TIF order. Also remove two leftover comments from the pre generic entry time. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/thread_info.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 00ac01874a12..3802f281eeed 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -61,7 +61,6 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers */ -/* _TIF_WORK bits */ #define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ @@ -72,33 +71,33 @@ void arch_setup_new_exec(void); #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ #define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ - #define TIF_31BIT 16 /* 32bit process */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ #define TIF_SINGLE_STEP 19 /* This task is single stepped */ #define TIF_BLOCK_STEP 20 /* This task is block stepped */ #define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ - -/* _TIF_TRACE bits */ #define TIF_SYSCALL_TRACE 24 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ #define TIF_SECCOMP 26 /* secure computing */ #define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) #define _TIF_UPROBE BIT(TIF_UPROBE) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) +#define _TIF_PGSTE BIT(TIF_PGSTE) +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) - #define _TIF_31BIT BIT(TIF_31BIT) +#define _TIF_MEMDIE BIT(TIF_MEMDIE) +#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) - +#define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) +#define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) #define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP BIT(TIF_SECCOMP) From 9de3e4bf6cfbcb62e9ec658e3b291cd479018b43 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Nov 2024 12:41:08 +0100 Subject: [PATCH 08/27] s390: Add ARCH_HAS_PREEMPT_LAZY support Just add the required TIF bit for ARCH_HAS_PREEMPT_LAZY support. Shuffle TIF bits to get TIF_NEED_RESCHED_LAZY next to TIF_NEED_RESCHED. Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/thread_info.h | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a45259d4c0a5..b7e1eec7903e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -87,6 +87,7 @@ config S390 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME select ARCH_HAS_SET_DIRECT_MAP diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 3802f281eeed..c33f7144d1b9 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -64,11 +64,12 @@ void arch_setup_new_exec(void); #define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_UPROBE 3 /* breakpointed or single-stepping */ -#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ +#define TIF_UPROBE 4 /* breakpointed or single-stepping */ #define TIF_PATCH_PENDING 5 /* pending live patching update */ #define TIF_PGSTE 6 /* New mm's will use 4K page tables */ #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ +#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ #define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ #define TIF_31BIT 16 /* 32bit process */ @@ -85,11 +86,12 @@ void arch_setup_new_exec(void); #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) #define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #define _TIF_PGSTE BIT(TIF_PGSTE) #define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) +#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) From ff123eb7741638d55abf82fac090bb3a543c1e74 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 22 Nov 2024 13:41:33 +0100 Subject: [PATCH 09/27] s390/mm: Allow large pages for KASAN shadow mapping Commit c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") introduced a large_allowed() helper that restricts which mapping modes can use large pages. This change unintentionally prevented KASAN shadow mappings from using large pages, despite there being no reason to avoid them. In fact, large pages are preferred for performance. Add POPULATE_KASAN_MAP_SHADOW to the allowed list in large_allowed() to restore large page mappings for KASAN shadows. While large_allowed() isn't strictly necessary with current mapping modes since disallowed modes either don't map anything or fail alignment and size checks, keep it for clarity. Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Acked-by: Alexander Gordeev Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/boot/vmem.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 145035f84a0e..8476b6f965a9 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -264,7 +264,17 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m static bool large_allowed(enum populate_mode mode) { - return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY) || (mode == POPULATE_KERNEL); + switch (mode) { + case POPULATE_DIRECT: + case POPULATE_IDENTITY: + case POPULATE_KERNEL: +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: +#endif + return true; + default: + return false; + } } static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, From 7726b55b5d6c22dc0e66570597bf6e43d410d093 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Nov 2024 12:54:30 +0100 Subject: [PATCH 10/27] s390/ap: Replace xchg() with WRITE_ONCE() The result of xchg() is not used, and in addition it is used on a one byte memory area which leads to inefficient code. Use WRITE_ONCE() instead to achieve the same result with much less generated code. Acked-by: Harald Freudenberger Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- drivers/s390/crypto/ap_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index e14638936de6..26e1ea1940ec 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -453,7 +453,7 @@ static void ap_tasklet_fn(unsigned long dummy) * important that no requests on any AP get lost. */ if (ap_irq_flag) - xchg(ap_airq.lsi_ptr, 0); + WRITE_ONCE(*ap_airq.lsi_ptr, 0); spin_lock_bh(&ap_queues_lock); hash_for_each(ap_queues, bkt, aq, hnode) { From 5618c53d96d1bf2df07df8b9204773db28acbc1b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 11:25:13 +0100 Subject: [PATCH 11/27] KVM: s390: Use try_cmpxchg() instead of cmpxchg() loops Convert all cmpxchg() loops to try_cmpxchg() loops. With gcc 14 and the usage of flag output operands in try_cmpxchg() this allows the compiler to generate slightly better code. Acked-by: Claudio Imbrenda Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20241126102515.3178914-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/kvm/gaccess.c | 16 ++++++++-------- arch/s390/kvm/interrupt.c | 12 ++++++------ arch/s390/kvm/kvm-s390.c | 4 ++-- arch/s390/kvm/pci.c | 5 ++--- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index a688351f4ab5..9816b0060fbe 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -129,8 +129,8 @@ static void ipte_lock_simple(struct kvm *kvm) retry: read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); if (old.k) { read_unlock(&kvm->arch.sca_lock); cond_resched(); @@ -138,7 +138,7 @@ static void ipte_lock_simple(struct kvm *kvm) } new = old; new.k = 1; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); out: mutex_unlock(&kvm->arch.ipte_mutex); @@ -154,11 +154,11 @@ static void ipte_unlock_simple(struct kvm *kvm) goto out; read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); new = old; new.k = 0; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); wake_up(&kvm->arch.ipte_wq); out: @@ -172,8 +172,8 @@ static void ipte_lock_siif(struct kvm *kvm) retry: read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); if (old.kg) { read_unlock(&kvm->arch.sca_lock); cond_resched(); @@ -182,7 +182,7 @@ static void ipte_lock_siif(struct kvm *kvm) new = old; new.k = 1; new.kh++; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); } @@ -192,13 +192,13 @@ static void ipte_unlock_siif(struct kvm *kvm) read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); new = old; new.kh--; if (!new.kh) new.k = 0; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); if (!new.kh) wake_up(&kvm->arch.ipte_wq); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4f0e7f61edf7..eff69018cbeb 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -247,12 +247,12 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) { u64 word, _word; + word = READ_ONCE(gisa->u64.word[0]); do { - word = READ_ONCE(gisa->u64.word[0]); if ((u64)gisa != word >> 32) return -EBUSY; _word = (word & ~0xffUL) | iam; - } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word)); return 0; } @@ -270,10 +270,10 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) { u64 word, _word; + word = READ_ONCE(gisa->u64.word[0]); do { - word = READ_ONCE(gisa->u64.word[0]); _word = word & ~(0xffUL << 24); - } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word)); } /** @@ -291,14 +291,14 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) u8 pending_mask, alert_mask; u64 word, _word; + word = READ_ONCE(gi->origin->u64.word[0]); do { - word = READ_ONCE(gi->origin->u64.word[0]); alert_mask = READ_ONCE(gi->alert.mask); pending_mask = (u8)(word >> 24) & alert_mask; if (pending_mask) return pending_mask; _word = (word & ~0xffUL) | alert_mask; - } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gi->origin->u64.word[0], &word, _word)); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index deeb32034ad5..a1fcdb2479a1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1907,11 +1907,11 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; + old = READ_ONCE(sca->utility); do { - old = READ_ONCE(sca->utility); new = old; new.mtcr = val; - } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); } diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index a61518b549f0..9b9e7fdd5380 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -208,13 +208,12 @@ static inline int account_mem(unsigned long nr_pages) page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur_pages = atomic_long_read(&user->locked_vm); do { - cur_pages = atomic_long_read(&user->locked_vm); new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); + } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); atomic64_add(nr_pages, ¤t->mm->pinned_vm); From 7061c63919bde3bb0daa63e700a117ab1f2c97b1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 11:25:14 +0100 Subject: [PATCH 12/27] KVM: s390: Remove one byte cmpxchg() usage Within sca_clear_ext_call() cmpxchg() is used to clear one or two bytes (depending on sca format). The cmpxchg() calls are not supposed to fail; if so that would be a bug. Given that cmpxchg() usage on one and two byte areas generates very inefficient code, replace them with block concurrent WRITE_ONCE() calls, and remove the WARN_ON(). Acked-by: Claudio Imbrenda Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20241126102515.3178914-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/kvm/interrupt.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index eff69018cbeb..ea8dce299954 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -118,8 +118,6 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { - int rc, expect; - if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); @@ -128,23 +126,16 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl old; - old = READ_ONCE(*sigp_ctrl); - expect = old.value; - rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + WRITE_ONCE(sigp_ctrl->value, 0); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl old; - old = READ_ONCE(*sigp_ctrl); - expect = old.value; - rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + WRITE_ONCE(sigp_ctrl->value, 0); } read_unlock(&vcpu->kvm->arch.sca_lock); - WARN_ON(rc != expect); /* cannot clear? */ } int psw_extint_disabled(struct kvm_vcpu *vcpu) From f93d6d62e4697b0df1474f0b1314c3c158335f0a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 11:25:15 +0100 Subject: [PATCH 13/27] KVM: s390: Increase size of union sca_utility to four bytes kvm_s390_update_topology_change_report() modifies a single bit within sca_utility using cmpxchg(). Given that the size of the sca_utility union is two bytes this generates very inefficient code. Change the size to four bytes, so better code can be generated. Even though the size of sca_utility doesn't reflect architecture anymore this seems to be the easiest and most pragmatic approach to avoid inefficient code. Acked-by: Claudio Imbrenda Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20241126102515.3178914-4-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/include/asm/kvm_host.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 51201b4ac93a..7ef37f746d47 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -94,11 +94,16 @@ union ipte_control { }; }; +/* + * Utility is defined as two bytes but having it four bytes wide + * generates more efficient code. Since the following bytes are + * reserved this makes no functional difference. + */ union sca_utility { - __u16 val; + __u32 val; struct { - __u16 mtcr : 1; - __u16 reserved : 15; + __u32 mtcr : 1; + __u32 : 31; }; }; @@ -107,7 +112,7 @@ struct bsca_block { __u64 reserved[5]; __u64 mcn; union sca_utility utility; - __u8 reserved2[6]; + __u8 reserved2[4]; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; }; @@ -115,7 +120,7 @@ struct esca_block { union ipte_control ipte_control; __u64 reserved1[6]; union sca_utility utility; - __u8 reserved2[6]; + __u8 reserved2[4]; __u64 mcn[4]; __u64 reserved3[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; From ae1b9fb2d556e95001399dcd4e228525aead5122 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:20 +0100 Subject: [PATCH 14/27] s390/mm: Rearrange region-third and segment table entry SW bits Rearrange region-third and segment table entry SW bits, in order to make room for future encoding of region/segment table swap entries. Also adjust _SEGMENT_ENTRY_GMAP_UC and _SEGMENT_ENTRY_GMAP_IN bits in gmap code. Those should only apply for gmap PMDs, and not really depend on or conflict with host PMD bits, but for consistency also adjust them: - _SEGMENT_ENTRY_GMAP_UC "dirty (migration)" was using the same bit as _SEGMENT_ENTRY_SOFT_DIRTY in the host PMD -> make it use the new SOFT_DIRTY bit 63 (0x0002) - _SEGMENT_ENTRY_GMAP_IN "invalidation notify bit" was using 0x8000, which was an unused bit in the host PMD, that is now used for _SEGMENT_ENTRY_WRITE -> make it use bit 52 (0x0800) instead, which is still unused in the host PMD This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/gmap.h | 4 ++-- arch/s390/include/asm/pgtable.h | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 64761c78f774..13f51a6a5bb1 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -17,8 +17,8 @@ #define GMAP_NOTIFY_MPROT 0x1 /* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */ +#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ +#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ /** * struct gmap_struct - guest address space diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0ffbaf741955..ee5d047a2d67 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -286,11 +286,11 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ -#define _REGION3_ENTRY_WRITE 0x0002 /* SW region write bit */ -#define _REGION3_ENTRY_READ 0x0001 /* SW region read bit */ +#define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ +#define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ +#define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */ #else #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ #endif @@ -313,12 +313,13 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ + #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ -#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */ -#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */ +#define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ +#define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */ +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0002 /* SW segment soft dirty bit */ #else #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif From 03e6db16b808afbe23e10617f2d18578846bdce0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:21 +0100 Subject: [PATCH 15/27] s390/mm: Introduce region-third and segment table entry present bits Introduce region-third and segment table entry present SW bits, and adjust pmd/pud_present() accordingly. Also add pmd/pud_present() checks to pmd/pud_leaf(), to return false for future swap entries. Same logic applies to pmd_trans_huge(), make that return pmd_leaf() instead of duplicating the same check. huge_pte_offset() also needs to be adjusted, current code would return NULL for !pud_present(). Use the same logic as in the generic version, which allows for !pud_present() swap entries. Similar to PTE, bit 63 can be used for the new SW present bit in region and segment table entries. For segment-table entries (PMD) the architecture says that "Bits 62-63 are available for programming", so they are safe to use. The same is true for large leaf region-third-table entries (PUD). However, for non-leaf region-third-table entries, bits 62-63 indicate the TABLE LENGTH and both must be set to 1. But such entries would always be considered as present, so it is safe to use bit 63 as PRESENT bit for PUD. They also should not conflict with bit 62 potentially later used for preserving SOFT_DIRTY in swap entries, because they are not swap entries. Valid PMDs / PUDs should always have the present bit set, so add it to the various pgprot defines, and also _SEGMENT_ENTRY which is OR'ed e.g. in pmd_populate(). _REGION3_ENTRY wouldn't need any change, as the present bit is already included in the TABLE LENGTH, but also explicitly add it there, for completeness, and just in case the bit would ever be changed. gmap code needs some adjustment, to also OR the _SEGMENT_ENTRY, like it is already done gmap_shadow_pgt() when creating new PMDs, but not in __gmap_link(). Otherwise, the gmap PMDs would not be considered present, e.g. when using pmd_leaf() checks in gmap code. The various WARN_ON checks in gmap code also need adjustment, to tolerate the new present bit. This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 51 ++++++++++++++++++++++----------- arch/s390/mm/gmap.c | 12 +++++--- arch/s390/mm/hugetlbpage.c | 8 +++--- 3 files changed, 47 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ee5d047a2d67..8213e05770f8 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -277,7 +277,8 @@ static inline int is_module_addr(void *addr) #define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID) #define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH) #define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID) -#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) +#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH | \ + _REGION3_ENTRY_PRESENT) #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) #define _REGION3_ENTRY_HARDWARE_BITS 0xfffffffffffff6ffUL @@ -297,6 +298,14 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_BITS 0xfffffffffffff22fUL +/* + * SW region present bit. For non-leaf region-third-table entries, bits 62-63 + * indicate the TABLE LENGTH and both must be set to 1. But such entries + * would always be considered as present, so it is safe to use bit 63 as + * PRESENT bit for PUD. + */ +#define _REGION3_ENTRY_PRESENT 0x0001 + /* Bits in the segment table entry */ #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe3fUL #define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe3cUL @@ -308,7 +317,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ -#define _SEGMENT_ENTRY (0) +#define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PRESENT) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ @@ -324,6 +333,8 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -455,17 +466,22 @@ static inline int is_module_addr(void *addr) /* * Segment entry (large page) protection definitions. */ -#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \ +#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_INVALID | \ _SEGMENT_ENTRY_PROTECT) -#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ) -#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_READ | \ +#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_READ | \ +#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE) #define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \ _SEGMENT_ENTRY_LARGE | \ @@ -492,6 +508,7 @@ static inline int is_module_addr(void *addr) */ #define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_WRITE | \ @@ -499,12 +516,14 @@ static inline int is_module_addr(void *addr) _REGION3_ENTRY_DIRTY | \ _REGION_ENTRY_NOEXEC) #define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_YOUNG | \ _REGION_ENTRY_PROTECT | \ _REGION_ENTRY_NOEXEC) #define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_WRITE | \ @@ -747,7 +766,7 @@ static inline int pud_present(pud_t pud) { if (pud_folded(pud)) return 1; - return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; + return (pud_val(pud) & _REGION3_ENTRY_PRESENT) != 0; } static inline int pud_none(pud_t pud) @@ -762,13 +781,18 @@ static inline bool pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; - return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); + return (pud_present(pud) && (pud_val(pud) & _REGION3_ENTRY_LARGE) != 0); +} + +static inline int pmd_present(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; } #define pmd_leaf pmd_leaf static inline bool pmd_leaf(pmd_t pmd) { - return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; + return (pmd_present(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0); } static inline int pmd_bad(pmd_t pmd) @@ -800,11 +824,6 @@ static inline int p4d_bad(p4d_t p4d) return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } -static inline int pmd_present(pmd_t pmd) -{ - return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY; -} - static inline int pmd_none(pmd_t pmd) { return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; @@ -1852,7 +1871,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, static inline int pmd_trans_huge(pmd_t pmd) { - return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; + return pmd_leaf(pmd); } #define has_transparent_hugepage has_transparent_hugepage diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 329682655af2..404489ecaf8a 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -587,7 +587,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC; + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; } else *table = pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS; @@ -2396,7 +2397,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (purge) __pmdp_csp(pmdp); set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); @@ -2450,7 +2452,8 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); @@ -2485,7 +2488,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ded0eff58a19..a65bebbb5a34 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -48,6 +48,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) */ if (pte_present(pte)) { rste = pte_val(pte) & PAGE_MASK; + rste |= _SEGMENT_ENTRY_PRESENT; rste |= move_set_bit(pte_val(pte), _PAGE_READ, _SEGMENT_ENTRY_READ); rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, @@ -223,11 +224,10 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4dp = p4d_offset(pgdp, addr); if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); - if (pud_present(*pudp)) { - if (pud_leaf(*pudp)) - return (pte_t *) pudp; + if (sz == PUD_SIZE) + return (pte_t *)pudp; + if (pud_present(*pudp)) pmdp = pmd_offset(pudp, addr); - } } } return (pte_t *) pmdp; From f934f6be76c1d033692f7fbfcbb06ec44a25bf3f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:22 +0100 Subject: [PATCH 16/27] s390/mm: Introduce region-third and segment table swap entries Introduce region-third (PUD) and segment table (PMD) swap entries, and make hugetlbfs RSTE <-> PTE conversion code aware of them, so that they can be used for hugetlbfs PTE_MARKER entries. Future work could also build on this to enable THP_SWAP and THP_MIGRATION for s390. Similar to PTE swap entries, bits 0-51 can be used to store the swap offset, but bits 57-61 cannot be used for swap type because that overlaps with the INVALID and TABLE TYPE bits. PMD/PUD swap entries must be invalid, and have a correct table type so that pud_folded() check still works. Bits 53-57 can be used for swap type, but those include the PROTECT bit. So unlike swap PTEs, the PROTECT bit cannot be used to mark the swap entry. Use the "Common-Segment/Region" bit 59 instead for that. Also remove the !MACHINE_HAS_NX check in __set_huge_pte_at(). Otherwise, that would clear the _SEGMENT_ENTRY_NOEXEC bit also for swap entries, where it is used for encoding the swap type. The architecture only requires this bit to be 0 for PTEs, with !MACHINE_HAS_NX, not for segment or region-third entries. And the check is also redundant, because after __pte_to_rste() conversion, for non-swap PTEs it would only be set if it was already set in the PTE, which should never be the case for !MACHINE_HAS_NX. This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 53 +++++++++++++++++++++++++++++++++ arch/s390/mm/hugetlbpage.c | 23 ++++++++++---- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 8213e05770f8..300affac1698 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -286,6 +286,7 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ +#define _REGION3_ENTRY_COMM 0x0010 /* Common-Region, marks swap entry */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ #define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ #define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ @@ -323,6 +324,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ +#define _SEGMENT_ENTRY_COMM 0x0010 /* Common-Segment, marks swap entry */ #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ #define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ #define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ @@ -335,6 +337,10 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ +/* Common bits in region and segment table entries, for swap entries */ +#define _RST_ENTRY_COMM 0x0010 /* Common-Region/Segment, marks swap entry */ +#define _RST_ENTRY_INVALID 0x0020 /* invalid region/segment table entry */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -1931,6 +1937,53 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* + * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE) + * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste + * as invalid. + * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010 + * | offset |Xtype |11TT|S0| + * |0000000000111111111122222222223333333333444444444455|555555|5566|66| + * |0123456789012345678901234567890123456789012345678901|234567|8901|23| + * + * Bits 0-51 store the offset. + * Bits 53-57 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT. + * Bit 52 (X) is unused. + */ + +#define __SWP_OFFSET_MASK_RSTE ((1UL << 52) - 1) +#define __SWP_OFFSET_SHIFT_RSTE 12 +#define __SWP_TYPE_MASK_RSTE ((1UL << 5) - 1) +#define __SWP_TYPE_SHIFT_RSTE 6 + +/* + * TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3 + * bits 0x01. See also __set_huge_pte_at(). + */ +static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset) +{ + unsigned long rste; + + rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM; + rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE; + rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE; + return rste; +} + +static inline unsigned long __swp_type_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE; +} + +static inline unsigned long __swp_offset_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE; +} + +#define __rste_to_swp_entry(rste) ((swp_entry_t) { rste }) + extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index a65bebbb5a34..c58a67416a04 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -24,6 +24,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) { + swp_entry_t arch_entry; unsigned long rste; /* @@ -67,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte) #endif rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, _SEGMENT_ENTRY_NOEXEC); + } else if (!pte_none(pte)) { + /* swap pte */ + arch_entry = __pte_to_swp_entry(pte); + rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)); } else rste = _SEGMENT_ENTRY_EMPTY; return rste; @@ -74,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + swp_entry_t arch_entry; unsigned long pteval; - int present; + int present, none; + pte_t pte; - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { present = pud_present(__pud(rste)); - else + none = pud_none(__pud(rste)); + } else { present = pmd_present(__pmd(rste)); + none = pmd_none(__pmd(rste)); + } /* * Convert encoding pmd / pud bits pte bits @@ -115,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste) pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else if (!none) { + /* swap rste */ + arch_entry = __rste_to_swp_entry(rste); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + pteval = pte_val(pte); } else pteval = _PAGE_INVALID; return __pte(pteval); @@ -149,8 +164,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, unsigned long rste; rste = __pte_to_rste(pte); - if (!MACHINE_HAS_NX) - rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { From 487ef5d4d912f6a32556d8a61cede17870925295 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:23 +0100 Subject: [PATCH 17/27] s390/mm: Add PTE_MARKER support for hugetlbfs mappings Commit 8a13897fb0daa ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") added support for PTE_MARKER_POISONED for hugetlbfs, but PTE_MARKER also needs support for swap entries. For s390, swap entries were only supported on PTE level, not on the PMD/PUD levels that are used for large hugetlbfs mappings. Therefore, when writing a PTE_MARKER_POISONED entry, the resulting entry on PMD/PUD level would be an invalid / empty entry. Further access would then generate a pagefault loop, instead of the expected SIGBUS. It is a loop inside the kernel, but interruptible and uffd fault handling also calls schedule() in between, so at least it won't completely block the system. Previous commits prepared support for swap entries on PMD/PUD levels. PTE_MARKER support for hugetlbfs can now be enabled by simply adding an extra is_pte_marker() check to huge_pte_none_mostly(). Fault handling code also needs to be adjusted to expect the VM_FAULT_HWPOISON_LARGE fault flag, which was not possible on s390 before. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/hugetlb.h | 2 +- arch/s390/mm/fault.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cf1b5d6fb1a6..0f621c99b216 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -91,7 +91,7 @@ static inline int huge_pte_none(pte_t pte) static inline int huge_pte_none_mostly(pte_t pte) { - return huge_pte_none(pte); + return huge_pte_none(pte) || is_pte_marker(pte); } static inline int huge_pte_write(pte_t pte) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 94cb2e092075..4ca35545010f 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -338,7 +338,8 @@ static void do_exception(struct pt_regs *regs, int access) handle_fault_error_nolock(regs, 0); else do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)) { + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { if (!user_mode(regs)) handle_fault_error_nolock(regs, 0); else From adb44a4bfc8a3312d2a13cc09d2b89d5828e7702 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 28 Nov 2024 09:43:29 +0100 Subject: [PATCH 18/27] s390/mm/hugetlbfs: Add missing includes Add missing includes to fix this randconfig compile error: All errors (new ones prefixed by >>): In file included from mm/pagewalk.c:5: In file included from include/linux/hugetlb.h:798: >> arch/s390/include/asm/hugetlb.h:94:31: error: call to undeclared function 'is_pte_marker'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 94 | return huge_pte_none(pte) || is_pte_marker(pte); | ^ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411281002.IPkRpIcR-lkp@intel.com/ Fixes: 487ef5d4d912 ("s390/mm: Add PTE_MARKER support for hugetlbfs mappings") Signed-off-by: Heiko Carstens --- arch/s390/include/asm/hugetlb.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 0f621c99b216..5bdb35991ecb 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -10,6 +10,8 @@ #define _ASM_S390_HUGETLB_H #include +#include +#include #include #define hugetlb_free_pgd_range free_pgd_range From 48796104c864cf4dafa80bd8c2ce88f9c92a65ea Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 25 Nov 2024 10:35:10 +0100 Subject: [PATCH 19/27] s390/pci: Fix leak of struct zpci_dev when zpci_add_device() fails Prior to commit 0467cdde8c43 ("s390/pci: Sort PCI functions prior to creating virtual busses") the IOMMU was initialized and the device was registered as part of zpci_create_device() with the struct zpci_dev freed if either resulted in an error. With that commit this was moved into a separate function called zpci_add_device(). While this new function logs when adding failed, it expects the caller not to use and to free the struct zpci_dev on error. This difference between it and zpci_create_device() was missed while changing the callers and the incompletely initialized struct zpci_dev may get used in zpci_scan_configured_device in the error path. This then leads to a crash due to the device not being registered with the zbus. It was also not freed in this case. Fix this by handling the error return of zpci_add_device(). Since in this case the zdev was not added to the zpci_list it can simply be discarded and freed. Also make this more explicit by moving the kref_init() into zpci_add_device() and document that zpci_zdev_get()/zpci_zdev_put() must be used after adding. Cc: stable@vger.kernel.org Fixes: 0467cdde8c43 ("s390/pci: Sort PCI functions prior to creating virtual busses") Reviewed-by: Gerd Bayer Reviewed-by: Matthew Rosato Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 21 +++++++++++++++++---- arch/s390/pci/pci_event.c | 10 ++++++++-- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index b7efa96776ea..6a011d040dfe 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -776,8 +776,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) * @fh: Current Function Handle of the device to be created * @state: Initial state after creation either Standby or Configured * - * Creates a new zpci device and adds it to its, possibly newly created, zbus - * as well as zpci_list. + * Allocates a new struct zpci_dev and queries the platform for its details. + * If successful the device can subsequently be added to the zPCI subsystem + * using zpci_add_device(). * * Returns: the zdev on success or an error pointer otherwise */ @@ -800,7 +801,6 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) goto error; zdev->state = state; - kref_init(&zdev->kref); mutex_init(&zdev->state_lock); mutex_init(&zdev->fmb_lock); mutex_init(&zdev->kzdev_lock); @@ -813,6 +813,17 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) return ERR_PTR(rc); } +/** + * zpci_add_device() - Add a previously created zPCI device to the zPCI subsystem + * @zdev: The zPCI device to be added + * + * A struct zpci_dev is added to the zPCI subsystem and to a virtual PCI bus creating + * a new one as necessary. A hotplug slot is created and events start to be handled. + * If successful from this point on zpci_zdev_get() and zpci_zdev_put() must be used. + * If adding the struct zpci_dev fails the device was not added and should be freed. + * + * Return: 0 on success, or an error code otherwise + */ int zpci_add_device(struct zpci_dev *zdev) { int rc; @@ -826,6 +837,7 @@ int zpci_add_device(struct zpci_dev *zdev) if (rc) goto error_destroy_iommu; + kref_init(&zdev->kref); spin_lock(&zpci_list_lock); list_add_tail(&zdev->entry, &zpci_list); spin_unlock(&zpci_list_lock); @@ -1118,7 +1130,8 @@ static void zpci_add_devices(struct list_head *scan_list) list_sort(NULL, scan_list, &zpci_cmp_rid); list_for_each_entry_safe(zdev, tmp, scan_list, entry) { list_del_init(&zdev->entry); - zpci_add_device(zdev); + if (zpci_add_device(zdev)) + kfree(zdev); } } diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 47f934f4e828..7f7b732b3f3e 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -340,7 +340,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED); if (IS_ERR(zdev)) break; - zpci_add_device(zdev); + if (zpci_add_device(zdev)) { + kfree(zdev); + break; + } } else { /* the configuration request may be stale */ if (zdev->state != ZPCI_FN_STATE_STANDBY) @@ -354,7 +357,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); if (IS_ERR(zdev)) break; - zpci_add_device(zdev); + if (zpci_add_device(zdev)) { + kfree(zdev); + break; + } } else { zpci_update_fh(zdev, ccdf->fh); } From c4a585e952ca403a370586d3f16e8331a7564901 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 25 Nov 2024 16:02:38 +0100 Subject: [PATCH 20/27] s390/pci: Fix potential double remove of hotplug slot In commit 6ee600bfbe0f ("s390/pci: remove hotplug slot when releasing the device") the zpci_exit_slot() was moved from zpci_device_reserved() to zpci_release_device() with the intention of keeping the hotplug slot around until the device is actually removed. Now zpci_release_device() is only called once all references are dropped. Since the zPCI subsystem only drops its reference once the device is in the reserved state it follows that zpci_release_device() must only deal with devices in the reserved state. Despite that it contains code to tear down from both configured and standby state. For the standby case this already includes the removal of the hotplug slot so would cause a double removal if a device was ever removed in either configured or standby state. Instead of causing a potential double removal in a case that should never happen explicitly WARN_ON() if a device in non-reserved state is released and get rid of the dead code cases. Fixes: 6ee600bfbe0f ("s390/pci: remove hotplug slot when releasing the device") Reviewed-by: Matthew Rosato Reviewed-by: Gerd Bayer Tested-by: Gerd Bayer Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 6a011d040dfe..7aeab522f28d 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -937,10 +937,8 @@ void zpci_device_reserved(struct zpci_dev *zdev) void zpci_release_device(struct kref *kref) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); - int ret; - if (zdev->has_hp_slot) - zpci_exit_slot(zdev); + WARN_ON(zdev->state != ZPCI_FN_STATE_RESERVED); if (zdev->zbus->bus) zpci_bus_remove_device(zdev, false); @@ -948,28 +946,14 @@ void zpci_release_device(struct kref *kref) if (zdev_enabled(zdev)) zpci_disable_device(zdev); - switch (zdev->state) { - case ZPCI_FN_STATE_CONFIGURED: - ret = sclp_pci_deconfigure(zdev->fid); - zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret); - fallthrough; - case ZPCI_FN_STATE_STANDBY: - if (zdev->has_hp_slot) - zpci_exit_slot(zdev); - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); - zpci_dbg(3, "rsv fid:%x\n", zdev->fid); - fallthrough; - case ZPCI_FN_STATE_RESERVED: - if (zdev->has_resources) - zpci_cleanup_bus_resources(zdev); - zpci_bus_device_unregister(zdev); - zpci_destroy_iommu(zdev); - fallthrough; - default: - break; - } + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + + if (zdev->has_resources) + zpci_cleanup_bus_resources(zdev); + + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); zpci_dbg(3, "rem fid:%x\n", zdev->fid); kfree_rcu(zdev, rcu); } From b5f463486b212c56d837c2592d87de7fb4833662 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 27 Nov 2024 17:17:12 +0100 Subject: [PATCH 21/27] s390: Support PREEMPT_DYNAMIC Select HAVE_PREEMPT_DYNAMIC_KEY and add the pieces which are required to support PREEMPT_DYNAMIC. See commit 99cf983cc8bc ("sched/preempt: Add PREEMPT_DYNAMIC using static keys") and commit 1b2d3451ee50 ("arm64: Support PREEMPT_DYNAMIC") for more details. Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/preempt.h | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b7e1eec7903e..da77f876ce5b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -219,6 +219,7 @@ config S390 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RETHOOK diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index 0cde7e240373..2c29bdf12127 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -130,10 +130,24 @@ static __always_inline bool should_resched(int preempt_offset) #define init_idle_preempt_count(p, cpu) do { } while (0) #ifdef CONFIG_PREEMPTION -extern void preempt_schedule(void); -#define __preempt_schedule() preempt_schedule() -extern void preempt_schedule_notrace(void); -#define __preempt_schedule_notrace() preempt_schedule_notrace() + +void preempt_schedule(void); +void preempt_schedule_notrace(void); + +#ifdef CONFIG_PREEMPT_DYNAMIC + +void dynamic_preempt_schedule(void); +void dynamic_preempt_schedule_notrace(void); +#define __preempt_schedule() dynamic_preempt_schedule() +#define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() + +#else /* CONFIG_PREEMPT_DYNAMIC */ + +#define __preempt_schedule() preempt_schedule() +#define __preempt_schedule_notrace() preempt_schedule_notrace() + +#endif /* CONFIG_PREEMPT_DYNAMIC */ + #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ From 78486ed9e76b72d81e7bb142adb47194f95e188d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 14:28:23 +0100 Subject: [PATCH 22/27] s390/spinlock: Use symbolic names in inline assemblies Improve readability and use symbolic names. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/spinlock.h | 7 ++++--- arch/s390/lib/spinlock.c | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index ac868a9bb0d1..f2722775baf6 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -82,9 +82,10 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) kcsan_release(); asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */ - " sth %1,%0\n" - : "=R" (((unsigned short *) &lp->lock)[1]) - : "d" (0) : "cc", "memory"); + " sth %[zero],%[lock]\n" + : [lock] "=R" (((unsigned short *)&lp->lock)[1]) + : [zero] "d" (0) + : "cc", "memory"); } /* diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 09d735010ee1..255c0a8202e7 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -76,8 +76,8 @@ static inline int arch_load_niai4(int *lock) asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */ - " l %0,%1\n" - : "=d" (owner) : "Q" (*lock) : "memory"); + " l %[owner],%[lock]\n" + : [owner] "=d" (owner) : [lock] "Q" (*lock) : "memory"); return owner; } @@ -87,9 +87,9 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ - " cs %0,%3,%1\n" - : "=d" (old), "=Q" (*lock) - : "0" (old), "d" (new), "Q" (*lock) + " cs %[old],%[new],%[lock]\n" + : [old] "+d" (old), [lock] "+Q" (*lock) + : [new] "d" (new) : "cc", "memory"); return expected == old; } From 2c3bc137f1e339c4fa9485ec4028433b8cb7374b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 14:28:24 +0100 Subject: [PATCH 23/27] s390/spinlock: Remove condition code clobber from arch_spin_unlock() Both instructions in arch_spin_unlock() do not clobber the condition code. Therefore remove the condition code clobber from the inline assembly. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/spinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f2722775baf6..9601e27d3291 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -85,7 +85,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) " sth %[zero],%[lock]\n" : [lock] "=R" (((unsigned short *)&lp->lock)[1]) : [zero] "d" (0) - : "cc", "memory"); + : "memory"); } /* From 1200f216a3043ad78e89ce1f573fe6d62ed5e5d0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 14:28:25 +0100 Subject: [PATCH 24/27] s390/spinlock: Generate shorter code for arch_spin_unlock() Use mvhhi instead of sth to write a zero to spinlocks. Compared to the sth variant this avoids the load of zero to a register, and reduces register pressure. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/spinlock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 9601e27d3291..f87dd0a84855 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -82,9 +82,9 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) kcsan_release(); asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */ - " sth %[zero],%[lock]\n" - : [lock] "=R" (((unsigned short *)&lp->lock)[1]) - : [zero] "d" (0) + " mvhhi %[lock],0\n" + : [lock] "=Q" (((unsigned short *)&lp->lock)[1]) + : : "memory"); } From 84ac96587b2a7a27d2aba250009c45dffb8ab4b6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 14:28:26 +0100 Subject: [PATCH 25/27] s390/spinlock: Use R constraint for arch_load_niai4() The load instruction used within arch_load_niai4() has a short displacement and index register. Therefore use the R constraint to reflect this. The used Q constraint does consider an index register. Signed-off-by: Heiko Carstens --- arch/s390/lib/spinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 255c0a8202e7..c27c0f2a8018 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -77,7 +77,7 @@ static inline int arch_load_niai4(int *lock) asm_inline volatile( ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */ " l %[owner],%[lock]\n" - : [owner] "=d" (owner) : [lock] "Q" (*lock) : "memory"); + : [owner] "=d" (owner) : [lock] "R" (*lock) : "memory"); return owner; } From 889221c4d78d754023bec8edec13d11a13b3fb51 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 14:28:27 +0100 Subject: [PATCH 26/27] s390/spinlock: Use flag output constraint for arch_cmpxchg_niai8() Add a new variant of arch_cmpxchg_niai8() which makes use of the flag output constraint, which allows the compiler to generate slightly better code. Also rename arch_cmpxchg_niai8() to arch_try_cmpxchg_niai8() which reflects the purpose of the function and makes it consistent with other "try" variants. Signed-off-by: Heiko Carstens --- arch/s390/lib/spinlock.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index c27c0f2a8018..a81a01c44927 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -15,6 +15,7 @@ #include #include #include +#include int spin_retry = -1; @@ -81,7 +82,24 @@ static inline int arch_load_niai4(int *lock) return owner; } -static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +#ifdef __HAVE_ASM_FLAG_OUTPUTS__ + +static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new) +{ + int cc; + + asm_inline volatile( + ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */ + " cs %[old],%[new],%[lock]\n" + : [old] "+d" (old), [lock] "+Q" (*lock), "=@cc" (cc) + : [new] "d" (new) + : "memory"); + return cc == 0; +} + +#else /* __HAVE_ASM_FLAG_OUTPUTS__ */ + +static inline int arch_try_cmpxchg_niai8(int *lock, int old, int new) { int expected = old; @@ -94,6 +112,8 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new) return expected == old; } +#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */ + static inline struct spin_wait *arch_spin_decode_tail(int lock) { int ix, cpu; @@ -226,7 +246,7 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) /* Try to get the lock if it is free. */ if (!owner) { new = (old & _Q_TAIL_MASK) | lockval; - if (arch_cmpxchg_niai8(&lp->lock, old, new)) { + if (arch_try_cmpxchg_niai8(&lp->lock, old, new)) { /* Got the lock */ return; } From cc00550b2ae7ab1c7c56669fc004a13d880aaf0a Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 29 Nov 2024 01:07:01 +0100 Subject: [PATCH 27/27] Revert "s390/mm: Allow large pages for KASAN shadow mapping" This reverts commit ff123eb7741638d55abf82fac090bb3a543c1e74. Allowing large pages for KASAN shadow mappings isn't inherently wrong, but adding POPULATE_KASAN_MAP_SHADOW to large_allowed() exposes an issue in can_large_pud() and can_large_pmd(). Since commit d8073dc6bc04 ("s390/mm: Allow large pages only for aligned physical addresses"), both can_large_pud() and can_large_pmd() call _pa() to check if large page physical addresses are aligned. However, _pa() has a side effect: it allocates memory in POPULATE_KASAN_MAP_SHADOW mode. This results in massive memory leaks. The proper fix would be to address both large_allowed() and _pa()'s side effects, but for now, revert this change to avoid the leaks. Fixes: ff123eb77416 ("s390/mm: Allow large pages for KASAN shadow mapping") Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/boot/vmem.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 8476b6f965a9..145035f84a0e 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -264,17 +264,7 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m static bool large_allowed(enum populate_mode mode) { - switch (mode) { - case POPULATE_DIRECT: - case POPULATE_IDENTITY: - case POPULATE_KERNEL: -#ifdef CONFIG_KASAN - case POPULATE_KASAN_MAP_SHADOW: -#endif - return true; - default: - return false; - } + return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY) || (mode == POPULATE_KERNEL); } static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end,