diff --git a/Documentation/riscv/hwprobe.rst b/Documentation/riscv/hwprobe.rst index 20eff9650da9..a52996b22f75 100644 --- a/Documentation/riscv/hwprobe.rst +++ b/Documentation/riscv/hwprobe.rst @@ -87,13 +87,12 @@ The following keys are defined: emulated via software, either in or below the kernel. These accesses are always extremely slow. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are supported - in hardware, but are slower than the corresponding aligned accesses - sequences. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are slower + than equivalent byte accesses. Misaligned accesses may be supported + directly in hardware, or trapped and emulated by software. - * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are supported - in hardware and are faster than the corresponding aligned accesses - sequences. + * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are faster + than equivalent byte accesses. * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are not supported at all and will generate a misaligned address fault. diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index be84b14f0118..0554ed4bf087 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -120,11 +120,3 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) local_flush_icache_all(); } - -void thead_feature_probe_func(unsigned int cpu, - unsigned long archid, - unsigned long impid) -{ - if ((archid == 0) && (impid == 0)) - per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST; -} diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 6a41537826a7..58ccd2f8cab7 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -30,7 +30,6 @@ #define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset) #define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset) -void probe_vendor_features(unsigned int cpu); void __init apply_boot_alternatives(void); void __init apply_early_boot_alternatives(void); void apply_module_alternatives(void *start, size_t length); @@ -53,15 +52,11 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); -void thead_feature_probe_func(unsigned int cpu, unsigned long archid, - unsigned long impid); - void riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned int stage); #else /* CONFIG_RISCV_ALTERNATIVE */ -static inline void probe_vendor_features(unsigned int cpu) { } static inline void apply_boot_alternatives(void) { } static inline void apply_early_boot_alternatives(void) { } static inline void apply_module_alternatives(void *start, size_t length) { } diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 23fed53b8815..d0345bd659c9 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -30,4 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed); /* Per-cpu ISA extensions. */ extern struct riscv_isainfo hart_isa[NR_CPUS]; +void check_unaligned_access(int cpu); + #endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 6ac56af42f4a..95cf25d48405 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -38,6 +38,7 @@ extra-y += vmlinux.lds obj-y += head.o obj-y += soc.o obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o +obj-y += copy-unaligned.o obj-y += cpu.o obj-y += cpufeature.o obj-y += entry.o diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 6b75788c18e6..85056153fa23 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -27,8 +27,6 @@ struct cpu_manufacturer_info_t { void (*patch_func)(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); - void (*feature_probe_func)(unsigned int cpu, unsigned long archid, - unsigned long impid); }; static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) @@ -43,7 +41,6 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info cpu_mfr_info->imp_id = sbi_get_mimpid(); #endif - cpu_mfr_info->feature_probe_func = NULL; switch (cpu_mfr_info->vendor_id) { #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: @@ -53,7 +50,6 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info #ifdef CONFIG_ERRATA_THEAD case THEAD_VENDOR_ID: cpu_mfr_info->patch_func = thead_errata_patch_func; - cpu_mfr_info->feature_probe_func = thead_feature_probe_func; break; #endif default: @@ -143,20 +139,6 @@ void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, } } -/* Called on each CPU as it starts */ -void probe_vendor_features(unsigned int cpu) -{ - struct cpu_manufacturer_info_t cpu_mfr_info; - - riscv_fill_cpu_mfr_info(&cpu_mfr_info); - if (!cpu_mfr_info.feature_probe_func) - return; - - cpu_mfr_info.feature_probe_func(cpu, - cpu_mfr_info.arch_id, - cpu_mfr_info.imp_id); -} - /* * This is called very early in the boot process (directly after we run * a feature detect on the boot CPU). No need to worry about other CPUs @@ -211,7 +193,6 @@ void __init apply_boot_alternatives(void) /* If called on non-boot cpu things could go wrong */ WARN_ON(smp_processor_id() != 0); - probe_vendor_features(0); _apply_alternatives((struct alt_entry *)__alt_start, (struct alt_entry *)__alt_end, RISCV_ALTERNATIVES_BOOT); diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S new file mode 100644 index 000000000000..cfdecfbaad62 --- /dev/null +++ b/arch/riscv/kernel/copy-unaligned.S @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Rivos Inc. */ + +#include +#include + + .text + +/* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using word loads and stores. */ +/* Note: The size is truncated to a multiple of 8 * SZREG */ +ENTRY(__riscv_copy_words_unaligned) + andi a4, a2, ~((8*SZREG)-1) + beqz a4, 2f + add a3, a1, a4 +1: + REG_L a4, 0(a1) + REG_L a5, SZREG(a1) + REG_L a6, 2*SZREG(a1) + REG_L a7, 3*SZREG(a1) + REG_L t0, 4*SZREG(a1) + REG_L t1, 5*SZREG(a1) + REG_L t2, 6*SZREG(a1) + REG_L t3, 7*SZREG(a1) + REG_S a4, 0(a0) + REG_S a5, SZREG(a0) + REG_S a6, 2*SZREG(a0) + REG_S a7, 3*SZREG(a0) + REG_S t0, 4*SZREG(a0) + REG_S t1, 5*SZREG(a0) + REG_S t2, 6*SZREG(a0) + REG_S t3, 7*SZREG(a0) + addi a0, a0, 8*SZREG + addi a1, a1, 8*SZREG + bltu a1, a3, 1b + +2: + ret +END(__riscv_copy_words_unaligned) + +/* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using only byte accesses. */ +/* Note: The size is truncated to a multiple of 8 */ +ENTRY(__riscv_copy_bytes_unaligned) + andi a4, a2, ~(8-1) + beqz a4, 2f + add a3, a1, a4 +1: + lb a4, 0(a1) + lb a5, 1(a1) + lb a6, 2(a1) + lb a7, 3(a1) + lb t0, 4(a1) + lb t1, 5(a1) + lb t2, 6(a1) + lb t3, 7(a1) + sb a4, 0(a0) + sb a5, 1(a0) + sb a6, 2(a0) + sb a7, 3(a0) + sb t0, 4(a0) + sb t1, 5(a0) + sb t2, 6(a0) + sb t3, 7(a0) + addi a0, a0, 8 + addi a1, a1, 8 + bltu a1, a3, 1b + +2: + ret +END(__riscv_copy_bytes_unaligned) diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h new file mode 100644 index 000000000000..e3d70d35b708 --- /dev/null +++ b/arch/riscv/kernel/copy-unaligned.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos, Inc. + */ +#ifndef __RISCV_KERNEL_COPY_UNALIGNED_H +#define __RISCV_KERNEL_COPY_UNALIGNED_H + +#include + +void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size); +void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size); + +#endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index ef7b4fd9e876..1cfbba65d11a 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -18,12 +18,19 @@ #include #include #include +#include #include #include #include +#include "copy-unaligned.h" + #define NUM_ALPHA_EXTS ('z' - 'a' + 1) +#define MISALIGNED_ACCESS_JIFFIES_LG2 1 +#define MISALIGNED_BUFFER_SIZE 0x4000 +#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) + unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ @@ -549,6 +556,103 @@ unsigned long riscv_get_elf_hwcap(void) return hwcap; } +void check_unaligned_access(int cpu) +{ + u64 start_cycles, end_cycles; + u64 word_cycles; + u64 byte_cycles; + int ratio; + unsigned long start_jiffies, now; + struct page *page; + void *dst; + void *src; + long speed = RISCV_HWPROBE_MISALIGNED_SLOW; + + page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE)); + if (!page) { + pr_warn("Can't alloc pages to measure memcpy performance"); + return; + } + + /* Make an unaligned destination buffer. */ + dst = (void *)((unsigned long)page_address(page) | 0x1); + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ + src = dst + (MISALIGNED_BUFFER_SIZE / 2); + src += 2; + word_cycles = -1ULL; + /* Do a warmup. */ + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + preempt_disable(); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + /* + * For a fixed amount of time, repeatedly try the function, and take + * the best time in cycles as the measurement. + */ + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < word_cycles) + word_cycles = end_cycles - start_cycles; + } + + byte_cycles = -1ULL; + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + mb(); + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < byte_cycles) + byte_cycles = end_cycles - start_cycles; + } + + preempt_enable(); + + /* Don't divide by zero. */ + if (!word_cycles || !byte_cycles) { + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", + cpu); + + goto out; + } + + if (word_cycles < byte_cycles) + speed = RISCV_HWPROBE_MISALIGNED_FAST; + + ratio = div_u64((byte_cycles * 100), word_cycles); + pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", + cpu, + ratio / 100, + ratio % 100, + (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); + + per_cpu(misaligned_access_speed, cpu) = speed; + +out: + __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE)); +} + +static int check_unaligned_access_boot_cpu(void) +{ + check_unaligned_access(0); + return 0; +} + +arch_initcall(check_unaligned_access_boot_cpu); + #ifdef CONFIG_RISCV_ALTERNATIVE /* * Alternative patch sites consider 48 bits when determining when to patch diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index f4d6acb38dd0..1b8da4e40a4d 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -245,7 +246,7 @@ asmlinkage __visible void smp_callin(void) numa_add_cpu(curr_cpuid); set_cpu_online(curr_cpuid, 1); - probe_vendor_features(curr_cpuid); + check_unaligned_access(curr_cpuid); if (has_vector()) { if (riscv_v_setup_vsize())