LoongArch: Use alternative to optimize libraries

Use the alternative to optimize common libraries according whether CPU
has UAL (hardware unaligned access support) feature, including memset(),
memcopy(), memmove(), copy_user() and clear_user().

We have tested UnixBench on a Loongson-3A5000 quad-core machine (1.6GHz):

1, One copy, before patch:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0    9566582.0    819.8
Double-Precision Whetstone                       55.0       2805.3    510.1
Execl Throughput                                 43.0       2120.0    493.0
File Copy 1024 bufsize 2000 maxblocks          3960.0     209833.0    529.9
File Copy 256 bufsize 500 maxblocks            1655.0      89400.0    540.2
File Copy 4096 bufsize 8000 maxblocks          5800.0     320036.0    551.8
Pipe Throughput                               12440.0     340624.0    273.8
Pipe-based Context Switching                   4000.0     109939.1    274.8
Process Creation                                126.0       4728.7    375.3
Shell Scripts (1 concurrent)                     42.4       2223.1    524.3
Shell Scripts (8 concurrent)                      6.0        883.1   1471.9
System Call Overhead                          15000.0     518639.1    345.8
                                                                   ========
System Benchmarks Index Score                                         500.2

2, One copy, after patch:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0    9567674.7    819.9
Double-Precision Whetstone                       55.0       2805.5    510.1
Execl Throughput                                 43.0       2392.7    556.4
File Copy 1024 bufsize 2000 maxblocks          3960.0     417804.0   1055.1
File Copy 256 bufsize 500 maxblocks            1655.0     112909.5    682.2
File Copy 4096 bufsize 8000 maxblocks          5800.0    1255207.4   2164.2
Pipe Throughput                               12440.0     555712.0    446.7
Pipe-based Context Switching                   4000.0      99964.5    249.9
Process Creation                                126.0       5192.5    412.1
Shell Scripts (1 concurrent)                     42.4       2302.4    543.0
Shell Scripts (8 concurrent)                      6.0        919.6   1532.6
System Call Overhead                          15000.0     511159.3    340.8
                                                                   ========
System Benchmarks Index Score                                         640.1

3, Four copies, before patch:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0   38268610.5   3279.2
Double-Precision Whetstone                       55.0      11222.2   2040.4
Execl Throughput                                 43.0       7892.0   1835.3
File Copy 1024 bufsize 2000 maxblocks          3960.0     235149.6    593.8
File Copy 256 bufsize 500 maxblocks            1655.0      74959.6    452.9
File Copy 4096 bufsize 8000 maxblocks          5800.0     545048.5    939.7
Pipe Throughput                               12440.0    1337359.0   1075.0
Pipe-based Context Switching                   4000.0     473663.9   1184.2
Process Creation                                126.0      17491.2   1388.2
Shell Scripts (1 concurrent)                     42.4       6865.7   1619.3
Shell Scripts (8 concurrent)                      6.0       1015.9   1693.1
System Call Overhead                          15000.0    1899535.2   1266.4
                                                                   ========
System Benchmarks Index Score                                        1278.3

4, Four copies, after patch:

System Benchmarks Index Values               BASELINE       RESULT    INDEX
Dhrystone 2 using register variables         116700.0   38272815.5   3279.6
Double-Precision Whetstone                       55.0      11222.8   2040.5
Execl Throughput                                 43.0       8839.2   2055.6
File Copy 1024 bufsize 2000 maxblocks          3960.0     313912.9    792.7
File Copy 256 bufsize 500 maxblocks            1655.0      80976.1    489.3
File Copy 4096 bufsize 8000 maxblocks          5800.0    1176594.3   2028.6
Pipe Throughput                               12440.0    2100941.9   1688.9
Pipe-based Context Switching                   4000.0     476696.4   1191.7
Process Creation                                126.0      18394.7   1459.9
Shell Scripts (1 concurrent)                     42.4       7172.2   1691.6
Shell Scripts (8 concurrent)                      6.0       1058.3   1763.9
System Call Overhead                          15000.0    1874714.7   1249.8
                                                                   ========
System Benchmarks Index Score                                        1488.8

Signed-off-by: Jun Yi <yijun@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
This commit is contained in:
Huacai Chen 2022-12-10 22:39:59 +08:00
parent 19e5eb15b0
commit a275a82dcd
7 changed files with 465 additions and 11 deletions

View File

@ -5,8 +5,13 @@
#ifndef _ASM_STRING_H
#define _ASM_STRING_H
#define __HAVE_ARCH_MEMSET
extern void *memset(void *__s, int __c, size_t __count);
#define __HAVE_ARCH_MEMCPY
extern void *memcpy(void *__to, __const__ void *__from, size_t __n);
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *__dest, __const__ void *__src, size_t __n);
#endif /* _ASM_STRING_H */

View File

@ -3,4 +3,5 @@
# Makefile for LoongArch-specific library files.
#
lib-y += delay.o clear_user.o copy_user.o dump_tlb.o unaligned.o
lib-y += delay.o memset.o memcpy.o memmove.o \
clear_user.o copy_user.o dump_tlb.o unaligned.o

View File

@ -3,25 +3,37 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
#include <asm/cpu.h>
#include <asm/export.h>
#include <asm/regdef.h>
.irp to, 0
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
.L_fixup_handle_\to\():
addi.d a0, a1, (\to) * (-8)
jr ra
.endr
SYM_FUNC_START(__clear_user)
/*
* Some CPUs support hardware unaligned access
*/
ALTERNATIVE "b __clear_user_generic", \
"b __clear_user_fast", CPU_FEATURE_UAL
SYM_FUNC_END(__clear_user)
EXPORT_SYMBOL(__clear_user)
/*
* unsigned long __clear_user(void *addr, size_t size)
* unsigned long __clear_user_generic(void *addr, size_t size)
*
* a0: addr
* a1: size
*/
SYM_FUNC_START(__clear_user)
SYM_FUNC_START(__clear_user_generic)
beqz a1, 2f
1: st.b zero, a0, 0
@ -33,6 +45,54 @@ SYM_FUNC_START(__clear_user)
jr ra
_asm_extable 1b, .L_fixup_handle_0
SYM_FUNC_END(__clear_user)
SYM_FUNC_END(__clear_user_generic)
EXPORT_SYMBOL(__clear_user)
/*
* unsigned long __clear_user_fast(void *addr, unsigned long size)
*
* a0: addr
* a1: size
*/
SYM_FUNC_START(__clear_user_fast)
beqz a1, 10f
ori a2, zero, 64
blt a1, a2, 9f
/* set 64 bytes at a time */
1: st.d zero, a0, 0
2: st.d zero, a0, 8
3: st.d zero, a0, 16
4: st.d zero, a0, 24
5: st.d zero, a0, 32
6: st.d zero, a0, 40
7: st.d zero, a0, 48
8: st.d zero, a0, 56
addi.d a0, a0, 64
addi.d a1, a1, -64
bge a1, a2, 1b
beqz a1, 10f
/* set the remaining bytes */
9: st.b zero, a0, 0
addi.d a0, a0, 1
addi.d a1, a1, -1
bgt a1, zero, 9b
/* return */
10: move a0, a1
jr ra
/* fixup and ex_table */
_asm_extable 1b, .L_fixup_handle_0
_asm_extable 2b, .L_fixup_handle_1
_asm_extable 3b, .L_fixup_handle_2
_asm_extable 4b, .L_fixup_handle_3
_asm_extable 5b, .L_fixup_handle_4
_asm_extable 6b, .L_fixup_handle_5
_asm_extable 7b, .L_fixup_handle_6
_asm_extable 8b, .L_fixup_handle_7
_asm_extable 9b, .L_fixup_handle_0
SYM_FUNC_END(__clear_user_fast)

View File

@ -3,26 +3,38 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-extable.h>
#include <asm/cpu.h>
#include <asm/export.h>
#include <asm/regdef.h>
.irp to, 0
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
.L_fixup_handle_\to\():
addi.d a0, a2, (\to) * (-8)
jr ra
.endr
SYM_FUNC_START(__copy_user)
/*
* Some CPUs support hardware unaligned access
*/
ALTERNATIVE "b __copy_user_generic", \
"b __copy_user_fast", CPU_FEATURE_UAL
SYM_FUNC_END(__copy_user)
EXPORT_SYMBOL(__copy_user)
/*
* unsigned long __copy_user(void *to, const void *from, size_t n)
* unsigned long __copy_user_generic(void *to, const void *from, size_t n)
*
* a0: to
* a1: from
* a2: n
*/
SYM_FUNC_START(__copy_user)
SYM_FUNC_START(__copy_user_generic)
beqz a2, 3f
1: ld.b t0, a1, 0
@ -37,6 +49,75 @@ SYM_FUNC_START(__copy_user)
_asm_extable 1b, .L_fixup_handle_0
_asm_extable 2b, .L_fixup_handle_0
SYM_FUNC_END(__copy_user)
SYM_FUNC_END(__copy_user_generic)
EXPORT_SYMBOL(__copy_user)
/*
* unsigned long __copy_user_fast(void *to, const void *from, unsigned long n)
*
* a0: to
* a1: from
* a2: n
*/
SYM_FUNC_START(__copy_user_fast)
beqz a2, 19f
ori a3, zero, 64
blt a2, a3, 17f
/* copy 64 bytes at a time */
1: ld.d t0, a1, 0
2: ld.d t1, a1, 8
3: ld.d t2, a1, 16
4: ld.d t3, a1, 24
5: ld.d t4, a1, 32
6: ld.d t5, a1, 40
7: ld.d t6, a1, 48
8: ld.d t7, a1, 56
9: st.d t0, a0, 0
10: st.d t1, a0, 8
11: st.d t2, a0, 16
12: st.d t3, a0, 24
13: st.d t4, a0, 32
14: st.d t5, a0, 40
15: st.d t6, a0, 48
16: st.d t7, a0, 56
addi.d a0, a0, 64
addi.d a1, a1, 64
addi.d a2, a2, -64
bge a2, a3, 1b
beqz a2, 19f
/* copy the remaining bytes */
17: ld.b t0, a1, 0
18: st.b t0, a0, 0
addi.d a0, a0, 1
addi.d a1, a1, 1
addi.d a2, a2, -1
bgt a2, zero, 17b
/* return */
19: move a0, a2
jr ra
/* fixup and ex_table */
_asm_extable 1b, .L_fixup_handle_0
_asm_extable 2b, .L_fixup_handle_1
_asm_extable 3b, .L_fixup_handle_2
_asm_extable 4b, .L_fixup_handle_3
_asm_extable 5b, .L_fixup_handle_4
_asm_extable 6b, .L_fixup_handle_5
_asm_extable 7b, .L_fixup_handle_6
_asm_extable 8b, .L_fixup_handle_7
_asm_extable 9b, .L_fixup_handle_0
_asm_extable 10b, .L_fixup_handle_1
_asm_extable 11b, .L_fixup_handle_2
_asm_extable 12b, .L_fixup_handle_3
_asm_extable 13b, .L_fixup_handle_4
_asm_extable 14b, .L_fixup_handle_5
_asm_extable 15b, .L_fixup_handle_6
_asm_extable 16b, .L_fixup_handle_7
_asm_extable 17b, .L_fixup_handle_0
_asm_extable 18b, .L_fixup_handle_0
SYM_FUNC_END(__copy_user_fast)

View File

@ -0,0 +1,95 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
#include <asm/export.h>
#include <asm/regdef.h>
SYM_FUNC_START(memcpy)
/*
* Some CPUs support hardware unaligned access
*/
ALTERNATIVE "b __memcpy_generic", \
"b __memcpy_fast", CPU_FEATURE_UAL
SYM_FUNC_END(memcpy)
EXPORT_SYMBOL(memcpy)
/*
* void *__memcpy_generic(void *dst, const void *src, size_t n)
*
* a0: dst
* a1: src
* a2: n
*/
SYM_FUNC_START(__memcpy_generic)
move a3, a0
beqz a2, 2f
1: ld.b t0, a1, 0
st.b t0, a0, 0
addi.d a0, a0, 1
addi.d a1, a1, 1
addi.d a2, a2, -1
bgt a2, zero, 1b
2: move a0, a3
jr ra
SYM_FUNC_END(__memcpy_generic)
/*
* void *__memcpy_fast(void *dst, const void *src, size_t n)
*
* a0: dst
* a1: src
* a2: n
*/
SYM_FUNC_START(__memcpy_fast)
move a3, a0
beqz a2, 3f
ori a4, zero, 64
blt a2, a4, 2f
/* copy 64 bytes at a time */
1: ld.d t0, a1, 0
ld.d t1, a1, 8
ld.d t2, a1, 16
ld.d t3, a1, 24
ld.d t4, a1, 32
ld.d t5, a1, 40
ld.d t6, a1, 48
ld.d t7, a1, 56
st.d t0, a0, 0
st.d t1, a0, 8
st.d t2, a0, 16
st.d t3, a0, 24
st.d t4, a0, 32
st.d t5, a0, 40
st.d t6, a0, 48
st.d t7, a0, 56
addi.d a0, a0, 64
addi.d a1, a1, 64
addi.d a2, a2, -64
bge a2, a4, 1b
beqz a2, 3f
/* copy the remaining bytes */
2: ld.b t0, a1, 0
st.b t0, a0, 0
addi.d a0, a0, 1
addi.d a1, a1, 1
addi.d a2, a2, -1
bgt a2, zero, 2b
/* return */
3: move a0, a3
jr ra
SYM_FUNC_END(__memcpy_fast)

View File

@ -0,0 +1,121 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
#include <asm/export.h>
#include <asm/regdef.h>
SYM_FUNC_START(memmove)
blt a0, a1, 1f /* dst < src, memcpy */
blt a1, a0, 3f /* src < dst, rmemcpy */
jr ra /* dst == src, return */
/* if (src - dst) < 64, copy 1 byte at a time */
1: ori a3, zero, 64
sub.d t0, a1, a0
blt t0, a3, 2f
b memcpy
2: b __memcpy_generic
/* if (dst - src) < 64, copy 1 byte at a time */
3: ori a3, zero, 64
sub.d t0, a0, a1
blt t0, a3, 4f
b rmemcpy
4: b __rmemcpy_generic
SYM_FUNC_END(memmove)
EXPORT_SYMBOL(memmove)
SYM_FUNC_START(rmemcpy)
/*
* Some CPUs support hardware unaligned access
*/
ALTERNATIVE "b __rmemcpy_generic", \
"b __rmemcpy_fast", CPU_FEATURE_UAL
SYM_FUNC_END(rmemcpy)
/*
* void *__rmemcpy_generic(void *dst, const void *src, size_t n)
*
* a0: dst
* a1: src
* a2: n
*/
SYM_FUNC_START(__rmemcpy_generic)
move a3, a0
beqz a2, 2f
add.d a0, a0, a2
add.d a1, a1, a2
1: ld.b t0, a1, -1
st.b t0, a0, -1
addi.d a0, a0, -1
addi.d a1, a1, -1
addi.d a2, a2, -1
bgt a2, zero, 1b
2: move a0, a3
jr ra
SYM_FUNC_END(__rmemcpy_generic)
/*
* void *__rmemcpy_fast(void *dst, const void *src, size_t n)
*
* a0: dst
* a1: src
* a2: n
*/
SYM_FUNC_START(__rmemcpy_fast)
move a3, a0
beqz a2, 3f
add.d a0, a0, a2
add.d a1, a1, a2
ori a4, zero, 64
blt a2, a4, 2f
/* copy 64 bytes at a time */
1: ld.d t0, a1, -8
ld.d t1, a1, -16
ld.d t2, a1, -24
ld.d t3, a1, -32
ld.d t4, a1, -40
ld.d t5, a1, -48
ld.d t6, a1, -56
ld.d t7, a1, -64
st.d t0, a0, -8
st.d t1, a0, -16
st.d t2, a0, -24
st.d t3, a0, -32
st.d t4, a0, -40
st.d t5, a0, -48
st.d t6, a0, -56
st.d t7, a0, -64
addi.d a0, a0, -64
addi.d a1, a1, -64
addi.d a2, a2, -64
bge a2, a4, 1b
beqz a2, 3f
/* copy the remaining bytes */
2: ld.b t0, a1, -1
st.b t0, a0, -1
addi.d a0, a0, -1
addi.d a1, a1, -1
addi.d a2, a2, -1
bgt a2, zero, 2b
/* return */
3: move a0, a3
jr ra
SYM_FUNC_END(__rmemcpy_fast)

View File

@ -0,0 +1,91 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/cpu.h>
#include <asm/export.h>
#include <asm/regdef.h>
.macro fill_to_64 r0
bstrins.d \r0, \r0, 15, 8
bstrins.d \r0, \r0, 31, 16
bstrins.d \r0, \r0, 63, 32
.endm
SYM_FUNC_START(memset)
/*
* Some CPUs support hardware unaligned access
*/
ALTERNATIVE "b __memset_generic", \
"b __memset_fast", CPU_FEATURE_UAL
SYM_FUNC_END(memset)
EXPORT_SYMBOL(memset)
/*
* void *__memset_generic(void *s, int c, size_t n)
*
* a0: s
* a1: c
* a2: n
*/
SYM_FUNC_START(__memset_generic)
move a3, a0
beqz a2, 2f
1: st.b a1, a0, 0
addi.d a0, a0, 1
addi.d a2, a2, -1
bgt a2, zero, 1b
2: move a0, a3
jr ra
SYM_FUNC_END(__memset_generic)
/*
* void *__memset_fast(void *s, int c, size_t n)
*
* a0: s
* a1: c
* a2: n
*/
SYM_FUNC_START(__memset_fast)
move a3, a0
beqz a2, 3f
ori a4, zero, 64
blt a2, a4, 2f
/* fill a1 to 64 bits */
fill_to_64 a1
/* set 64 bytes at a time */
1: st.d a1, a0, 0
st.d a1, a0, 8
st.d a1, a0, 16
st.d a1, a0, 24
st.d a1, a0, 32
st.d a1, a0, 40
st.d a1, a0, 48
st.d a1, a0, 56
addi.d a0, a0, 64
addi.d a2, a2, -64
bge a2, a4, 1b
beqz a2, 3f
/* set the remaining bytes */
2: st.b a1, a0, 0
addi.d a0, a0, 1
addi.d a2, a2, -1
bgt a2, zero, 2b
/* return */
3: move a0, a3
jr ra
SYM_FUNC_END(__memset_fast)