linux-stable/arch/arm64/kernel/io.c
Jason Gunthorpe ead79118da arm64/io: Provide a WC friendly __iowriteXX_copy()
The kernel provides driver support for using write combining IO memory
through the __iowriteXX_copy() API which is commonly used as an optional
optimization to generate 16/32/64 byte MemWr TLPs in a PCIe environment.

iomap_copy.c provides a generic implementation as a simple 4/8 byte at a
time copy loop that has worked well with past ARM64 CPUs, giving a high
frequency of large TLPs being successfully formed.

However modern ARM64 CPUs are quite sensitive to how the write combining
CPU HW is operated and a compiler generated loop with intermixed
load/store is not sufficient to frequently generate a large TLP. The CPUs
would like to see the entire TLP generated by consecutive store
instructions from registers. Compilers like gcc tend to intermix loads and
stores and have poor code generation, in part, due to the ARM64 situation
that writeq() does not codegen anything other than "[xN]". However even
with that resolved compilers like clang still do not have good code
generation.

This means on modern ARM64 CPUs the rate at which __iowriteXX_copy()
successfully generates large TLPs is very small (less than 1 in 10,000)
tries), to the point that the use of WC is pointless.

Implement __iowrite32/64_copy() specifically for ARM64 and use inline
assembly to build consecutive blocks of STR instructions. Provide direct
support for 64/32/16 large TLP generation in this manner. Optimize for
common constant lengths so that the compiler can directly inline the store
blocks.

This brings the frequency of large TLP generation up to a high level that
is comparable with older CPU generations.

As the __iowriteXX_copy() family of APIs is intended for use with WC
incorporate the DGH hint directly into the function.

Link: https://lore.kernel.org/r/4-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2024-04-22 17:11:20 -03:00

140 lines
3.4 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Based on arch/arm/kernel/io.c
*
* Copyright (C) 2012 ARM Ltd.
*/
#include <linux/export.h>
#include <linux/types.h>
#include <linux/io.h>
/*
* Copy data from IO memory space to "real" memory space.
*/
void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
{
while (count && !IS_ALIGNED((unsigned long)from, 8)) {
*(u8 *)to = __raw_readb(from);
from++;
to++;
count--;
}
while (count >= 8) {
*(u64 *)to = __raw_readq(from);
from += 8;
to += 8;
count -= 8;
}
while (count) {
*(u8 *)to = __raw_readb(from);
from++;
to++;
count--;
}
}
EXPORT_SYMBOL(__memcpy_fromio);
/*
* This generates a memcpy that works on a from/to address which is aligned to
* bits. Count is in terms of the number of bits sized quantities to copy. It
* optimizes to use the STR groupings when possible so that it is WC friendly.
*/
#define memcpy_toio_aligned(to, from, count, bits) \
({ \
volatile u##bits __iomem *_to = to; \
const u##bits *_from = from; \
size_t _count = count; \
const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
\
for (; _from < _end_from; _from += 8, _to += 8) \
__const_memcpy_toio_aligned##bits(_to, _from, 8); \
if ((_count % 8) >= 4) { \
__const_memcpy_toio_aligned##bits(_to, _from, 4); \
_from += 4; \
_to += 4; \
} \
if ((_count % 4) >= 2) { \
__const_memcpy_toio_aligned##bits(_to, _from, 2); \
_from += 2; \
_to += 2; \
} \
if (_count % 2) \
__const_memcpy_toio_aligned##bits(_to, _from, 1); \
})
void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
{
memcpy_toio_aligned(to, from, count, 64);
dgh();
}
EXPORT_SYMBOL(__iowrite64_copy_full);
void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
{
memcpy_toio_aligned(to, from, count, 32);
dgh();
}
EXPORT_SYMBOL(__iowrite32_copy_full);
/*
* Copy data from "real" memory space to IO memory space.
*/
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
{
while (count && !IS_ALIGNED((unsigned long)to, 8)) {
__raw_writeb(*(u8 *)from, to);
from++;
to++;
count--;
}
while (count >= 8) {
__raw_writeq(*(u64 *)from, to);
from += 8;
to += 8;
count -= 8;
}
while (count) {
__raw_writeb(*(u8 *)from, to);
from++;
to++;
count--;
}
}
EXPORT_SYMBOL(__memcpy_toio);
/*
* "memset" on IO memory space.
*/
void __memset_io(volatile void __iomem *dst, int c, size_t count)
{
u64 qc = (u8)c;
qc |= qc << 8;
qc |= qc << 16;
qc |= qc << 32;
while (count && !IS_ALIGNED((unsigned long)dst, 8)) {
__raw_writeb(c, dst);
dst++;
count--;
}
while (count >= 8) {
__raw_writeq(qc, dst);
dst += 8;
count -= 8;
}
while (count) {
__raw_writeb(c, dst);
dst++;
count--;
}
}
EXPORT_SYMBOL(__memset_io);