mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-07 21:53:44 +00:00
ead79118da
The kernel provides driver support for using write combining IO memory through the __iowriteXX_copy() API which is commonly used as an optional optimization to generate 16/32/64 byte MemWr TLPs in a PCIe environment. iomap_copy.c provides a generic implementation as a simple 4/8 byte at a time copy loop that has worked well with past ARM64 CPUs, giving a high frequency of large TLPs being successfully formed. However modern ARM64 CPUs are quite sensitive to how the write combining CPU HW is operated and a compiler generated loop with intermixed load/store is not sufficient to frequently generate a large TLP. The CPUs would like to see the entire TLP generated by consecutive store instructions from registers. Compilers like gcc tend to intermix loads and stores and have poor code generation, in part, due to the ARM64 situation that writeq() does not codegen anything other than "[xN]". However even with that resolved compilers like clang still do not have good code generation. This means on modern ARM64 CPUs the rate at which __iowriteXX_copy() successfully generates large TLPs is very small (less than 1 in 10,000) tries), to the point that the use of WC is pointless. Implement __iowrite32/64_copy() specifically for ARM64 and use inline assembly to build consecutive blocks of STR instructions. Provide direct support for 64/32/16 large TLP generation in this manner. Optimize for common constant lengths so that the compiler can directly inline the store blocks. This brings the frequency of large TLP generation up to a high level that is comparable with older CPU generations. As the __iowriteXX_copy() family of APIs is intended for use with WC incorporate the DGH hint directly into the function. Link: https://lore.kernel.org/r/4-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: linux-arch@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
140 lines
3.4 KiB
C
140 lines
3.4 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Based on arch/arm/kernel/io.c
|
|
*
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*/
|
|
|
|
#include <linux/export.h>
|
|
#include <linux/types.h>
|
|
#include <linux/io.h>
|
|
|
|
/*
|
|
* Copy data from IO memory space to "real" memory space.
|
|
*/
|
|
void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
|
|
{
|
|
while (count && !IS_ALIGNED((unsigned long)from, 8)) {
|
|
*(u8 *)to = __raw_readb(from);
|
|
from++;
|
|
to++;
|
|
count--;
|
|
}
|
|
|
|
while (count >= 8) {
|
|
*(u64 *)to = __raw_readq(from);
|
|
from += 8;
|
|
to += 8;
|
|
count -= 8;
|
|
}
|
|
|
|
while (count) {
|
|
*(u8 *)to = __raw_readb(from);
|
|
from++;
|
|
to++;
|
|
count--;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(__memcpy_fromio);
|
|
|
|
/*
|
|
* This generates a memcpy that works on a from/to address which is aligned to
|
|
* bits. Count is in terms of the number of bits sized quantities to copy. It
|
|
* optimizes to use the STR groupings when possible so that it is WC friendly.
|
|
*/
|
|
#define memcpy_toio_aligned(to, from, count, bits) \
|
|
({ \
|
|
volatile u##bits __iomem *_to = to; \
|
|
const u##bits *_from = from; \
|
|
size_t _count = count; \
|
|
const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
|
|
\
|
|
for (; _from < _end_from; _from += 8, _to += 8) \
|
|
__const_memcpy_toio_aligned##bits(_to, _from, 8); \
|
|
if ((_count % 8) >= 4) { \
|
|
__const_memcpy_toio_aligned##bits(_to, _from, 4); \
|
|
_from += 4; \
|
|
_to += 4; \
|
|
} \
|
|
if ((_count % 4) >= 2) { \
|
|
__const_memcpy_toio_aligned##bits(_to, _from, 2); \
|
|
_from += 2; \
|
|
_to += 2; \
|
|
} \
|
|
if (_count % 2) \
|
|
__const_memcpy_toio_aligned##bits(_to, _from, 1); \
|
|
})
|
|
|
|
void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
|
|
{
|
|
memcpy_toio_aligned(to, from, count, 64);
|
|
dgh();
|
|
}
|
|
EXPORT_SYMBOL(__iowrite64_copy_full);
|
|
|
|
void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
|
|
{
|
|
memcpy_toio_aligned(to, from, count, 32);
|
|
dgh();
|
|
}
|
|
EXPORT_SYMBOL(__iowrite32_copy_full);
|
|
|
|
/*
|
|
* Copy data from "real" memory space to IO memory space.
|
|
*/
|
|
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
|
|
{
|
|
while (count && !IS_ALIGNED((unsigned long)to, 8)) {
|
|
__raw_writeb(*(u8 *)from, to);
|
|
from++;
|
|
to++;
|
|
count--;
|
|
}
|
|
|
|
while (count >= 8) {
|
|
__raw_writeq(*(u64 *)from, to);
|
|
from += 8;
|
|
to += 8;
|
|
count -= 8;
|
|
}
|
|
|
|
while (count) {
|
|
__raw_writeb(*(u8 *)from, to);
|
|
from++;
|
|
to++;
|
|
count--;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(__memcpy_toio);
|
|
|
|
/*
|
|
* "memset" on IO memory space.
|
|
*/
|
|
void __memset_io(volatile void __iomem *dst, int c, size_t count)
|
|
{
|
|
u64 qc = (u8)c;
|
|
|
|
qc |= qc << 8;
|
|
qc |= qc << 16;
|
|
qc |= qc << 32;
|
|
|
|
while (count && !IS_ALIGNED((unsigned long)dst, 8)) {
|
|
__raw_writeb(c, dst);
|
|
dst++;
|
|
count--;
|
|
}
|
|
|
|
while (count >= 8) {
|
|
__raw_writeq(qc, dst);
|
|
dst += 8;
|
|
count -= 8;
|
|
}
|
|
|
|
while (count) {
|
|
__raw_writeb(c, dst);
|
|
dst++;
|
|
count--;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(__memset_io);
|