2020-09-22 15:34:22 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2018 Christoph Hellwig.
|
|
|
|
*
|
|
|
|
* DMA operations that map physical memory directly without using an IOMMU.
|
|
|
|
*/
|
|
|
|
#ifndef _KERNEL_DMA_DIRECT_H
|
|
|
|
#define _KERNEL_DMA_DIRECT_H
|
|
|
|
|
|
|
|
#include <linux/dma-direct.h>
|
2022-07-08 10:50:56 -06:00
|
|
|
#include <linux/memremap.h>
|
2020-09-22 15:34:22 +02:00
|
|
|
|
|
|
|
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
|
|
|
|
void *cpu_addr, dma_addr_t dma_addr, size_t size,
|
|
|
|
unsigned long attrs);
|
|
|
|
bool dma_direct_can_mmap(struct device *dev);
|
|
|
|
int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
|
|
|
|
void *cpu_addr, dma_addr_t dma_addr, size_t size,
|
|
|
|
unsigned long attrs);
|
|
|
|
bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
|
|
|
|
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
|
|
|
|
enum dma_data_direction dir, unsigned long attrs);
|
dma-mapping: fix dma_addressing_limited() if dma_range_map can't cover all system RAM
There is an unusual case that the range map covers right up to the top
of system RAM, but leaves a hole somewhere lower down. Then it prevents
the nvme device dma mapping in the checking path of phys_to_dma() and
causes the hangs at boot.
E.g. On an Armv8 Ampere server, the dsdt ACPI table is:
Method (_DMA, 0, Serialized) // _DMA: Direct Memory Access
{
Name (RBUF, ResourceTemplate ()
{
QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
0x0000000000000000, // Granularity
0x0000000000000000, // Range Minimum
0x00000000FFFFFFFF, // Range Maximum
0x0000000000000000, // Translation Offset
0x0000000100000000, // Length
,, , AddressRangeMemory, TypeStatic)
QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
0x0000000000000000, // Granularity
0x0000006010200000, // Range Minimum
0x000000602FFFFFFF, // Range Maximum
0x0000000000000000, // Translation Offset
0x000000001FE00000, // Length
,, , AddressRangeMemory, TypeStatic)
QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
0x0000000000000000, // Granularity
0x00000060F0000000, // Range Minimum
0x00000060FFFFFFFF, // Range Maximum
0x0000000000000000, // Translation Offset
0x0000000010000000, // Length
,, , AddressRangeMemory, TypeStatic)
QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
0x0000000000000000, // Granularity
0x0000007000000000, // Range Minimum
0x000003FFFFFFFFFF, // Range Maximum
0x0000000000000000, // Translation Offset
0x0000039000000000, // Length
,, , AddressRangeMemory, TypeStatic)
})
But the System RAM ranges are:
cat /proc/iomem |grep -i ram
90000000-91ffffff : System RAM
92900000-fffbffff : System RAM
880000000-fffffffff : System RAM
8800000000-bff5990fff : System RAM
bff59d0000-bff5a4ffff : System RAM
bff8000000-bfffffffff : System RAM
So some RAM ranges are out of dma_range_map.
Fix it by checking whether each of the system RAM resources can be
properly encompassed within the dma_range_map.
Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2023-10-28 10:20:59 +00:00
|
|
|
bool dma_direct_all_ram_mapped(struct device *dev);
|
2020-09-22 15:34:22 +02:00
|
|
|
size_t dma_direct_max_mapping_size(struct device *dev);
|
|
|
|
|
|
|
|
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
|
|
|
|
defined(CONFIG_SWIOTLB)
|
|
|
|
void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
|
|
|
|
int nents, enum dma_data_direction dir);
|
|
|
|
#else
|
|
|
|
static inline void dma_direct_sync_sg_for_device(struct device *dev,
|
|
|
|
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
|
|
|
|
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
|
|
|
|
defined(CONFIG_SWIOTLB)
|
|
|
|
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
|
|
|
|
int nents, enum dma_data_direction dir, unsigned long attrs);
|
|
|
|
void dma_direct_sync_sg_for_cpu(struct device *dev,
|
|
|
|
struct scatterlist *sgl, int nents, enum dma_data_direction dir);
|
|
|
|
#else
|
|
|
|
static inline void dma_direct_unmap_sg(struct device *dev,
|
|
|
|
struct scatterlist *sgl, int nents, enum dma_data_direction dir,
|
|
|
|
unsigned long attrs)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
|
|
|
|
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline void dma_direct_sync_single_for_device(struct device *dev,
|
|
|
|
dma_addr_t addr, size_t size, enum dma_data_direction dir)
|
|
|
|
{
|
|
|
|
phys_addr_t paddr = dma_to_phys(dev, addr);
|
|
|
|
|
swiotlb: reduce swiotlb pool lookups
With CONFIG_SWIOTLB_DYNAMIC enabled, each round-trip map/unmap pair
in the swiotlb results in 6 calls to swiotlb_find_pool(). In multiple
places, the pool is found and used in one function, and then must
be found again in the next function that is called because only the
tlb_addr is passed as an argument. These are the six call sites:
dma_direct_map_page:
1. swiotlb_map -> swiotlb_tbl_map_single -> swiotlb_bounce
dma_direct_unmap_page:
2. dma_direct_sync_single_for_cpu -> is_swiotlb_buffer
3. dma_direct_sync_single_for_cpu -> swiotlb_sync_single_for_cpu ->
swiotlb_bounce
4. is_swiotlb_buffer
5. swiotlb_tbl_unmap_single -> swiotlb_del_transient
6. swiotlb_tbl_unmap_single -> swiotlb_release_slots
Reduce the number of calls by finding the pool at a higher level, and
passing it as an argument instead of searching again. A key change is
for is_swiotlb_buffer() to return a pool pointer instead of a boolean,
and then pass this pool pointer to subsequent swiotlb functions.
There are 9 occurrences of is_swiotlb_buffer() used to test if a buffer
is a swiotlb buffer before calling a swiotlb function. To reduce code
duplication in getting the pool pointer and passing it as an argument,
introduce inline wrappers for this pattern. The generated code is
essentially unchanged.
Since is_swiotlb_buffer() no longer returns a boolean, rename some
functions to reflect the change:
* swiotlb_find_pool() becomes __swiotlb_find_pool()
* is_swiotlb_buffer() becomes swiotlb_find_pool()
* is_xen_swiotlb_buffer() becomes xen_swiotlb_find_pool()
With these changes, a round-trip map/unmap pair requires only 2 pool
lookups (listed using the new names and wrappers):
dma_direct_unmap_page:
1. dma_direct_sync_single_for_cpu -> swiotlb_find_pool
2. swiotlb_tbl_unmap_single -> swiotlb_find_pool
These changes come from noticing the inefficiencies in a code review,
not from performance measurements. With CONFIG_SWIOTLB_DYNAMIC,
__swiotlb_find_pool() is not trivial, and it uses an RCU read lock,
so avoiding the redundant calls helps performance in a hot path.
When CONFIG_SWIOTLB_DYNAMIC is *not* set, the code size reduction
is minimal and the perf benefits are likely negligible, but no
harm is done.
No functional change is intended.
Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2024-07-08 12:41:00 -07:00
|
|
|
swiotlb_sync_single_for_device(dev, paddr, size, dir);
|
2020-09-22 15:34:22 +02:00
|
|
|
|
|
|
|
if (!dev_is_dma_coherent(dev))
|
|
|
|
arch_sync_dma_for_device(paddr, size, dir);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void dma_direct_sync_single_for_cpu(struct device *dev,
|
|
|
|
dma_addr_t addr, size_t size, enum dma_data_direction dir)
|
|
|
|
{
|
|
|
|
phys_addr_t paddr = dma_to_phys(dev, addr);
|
|
|
|
|
|
|
|
if (!dev_is_dma_coherent(dev)) {
|
|
|
|
arch_sync_dma_for_cpu(paddr, size, dir);
|
|
|
|
arch_sync_dma_for_cpu_all();
|
|
|
|
}
|
|
|
|
|
swiotlb: reduce swiotlb pool lookups
With CONFIG_SWIOTLB_DYNAMIC enabled, each round-trip map/unmap pair
in the swiotlb results in 6 calls to swiotlb_find_pool(). In multiple
places, the pool is found and used in one function, and then must
be found again in the next function that is called because only the
tlb_addr is passed as an argument. These are the six call sites:
dma_direct_map_page:
1. swiotlb_map -> swiotlb_tbl_map_single -> swiotlb_bounce
dma_direct_unmap_page:
2. dma_direct_sync_single_for_cpu -> is_swiotlb_buffer
3. dma_direct_sync_single_for_cpu -> swiotlb_sync_single_for_cpu ->
swiotlb_bounce
4. is_swiotlb_buffer
5. swiotlb_tbl_unmap_single -> swiotlb_del_transient
6. swiotlb_tbl_unmap_single -> swiotlb_release_slots
Reduce the number of calls by finding the pool at a higher level, and
passing it as an argument instead of searching again. A key change is
for is_swiotlb_buffer() to return a pool pointer instead of a boolean,
and then pass this pool pointer to subsequent swiotlb functions.
There are 9 occurrences of is_swiotlb_buffer() used to test if a buffer
is a swiotlb buffer before calling a swiotlb function. To reduce code
duplication in getting the pool pointer and passing it as an argument,
introduce inline wrappers for this pattern. The generated code is
essentially unchanged.
Since is_swiotlb_buffer() no longer returns a boolean, rename some
functions to reflect the change:
* swiotlb_find_pool() becomes __swiotlb_find_pool()
* is_swiotlb_buffer() becomes swiotlb_find_pool()
* is_xen_swiotlb_buffer() becomes xen_swiotlb_find_pool()
With these changes, a round-trip map/unmap pair requires only 2 pool
lookups (listed using the new names and wrappers):
dma_direct_unmap_page:
1. dma_direct_sync_single_for_cpu -> swiotlb_find_pool
2. swiotlb_tbl_unmap_single -> swiotlb_find_pool
These changes come from noticing the inefficiencies in a code review,
not from performance measurements. With CONFIG_SWIOTLB_DYNAMIC,
__swiotlb_find_pool() is not trivial, and it uses an RCU read lock,
so avoiding the redundant calls helps performance in a hot path.
When CONFIG_SWIOTLB_DYNAMIC is *not* set, the code size reduction
is minimal and the perf benefits are likely negligible, but no
harm is done.
No functional change is intended.
Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2024-07-08 12:41:00 -07:00
|
|
|
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
|
2020-09-22 15:34:22 +02:00
|
|
|
|
|
|
|
if (dir == DMA_FROM_DEVICE)
|
|
|
|
arch_dma_mark_clean(paddr, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline dma_addr_t dma_direct_map_page(struct device *dev,
|
|
|
|
struct page *page, unsigned long offset, size_t size,
|
|
|
|
enum dma_data_direction dir, unsigned long attrs)
|
|
|
|
{
|
|
|
|
phys_addr_t phys = page_to_phys(page) + offset;
|
|
|
|
dma_addr_t dma_addr = phys_to_dma(dev, phys);
|
|
|
|
|
2022-07-08 10:50:56 -06:00
|
|
|
if (is_swiotlb_force_bounce(dev)) {
|
|
|
|
if (is_pci_p2pdma_page(page))
|
|
|
|
return DMA_MAPPING_ERROR;
|
2020-09-22 15:34:22 +02:00
|
|
|
return swiotlb_map(dev, phys, size, dir, attrs);
|
2022-07-08 10:50:56 -06:00
|
|
|
}
|
2020-09-22 15:34:22 +02:00
|
|
|
|
2023-06-12 16:31:58 +01:00
|
|
|
if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
|
|
|
|
dma_kmalloc_needs_bounce(dev, size, dir)) {
|
2022-07-08 10:50:56 -06:00
|
|
|
if (is_pci_p2pdma_page(page))
|
|
|
|
return DMA_MAPPING_ERROR;
|
2022-02-14 09:44:32 +01:00
|
|
|
if (is_swiotlb_active(dev))
|
2020-09-22 15:34:22 +02:00
|
|
|
return swiotlb_map(dev, phys, size, dir, attrs);
|
|
|
|
|
|
|
|
dev_WARN_ONCE(dev, 1,
|
|
|
|
"DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
|
|
|
|
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
|
|
|
|
return DMA_MAPPING_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
|
|
|
|
arch_sync_dma_for_device(phys, size, dir);
|
|
|
|
return dma_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
|
|
|
|
size_t size, enum dma_data_direction dir, unsigned long attrs)
|
|
|
|
{
|
|
|
|
phys_addr_t phys = dma_to_phys(dev, addr);
|
|
|
|
|
|
|
|
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
|
|
|
|
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
|
|
|
|
|
swiotlb: reduce swiotlb pool lookups
With CONFIG_SWIOTLB_DYNAMIC enabled, each round-trip map/unmap pair
in the swiotlb results in 6 calls to swiotlb_find_pool(). In multiple
places, the pool is found and used in one function, and then must
be found again in the next function that is called because only the
tlb_addr is passed as an argument. These are the six call sites:
dma_direct_map_page:
1. swiotlb_map -> swiotlb_tbl_map_single -> swiotlb_bounce
dma_direct_unmap_page:
2. dma_direct_sync_single_for_cpu -> is_swiotlb_buffer
3. dma_direct_sync_single_for_cpu -> swiotlb_sync_single_for_cpu ->
swiotlb_bounce
4. is_swiotlb_buffer
5. swiotlb_tbl_unmap_single -> swiotlb_del_transient
6. swiotlb_tbl_unmap_single -> swiotlb_release_slots
Reduce the number of calls by finding the pool at a higher level, and
passing it as an argument instead of searching again. A key change is
for is_swiotlb_buffer() to return a pool pointer instead of a boolean,
and then pass this pool pointer to subsequent swiotlb functions.
There are 9 occurrences of is_swiotlb_buffer() used to test if a buffer
is a swiotlb buffer before calling a swiotlb function. To reduce code
duplication in getting the pool pointer and passing it as an argument,
introduce inline wrappers for this pattern. The generated code is
essentially unchanged.
Since is_swiotlb_buffer() no longer returns a boolean, rename some
functions to reflect the change:
* swiotlb_find_pool() becomes __swiotlb_find_pool()
* is_swiotlb_buffer() becomes swiotlb_find_pool()
* is_xen_swiotlb_buffer() becomes xen_swiotlb_find_pool()
With these changes, a round-trip map/unmap pair requires only 2 pool
lookups (listed using the new names and wrappers):
dma_direct_unmap_page:
1. dma_direct_sync_single_for_cpu -> swiotlb_find_pool
2. swiotlb_tbl_unmap_single -> swiotlb_find_pool
These changes come from noticing the inefficiencies in a code review,
not from performance measurements. With CONFIG_SWIOTLB_DYNAMIC,
__swiotlb_find_pool() is not trivial, and it uses an RCU read lock,
so avoiding the redundant calls helps performance in a hot path.
When CONFIG_SWIOTLB_DYNAMIC is *not* set, the code size reduction
is minimal and the perf benefits are likely negligible, but no
harm is done.
No functional change is intended.
Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2024-07-08 12:41:00 -07:00
|
|
|
swiotlb_tbl_unmap_single(dev, phys, size, dir,
|
2022-04-13 08:32:22 +02:00
|
|
|
attrs | DMA_ATTR_SKIP_CPU_SYNC);
|
2020-09-22 15:34:22 +02:00
|
|
|
}
|
|
|
|
#endif /* _KERNEL_DMA_DIRECT_H */
|