mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-16 01:54:00 +00:00
fe52c64943
One alternative to the fix Christian proposed in https://lore.kernel.org/dri-devel/20241024124159.4519-3-christian.koenig@amd.com/ is to replace the rather complex open coded sorting loops with the kernel standard sort followed by a context squashing pass. Proposed advantage of this would be readability but one concern Christian raised was that there could be many fences, that they are typically mostly sorted, and so the kernel's heap sort would be much worse by the proposed algorithm. I had a look running some games and vkcube to see what are the typical number of input fences. Tested scenarios: 1) Hogwarts Legacy under Gamescope 450 calls per second to __dma_fence_unwrap_merge. Percentages per number of fences buckets, before and after checking for signalled status, sorting and flattening: N Before After 0 0.91% 1 69.40% 2-3 28.72% 9.4% (90.6% resolved to one fence) 4-5 0.93% 6-9 0.03% 10+ 2) Cyberpunk 2077 under Gamescope 1050 calls per second, amounting to 0.01% CPU time according to perf top. N Before After 0 1.13% 1 52.30% 2-3 40.34% 55.57% 4-5 1.46% 0.50% 6-9 2.44% 10+ 2.34% 3) vkcube under Plasma 90 calls per second. N Before After 0 1 2-3 100% 0% (Ie. all resolved to a single fence) 4-5 6-9 10+ In the case of vkcube all invocations in the 2-3 bucket were actually just two input fences. From these numbers it looks like the heap sort should not be a disadvantage, given how the dominant case is <= 2 input fences which heap sort solves with just one compare and swap. (And for the case of one input fence we have a fast path in the previous patch.) A complementary possibility is to implement a different sorting algorithm under the same API as the kernel's sort() and so keep the simplicity, potentially moving the new sort under lib/ if it would be found more widely useful. v2: * Hold on to fence references and reduce commentary. (Christian) * Record and use latest signaled timestamp in the 2nd loop too. * Consolidate zero or one fences fast paths. v3: * Reverse the seqno sort order for a simpler squashing pass. (Christian) Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Fixes: 245a4a7b531c ("dma-buf: generalize dma_fence unwrap & merging v3") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3617 Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: Gustavo Padovan <gustavo@padovan.org> Cc: Friedrich Vock <friedrich.vock@gmx.de> Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: <stable@vger.kernel.org> # v6.0+ Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241115102153.1980-3-tursulin@igalia.com
173 lines
4.1 KiB
C
173 lines
4.1 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* dma-fence-util: misc functions for dma_fence objects
|
|
*
|
|
* Copyright (C) 2022 Advanced Micro Devices, Inc.
|
|
* Authors:
|
|
* Christian König <christian.koenig@amd.com>
|
|
*/
|
|
|
|
#include <linux/dma-fence.h>
|
|
#include <linux/dma-fence-array.h>
|
|
#include <linux/dma-fence-chain.h>
|
|
#include <linux/dma-fence-unwrap.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/sort.h>
|
|
|
|
/* Internal helper to start new array iteration, don't use directly */
|
|
static struct dma_fence *
|
|
__dma_fence_unwrap_array(struct dma_fence_unwrap *cursor)
|
|
{
|
|
cursor->array = dma_fence_chain_contained(cursor->chain);
|
|
cursor->index = 0;
|
|
return dma_fence_array_first(cursor->array);
|
|
}
|
|
|
|
/**
|
|
* dma_fence_unwrap_first - return the first fence from fence containers
|
|
* @head: the entrypoint into the containers
|
|
* @cursor: current position inside the containers
|
|
*
|
|
* Unwraps potential dma_fence_chain/dma_fence_array containers and return the
|
|
* first fence.
|
|
*/
|
|
struct dma_fence *dma_fence_unwrap_first(struct dma_fence *head,
|
|
struct dma_fence_unwrap *cursor)
|
|
{
|
|
cursor->chain = dma_fence_get(head);
|
|
return __dma_fence_unwrap_array(cursor);
|
|
}
|
|
EXPORT_SYMBOL_GPL(dma_fence_unwrap_first);
|
|
|
|
/**
|
|
* dma_fence_unwrap_next - return the next fence from a fence containers
|
|
* @cursor: current position inside the containers
|
|
*
|
|
* Continue unwrapping the dma_fence_chain/dma_fence_array containers and return
|
|
* the next fence from them.
|
|
*/
|
|
struct dma_fence *dma_fence_unwrap_next(struct dma_fence_unwrap *cursor)
|
|
{
|
|
struct dma_fence *tmp;
|
|
|
|
++cursor->index;
|
|
tmp = dma_fence_array_next(cursor->array, cursor->index);
|
|
if (tmp)
|
|
return tmp;
|
|
|
|
cursor->chain = dma_fence_chain_walk(cursor->chain);
|
|
return __dma_fence_unwrap_array(cursor);
|
|
}
|
|
EXPORT_SYMBOL_GPL(dma_fence_unwrap_next);
|
|
|
|
|
|
static int fence_cmp(const void *_a, const void *_b)
|
|
{
|
|
struct dma_fence *a = *(struct dma_fence **)_a;
|
|
struct dma_fence *b = *(struct dma_fence **)_b;
|
|
|
|
if (a->context < b->context)
|
|
return -1;
|
|
else if (a->context > b->context)
|
|
return 1;
|
|
|
|
if (dma_fence_is_later(b, a))
|
|
return 1;
|
|
else if (dma_fence_is_later(a, b))
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Implementation for the dma_fence_merge() marco, don't use directly */
|
|
struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences,
|
|
struct dma_fence **fences,
|
|
struct dma_fence_unwrap *iter)
|
|
{
|
|
struct dma_fence_array *result;
|
|
struct dma_fence *tmp, **array;
|
|
ktime_t timestamp;
|
|
int i, j, count;
|
|
|
|
count = 0;
|
|
timestamp = ns_to_ktime(0);
|
|
for (i = 0; i < num_fences; ++i) {
|
|
dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
|
|
if (!dma_fence_is_signaled(tmp)) {
|
|
++count;
|
|
} else {
|
|
ktime_t t = dma_fence_timestamp(tmp);
|
|
|
|
if (ktime_after(t, timestamp))
|
|
timestamp = t;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If we couldn't find a pending fence just return a private signaled
|
|
* fence with the timestamp of the last signaled one.
|
|
*/
|
|
if (count == 0)
|
|
return dma_fence_allocate_private_stub(timestamp);
|
|
|
|
array = kmalloc_array(count, sizeof(*array), GFP_KERNEL);
|
|
if (!array)
|
|
return NULL;
|
|
|
|
count = 0;
|
|
for (i = 0; i < num_fences; ++i) {
|
|
dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) {
|
|
if (!dma_fence_is_signaled(tmp)) {
|
|
array[count++] = dma_fence_get(tmp);
|
|
} else {
|
|
ktime_t t = dma_fence_timestamp(tmp);
|
|
|
|
if (ktime_after(t, timestamp))
|
|
timestamp = t;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (count == 0 || count == 1)
|
|
goto return_fastpath;
|
|
|
|
sort(array, count, sizeof(*array), fence_cmp, NULL);
|
|
|
|
/*
|
|
* Only keep the most recent fence for each context.
|
|
*/
|
|
j = 0;
|
|
for (i = 1; i < count; i++) {
|
|
if (array[i]->context == array[j]->context)
|
|
dma_fence_put(array[i]);
|
|
else
|
|
array[++j] = array[i];
|
|
}
|
|
count = ++j;
|
|
|
|
if (count > 1) {
|
|
result = dma_fence_array_create(count, array,
|
|
dma_fence_context_alloc(1),
|
|
1, false);
|
|
if (!result) {
|
|
for (i = 0; i < count; i++)
|
|
dma_fence_put(array[i]);
|
|
tmp = NULL;
|
|
goto return_tmp;
|
|
}
|
|
return &result->base;
|
|
}
|
|
|
|
return_fastpath:
|
|
if (count == 0)
|
|
tmp = dma_fence_allocate_private_stub(timestamp);
|
|
else
|
|
tmp = array[0];
|
|
|
|
return_tmp:
|
|
kfree(array);
|
|
return tmp;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__dma_fence_unwrap_merge);
|