From 85dbfe47d07cddeac959ccc9352c4b0f1683225b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Mon, 5 Jun 2023 15:04:54 +0200 Subject: [PATCH] drm/xe: Invalidate TLB also on bind if in scratch page mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For scratch table mode we need to cover the case where a scratch PTE might have been pre-fetched and cached and used instead of that of the newly bound vma. For compute vms, invalidate TLB globally using GuC before signalling bind complete. For !long-running vms, invalidate TLB at batch start. Also document how TLB invalidation works. v2: - Fix a pointer to the comment about TLB invalidation (Jose Souza). - Add a bool to the vm whether we want to invalidate TLB at batch start. - Invalidate TLB also on BCS- and video engines at batch start where needed. - Use BIT() macro instead of explicit shift. Signed-off-by: Thomas Hellström Tested-by: José Roberto de Souza #v1 Reported-by: José Roberto de Souza #v1 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/291 Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/291 Acked-by: José Roberto de Souza Reviewed-by: Matthew Brost Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_gpu_commands.h | 1 + drivers/gpu/drm/xe/xe_pt.c | 17 ++++++++- drivers/gpu/drm/xe/xe_ring_ops.c | 45 ++++++++++++++++++----- drivers/gpu/drm/xe/xe_vm.c | 2 + drivers/gpu/drm/xe/xe_vm_types.h | 3 ++ 5 files changed, 56 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h index 0f9c5b0b8a3b..1a744c508174 100644 --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h @@ -73,6 +73,7 @@ #define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) #define PIPE_CONTROL_CS_STALL (1<<20) #define PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET (1<<19) +#define PIPE_CONTROL_TLB_INVALIDATE BIT(18) #define PIPE_CONTROL_PSD_SYNC (1<<17) #define PIPE_CONTROL_QW_WRITE (1<<14) #define PIPE_CONTROL_DEPTH_STALL (1<<13) diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index bef265715000..2c472fafc811 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1297,7 +1297,20 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_engine *e, xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries); - if (rebind && !xe_vm_no_dma_fences(vma->vm)) { + /* + * If rebind, we have to invalidate TLB on !LR vms to invalidate + * cached PTEs point to freed memory. on LR vms this is done + * automatically when the context is re-enabled by the rebind worker, + * or in fault mode it was invalidated on PTE zapping. + * + * If !rebind, and scratch enabled VMs, there is a chance the scratch + * PTE is already cached in the TLB so it needs to be invalidated. + * on !LR VMs this is done in the ring ops preceding a batch, but on + * non-faulting LR, in particular on user-space batch buffer chaining, + * it needs to be done here. + */ + if ((rebind && !xe_vm_no_dma_fences(vm) && !vm->batch_invalidate_tlb) || + (!rebind && vm->scratch_bo[tile->id] && xe_vm_in_compute_mode(vm))) { ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); @@ -1313,7 +1326,7 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_engine *e, LLIST_HEAD(deferred); /* TLB invalidation must be done before signaling rebind */ - if (rebind && !xe_vm_no_dma_fences(vma->vm)) { + if (ifence) { int err = invalidation_fence_init(tile->primary_gt, ifence, fence, vma); if (err) { diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index 45117a2ab1a0..215606b5fae0 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -15,6 +15,7 @@ #include "xe_macros.h" #include "xe_sched_job.h" #include "xe_vm_types.h" +#include "xe_vm.h" /* * 3D-related flags that can't be set on _engines_ that lack access to the 3D @@ -74,9 +75,11 @@ static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) return i; } -static int emit_flush_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) +static int emit_flush_imm_ggtt(u32 addr, u32 value, bool invalidate_tlb, + u32 *dw, int i) { - dw[i++] = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; + dw[i++] = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW | + (invalidate_tlb ? MI_INVALIDATE_TLB : 0); dw[i++] = addr | MI_FLUSH_DW_USE_GTT; dw[i++] = 0; dw[i++] = value; @@ -107,7 +110,8 @@ static int emit_flush_invalidate(u32 flag, u32 *dw, int i) return i; } -static int emit_pipe_invalidate(u32 mask_flags, u32 *dw, int i) +static int emit_pipe_invalidate(u32 mask_flags, bool invalidate_tlb, u32 *dw, + int i) { u32 flags = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_COMMAND_CACHE_INVALIDATE | @@ -119,6 +123,9 @@ static int emit_pipe_invalidate(u32 mask_flags, u32 *dw, int i) PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_STORE_DATA_INDEX; + if (invalidate_tlb) + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags &= ~mask_flags; dw[i++] = GFX_OP_PIPE_CONTROL(6); @@ -170,9 +177,17 @@ static void __emit_job_gen12_copy(struct xe_sched_job *job, struct xe_lrc *lrc, { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job); + struct xe_vm *vm = job->engine->vm; - i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), - seqno, dw, i); + if (vm->batch_invalidate_tlb) { + dw[i++] = preparser_disable(true); + i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, true, dw, i); + dw[i++] = preparser_disable(false); + } else { + i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, dw, i); + } i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); @@ -181,7 +196,7 @@ static void __emit_job_gen12_copy(struct xe_sched_job *job, struct xe_lrc *lrc, job->user_fence.value, dw, i); - i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, dw, i); + i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i); i = emit_user_interrupt(dw, i); @@ -210,6 +225,7 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, struct xe_gt *gt = job->engine->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->engine->class == XE_ENGINE_CLASS_VIDEO_DECODE; + struct xe_vm *vm = job->engine->vm; dw[i++] = preparser_disable(true); @@ -220,10 +236,16 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, else i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); } + + if (vm->batch_invalidate_tlb) + i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, true, dw, i); + dw[i++] = preparser_disable(false); - i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), - seqno, dw, i); + if (!vm->batch_invalidate_tlb) + i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, dw, i); i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); @@ -232,7 +254,7 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, job->user_fence.value, dw, i); - i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, dw, i); + i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i); i = emit_user_interrupt(dw, i); @@ -250,6 +272,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, struct xe_gt *gt = job->engine->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); + struct xe_vm *vm = job->engine->vm; u32 mask_flags = 0; dw[i++] = preparser_disable(true); @@ -257,7 +280,9 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS; else if (job->engine->class == XE_ENGINE_CLASS_COMPUTE) mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; - i = emit_pipe_invalidate(mask_flags, dw, i); + + /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */ + i = emit_pipe_invalidate(mask_flags, vm->batch_invalidate_tlb, dw, i); /* hsdes: 1809175790 */ if (has_aux_ccs(xe)) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 17b7d543ae49..ea205244fcf9 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1237,11 +1237,13 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags) if (err) goto err_scratch_pt; } + vm->batch_invalidate_tlb = true; } if (flags & DRM_XE_VM_CREATE_COMPUTE_MODE) { INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); vm->flags |= XE_VM_FLAG_COMPUTE_MODE; + vm->batch_invalidate_tlb = false; } if (flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS) { diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 76af6ac0fa84..5242236b4b0e 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -337,6 +337,9 @@ struct xe_vm { /** @capture_once: capture only one error per VM */ bool capture_once; } error_capture; + + /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ + bool batch_invalidate_tlb; }; #endif