From 5659b0c93a1ea02c662a030b322093203f299185 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Fri, 19 Jul 2024 16:10:40 +0800 Subject: [PATCH] drm/amdgpu: reset vm state machine after gpu reset(vram lost) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] Page table of compute VM in the VRAM will lost after gpu reset. VRAM won't be restored since compute VM has no shadows. [How] Use higher 32-bit of vm->generation to record a vram_lost_counter. Reset the VM state machine when vm->genertaion is not equal to the new generation token. v2: Check vm->generation instead of calling drm_sched_entity_error in amdgpu_vm_validate. v3: Use new generation token instead of vram_lost_counter for check. Signed-off-by: ZhenGuo Yin Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org (cherry picked from commit 47c0388b0589cb481c294dcb857d25a214c46eb3) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 3abfa66d72a2..a060c28f0877 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -434,7 +434,7 @@ uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm) if (!vm) return result; - result += vm->generation; + result += lower_32_bits(vm->generation); /* Add one if the page tables will be re-generated on next CS */ if (drm_sched_entity_error(&vm->delayed)) ++result; @@ -463,13 +463,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, int (*validate)(void *p, struct amdgpu_bo *bo), void *param) { + uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); struct amdgpu_vm_bo_base *bo_base; struct amdgpu_bo *shadow; struct amdgpu_bo *bo; int r; - if (drm_sched_entity_error(&vm->delayed)) { - ++vm->generation; + if (vm->generation != new_vm_generation) { + vm->generation = new_vm_generation; amdgpu_vm_bo_reset_state_machine(vm); amdgpu_vm_fini_entities(vm); r = amdgpu_vm_init_entities(adev, vm); @@ -2439,7 +2440,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->last_update = dma_fence_get_stub(); vm->last_unlocked = dma_fence_get_stub(); vm->last_tlb_flush = dma_fence_get_stub(); - vm->generation = 0; + vm->generation = amdgpu_vm_generation(adev, NULL); mutex_init(&vm->eviction_lock); vm->evicting = false;