From 536ae08d7b6ae16872f0b3c2679e656a7fc9d5e2 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Dec 2024 09:56:01 -0600 Subject: [PATCH 01/11] drm/amd: Require CONFIG_HOTPLUG_PCI_PCIE for BOCO If the kernel hasn't been compiled with PCIe hotplug support this can lead to problems with dGPUs that use BOCO because they effectively drop off the bus. To prevent issues, disable BOCO support when compiled without PCIe hotplug. Reported-by: Gabriel Marcano Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1707#note_2696862 Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211155601.3585256-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 1ad5bdc28bafa66db0f041cc6cdd278a80426aae) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d272d95dd5b2..cd4fac120834 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -417,6 +417,9 @@ bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); + if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) + return false; + if (adev->has_pr3 || ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) return true; From a93b1020eb9386d7da11608477121b10079c076a Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Fri, 6 Dec 2024 13:17:45 +0100 Subject: [PATCH 02/11] drm/amdgpu: don't access invalid sched MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") accessing job->base.sched can produce unexpected results as the initialisation of (*job)->base.sched done in amdgpu_job_alloc is overwritten by the memset. This commit fixes an issue when a CS would fail validation and would be rejected after job->num_ibs is incremented. In this case, amdgpu_ib_free(ring->adev, ...) will be called, which would crash the machine because the ring value is bogus. To fix this, pass a NULL pointer to amdgpu_ib_free(): we can do this because the device is actually not used in this function. The next commit will remove the ring argument completely. Fixes: 2320c9e6a768 ("drm/sched: memset() 'job' in drm_sched_job_init()") Signed-off-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 2ae520cb12831d264ceb97c61f72c59d33c0dbd7) --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index b9d08bc96581..a21c510c408e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -255,7 +255,6 @@ void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, void amdgpu_job_free_resources(struct amdgpu_job *job) { - struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); struct dma_fence *f; unsigned i; @@ -268,7 +267,7 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) f = NULL; for (i = 0; i < job->num_ibs; ++i) - amdgpu_ib_free(ring->adev, &job->ibs[i], f); + amdgpu_ib_free(NULL, &job->ibs[i], f); } static void amdgpu_job_free_cb(struct drm_sched_job *s_job) From 458600da793da12e0f3724ecbea34a80703f4d5b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:47:48 -0500 Subject: [PATCH 03/11] drm/amdgpu/nbio7.7: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 22b9555bc90df22b585bdd1f161b61584b13af51) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index 1ac730328516..3fb6d2aa7e3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,7 +247,7 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 7, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); From 8c1ecc7197a88c6ae62de56e1c0887f220712a32 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:00:07 -0500 Subject: [PATCH 04/11] drm/amdgpu/nbio7.11: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 2c8eeaaa0fe5841ccf07a0eb51b1426f34ef39f7) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c index 814ab59fdd4a..41421da63a08 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_11.c @@ -275,7 +275,7 @@ static void nbio_v7_11_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF_BIF256_CI256_RC3X4_USB4_PCIE_MST_CTRL_3, data); - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 11, 0): case IP_VERSION(7, 11, 1): case IP_VERSION(7, 11, 2): From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:03:20 -0500 Subject: [PATCH 05/11] drm/amdgpu/mmhub4.1: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c index 0fbc3be81f14..f2ab5001b492 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c @@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev, dev_err(adev->dev, "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n", status); - switch (adev->ip_versions[MMHUB_HWIP][0]) { + switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { case IP_VERSION(4, 1, 0): mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw]; break; From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:04:58 -0500 Subject: [PATCH 06/11] drm/amdgpu/gfx12: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index fe7c48f2fb2a..da327ab48a57 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -4123,7 +4123,7 @@ static int gfx_v12_0_set_clockgating_state(void *handle, if (amdgpu_sriov_vf(adev)) return 0; - switch (adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): gfx_v12_0_update_gfx_clock_gating(adev, From 9e752ee26c1031312a01d2afc281f5f6fdfca176 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 17:06:26 -0500 Subject: [PATCH 07/11] drm/amdgpu/smu14.0.2: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 8f2cd1067afe68372a1723e05e19b68ed187676a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 6a565ce74d5b..5cad09c5f2ff 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -2096,7 +2096,7 @@ static int smu_v14_0_2_enable_gfx_features(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(14, 0, 2)) + if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 2)) return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_EnableAllSmuFeatures, FEATURE_PWR_GFX, NULL); else From 8d1a13816e59254bd3b18f5ae0895230922bd120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 12 Dec 2024 16:29:18 +0100 Subject: [PATCH 08/11] drm/amdgpu: fix amdgpu_coredump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM pointer might already be outdated when that function is called. Use the PASID instead to gather the information instead. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 57f812d171af4ba233d3ed7c94dfa5b8e92dcc04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index 946c48829f19..824f9da5b6ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -343,11 +343,10 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; - if (job && job->vm) { - struct amdgpu_vm *vm = job->vm; + if (job && job->pasid) { struct amdgpu_task_info *ti; - ti = amdgpu_vm_get_task_info_vm(vm); + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); if (ti) { coredump->reset_task_info = *ti; amdgpu_vm_put_task_info(ti); From 85230ee36d88e7a09fb062d43203035659dd10a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 17 Dec 2024 18:22:56 +0100 Subject: [PATCH 09/11] drm/amdgpu: Handle NULL bo->tbo.resource (again) in amdgpu_vm_bo_update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Third time's the charm, I hope? Fixes: d3116756a710 ("drm/ttm: rename bo->mem and make it a pointer") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3837 Reviewed-by: Christian König Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher (cherry picked from commit 695c2c745e5dff201b75da8a1d237ce403600d04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ddd7f05e4db9..c9c48b782ec1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1266,10 +1266,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, * next command submission. */ if (amdgpu_vm_is_bo_always_valid(vm, bo)) { - uint32_t mem_type = bo->tbo.resource->mem_type; - - if (!(bo->preferred_domains & - amdgpu_mem_type_to_domain(mem_type))) + if (bo->tbo.resource && + !(bo->preferred_domains & + amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type))) amdgpu_vm_bo_evicted(&bo_va->base); else amdgpu_vm_bo_idle(&bo_va->base); From a7f9d98eb1202132014ba760c26ad8608ffc9caf Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Dec 2024 20:44:14 -0600 Subject: [PATCH 10/11] drm/amd: Update strapping for NBIO 2.5.0 This helps to avoid a spurious PME event on hotplug to Azalia. Cc: Vijendar Mukunda Reported-and-tested-by: ionut_n2001@yahoo.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215884 Tested-by: Gabriel Marcano Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241211024414.7840-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 3f6f237b9dd189e1fb85b8a3f7c97a8f27c1e49a) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index b1b57dcc5a73..49e953f86ced 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -271,8 +271,19 @@ const struct nbio_hdp_flush_reg nbio_v7_0_hdp_flush_reg = { .ref_and_mask_sdma1 = GPU_HDP_FLUSH_DONE__SDMA1_MASK, }; +#define regRCC_DEV0_EPF6_STRAP4 0xd304 +#define regRCC_DEV0_EPF6_STRAP4_BASE_IDX 5 + static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { + uint32_t data; + + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(2, 5, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); + break; + } } #define MMIO_REG_HOLE_OFFSET (0x80000 - PAGE_SIZE) From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 Dec 2024 16:49:20 -0500 Subject: [PATCH 11/11] drm/amdgpu/nbio7.0: fix IP version check Use the helper function rather than reading it directly. Reviewed-by: Yang Wang Signed-off-by: Alex Deucher (cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index 49e953f86ced..d1032e9992b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { uint32_t data; - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(2, 5, 0): data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data);