mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-09 22:50:41 +00:00
drm/amdgpu: Store CU info from all XCCs for GFX v9.4.3
Currently, we store CU info only for a single XCC assuming that it is the same for all XCCs. However, that may not be true. As a result, store CU info for all XCCs. This info is later used for CU masking. Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
2f06b27444
commit
97e3c6a853
@ -478,7 +478,7 @@ void amdgpu_amdkfd_get_cu_info(struct amdgpu_device *adev, struct kfd_cu_info *c
|
||||
cu_info->cu_active_number = acu_info.number;
|
||||
cu_info->cu_ao_mask = acu_info.ao_cu_mask;
|
||||
memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
|
||||
sizeof(acu_info.bitmap));
|
||||
sizeof(cu_info->cu_bitmap));
|
||||
cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
|
||||
cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
|
||||
cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
|
||||
|
@ -43,6 +43,7 @@
|
||||
#define AMDGPU_GFX_LBPW_DISABLED_MODE 0x00000008L
|
||||
|
||||
#define AMDGPU_MAX_GC_INSTANCES 8
|
||||
#define KGD_MAX_QUEUES 128
|
||||
|
||||
#define AMDGPU_MAX_GFX_QUEUES KGD_MAX_QUEUES
|
||||
#define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES
|
||||
@ -257,7 +258,7 @@ struct amdgpu_cu_info {
|
||||
uint32_t number;
|
||||
uint32_t ao_cu_mask;
|
||||
uint32_t ao_cu_bitmap[4][4];
|
||||
uint32_t bitmap[4][4];
|
||||
uint32_t bitmap[AMDGPU_MAX_GC_INSTANCES][4][4];
|
||||
};
|
||||
|
||||
struct amdgpu_gfx_ras {
|
||||
|
@ -839,7 +839,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
|
||||
memcpy(&dev_info->cu_ao_bitmap[0], &adev->gfx.cu_info.ao_cu_bitmap[0],
|
||||
sizeof(adev->gfx.cu_info.ao_cu_bitmap));
|
||||
memcpy(&dev_info->cu_bitmap[0], &adev->gfx.cu_info.bitmap[0],
|
||||
sizeof(adev->gfx.cu_info.bitmap));
|
||||
sizeof(dev_info->cu_bitmap));
|
||||
dev_info->vram_type = adev->gmc.vram_type;
|
||||
dev_info->vram_bit_width = adev->gmc.vram_width;
|
||||
dev_info->vce_harvest_config = adev->vce.harvest_config;
|
||||
|
@ -9449,7 +9449,7 @@ static int gfx_v10_0_get_cu_info(struct amdgpu_device *adev,
|
||||
gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh(
|
||||
adev, disable_masks[i * 2 + j]);
|
||||
bitmap = gfx_v10_0_get_cu_active_bitmap_per_sh(adev);
|
||||
cu_info->bitmap[i][j] = bitmap;
|
||||
cu_info->bitmap[0][i][j] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
|
||||
if (bitmap & mask) {
|
||||
|
@ -6368,7 +6368,7 @@ static int gfx_v11_0_get_cu_info(struct amdgpu_device *adev,
|
||||
* SE6: {SH0,SH1} --> {bitmap[2][2], bitmap[2][3]}
|
||||
* SE7: {SH0,SH1} --> {bitmap[3][2], bitmap[3][3]}
|
||||
*/
|
||||
cu_info->bitmap[i % 4][j + (i / 4) * 2] = bitmap;
|
||||
cu_info->bitmap[0][i % 4][j + (i / 4) * 2] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
|
||||
if (bitmap & mask)
|
||||
|
@ -3577,7 +3577,7 @@ static void gfx_v6_0_get_cu_info(struct amdgpu_device *adev)
|
||||
gfx_v6_0_set_user_cu_inactive_bitmap(
|
||||
adev, disable_masks[i * 2 + j]);
|
||||
bitmap = gfx_v6_0_get_cu_enabled(adev);
|
||||
cu_info->bitmap[i][j] = bitmap;
|
||||
cu_info->bitmap[0][i][j] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
|
||||
if (bitmap & mask) {
|
||||
|
@ -5119,7 +5119,7 @@ static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
|
||||
gfx_v7_0_set_user_cu_inactive_bitmap(
|
||||
adev, disable_masks[i * 2 + j]);
|
||||
bitmap = gfx_v7_0_get_cu_active_bitmap(adev);
|
||||
cu_info->bitmap[i][j] = bitmap;
|
||||
cu_info->bitmap[0][i][j] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
|
||||
if (bitmap & mask) {
|
||||
|
@ -7121,7 +7121,7 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev)
|
||||
gfx_v8_0_set_user_cu_inactive_bitmap(
|
||||
adev, disable_masks[i * 2 + j]);
|
||||
bitmap = gfx_v8_0_get_cu_active_bitmap(adev);
|
||||
cu_info->bitmap[i][j] = bitmap;
|
||||
cu_info->bitmap[0][i][j] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
|
||||
if (bitmap & mask) {
|
||||
|
@ -1499,7 +1499,7 @@ static void gfx_v9_0_init_always_on_cu_mask(struct amdgpu_device *adev)
|
||||
amdgpu_gfx_select_se_sh(adev, i, j, 0xffffffff, 0);
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
|
||||
if (cu_info->bitmap[i][j] & mask) {
|
||||
if (cu_info->bitmap[0][i][j] & mask) {
|
||||
if (counter == pg_always_on_cu_num)
|
||||
WREG32_SOC15(GC, 0, mmRLC_PG_ALWAYS_ON_CU_MASK, cu_bitmap);
|
||||
if (counter < always_on_cu_num)
|
||||
@ -7233,7 +7233,7 @@ static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
|
||||
* SE6,SH0 --> bitmap[2][1]
|
||||
* SE7,SH0 --> bitmap[3][1]
|
||||
*/
|
||||
cu_info->bitmap[i % 4][j + i / 4] = bitmap;
|
||||
cu_info->bitmap[0][i % 4][j + i / 4] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
|
||||
if (bitmap & mask) {
|
||||
|
@ -4259,7 +4259,7 @@ static void gfx_v9_4_3_set_gds_init(struct amdgpu_device *adev)
|
||||
}
|
||||
|
||||
static void gfx_v9_4_3_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
|
||||
u32 bitmap)
|
||||
u32 bitmap, int xcc_id)
|
||||
{
|
||||
u32 data;
|
||||
|
||||
@ -4269,15 +4269,15 @@ static void gfx_v9_4_3_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
|
||||
data = bitmap << GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_CUS__SHIFT;
|
||||
data &= GC_USER_SHADER_ARRAY_CONFIG__INACTIVE_CUS_MASK;
|
||||
|
||||
WREG32_SOC15(GC, GET_INST(GC, 0), regGC_USER_SHADER_ARRAY_CONFIG, data);
|
||||
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG, data);
|
||||
}
|
||||
|
||||
static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev)
|
||||
static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev, int xcc_id)
|
||||
{
|
||||
u32 data, mask;
|
||||
|
||||
data = RREG32_SOC15(GC, GET_INST(GC, 0), regCC_GC_SHADER_ARRAY_CONFIG);
|
||||
data |= RREG32_SOC15(GC, GET_INST(GC, 0), regGC_USER_SHADER_ARRAY_CONFIG);
|
||||
data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCC_GC_SHADER_ARRAY_CONFIG);
|
||||
data |= RREG32_SOC15(GC, GET_INST(GC, xcc_id), regGC_USER_SHADER_ARRAY_CONFIG);
|
||||
|
||||
data &= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_CUS_MASK;
|
||||
data >>= CC_GC_SHADER_ARRAY_CONFIG__INACTIVE_CUS__SHIFT;
|
||||
@ -4290,7 +4290,7 @@ static u32 gfx_v9_4_3_get_cu_active_bitmap(struct amdgpu_device *adev)
|
||||
static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
|
||||
struct amdgpu_cu_info *cu_info)
|
||||
{
|
||||
int i, j, k, counter, active_cu_number = 0;
|
||||
int i, j, k, counter, xcc_id, active_cu_number = 0;
|
||||
u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
|
||||
unsigned disable_masks[4 * 4];
|
||||
|
||||
@ -4309,46 +4309,38 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
|
||||
adev->gfx.config.max_sh_per_se);
|
||||
|
||||
mutex_lock(&adev->grbm_idx_mutex);
|
||||
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
|
||||
for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
|
||||
mask = 1;
|
||||
ao_bitmap = 0;
|
||||
counter = 0;
|
||||
gfx_v9_4_3_xcc_select_se_sh(adev, i, j, 0xffffffff, 0);
|
||||
gfx_v9_4_3_set_user_cu_inactive_bitmap(
|
||||
adev, disable_masks[i * adev->gfx.config.max_sh_per_se + j]);
|
||||
bitmap = gfx_v9_4_3_get_cu_active_bitmap(adev);
|
||||
for (xcc_id = 0; xcc_id < NUM_XCC(adev->gfx.xcc_mask); xcc_id++) {
|
||||
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
|
||||
for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
|
||||
mask = 1;
|
||||
ao_bitmap = 0;
|
||||
counter = 0;
|
||||
gfx_v9_4_3_xcc_select_se_sh(adev, i, j, 0xffffffff, xcc_id);
|
||||
gfx_v9_4_3_set_user_cu_inactive_bitmap(
|
||||
adev,
|
||||
disable_masks[i * adev->gfx.config.max_sh_per_se + j],
|
||||
xcc_id);
|
||||
bitmap = gfx_v9_4_3_get_cu_active_bitmap(adev, xcc_id);
|
||||
|
||||
/*
|
||||
* The bitmap(and ao_cu_bitmap) in cu_info structure is
|
||||
* 4x4 size array, and it's usually suitable for Vega
|
||||
* ASICs which has 4*2 SE/SH layout.
|
||||
* But for Arcturus, SE/SH layout is changed to 8*1.
|
||||
* To mostly reduce the impact, we make it compatible
|
||||
* with current bitmap array as below:
|
||||
* SE4,SH0 --> bitmap[0][1]
|
||||
* SE5,SH0 --> bitmap[1][1]
|
||||
* SE6,SH0 --> bitmap[2][1]
|
||||
* SE7,SH0 --> bitmap[3][1]
|
||||
*/
|
||||
cu_info->bitmap[i % 4][j + i / 4] = bitmap;
|
||||
cu_info->bitmap[xcc_id][i][j] = bitmap;
|
||||
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
|
||||
if (bitmap & mask) {
|
||||
if (counter < adev->gfx.config.max_cu_per_sh)
|
||||
ao_bitmap |= mask;
|
||||
counter++;
|
||||
for (k = 0; k < adev->gfx.config.max_cu_per_sh; k++) {
|
||||
if (bitmap & mask) {
|
||||
if (counter < adev->gfx.config.max_cu_per_sh)
|
||||
ao_bitmap |= mask;
|
||||
counter++;
|
||||
}
|
||||
mask <<= 1;
|
||||
}
|
||||
mask <<= 1;
|
||||
active_cu_number += counter;
|
||||
if (i < 2 && j < 2)
|
||||
ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8));
|
||||
cu_info->ao_cu_bitmap[i][j] = ao_bitmap;
|
||||
}
|
||||
active_cu_number += counter;
|
||||
if (i < 2 && j < 2)
|
||||
ao_cu_mask |= (ao_bitmap << (i * 16 + j * 8));
|
||||
cu_info->ao_cu_bitmap[i % 4][j + i / 4] = ao_bitmap;
|
||||
}
|
||||
gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
|
||||
xcc_id);
|
||||
}
|
||||
gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
|
||||
0);
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
|
||||
cu_info->number = active_cu_number;
|
||||
|
@ -2087,7 +2087,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
|
||||
|
||||
amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
|
||||
cu->num_simd_per_cu = cu_info.simd_per_cu;
|
||||
cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
|
||||
cu->num_simd_cores = cu_info.simd_per_cu *
|
||||
(cu_info.cu_active_number / kdev->kfd->num_nodes);
|
||||
cu->max_waves_simd = cu_info.max_waves_per_simd;
|
||||
|
||||
cu->wave_front_size = cu_info.wave_front_size;
|
||||
|
@ -104,11 +104,13 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
|
||||
bool wgp_mode_req = KFD_GC_VERSION(mm->dev) >= IP_VERSION(10, 0, 0);
|
||||
uint32_t en_mask = wgp_mode_req ? 0x3 : 0x1;
|
||||
int i, se, sh, cu, cu_bitmap_sh_mul, inc = wgp_mode_req ? 2 : 1;
|
||||
uint32_t cu_active_per_node;
|
||||
|
||||
amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info);
|
||||
|
||||
if (cu_mask_count > cu_info.cu_active_number)
|
||||
cu_mask_count = cu_info.cu_active_number;
|
||||
cu_active_per_node = cu_info.cu_active_number / mm->dev->kfd->num_nodes;
|
||||
if (cu_mask_count > cu_active_per_node)
|
||||
cu_mask_count = cu_active_per_node;
|
||||
|
||||
/* Exceeding these bounds corrupts the stack and indicates a coding error.
|
||||
* Returning with no CU's enabled will hang the queue, which should be
|
||||
@ -141,7 +143,7 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
|
||||
for (se = 0; se < cu_info.num_shader_engines; se++)
|
||||
for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
|
||||
cu_per_sh[se][sh] = hweight32(
|
||||
cu_info.cu_bitmap[se % 4][sh + (se / 4) * cu_bitmap_sh_mul]);
|
||||
cu_info.cu_bitmap[0][se % 4][sh + (se / 4) * cu_bitmap_sh_mul]);
|
||||
|
||||
/* Symmetrically map cu_mask to all SEs & SHs:
|
||||
* se_mask programs up to 2 SH in the upper and lower 16 bits.
|
||||
|
@ -450,8 +450,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
|
||||
sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count",
|
||||
dev->node_props.cpu_cores_count);
|
||||
sysfs_show_32bit_prop(buffer, offs, "simd_count",
|
||||
dev->gpu ? (dev->node_props.simd_count *
|
||||
NUM_XCC(dev->gpu->xcc_mask)) : 0);
|
||||
dev->gpu ? dev->node_props.simd_count : 0);
|
||||
sysfs_show_32bit_prop(buffer, offs, "mem_banks_count",
|
||||
dev->node_props.mem_banks_count);
|
||||
sysfs_show_32bit_prop(buffer, offs, "caches_count",
|
||||
@ -1604,7 +1603,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
|
||||
int i, j, k;
|
||||
struct kfd_cache_properties *pcache = NULL;
|
||||
|
||||
cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
|
||||
cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0];
|
||||
cu_sibling_map_mask &=
|
||||
((1 << pcache_info[cache_type].num_cu_shared) - 1);
|
||||
first_active_cu = ffs(cu_sibling_map_mask);
|
||||
@ -1647,7 +1646,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
|
||||
pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
|
||||
k += 4;
|
||||
|
||||
cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
|
||||
cu_sibling_map_mask = cu_info->cu_bitmap[0][i % 4][j + i / 4];
|
||||
cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
|
||||
}
|
||||
}
|
||||
@ -1708,8 +1707,8 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
|
||||
for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
|
||||
|
||||
ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
|
||||
pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
|
||||
cu_processor_id, k);
|
||||
pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct,
|
||||
cu_processor_id, k);
|
||||
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
@ -31,12 +31,12 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/dma-fence.h>
|
||||
#include "amdgpu_irq.h"
|
||||
#include "amdgpu_gfx.h"
|
||||
|
||||
struct pci_dev;
|
||||
struct amdgpu_device;
|
||||
|
||||
#define KGD_MAX_QUEUES 128
|
||||
|
||||
struct kfd_dev;
|
||||
struct kgd_mem;
|
||||
|
||||
@ -68,7 +68,7 @@ struct kfd_cu_info {
|
||||
uint32_t wave_front_size;
|
||||
uint32_t max_scratch_slots_per_cu;
|
||||
uint32_t lds_size;
|
||||
uint32_t cu_bitmap[4][4];
|
||||
uint32_t cu_bitmap[AMDGPU_MAX_GC_INSTANCES][4][4];
|
||||
};
|
||||
|
||||
/* For getting GPU local memory information from KGD */
|
||||
|
Loading…
x
Reference in New Issue
Block a user