This tag contains the following changes for kernel 5.8:

- Improve MMU cache invalidation code and handle case where the
   invalidation doesn't finish in a reasonable time.
 
 - Remove the option to perform soft-reset to GAUDI. Soft-reset is where the
   driver only resets the compute and DMA engines of the ASIC. This is not
   relevant to GAUDI as we must also reset the NIC ports. And when we reset
   the NIC ports, we must also reset other stuff so we prefer to just do
   hard-reset (where we reset the entire ASIC except for PCIe).
 
 - Fail the hard-reset procedure in case we still have user processes which
   have active file-descriptors on a device. Doing hard-reset in that case
   can result in a kernel panic because of gen_pool checks
 
 - Don't initialize the default wait callback of dma_buf with the default
   wait function as that's the default...
 -----BEGIN PGP SIGNATURE-----
 
 iQFKBAABCgA0FiEE7TEboABC71LctBLFZR1NuKta54AFAl7LVuUWHG9kZWQuZ2Fi
 YmF5QGdtYWlsLmNvbQAKCRBlHU24q1rngIEkCACWLDEhnP1oQtz8OWZiD42rgo6r
 /OuFmRFS0BmAbBveAP5It8qVgtgvZtMnrzcwT5k6NWSiyyldSn/1yQ59qfknWqtN
 2v3slS1pkvH0gLustR3vTeUgM9cAn7NPeYbfU0Z47Zha0fl2Bje2Z65TF37wOUlb
 I99ElTvlPY6jPjCraLK2JhhyYtdbAP4tEtLyAxc/ld+AqdLiNh26f2XjthCaFFIp
 GekUMr6QCAjj4tHQLpTKOOaO07ChbVm9rFcC2QL7NAgdT4gjAGJ6EMAuEozCsQng
 wDnxOZ0HRyE1jFmouQM1AhV8+nCCAW3CbRgNkDDWyF3SmGZ9dhC6PMrXWZ/S
 =g9rL
 -----END PGP SIGNATURE-----

Merge tag 'misc-habanalabs-next-2020-05-25' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains the following changes for kernel 5.8:

- Improve MMU cache invalidation code and handle case where the
  invalidation doesn't finish in a reasonable time.

- Remove the option to perform soft-reset to GAUDI. Soft-reset is where the
  driver only resets the compute and DMA engines of the ASIC. This is not
  relevant to GAUDI as we must also reset the NIC ports. And when we reset
  the NIC ports, we must also reset other stuff so we prefer to just do
  hard-reset (where we reset the entire ASIC except for PCIe).

- Fail the hard-reset procedure in case we still have user processes which
  have active file-descriptors on a device. Doing hard-reset in that case
  can result in a kernel panic because of gen_pool checks

- Don't initialize the default wait callback of dma_buf with the default
  wait function as that's the default...

* tag 'misc-habanalabs-next-2020-05-25' of git://people.freedesktop.org/~gabbayo/linux:
  habanalabs: handle MMU cache invalidation timeout
  habanalabs: don't allow hard reset with open processes
  habanalabs: GAUDI does not support soft-reset
  habanalabs: add print for soft reset due to event
  habanalabs: improve MMU cache invalidation code
  habanalabs: don't set default fence_ops->wait
This commit is contained in:
Greg Kroah-Hartman 2020-05-25 08:49:43 +02:00
commit 18cbc336ec
7 changed files with 126 additions and 57 deletions

View File

@ -99,7 +99,6 @@ static const struct dma_fence_ops hl_fence_ops = {
.get_driver_name = hl_fence_get_driver_name,
.get_timeline_name = hl_fence_get_timeline_name,
.enable_signaling = hl_fence_enable_signaling,
.wait = dma_fence_default_wait,
.release = hl_fence_release
};

View File

@ -726,7 +726,7 @@ int hl_device_resume(struct hl_device *hdev)
return rc;
}
static void device_kill_open_processes(struct hl_device *hdev)
static int device_kill_open_processes(struct hl_device *hdev)
{
u16 pending_total, pending_cnt;
struct hl_fpriv *hpriv;
@ -779,9 +779,7 @@ static void device_kill_open_processes(struct hl_device *hdev)
ssleep(1);
}
if (!list_empty(&hdev->fpriv_list))
dev_crit(hdev->dev,
"Going to hard reset with open user contexts\n");
return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
}
static void device_hard_reset_pending(struct work_struct *work)
@ -801,6 +799,7 @@ static void device_hard_reset_pending(struct work_struct *work)
* @hdev: pointer to habanalabs device structure
* @hard_reset: should we do hard reset to all engines or just reset the
* compute/dma engines
* @from_hard_reset_thread: is the caller the hard-reset thread
*
* Block future CS and wait for pending CS to be enqueued
* Call ASIC H/W fini
@ -823,6 +822,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
return 0;
}
if ((!hard_reset) && (!hdev->supports_soft_reset)) {
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
hard_reset = true;
}
/*
* Prevent concurrency in this function - only one reset should be
* done at any given time. Only need to perform this if we didn't
@ -902,7 +906,12 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
* process can't really exit until all its CSs are done, which
* is what we do in cs rollback
*/
device_kill_open_processes(hdev);
rc = device_kill_open_processes(hdev);
if (rc) {
dev_crit(hdev->dev,
"Failed to kill all open processes, stopping hard reset\n");
goto out_err;
}
/* Flush the Event queue workers to make sure no other thread is
* reading or writing to registers during the reset
@ -1385,7 +1394,9 @@ void hl_device_fini(struct hl_device *hdev)
* can't really exit until all its CSs are done, which is what we
* do in cs rollback
*/
device_kill_open_processes(hdev);
rc = device_kill_open_processes(hdev);
if (rc)
dev_crit(hdev->dev, "Failed to kill all open processes\n");
hl_cb_pool_fini(hdev);

View File

@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
>> EQ_CTL_EVENT_TYPE_SHIFT);
u8 cause;
bool soft_reset_required;
bool reset_required;
gaudi->events_stat[event_type]++;
gaudi->events_stat_aggregate[event_type]++;
@ -5840,12 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_DEC:
case GAUDI_EVENT_TPC7_DEC:
gaudi_print_irq_info(hdev, event_type, true);
soft_reset_required = gaudi_tpc_read_interrupts(hdev,
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error");
if (soft_reset_required)
hl_device_reset(hdev, false, false);
hl_fw_unmask_irq(hdev, event_type);
if (reset_required) {
dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name);
if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
} else {
hl_fw_unmask_irq(hdev, event_type);
}
break;
case GAUDI_EVENT_TPC0_KRN_ERR:
@ -5857,12 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR:
gaudi_print_irq_info(hdev, event_type, true);
soft_reset_required = gaudi_tpc_read_interrupts(hdev,
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR");
if (soft_reset_required)
hl_device_reset(hdev, false, false);
hl_fw_unmask_irq(hdev, event_type);
if (reset_required) {
dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name);
if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
} else {
hl_fw_unmask_irq(hdev, event_type);
}
break;
case GAUDI_EVENT_PCIE_CORE_SERR:
@ -5913,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
hl_device_reset(hdev, false, false);
hl_fw_unmask_irq(hdev, event_type);
if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
break;
case GAUDI_EVENT_TPC0_BMON_SPMU:
@ -5963,7 +5975,7 @@ static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate,
return gaudi->events_stat;
}
static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@ -5972,34 +5984,40 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
mutex_lock(&hdev->mmu_cache_lock);
return 0;
if (hdev->pldm)
timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC;
else
timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
mutex_lock(&hdev->mmu_cache_lock);
/* L0 & L1 invalidation */
WREG32(mmSTLB_INV_ALL_START, 1);
WREG32(mmSTLB_INV_PS, 2);
rc = hl_poll_timeout(
hdev,
mmSTLB_INV_ALL_START,
mmSTLB_INV_PS,
status,
!status,
1000,
timeout_usec);
if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
WREG32(mmSTLB_INV_SET, 0);
mutex_unlock(&hdev->mmu_cache_lock);
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}
return rc;
}
static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@ -6010,7 +6028,7 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
return 0;
mutex_lock(&hdev->mmu_cache_lock);
@ -6041,11 +6059,15 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
1000,
timeout_usec);
if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
mutex_unlock(&hdev->mmu_cache_lock);
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}
return rc;
}
static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev,

View File

@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev)
spin_lock_init(&goya->hw_queues_lock);
hdev->supports_coresight = true;
hdev->supports_soft_reset = true;
return 0;
@ -4883,7 +4884,7 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
}
static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{
struct goya_device *goya = hdev->asic_specific;
@ -4892,11 +4893,11 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
return 0;
/* no need in L1 only invalidation in Goya */
if (!is_hard)
return;
return 0;
if (hdev->pldm)
timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
@ -4918,13 +4919,17 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
mutex_unlock(&hdev->mmu_cache_lock);
if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}
return rc;
}
static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
static int goya_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
{
struct goya_device *goya = hdev->asic_specific;
u32 status, timeout_usec, inv_data, pi;
@ -4932,11 +4937,11 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
return 0;
/* no need in L1 only invalidation in Goya */
if (!is_hard)
return;
return 0;
if (hdev->pldm)
timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
@ -4969,9 +4974,13 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
mutex_unlock(&hdev->mmu_cache_lock);
if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}
return rc;
}
int goya_send_heartbeat(struct hl_device *hdev)

View File

@ -675,9 +675,9 @@ struct hl_asic_funcs {
u32 *size);
u64 (*read_pte)(struct hl_device *hdev, u64 addr);
void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
u32 flags);
void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
void (*enable_clock_gating)(struct hl_device *hdev);
@ -755,8 +755,8 @@ struct hl_va_range {
* with huge pages.
* @dram_va_range: holds available virtual addresses for DRAM mappings.
* @mem_hash_lock: protects the mem_hash.
* @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the
* MMU hash or walking the PGT requires talking this lock
* @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
* MMU hash or walking the PGT requires talking this lock.
* @debugfs_list: node in debugfs list of contexts.
* @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
* to user so user could inquire about CS. It is used as
@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts {
* @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported.
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
*/
struct hl_device {
struct pci_dev *pdev;
@ -1522,6 +1523,7 @@ struct hl_device {
u8 stop_on_err;
u8 supports_sync_stream;
u8 supports_coresight;
u8 supports_soft_reset;
/* Parameters for bring-up */
u8 mmu_enable;

View File

@ -886,6 +886,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
vm_type = (enum vm_type_t *) userptr;
hint_addr = args->map_host.hint_addr;
handle = phys_pg_pack->handle;
} else {
handle = lower_32_bits(args->map_device.handle);
@ -954,10 +955,17 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
goto map_err;
}
hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
mutex_unlock(&ctx->mmu_lock);
if (rc) {
dev_err(hdev->dev,
"mapping handle %u failed due to MMU cache invalidation\n",
handle);
goto map_err;
}
ret_vaddr += phys_pg_pack->offset;
hnode->ptr = vm_type;
@ -1083,21 +1091,34 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
* at the loop end rather than for each iteration
*/
if (!ctx_free)
hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
*vm_type);
mutex_unlock(&ctx->mmu_lock);
/*
* No point in maintaining the free VA block list if the context is
* closing as the list will be freed anyway
* If the context is closing we don't need to check for the MMU cache
* invalidation return code and update the VA free list as in this flow
* we invalidate the MMU cache outside of this unmap function and the VA
* free list will be freed anyway.
*/
if (!ctx_free) {
rc = add_va_block(hdev, va_range, vaddr,
vaddr + phys_pg_pack->total_size - 1);
int tmp_rc;
if (rc)
dev_err(hdev->dev,
"unmapping vaddr 0x%llx failed due to MMU cache invalidation\n",
vaddr);
tmp_rc = add_va_block(hdev, va_range, vaddr,
vaddr + phys_pg_pack->total_size - 1);
if (tmp_rc) {
dev_warn(hdev->dev,
"add va block failed for vaddr: 0x%llx\n",
vaddr);
if (!rc)
rc = tmp_rc;
}
}
atomic_dec(&phys_pg_pack->mapping_cnt);
@ -1108,7 +1129,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
dma_unmap_host_va(hdev, userptr);
}
return 0;
return rc;
mapping_cnt_err:
if (is_userptr)

View File

@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev,
goto out;
}
if (!hdev->supports_soft_reset) {
dev_err(hdev->dev, "Device does not support soft-reset\n");
goto out;
}
dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
hl_device_reset(hdev, false, false);