From ad689233bee854dced741c91aff12a8771a22f6f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 26 Jan 2007 14:08:52 -0800 Subject: [PATCH] [SCSI] libsas: Handle SCSI commands that complete with failure codes This patch moves the code that handles SAS failures out of the main EH function and into a separate function. It also detects commands that have no sas_task (i.e. they completed, but with error data) and sends them into scsi_error for processing. This allows us to handle SCSI errors (and enables auto-spinup as a side effect) instead of dropping them on the floor and falling into an infinite loop. It also requires the implementation of a device reset function, which the SAS failure code has been modified to employ for REQ_DEVICE_RESET. Signed-off-by: Darrick J. Wong Signed-off-by: James Bottomley --- drivers/scsi/libsas/sas_scsi_host.c | 148 +++++++++++++++++++--------- include/scsi/libsas.h | 1 + 2 files changed, 105 insertions(+), 44 deletions(-) diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 9ffe7605fb..eac1d2dde7 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -34,6 +34,7 @@ #include #include "../scsi_sas_internal.h" #include "../scsi_transport_api.h" +#include "../scsi_priv.h" #include #include @@ -396,54 +397,80 @@ static int sas_recover_I_T(struct domain_device *dev) return res; } -static int eh_reset_phy_helper(struct sas_phy *phy) +/* Find the sas_phy that's attached to this device */ +struct sas_phy *find_local_sas_phy(struct domain_device *dev) { - int tmf_resp; + struct domain_device *pdev = dev->parent; + struct ex_phy *exphy = NULL; + int i; + + /* Directly attached device */ + if (!pdev) + return dev->port->phy; - tmf_resp = sas_phy_reset(phy, 1); - if (tmf_resp) - SAS_DPRINTK("Hard reset of phy %d failed 0x%x\n", - phy->identify.phy_identifier, - tmf_resp); + /* Otherwise look in the expander */ + for (i = 0; i < pdev->ex_dev.num_phys; i++) + if (!memcmp(dev->sas_addr, + pdev->ex_dev.ex_phy[i].attached_sas_addr, + SAS_ADDR_SIZE)) { + exphy = &pdev->ex_dev.ex_phy[i]; + break; + } - return tmf_resp; + BUG_ON(!exphy); + return exphy->phy; } -void sas_scsi_recover_host(struct Scsi_Host *shost) +/* Attempt to send a target reset message to a device */ +int sas_eh_device_reset_handler(struct scsi_cmnd *cmd) +{ + struct domain_device *dev = cmd_to_domain_dev(cmd); + struct sas_phy *phy = find_local_sas_phy(dev); + int res; + + res = sas_phy_reset(phy, 1); + if (res) + SAS_DPRINTK("Device reset of %s failed 0x%x\n", + phy->dev.kobj.k_name, + res); + if (res == TMF_RESP_FUNC_SUCC || res == TMF_RESP_FUNC_COMPLETE) + return SUCCESS; + + return FAILED; +} + +/* Try to reset a device */ +static int try_to_reset_cmd_device(struct Scsi_Host *shost, + struct scsi_cmnd *cmd) +{ + if (!shost->hostt->eh_device_reset_handler) + return FAILED; + + return shost->hostt->eh_device_reset_handler(cmd); +} + +static int sas_eh_handle_sas_errors(struct Scsi_Host *shost, + struct list_head *work_q, + struct list_head *done_q) { - struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost); - unsigned long flags; - LIST_HEAD(error_q); struct scsi_cmnd *cmd, *n; enum task_disposition res = TASK_IS_DONE; int tmf_resp, need_reset; struct sas_internal *i = to_sas_internal(shost->transportt); - struct sas_phy *task_sas_phy = NULL; - - spin_lock_irqsave(shost->host_lock, flags); - list_splice_init(&shost->eh_cmd_q, &error_q); - spin_unlock_irqrestore(shost->host_lock, flags); - - SAS_DPRINTK("Enter %s\n", __FUNCTION__); + unsigned long flags; + struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost); - /* All tasks on this list were marked SAS_TASK_STATE_ABORTED - * by sas_scsi_timed_out() callback. - */ Again: - SAS_DPRINTK("going over list...\n"); - list_for_each_entry_safe(cmd, n, &error_q, eh_entry) { + list_for_each_entry_safe(cmd, n, work_q, eh_entry) { struct sas_task *task = TO_SAS_TASK(cmd); - list_del_init(&cmd->eh_entry); - if (!task) { - SAS_DPRINTK("%s: taskless cmd?!\n", __FUNCTION__); + if (!task) continue; - } + + list_del_init(&cmd->eh_entry); spin_lock_irqsave(&task->task_state_lock, flags); need_reset = task->task_state_flags & SAS_TASK_NEED_DEV_RESET; - if (need_reset) - task_sas_phy = task->dev->port->phy; spin_unlock_irqrestore(&task->task_state_lock, flags); SAS_DPRINTK("trying to find task 0x%p\n", task); @@ -457,14 +484,14 @@ Again: task); task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); + try_to_reset_cmd_device(shost, cmd); continue; case TASK_IS_ABORTED: SAS_DPRINTK("%s: task 0x%p is aborted\n", __FUNCTION__, task); task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); + try_to_reset_cmd_device(shost, cmd); continue; case TASK_IS_AT_LU: SAS_DPRINTK("task 0x%p is at LU: lu recover\n", task); @@ -476,8 +503,8 @@ Again: cmd->device->lun); task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); - sas_scsi_clear_queue_lu(&error_q, cmd); + try_to_reset_cmd_device(shost, cmd); + sas_scsi_clear_queue_lu(work_q, cmd); goto Again; } /* fallthrough */ @@ -491,8 +518,8 @@ Again: SAS_ADDR(task->dev->sas_addr)); task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); - sas_scsi_clear_queue_I_T(&error_q, task->dev); + try_to_reset_cmd_device(shost, cmd); + sas_scsi_clear_queue_I_T(work_q, task->dev); goto Again; } /* Hammer time :-) */ @@ -506,8 +533,8 @@ Again: "succeeded\n", port->id); task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); - sas_scsi_clear_queue_port(&error_q, + try_to_reset_cmd_device(shost, cmd); + sas_scsi_clear_queue_port(work_q, port); goto Again; } @@ -520,7 +547,7 @@ Again: "succeeded\n"); task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); + try_to_reset_cmd_device(shost, cmd); goto out; } } @@ -535,21 +562,53 @@ Again: task->task_done(task); if (need_reset) - eh_reset_phy_helper(task_sas_phy); + try_to_reset_cmd_device(shost, cmd); goto clear_q; } } out: - scsi_eh_flush_done_q(&ha->eh_done_q); - SAS_DPRINTK("--- Exit %s\n", __FUNCTION__); - return; + return list_empty(work_q); clear_q: SAS_DPRINTK("--- Exit %s -- clear_q\n", __FUNCTION__); - list_for_each_entry_safe(cmd, n, &error_q, eh_entry) { + list_for_each_entry_safe(cmd, n, work_q, eh_entry) { struct sas_task *task = TO_SAS_TASK(cmd); list_del_init(&cmd->eh_entry); task->task_done(task); } + return list_empty(work_q); +} + +void sas_scsi_recover_host(struct Scsi_Host *shost) +{ + struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost); + unsigned long flags; + LIST_HEAD(eh_work_q); + + spin_lock_irqsave(shost->host_lock, flags); + list_splice_init(&shost->eh_cmd_q, &eh_work_q); + spin_unlock_irqrestore(shost->host_lock, flags); + + SAS_DPRINTK("Enter %s\n", __FUNCTION__); + /* + * Deal with commands that still have SAS tasks (i.e. they didn't + * complete via the normal sas_task completion mechanism) + */ + if (sas_eh_handle_sas_errors(shost, &eh_work_q, &ha->eh_done_q)) + goto out; + + /* + * Now deal with SCSI commands that completed ok but have a an error + * code (and hopefully sense data) attached. This is roughly what + * scsi_unjam_host does, but we skip scsi_eh_abort_cmds because any + * command we see here has no sas_task and is thus unknown to the HA. + */ + if (!scsi_eh_get_sense(&eh_work_q, &ha->eh_done_q)) + scsi_eh_ready_devs(shost, &eh_work_q, &ha->eh_done_q); + +out: + scsi_eh_flush_done_q(&ha->eh_done_q); + SAS_DPRINTK("--- Exit %s\n", __FUNCTION__); + return; } enum scsi_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd) @@ -914,3 +973,4 @@ EXPORT_SYMBOL_GPL(__sas_task_abort); EXPORT_SYMBOL_GPL(sas_task_abort); EXPORT_SYMBOL_GPL(sas_phy_reset); EXPORT_SYMBOL_GPL(sas_phy_enable); +EXPORT_SYMBOL_GPL(sas_eh_device_reset_handler); diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ca393929c1..b200233cc6 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -660,5 +660,6 @@ void sas_init_dev(struct domain_device *); void sas_task_abort(struct sas_task *); int __sas_task_abort(struct sas_task *); +int sas_eh_device_reset_handler(struct scsi_cmnd *cmd); #endif /* _SASLIB_H_ */ -- 2.39.5