From: NeilBrown Date: Tue, 4 Mar 2008 22:29:35 +0000 (-0800) Subject: md: fix possible raid1/raid10 deadlock on read error during resync X-Git-Tag: v2.6.25-rc4~14 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1c830532f6b44d10a1743ccd00e990c6b83396f5;p=linux-2.6 md: fix possible raid1/raid10 deadlock on read error during resync Thanks to K.Tanaka and the scsi fault injection framework, here is a fix for another possible deadlock in raid1/raid10 error handing. If a read request returns an error while a resync is happening and a resync request is pending, the attempt to fix the error will block until the resync progresses, and the resync will block until the read request completes. Thus a deadlock. This patch fixes the problem. Cc: "K.Tanaka" Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 38f076a340..ff61b30912 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -704,13 +704,20 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, ({ flush_pending_writes(conf); raid1_unplug(conf->mddev->queue); })); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6c486d839c..8e5671d2f3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -747,13 +747,20 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, ({ flush_pending_writes(conf); raid10_unplug(conf->mddev->queue); }));