From: Dan Williams Date: Sat, 28 Jun 2008 04:44:04 +0000 (-0700) Subject: md: resolve external metadata handling deadlock in md_allow_write X-Git-Tag: v2.6.27-rc1~957^2~27^2 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b5470dc5fc18a8ff6517c3bb538d1479e58ecb02;p=linux-2.6 md: resolve external metadata handling deadlock in md_allow_write md_allow_write() marks the metadata dirty while holding mddev->lock and then waits for the write to complete. For externally managed metadata this causes a deadlock as userspace needs to take the lock to communicate that the metadata update has completed. Change md_allow_write() in the 'external' case to start the 'mark active' operation and then return -EAGAIN. The expected side effects while waiting for userspace to write 'active' to 'array_state' are holding off reshape (code currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes these failures can be mitigated by coordinating with mdmon. md_write_start() still prevents writes from occurring until the metadata handler has had a chance to take action as it unconditionally waits for MD_CHANGE_CLEAN to be cleared. [neilb@suse.de: return -EAGAIN, try GFP_NOIO] Signed-off-by: Dan Williams --- diff --git a/drivers/md/md.c b/drivers/md/md.c index df1230af02..43d033d9a0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) char *ptr, *buf = NULL; int err = -ENOMEM; - md_allow_write(mddev); + if (md_allow_write(mddev)) + file = kmalloc(sizeof(*file), GFP_NOIO); + else + file = kmalloc(sizeof(*file), GFP_KERNEL); - file = kmalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out; @@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev) * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. + * + * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock + * is dropped, so return -EAGAIN after notifying userspace. */ -void md_allow_write(mddev_t *mddev) +int md_allow_write(mddev_t *mddev) { if (!mddev->pers) - return; + return 0; if (mddev->ro) - return; + return 0; if (!mddev->pers->sync_request) - return; + return 0; spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { @@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify(&mddev->kobj, NULL, "array_state"); - /* wait for the dirty state to be recorded in the metadata */ - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } else spin_unlock_irq(&mddev->write_lock); + + if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + return -EAGAIN; + else + return 0; } EXPORT_SYMBOL_GPL(md_allow_write); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f05d5983ef..491dc2d4ad 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev) conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; unsigned long flags; - int d, d2; + int d, d2, err; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_size != mddev->new_chunk || @@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev) return -EINVAL; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; raid_disks = mddev->raid_disks + mddev->delta_disks; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 442622067c..8f4c70a532 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; if (!conf) @@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b7386ae9d2..dc0e3fcb9f 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw); extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); -extern void md_allow_write(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); #endif /* CONFIG_MD */