From: NeilBrown Date: Mon, 27 Mar 2006 09:18:10 +0000 (-0800) Subject: [PATCH] md: Final stages of raid5 expand code X-Git-Tag: v2.6.17-rc1~289 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=292695531ae4019bb15deedc121b218d1908b648;p=linux-2.6 [PATCH] md: Final stages of raid5 expand code This patch adds raid5_reshape and end_reshape which will start and finish the reshape processes. raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage accidental use. Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry. and Make sure that you have backups, just in case. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ac43f98062..fd2aae150c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -127,6 +127,32 @@ config MD_RAID5 If unsure, say Y. +config MD_RAID5_RESHAPE + bool "Support adding drives to a raid-5 array (experimental)" + depends on MD_RAID5 && EXPERIMENTAL + ---help--- + A RAID-5 set can be expanded by adding extra drives. This + requires "restriping" the array which means (almost) every + block must be written to a different place. + + This option allows such restriping to be done while the array + is online. However it is still EXPERIMENTAL code. It should + work, but please be sure that you have backups. + + You will need a version of mdadm newer than 2.3.1. During the + early stage of reshape there is a critical section where live data + is being over-written. A crash during this time needs extra care + for recovery. The newer mdadm takes a copy of the data in the + critical section and will restore it, if necessary, after a crash. + + The mdadm usage is e.g. + mdadm --grow /dev/md1 --raid-disks=6 + to grow '/dev/md1' to having 6 disks. + + Note: The array can only be expanded, not contracted. + There should be enough spares already present to make the new + array workable. + config MD_RAID6 tristate "RAID-6 mode" depends on BLK_DEV_MD diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e65986bc6..d169bc9646 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -158,11 +158,12 @@ static int start_readonly; */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -static void md_new_event(mddev_t *mddev) +void md_new_event(mddev_t *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); } +EXPORT_SYMBOL_GPL(md_new_event); /* * Enables to iterate over all existing md arrays @@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -static void md_do_sync(mddev_t *mddev) +void md_do_sync(mddev_t *mddev) { mddev_t *mddev2; unsigned int currspeed = 0, @@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); } +EXPORT_SYMBOL_GPL(md_do_sync); /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 56cba8d3e3..b29135acb1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num) } return 0; } + +#ifdef CONFIG_MD_RAID5_RESHAPE static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } - +#endif static int drop_one_stripe(raid5_conf_t *conf) { @@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in return 0; } +static void end_reshape(raid5_conf_t *conf); + static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) { int sectors_per_chunk = conf->chunk_size >> 9; @@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + return 0; + } if (mddev->curr_resync < max_sector) /* aborted */ bitmap_end_sync(mddev->bitmap, mddev->curr_resync, @@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } +#ifdef CONFIG_MD_RAID5_RESHAPE +static int raid5_reshape(mddev_t *mddev, int raid_disks) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + int err; + mdk_rdev_t *rdev; + struct list_head *rtmp; + int spares = 0; + int added_devices = 0; + + if (mddev->degraded || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + if (conf->raid_disks > raid_disks) + return -EINVAL; /* Cannot shrink array yet */ + if (conf->raid_disks == raid_disks) + return 0; /* nothing to do */ + + /* Can only proceed if there are plenty of stripe_heads. + * We need a minimum of one full stripe,, and for sensible progress + * it is best to have about 4 times that. + * If we require 4 times, then the default 256 4K stripe_heads will + * allow for chunk sizes up to 256K, which is probably OK. + * If the chunk size is greater, user-space should request more + * stripe_heads first. + */ + if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { + printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + (mddev->chunk_size / STRIPE_SIZE)*4); + return -ENOSPC; + } + + ITERATE_RDEV(mddev, rdev, rtmp) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (conf->raid_disks + spares < raid_disks-1) + /* Not enough devices even to make a degraded array + * of that size + */ + return -EINVAL; + + err = resize_stripes(conf, raid_disks); + if (err) + return err; + + spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; + mddev->raid_disks = conf->raid_disks = raid_disks; + conf->expand_progress = 0; + spin_unlock_irq(&conf->device_lock); + + /* Add some new drives, as many as will fit. + * We know there are enough to make the newly sized array work. + */ + ITERATE_RDEV(mddev, rdev, rtmp) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev)) { + char nm[20]; + set_bit(In_sync, &rdev->flags); + conf->working_disks++; + added_devices++; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + break; + } + + mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "%s_reshape"); + if (!mddev->sync_thread) { + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + conf->expand_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + return -EAGAIN; + } + md_wakeup_thread(mddev->sync_thread); + md_new_event(mddev); + return 0; +} +#endif + +static void end_reshape(raid5_conf_t *conf) +{ + struct block_device *bdev; + + conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + conf->mddev->changed = 1; + + bdev = bdget_disk(conf->mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + spin_lock_irq(&conf->device_lock); + conf->expand_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); +} + static void raid5_quiesce(mddev_t *mddev, int state) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, +#ifdef CONFIG_MD_RAID5_RESHAPE + .reshape = raid5_reshape, +#endif .quiesce = raid5_quiesce, }; diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b6e0bcad84..9c77cde5a7 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, extern void md_super_wait(mddev_t *mddev); extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw); - +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }