* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches. Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ * new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ * we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ * batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/highmem.h>
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state))
+ if (test_bit(STRIPE_DELAYED, &sh->state)) {
list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- conf->seq_write == sh->bm_seq)
+ blk_plug_device(conf->mddev->queue);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0) {
list_add_tail(&sh->lru, &conf->bitmap_list);
- else {
+ blk_plug_device(conf->mddev->queue);
+ } else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
- unplug_slaves(conf->mddev)
+ raid5_unplug_device(conf->mddev->queue)
);
conf->inactive_blocked = 0;
} else
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
- if (list_empty(&sh->lru))
+ if (list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state))
BUG();
list_del_init(&sh->lru);
}
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
if (bi->bi_size)
return 1;
set_bit(R5_UPTODATE, &sh->dev[i].flags);
#endif
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
- printk(KERN_INFO "raid5: read error corrected!!\n");
+ rdev = conf->disks[i].rdev;
+ printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+ mdname(conf->mddev), STRIPE_SECTORS,
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdevname(rdev->bdev, b));
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
if (atomic_read(&conf->disks[i].rdev->read_errors))
atomic_set(&conf->disks[i].rdev->read_errors, 0);
} else {
+ const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
int retry = 0;
+ rdev = conf->disks[i].rdev;
+
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&conf->disks[i].rdev->read_errors);
+ atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded)
- printk(KERN_WARNING "raid5: read error not correctable.\n");
+ printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
- printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
- else if (atomic_read(&conf->disks[i].rdev->read_errors)
+ printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdn);
+ else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
printk(KERN_WARNING
- "raid5: Too many read errors, failing device.\n");
+ "raid5:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
else
retry = 1;
if (retry)
else {
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
- md_error(conf->mddev, conf->disks[i].rdev);
+ md_error(conf->mddev, rdev);
}
}
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
- unsigned long flags;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
if (bi->bi_size)
return 0;
}
- spin_lock_irqsave(&conf->device_lock, flags);
if (!uptodate)
md_error(conf->mddev, conf->disks[i].rdev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- __release_stripe(conf, sh);
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ release_stripe(sh);
return 0;
}
PRINTK("raid5: error called\n");
if (!test_bit(Faulty, &rdev->flags)) {
- mddev->sb_dirty = 1;
- if (test_bit(In_sync, &rdev->flags)) {
- conf->working_disks--;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->failed_disks++;
- clear_bit(In_sync, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery was running, make sure it aborts.
*/
printk (KERN_ALERT
"raid5: Disk failure on %s, disabling device."
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
}
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (sh->dev[i].written) BUG();
+ BUG_ON(sh->dev[i].written);
sh->dev[i].written = chosen;
}
break;
(unsigned long long)sh->sector, dd_idx);
if (conf->mddev->bitmap && firstwrite) {
- sh->bm_seq = conf->seq_write;
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0);
+ sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
{
int sectors_per_chunk = conf->chunk_size >> 9;
- sector_t x = stripe;
int pd_idx, dd_idx;
- int chunk_offset = sector_div(x, sectors_per_chunk);
- stripe = x;
+ int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+ chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
return pd_idx;
return ret;
}
-static inline void raid5_plug_device(raid5_conf_t *conf)
+static int raid5_congested(void *data, int bits)
{
- spin_lock_irq(&conf->device_lock);
- blk_plug_device(conf->mddev->queue);
- spin_unlock_irq(&conf->device_lock);
+ mddev_t *mddev = data;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ /* No difference between reads and writes. Just check
+ * how busy the stripe_cache is
+ */
+ if (conf->inactive_blocked)
+ return 1;
+ if (conf->quiesce)
+ return 1;
+ if (list_empty_careful(&conf->inactive_list))
+ return 1;
+
+ return 0;
}
static int make_request(request_queue_t *q, struct bio * bi)
goto retry;
}
finish_wait(&conf->wait_for_overlap, &w);
- raid5_plug_device(conf);
handle_stripe(sh, NULL);
release_stripe(sh);
} else {
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->expand_progress;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
- wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+ wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop());
spin_lock_irq(&conf->device_lock);
conf->expand_lo = mddev->reshape_position;
while (1) {
struct list_head *first;
- if (conf->seq_flush - conf->seq_write > 0) {
+ if (conf->seq_flush != conf->seq_write) {
int seq = conf->seq_flush;
spin_unlock_irq(&conf->device_lock);
bitmap_unplug(mddev->bitmap);
mdk_rdev_t *rdev;
struct disk_info *disk;
struct list_head *tmp;
+ int working_disks = 0;
if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
printk(KERN_INFO "raid5: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b),
raid_disk);
- conf->working_disks++;
+ working_disks++;
}
}
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
- mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+ mddev->degraded = conf->raid_disks - working_disks;
conf->mddev = mddev;
conf->chunk_size = mddev->chunk_size;
conf->level = mddev->level;
if (mddev->degraded > conf->max_degraded) {
printk(KERN_ERR "raid5: not enough operational devices for %s"
" (%d/%d failed)\n",
- mdname(mddev), conf->failed_disks, conf->raid_disks);
+ mdname(mddev), mddev->degraded, conf->raid_disks);
goto abort;
}
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"%s_reshape");
- /* FIXME if md_register_thread fails?? */
- md_wakeup_thread(mddev->sync_thread);
-
}
/* read-ahead size must cover two whole stripes, which is
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
+
mddev->array_size = mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
int i;
seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
- seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->disks[i].rdev &&
printk("(conf==NULL)\n");
return;
}
- printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
- conf->working_disks, conf->failed_disks);
+ printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+ conf->raid_disks - conf->mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
tmp = conf->disks + i;
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- conf->failed_disks--;
- conf->working_disks++;
- set_bit(In_sync, &tmp->rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
print_raid5_conf(conf);
struct list_head *rtmp;
int spares = 0;
int added_devices = 0;
+ unsigned long flags;
if (mddev->degraded ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
if (raid5_add_disk(mddev, rdev)) {
char nm[20];
set_bit(In_sync, &rdev->flags);
- conf->working_disks++;
added_devices++;
rdev->recovery_offset = 0;
sprintf(nm, "rd%d", rdev->raid_disk);
break;
}
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = 0;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);