]> err.no Git - linux-2.6/blobdiff - drivers/md/raid10.c
[PATCH] drivers/md/md.c: make md_new_event() static
[linux-2.6] / drivers / md / raid10.c
index 1fa70c34b7d2bc6b4d3d11d19bf67b12925d6194..8e6f6dfddb2b830c11cb235841f0160bed232dc2 100644 (file)
@@ -59,10 +59,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
        int size = offsetof(struct r10bio_s, devs[conf->copies]);
 
        /* allocate a r10bio with room for raid_disks entries in the bios array */
-       r10_bio = kmalloc(size, gfp_flags);
-       if (r10_bio)
-               memset(r10_bio, 0, size);
-       else
+       r10_bio = kzalloc(size, gfp_flags);
+       if (!r10_bio)
                unplug_slaves(conf->mddev);
 
        return r10_bio;
@@ -134,10 +132,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
        for ( ; i > 0 ; i--)
-               __free_page(bio->bi_io_vec[i-1].bv_page);
+               safe_put_page(bio->bi_io_vec[i-1].bv_page);
        while (j--)
                for (i = 0; i < RESYNC_PAGES ; i++)
-                       __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+                       safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
        j = -1;
 out_free_bio:
        while ( ++j < nalloc )
@@ -157,7 +155,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
                struct bio *bio = r10bio->devs[j].bio;
                if (bio) {
                        for (i = 0; i < RESYNC_PAGES; i++) {
-                               __free_page(bio->bi_io_vec[i].bv_page);
+                               safe_put_page(bio->bi_io_vec[i].bv_page);
                                bio->bi_io_vec[i].bv_page = NULL;
                        }
                        bio_put(bio);
@@ -172,7 +170,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
        for (i = 0; i < conf->copies; i++) {
                struct bio **bio = & r10_bio->devs[i].bio;
-               if (*bio)
+               if (*bio && *bio != IO_BLOCKED)
                        bio_put(*bio);
                *bio = NULL;
        }
@@ -209,6 +207,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
 
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r10_bio->retry_list, &conf->retry_list);
+       conf->nr_queued ++;
        spin_unlock_irqrestore(&conf->device_lock, flags);
 
        md_wakeup_thread(mddev->thread);
@@ -254,9 +253,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
        /*
         * this branch is our 'one mirror IO has finished' event handler:
         */
-       if (!uptodate)
-               md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-       else
+       update_head_pos(slot, r10_bio);
+
+       if (uptodate) {
                /*
                 * Set R10BIO_Uptodate in our master bio, so that
                 * we will return a good error code to the higher
@@ -267,15 +266,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
                 * wait for the 'master' bio.
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
-
-       update_head_pos(slot, r10_bio);
-
-       /*
-        * we have only one bio on the read side
-        */
-       if (uptodate)
                raid_end_bio_io(r10_bio);
-       else {
+       else {
                /*
                 * oops, read error:
                 */
@@ -506,6 +498,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
                disk = r10_bio->devs[slot].devnum;
 
                while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+                      r10_bio->devs[slot].bio == IO_BLOCKED ||
                       !test_bit(In_sync, &rdev->flags)) {
                        slot++;
                        if (slot == conf->copies) {
@@ -523,6 +516,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
        slot = 0;
        disk = r10_bio->devs[slot].devnum;
        while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+              r10_bio->devs[slot].bio == IO_BLOCKED ||
               !test_bit(In_sync, &rdev->flags)) {
                slot ++;
                if (slot == conf->copies) {
@@ -543,6 +537,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 
 
                if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
+                   r10_bio->devs[nslot].bio == IO_BLOCKED ||
                    !test_bit(In_sync, &rdev->flags))
                        continue;
 
@@ -714,6 +709,33 @@ static void allow_barrier(conf_t *conf)
        wake_up(&conf->wait_barrier);
 }
 
+static void freeze_array(conf_t *conf)
+{
+       /* stop syncio and normal IO and wait for everything to
+        * go quiet.
+        * We increment barrier and nr_waiting, and then
+        * wait until barrier+nr_pending match nr_queued+2
+        */
+       spin_lock_irq(&conf->resync_lock);
+       conf->barrier++;
+       conf->nr_waiting++;
+       wait_event_lock_irq(conf->wait_barrier,
+                           conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                           conf->resync_lock,
+                           raid10_unplug(conf->mddev->queue));
+       spin_unlock_irq(&conf->resync_lock);
+}
+
+static void unfreeze_array(conf_t *conf)
+{
+       /* reverse the effect of the freeze */
+       spin_lock_irq(&conf->resync_lock);
+       conf->barrier--;
+       conf->nr_waiting--;
+       wake_up(&conf->wait_barrier);
+       spin_unlock_irq(&conf->resync_lock);
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
        mddev_t *mddev = q->queuedata;
@@ -1083,7 +1105,6 @@ abort:
 
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
        conf_t *conf = mddev_to_conf(r10_bio->mddev);
        int i,d;
@@ -1098,7 +1119,10 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
                BUG();
        update_head_pos(i, r10_bio);
        d = r10_bio->devs[i].devnum;
-       if (!uptodate)
+
+       if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+               set_bit(R10BIO_Uptodate, &r10_bio->state);
+       else if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
                md_error(r10_bio->mddev,
                         conf->mirrors[d].rdev);
 
@@ -1188,25 +1212,30 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
        fbio = r10_bio->devs[i].bio;
 
        /* now find blocks with errors */
-       for (i=first+1 ; i < conf->copies ; i++) {
-               int vcnt, j, d;
+       for (i=0 ; i < conf->copies ; i++) {
+               int  j, d;
+               int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
 
-               if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-                       continue;
-               /* We know that the bi_io_vec layout is the same for
-                * both 'first' and 'i', so we just compare them.
-                * All vec entries are PAGE_SIZE;
-                */
                tbio = r10_bio->devs[i].bio;
-               vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
-               for (j = 0; j < vcnt; j++)
-                       if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-                                  page_address(tbio->bi_io_vec[j].bv_page),
-                                  PAGE_SIZE))
-                               break;
-               if (j == vcnt)
+
+               if (tbio->bi_end_io != end_sync_read)
                        continue;
-               mddev->resync_mismatches += r10_bio->sectors;
+               if (i == first)
+                       continue;
+               if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+                       /* We know that the bi_io_vec layout is the same for
+                        * both 'first' and 'i', so we just compare them.
+                        * All vec entries are PAGE_SIZE;
+                        */
+                       for (j = 0; j < vcnt; j++)
+                               if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+                                          page_address(tbio->bi_io_vec[j].bv_page),
+                                          PAGE_SIZE))
+                                       break;
+                       if (j == vcnt)
+                               continue;
+                       mddev->resync_mismatches += r10_bio->sectors;
+               }
                if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
                        /* Don't fix anything. */
                        continue;
@@ -1287,7 +1316,10 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 
        atomic_inc(&conf->mirrors[d].rdev->nr_pending);
        md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-       generic_make_request(wbio);
+       if (test_bit(R10BIO_Uptodate, &r10_bio->state))
+               generic_make_request(wbio);
+       else
+               bio_endio(wbio, wbio->bi_size, -EIO);
 }
 
 
@@ -1338,6 +1370,7 @@ static void raid10d(mddev_t *mddev)
                        break;
                r10_bio = list_entry(head->prev, r10bio_t, retry_list);
                list_del(head->prev);
+               conf->nr_queued--;
                spin_unlock_irqrestore(&conf->device_lock, flags);
 
                mddev = r10_bio->mddev;
@@ -1350,8 +1383,95 @@ static void raid10d(mddev_t *mddev)
                        unplug = 1;
                } else {
                        int mirror;
+                       /* we got a read error. Maybe the drive is bad.  Maybe just
+                        * the block and we can fix it.
+                        * We freeze all other IO, and try reading the block from
+                        * other devices.  When we find one, we re-write
+                        * and check it that fixes the read error.
+                        * This is all done synchronously while the array is
+                        * frozen.
+                        */
+                       int sect = 0; /* Offset from r10_bio->sector */
+                       int sectors = r10_bio->sectors;
+                       freeze_array(conf);
+                       if (mddev->ro == 0) while(sectors) {
+                               int s = sectors;
+                               int sl = r10_bio->read_slot;
+                               int success = 0;
+
+                               if (s > (PAGE_SIZE>>9))
+                                       s = PAGE_SIZE >> 9;
+
+                               do {
+                                       int d = r10_bio->devs[sl].devnum;
+                                       rdev = conf->mirrors[d].rdev;
+                                       if (rdev &&
+                                           test_bit(In_sync, &rdev->flags) &&
+                                           sync_page_io(rdev->bdev,
+                                                        r10_bio->devs[sl].addr +
+                                                        sect + rdev->data_offset,
+                                                        s<<9,
+                                                        conf->tmppage, READ))
+                                               success = 1;
+                                       else {
+                                               sl++;
+                                               if (sl == conf->copies)
+                                                       sl = 0;
+                                       }
+                               } while (!success && sl != r10_bio->read_slot);
+
+                               if (success) {
+                                       int start = sl;
+                                       /* write it back and re-read */
+                                       while (sl != r10_bio->read_slot) {
+                                               int d;
+                                               if (sl==0)
+                                                       sl = conf->copies;
+                                               sl--;
+                                               d = r10_bio->devs[sl].devnum;
+                                               rdev = conf->mirrors[d].rdev;
+                                               if (rdev &&
+                                                   test_bit(In_sync, &rdev->flags)) {
+                                                       if (sync_page_io(rdev->bdev,
+                                                                        r10_bio->devs[sl].addr +
+                                                                        sect + rdev->data_offset,
+                                                                        s<<9, conf->tmppage, WRITE) == 0)
+                                                               /* Well, this device is dead */
+                                                               md_error(mddev, rdev);
+                                               }
+                                       }
+                                       sl = start;
+                                       while (sl != r10_bio->read_slot) {
+                                               int d;
+                                               if (sl==0)
+                                                       sl = conf->copies;
+                                               sl--;
+                                               d = r10_bio->devs[sl].devnum;
+                                               rdev = conf->mirrors[d].rdev;
+                                               if (rdev &&
+                                                   test_bit(In_sync, &rdev->flags)) {
+                                                       if (sync_page_io(rdev->bdev,
+                                                                        r10_bio->devs[sl].addr +
+                                                                        sect + rdev->data_offset,
+                                                                        s<<9, conf->tmppage, READ) == 0)
+                                                               /* Well, this device is dead */
+                                                               md_error(mddev, rdev);
+                                               }
+                                       }
+                               } else {
+                                       /* Cannot read from anywhere -- bye bye array */
+                                       md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
+                                       break;
+                               }
+                               sectors -= s;
+                               sect += s;
+                       }
+
+                       unfreeze_array(conf);
+
                        bio = r10_bio->devs[r10_bio->read_slot].bio;
-                       r10_bio->devs[r10_bio->read_slot].bio = NULL;
+                       r10_bio->devs[r10_bio->read_slot].bio =
+                               mddev->ro ? IO_BLOCKED : NULL;
                        bio_put(bio);
                        mirror = read_balance(conf, r10_bio);
                        if (mirror == -1) {
@@ -1566,8 +1686,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                                for (j=0; j<conf->copies;j++) {
                                        int d = r10_bio->devs[j].devnum;
                                        if (conf->mirrors[d].rdev == NULL ||
-                                           test_bit(Faulty, &conf->mirrors[d].rdev->flags))
+                                           test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
                                                still_degraded = 1;
+                                               break;
+                                       }
                                }
                                must_sync = bitmap_start_sync(mddev->bitmap, sect,
                                                              &sync_blocks, still_degraded);
@@ -1775,11 +1897,11 @@ static int run(mddev_t *mddev)
        int nc, fc;
        sector_t stride, size;
 
-       if (mddev->level != 10) {
-               printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
-                      mdname(mddev), mddev->level);
-               goto out;
+       if (mddev->chunk_size == 0) {
+               printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
+               return -EINVAL;
        }
+
        nc = mddev->layout & 255;
        fc = (mddev->layout >> 8) & 255;
        if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
@@ -1793,22 +1915,24 @@ static int run(mddev_t *mddev)
         * bookkeeping area. [whatever we allocate in run(),
         * should be freed in stop()]
         */
-       conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+       conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
        mddev->private = conf;
        if (!conf) {
                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
                        mdname(mddev));
                goto out;
        }
-       memset(conf, 0, sizeof(*conf));
-       conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+       conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
                                 GFP_KERNEL);
        if (!conf->mirrors) {
                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
                       mdname(mddev));
                goto out_free_conf;
        }
-       memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+
+       conf->tmppage = alloc_page(GFP_KERNEL);
+       if (!conf->tmppage)
+               goto out_free_conf;
 
        conf->near_copies = nc;
        conf->far_copies = fc;
@@ -1905,7 +2029,7 @@ static int run(mddev_t *mddev)
         * maybe...
         */
        {
-               int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
+               int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE;
                stripe /= conf->near_copies;
                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
@@ -1918,6 +2042,7 @@ static int run(mddev_t *mddev)
 out_free_conf:
        if (conf->r10bio_pool)
                mempool_destroy(conf->r10bio_pool);
+       safe_put_page(conf->tmppage);
        kfree(conf->mirrors);
        kfree(conf);
        mddev->private = NULL;
@@ -1961,9 +2086,10 @@ static void raid10_quiesce(mddev_t *mddev, int state)
        }
 }
 
-static mdk_personality_t raid10_personality =
+static struct mdk_personality raid10_personality =
 {
        .name           = "raid10",
+       .level          = 10,
        .owner          = THIS_MODULE,
        .make_request   = make_request,
        .run            = run,
@@ -1979,15 +2105,16 @@ static mdk_personality_t raid10_personality =
 
 static int __init raid_init(void)
 {
-       return register_md_personality(RAID10, &raid10_personality);
+       return register_md_personality(&raid10_personality);
 }
 
 static void raid_exit(void)
 {
-       unregister_md_personality(RAID10);
+       unregister_md_personality(&raid10_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-level-10");