md: replace STRIPE_OP_CHECK with 'check_states'

[linux-2.6] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 4efec467e2f178f13860c53ee83b0e2dd6d15d8f..544e1600f20828ed24532787ef06b14b69456803 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
  #define STRIPE_SHIFT           (PAGE_SHIFT - 9)
  #define STRIPE_SECTORS         (STRIPE_SIZE>>9)
  #define        IO_THRESHOLD            1
+#define BYPASS_THRESHOLD       1
  #define NR_HASH                        (PAGE_SIZE / sizeof(struct hlist_head))
  #define HASH_MASK              (NR_HASH - 1)
  
@@ -93,6 +94,8 @@
  #define __inline__
  #endif
  
+#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
+
  #if !RAID6_USE_EMPTY_ZERO_PAGE
  /* In .bss so it's zeroed */
  const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
@@ -112,9 +115,7 @@ static void return_io(struct bio *return_bi)
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_size = 0;
-               bi->bi_end_io(bi,
-                             test_bit(BIO_UPTODATE, &bi->bi_flags)
-                               ? 0 : -EIO);
+               bio_endio(bi, 0);
                 bi = return_bi;
         }
  }
@@ -372,8 +373,6 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
         test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
         test_and_ack_op(STRIPE_OP_POSTXOR, pending);
         test_and_ack_op(STRIPE_OP_CHECK, pending);
-       if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
-               ack++;
  
         sh->ops.count -= ack;
         if (unlikely(sh->ops.count < 0)) {
@@ -391,7 +390,7 @@ raid5_end_read_request(struct bio *bi, int error);
  static void
  raid5_end_write_request(struct bio *bi, int error);
  
-static void ops_run_io(struct stripe_head *sh)
+static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int i, disks = sh->disks;
@@ -426,11 +425,11 @@ static void ops_run_io(struct stripe_head *sh)
                 rcu_read_unlock();
  
                 if (rdev) {
-                       if (test_bit(STRIPE_SYNCING, &sh->state) ||
-                               test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
-                               test_bit(STRIPE_EXPAND_READY, &sh->state))
+                       if (s->syncing || s->expanding || s->expanded)
                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
  
+                       set_bit(STRIPE_IO_STARTED, &sh->state);
+
                         bi->bi_bdev = rdev->bdev;
                         pr_debug("%s: for %llu schedule op %ld on disc %d\n",
                                 __func__, (unsigned long long)sh->sector,
@@ -606,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref)
         set_bit(R5_UPTODATE, &tgt->flags);
         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
         clear_bit(R5_Wantcompute, &tgt->flags);
-       set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+       clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
+       if (sh->check_state == check_state_compute_run)
+               sh->check_state = check_state_compute_result;
+       else
+               set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
  }
@@ -835,16 +838,11 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
  static void ops_complete_check(void *stripe_head_ref)
  {
         struct stripe_head *sh = stripe_head_ref;
-       int pd_idx = sh->pd_idx;
  
         pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
-       if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
-               sh->ops.zero_sum_result == 0)
-               set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-
-       set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+       sh->check_state = check_state_check_result;
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
  }
@@ -871,17 +869,13 @@ static void ops_run_check(struct stripe_head *sh)
         tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
                 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
  
-       if (tx)
-               set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-       else
-               clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-
         atomic_inc(&sh->count);
         tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
                 ops_complete_check, sh);
  }
  
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
+                         unsigned long ops_request)
  {
         int overlap_clear = 0, i, disks = sh->disks;
         struct dma_async_tx_descriptor *tx = NULL;
@@ -891,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
                 overlap_clear++;
         }
  
-       if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
+       if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
+           test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
                 tx = ops_run_compute5(sh, pending);
  
         if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -905,12 +900,9 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
         if (test_bit(STRIPE_OP_POSTXOR, &pending))
                 ops_run_postxor(sh, tx, pending);
  
-       if (test_bit(STRIPE_OP_CHECK, &pending))
+       if (test_bit(STRIPE_OP_CHECK, &ops_request))
                 ops_run_check(sh);
  
-       if (test_bit(STRIPE_OP_IO, &pending))
-               ops_run_io(sh);
-
         if (overlap_clear)
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
@@ -1141,10 +1133,12 @@ static void raid5_end_read_request(struct bio * bi, int error)
                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
                         rdev = conf->disks[i].rdev;
-                       printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
-                              mdname(conf->mddev), STRIPE_SECTORS,
-                              (unsigned long long)(sh->sector + rdev->data_offset),
-                              bdevname(rdev->bdev, b));
+                       printk_rl(KERN_INFO "raid5:%s: read error corrected"
+                                 " (%lu sectors at %llu on %s)\n",
+                                 mdname(conf->mddev), STRIPE_SECTORS,
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdevname(rdev->bdev, b));
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
                 }
@@ -1158,16 +1152,22 @@ static void raid5_end_read_request(struct bio * bi, int error)
                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
                 atomic_inc(&rdev->read_errors);
                 if (conf->mddev->degraded)
-                       printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
-                              mdname(conf->mddev),
-                              (unsigned long long)(sh->sector + rdev->data_offset),
-                              bdn);
+                       printk_rl(KERN_WARNING
+                                 "raid5:%s: read error not correctable "
+                                 "(sector %llu on %s).\n",
+                                 mdname(conf->mddev),
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdn);
                 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                         /* Oh, no!!! */
-                       printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
-                              mdname(conf->mddev),
-                              (unsigned long long)(sh->sector + rdev->data_offset),
-                              bdn);
+                       printk_rl(KERN_WARNING
+                                 "raid5:%s: read error NOT corrected!! "
+                                 "(sector %llu on %s).\n",
+                                 mdname(conf->mddev),
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdn);
                 else if (atomic_read(&rdev->read_errors)
                          > conf->max_nr_stripes)
                         printk(KERN_WARNING
@@ -1256,12 +1256,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                         /*
                          * if recovery was running, make sure it aborts.
                          */
-                       set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                 }
                 set_bit(Faulty, &rdev->flags);
                 printk (KERN_ALERT
-                       "raid5: Disk failure on %s, disabling device."
-                       " Operation continuing on %d devices\n",
+                       "raid5: Disk failure on %s, disabling device.\n"
+                       "raid5: Operation continuing on %d devices.\n",
                         bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
         }
  }
@@ -1720,6 +1720,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
                                 locked++;
                         }
                 }
+               if (locked + 1 == disks)
+                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+                               atomic_inc(&sh->raid_conf->pending_full_writes);
         } else {
                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1947,6 +1950,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
                                         STRIPE_SECTORS, 0, 0);
         }
  
+       if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+               if (atomic_dec_and_test(&conf->pending_full_writes))
+                       md_wakeup_thread(conf->mddev->thread);
  }
  
  /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
         /* don't schedule compute operations or reads on the parity block while
          * a check is in flight
          */
-       if ((disk_idx == sh->pd_idx) &&
-            test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+       if (disk_idx == sh->pd_idx && sh->check_state)
                 return ~0;
  
         /* is the data in this block needed, and can we get it? */
@@ -1983,8 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
                  * 3/ We hold off parity block re-reads until check operations
                  * have quiesced.
                  */
-               if ((s->uptodate == disks - 1) &&
-                   !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+               if ((s->uptodate == disks - 1) && !sh->check_state &&
+                   (s->failed && disk_idx == s->failed_num)) {
                         set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
                         set_bit(R5_Wantcompute, &dev->flags);
                         sh->ops.target = disk_idx;
@@ -2006,8 +2011,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
                          */
                         set_bit(R5_LOCKED, &dev->flags);
                         set_bit(R5_Wantread, &dev->flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
                         s->locked++;
                         pr_debug("Reading block %d (sync=%d)\n", disk_idx,
                                 s->syncing);
@@ -2022,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
  {
         int i;
  
-       /* Clear completed compute operations.  Parity recovery
-        * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
-        * later on in this routine
-        */
-       if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-               !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+       /* Clear completed compute operations */
+       if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
                 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
                 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
                 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
@@ -2069,7 +2068,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
                         /* we would like to get this block, possibly
                          * by computing it, but we might not be able to
                          */
-                       if (s->uptodate == disks-1) {
+                       if ((s->uptodate == disks - 1) &&
+                           (s->failed && (i == r6s->failed_num[0] ||
+                                          i == r6s->failed_num[1]))) {
                                 pr_debug("Computing stripe %llu block %d\n",
                                        (unsigned long long)sh->sector, i);
                                 compute_block_1(sh, i, 0);
@@ -2149,6 +2150,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
                                                         0);
                         }
                 }
+
+       if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+               if (atomic_dec_and_test(&conf->pending_full_writes))
+                       md_wakeup_thread(conf->mddev->thread);
  }
  
  static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2195,9 +2200,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
                                                 "%d for r-m-w\n", i);
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
-                                       if (!test_and_set_bit(
-                                               STRIPE_OP_IO, &sh->ops.pending))
-                                               sh->ops.count++;
                                         s->locked++;
                                 } else {
                                         set_bit(STRIPE_DELAYED, &sh->state);
@@ -2221,9 +2223,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
                                                 "%d for Reconstruct\n", i);
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
-                                       if (!test_and_set_bit(
-                                               STRIPE_OP_IO, &sh->ops.pending))
-                                               sh->ops.count++;
                                         s->locked++;
                                 } else {
                                         set_bit(STRIPE_DELAYED, &sh->state);
@@ -2333,6 +2332,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
                                 s->locked++;
                                 set_bit(R5_Wantwrite, &sh->dev[i].flags);
                         }
+               if (s->locked == disks)
+                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+                               atomic_inc(&conf->pending_full_writes);
                 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
                 set_bit(STRIPE_INSYNC, &sh->state);
  
@@ -2348,91 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
  static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                 struct stripe_head_state *s, int disks)
  {
-       int canceled_check = 0;
+       struct r5dev *dev = NULL;
  
         set_bit(STRIPE_HANDLE, &sh->state);
  
-       /* complete a check operation */
-       if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-           clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-           clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+       switch (sh->check_state) {
+       case check_state_idle:
+               /* start a new check operation if there are no failures */
                 if (s->failed == 0) {
-                       if (sh->ops.zero_sum_result == 0)
-                               /* parity is correct (on disc,
-                                * not in buffer any more)
-                                */
-                               set_bit(STRIPE_INSYNC, &sh->state);
-                       else {
-                               conf->mddev->resync_mismatches +=
-                                       STRIPE_SECTORS;
-                               if (test_bit(
-                                    MD_RECOVERY_CHECK, &conf->mddev->recovery))
-                                       /* don't try to repair!! */
-                                       set_bit(STRIPE_INSYNC, &sh->state);
-                               else {
-                                       set_bit(STRIPE_OP_COMPUTE_BLK,
-                                               &sh->ops.pending);
-                                       set_bit(STRIPE_OP_MOD_REPAIR_PD,
-                                               &sh->ops.pending);
-                                       set_bit(R5_Wantcompute,
-                                               &sh->dev[sh->pd_idx].flags);
-                                       sh->ops.target = sh->pd_idx;
-                                       sh->ops.count++;
-                                       s->uptodate++;
-                               }
-                       }
-               } else
-                       canceled_check = 1; /* STRIPE_INSYNC is not set */
-       }
-
-       /* check if we can clear a parity disk reconstruct */
-       if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-               test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-
-               clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
-               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-       }
-
-       /* start a new check operation if there are no failures, the stripe is
-        * not insync, and a repair is not in flight
-        */
-       if (s->failed == 0 &&
-           !test_bit(STRIPE_INSYNC, &sh->state) &&
-           !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-               if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
                         BUG_ON(s->uptodate != disks);
+                       sh->check_state = check_state_run;
+                       set_bit(STRIPE_OP_CHECK, &s->ops_request);
                         clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
-                       sh->ops.count++;
                         s->uptodate--;
+                       break;
                 }
-       }
+               dev = &sh->dev[s->failed_num];
+               /* fall through */
+       case check_state_compute_result:
+               sh->check_state = check_state_idle;
+               if (!dev)
+                       dev = &sh->dev[sh->pd_idx];
+
+               /* check that a write has not made the stripe insync */
+               if (test_bit(STRIPE_INSYNC, &sh->state))
+                       break;
  
-       /* Wait for check parity and compute block operations to complete
-        * before write-back.  If a failure occurred while the check operation
-        * was in flight we need to cycle this stripe through handle_stripe
-        * since the parity block may not be uptodate
-        */
-       if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
-           !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
-           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
-               struct r5dev *dev;
                 /* either failed parity check, or recovery is happening */
-               if (s->failed == 0)
-                       s->failed_num = sh->pd_idx;
-               dev = &sh->dev[s->failed_num];
                 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
                 BUG_ON(s->uptodate != disks);
  
                 set_bit(R5_LOCKED, &dev->flags);
+               s->locked++;
                 set_bit(R5_Wantwrite, &dev->flags);
-               if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                       sh->ops.count++;
  
                 clear_bit(STRIPE_DEGRADED, &sh->state);
-               s->locked++;
                 set_bit(STRIPE_INSYNC, &sh->state);
+               break;
+       case check_state_run:
+               break; /* we will be called again upon completion */
+       case check_state_check_result:
+               sh->check_state = check_state_idle;
+
+               /* if a failure occurred during the check operation, leave
+                * STRIPE_INSYNC not set and let the stripe be handled again
+                */
+               if (s->failed)
+                       break;
+
+               /* handle a successful check operation, if parity is correct
+                * we are done.  Otherwise update the mismatch count and repair
+                * parity if !MD_RECOVERY_CHECK
+                */
+               if (sh->ops.zero_sum_result == 0)
+                       /* parity is correct (on disc,
+                        * not in buffer any more)
+                        */
+                       set_bit(STRIPE_INSYNC, &sh->state);
+               else {
+                       conf->mddev->resync_mismatches += STRIPE_SECTORS;
+                       if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+                               /* don't try to repair!! */
+                               set_bit(STRIPE_INSYNC, &sh->state);
+                       else {
+                               sh->check_state = check_state_compute_run;
+                               set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+                               set_bit(R5_Wantcompute,
+                                       &sh->dev[sh->pd_idx].flags);
+                               sh->ops.target = sh->pd_idx;
+                               s->uptodate++;
+                       }
+               }
+               break;
+       case check_state_compute_run:
+               break;
+       default:
+               printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+                      __func__, sh->check_state,
+                      (unsigned long long) sh->sector);
+               BUG();
         }
  }
  
@@ -2592,6 +2588,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
         }
  }
  
+
  /*
   * handle_stripe - do things to a stripe.
   *
@@ -2617,6 +2614,8 @@ static void handle_stripe5(struct stripe_head *sh)
         struct stripe_head_state s;
         struct r5dev *dev;
         unsigned long pending = 0;
+       mdk_rdev_t *blocked_rdev = NULL;
+       int prexor;
  
         memset(&s, 0, sizeof(s));
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2676,6 +2675,11 @@ static void handle_stripe5(struct stripe_head *sh)
                 if (dev->written)
                         s.written++;
                 rdev = rcu_dereference(conf->disks[i].rdev);
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       blocked_rdev = rdev;
+                       atomic_inc(&rdev->nr_pending);
+                       break;
+               }
                 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                         /* The ReadError flag will just be confusing now */
                         clear_bit(R5_ReadError, &dev->flags);
@@ -2690,6 +2694,11 @@ static void handle_stripe5(struct stripe_head *sh)
         }
         rcu_read_unlock();
  
+       if (unlikely(blocked_rdev)) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto unlock;
+       }
+
         if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
                 sh->ops.count++;
  
@@ -2736,9 +2745,11 @@ static void handle_stripe5(struct stripe_head *sh)
         /* leave prexor set until postxor is done, allows us to distinguish
          * a rmw from a rcw during biodrain
          */
+       prexor = 0;
         if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
                 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
  
+               prexor = 1;
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -2769,9 +2780,8 @@ static void handle_stripe5(struct stripe_head *sh)
                                 (i == sh->pd_idx || dev->written)) {
                                 pr_debug("Writing block %d\n", i);
                                 set_bit(R5_Wantwrite, &dev->flags);
-                               if (!test_and_set_bit(
-                                   STRIPE_OP_IO, &sh->ops.pending))
-                                       sh->ops.count++;
+                               if (prexor)
+                                       continue;
                                 if (!test_bit(R5_Insync, &dev->flags) ||
                                     (i == sh->pd_idx && s.failed == 0))
                                         set_bit(STRIPE_INSYNC, &sh->state);
@@ -2792,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh)
          *    block.
          */
         if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-                         !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+           !sh->check_state)
                 handle_issuing_new_write_requests5(conf, sh, &s, disks);
  
         /* maybe we need to check and possibly fix the parity for this stripe
@@ -2800,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh)
          * data is available.  The parity check is held off while parity
          * dependent operations are in flight.
          */
-       if ((s.syncing && s.locked == 0 &&
+       if (sh->check_state ||
+           (s.syncing && s.locked == 0 &&
              !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
-            !test_bit(STRIPE_INSYNC, &sh->state)) ||
-             test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
-             test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
+            !test_bit(STRIPE_INSYNC, &sh->state)))
                 handle_parity_checks5(conf, sh, &s, disks);
  
         if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2823,16 +2832,12 @@ static void handle_stripe5(struct stripe_head *sh)
                 dev = &sh->dev[s.failed_num];
                 if (!test_bit(R5_ReWrite, &dev->flags)) {
                         set_bit(R5_Wantwrite, &dev->flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
                         set_bit(R5_ReWrite, &dev->flags);
                         set_bit(R5_LOCKED, &dev->flags);
                         s.locked++;
                 } else {
                         /* let's read it back */
                         set_bit(R5_Wantread, &dev->flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
                         set_bit(R5_LOCKED, &dev->flags);
                         s.locked++;
                 }
@@ -2850,11 +2855,10 @@ static void handle_stripe5(struct stripe_head *sh)
                 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
                 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
  
-               for (i = conf->raid_disks; i--; ) {
+               for (i = conf->raid_disks; i--; )
                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
-               }
+                       set_bit(R5_LOCKED, &dev->flags);
+                       s.locked++;
         }
  
         if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
@@ -2865,6 +2869,7 @@ static void handle_stripe5(struct stripe_head *sh)
                         conf->raid_disks);
                 s.locked += handle_write_operations5(sh, 1, 1);
         } else if (s.expanded &&
+                  s.locked == 0 &&
                 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
                 clear_bit(STRIPE_EXPAND_READY, &sh->state);
                 atomic_dec(&conf->reshape_stripes);
@@ -2879,13 +2884,19 @@ static void handle_stripe5(struct stripe_head *sh)
         if (sh->ops.count)
                 pending = get_stripe_work(sh);
  
+ unlock:
         spin_unlock(&sh->lock);
  
-       if (pending)
-               raid5_run_ops(sh, pending);
+       /* wait for this device to become unblocked */
+       if (unlikely(blocked_rdev))
+               md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
  
-       return_io(return_bi);
+       if (pending || s.ops_request)
+               raid5_run_ops(sh, pending, s.ops_request);
+
+       ops_run_io(sh, &s);
  
+       return_io(return_bi);
  }
  
  static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -2897,6 +2908,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         struct stripe_head_state s;
         struct r6_state r6s;
         struct r5dev *dev, *pdev, *qdev;
+       mdk_rdev_t *blocked_rdev = NULL;
  
         r6s.qd_idx = raid6_next_disk(pd_idx, disks);
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
@@ -2960,6 +2972,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 if (dev->written)
                         s.written++;
                 rdev = rcu_dereference(conf->disks[i].rdev);
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       blocked_rdev = rdev;
+                       atomic_inc(&rdev->nr_pending);
+                       break;
+               }
                 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                         /* The ReadError flag will just be confusing now */
                         clear_bit(R5_ReadError, &dev->flags);
@@ -2974,6 +2991,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                         set_bit(R5_Insync, &dev->flags);
         }
         rcu_read_unlock();
+
+       if (unlikely(blocked_rdev)) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto unlock;
+       }
         pr_debug("locked=%d uptodate=%d to_read=%d"
                " to_write=%d failed=%d failed_num=%d,%d\n",
                s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3079,68 +3101,16 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
             !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
                 handle_stripe_expansion(conf, sh, &r6s);
  
+ unlock:
         spin_unlock(&sh->lock);
  
-       return_io(return_bi);
+       /* wait for this device to become unblocked */
+       if (unlikely(blocked_rdev))
+               md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
  
-       for (i=disks; i-- ;) {
-               int rw;
-               struct bio *bi;
-               mdk_rdev_t *rdev;
-               if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-                       rw = WRITE;
-               else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-                       rw = READ;
-               else
-                       continue;
-
-               bi = &sh->dev[i].req;
+       ops_run_io(sh, &s);
  
-               bi->bi_rw = rw;
-               if (rw == WRITE)
-                       bi->bi_end_io = raid5_end_write_request;
-               else
-                       bi->bi_end_io = raid5_end_read_request;
-
-               rcu_read_lock();
-               rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && test_bit(Faulty, &rdev->flags))
-                       rdev = NULL;
-               if (rdev)
-                       atomic_inc(&rdev->nr_pending);
-               rcu_read_unlock();
-
-               if (rdev) {
-                       if (s.syncing || s.expanding || s.expanded)
-                               md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-                       bi->bi_bdev = rdev->bdev;
-                       pr_debug("for %llu schedule op %ld on disc %d\n",
-                               (unsigned long long)sh->sector, bi->bi_rw, i);
-                       atomic_inc(&sh->count);
-                       bi->bi_sector = sh->sector + rdev->data_offset;
-                       bi->bi_flags = 1 << BIO_UPTODATE;
-                       bi->bi_vcnt = 1;
-                       bi->bi_max_vecs = 1;
-                       bi->bi_idx = 0;
-                       bi->bi_io_vec = &sh->dev[i].vec;
-                       bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-                       bi->bi_io_vec[0].bv_offset = 0;
-                       bi->bi_size = STRIPE_SIZE;
-                       bi->bi_next = NULL;
-                       if (rw == WRITE &&
-                           test_bit(R5_ReWrite, &sh->dev[i].flags))
-                               atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-                       generic_make_request(bi);
-               } else {
-                       if (rw == WRITE)
-                               set_bit(STRIPE_DEGRADED, &sh->state);
-                       pr_debug("skip op %ld on disc %d for sector %llu\n",
-                               bi->bi_rw, i, (unsigned long long)sh->sector);
-                       clear_bit(R5_LOCKED, &sh->dev[i].flags);
-                       set_bit(STRIPE_HANDLE, &sh->state);
-               }
-       }
+       return_io(return_bi);
  }
  
  static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
@@ -3164,7 +3134,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
                         clear_bit(STRIPE_DELAYED, &sh->state);
                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                 atomic_inc(&conf->preread_active_stripes);
-                       list_add_tail(&sh->lru, &conf->handle_list);
+                       list_add_tail(&sh->lru, &conf->hold_list);
                 }
         } else
                 blk_plug_device(conf->mddev->queue);
@@ -3442,6 +3412,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
         }
  }
  
+/* __get_priority_stripe - get the next stripe to process
+ *
+ * Full stripe writes are allowed to pass preread active stripes up until
+ * the bypass_threshold is exceeded.  In general the bypass_count
+ * increments when the handle_list is handled before the hold_list; however, it
+ * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
+ * stripe with in flight i/o.  The bypass_count will be reset when the
+ * head of the hold_list has changed, i.e. the head was promoted to the
+ * handle_list.
+ */
+static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
+{
+       struct stripe_head *sh;
+
+       pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
+                 __func__,
+                 list_empty(&conf->handle_list) ? "empty" : "busy",
+                 list_empty(&conf->hold_list) ? "empty" : "busy",
+                 atomic_read(&conf->pending_full_writes), conf->bypass_count);
+
+       if (!list_empty(&conf->handle_list)) {
+               sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+
+               if (list_empty(&conf->hold_list))
+                       conf->bypass_count = 0;
+               else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
+                       if (conf->hold_list.next == conf->last_hold)
+                               conf->bypass_count++;
+                       else {
+                               conf->last_hold = conf->hold_list.next;
+                               conf->bypass_count -= conf->bypass_threshold;
+                               if (conf->bypass_count < 0)
+                                       conf->bypass_count = 0;
+                       }
+               }
+       } else if (!list_empty(&conf->hold_list) &&
+                  ((conf->bypass_threshold &&
+                    conf->bypass_count > conf->bypass_threshold) ||
+                   atomic_read(&conf->pending_full_writes) == 0)) {
+               sh = list_entry(conf->hold_list.next,
+                               typeof(*sh), lru);
+               conf->bypass_count -= conf->bypass_threshold;
+               if (conf->bypass_count < 0)
+                       conf->bypass_count = 0;
+       } else
+               return NULL;
+
+       list_del_init(&sh->lru);
+       atomic_inc(&sh->count);
+       BUG_ON(atomic_read(&sh->count) != 1);
+       return sh;
+}
  
  static int make_request(struct request_queue *q, struct bio * bi)
  {
@@ -3576,9 +3598,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                 if ( rw == WRITE )
                         md_write_end(mddev);
  
-               bi->bi_end_io(bi,
-                             test_bit(BIO_UPTODATE, &bi->bi_flags)
-                               ? 0 : -EIO);
+               bio_endio(bi, 0);
         }
         return 0;
  }
@@ -3881,12 +3901,8 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
         spin_lock_irq(&conf->device_lock);
         remaining = --raid_bio->bi_phys_segments;
         spin_unlock_irq(&conf->device_lock);
-       if (remaining == 0) {
-
-               raid_bio->bi_end_io(raid_bio,
-                             test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
-                               ? 0 : -EIO);
-       }
+       if (remaining == 0)
+               bio_endio(raid_bio, 0);
         if (atomic_dec_and_test(&conf->active_aligned_reads))
                 wake_up(&conf->wait_for_stripe);
         return handled;
@@ -3914,7 +3930,6 @@ static void raid5d(mddev_t *mddev)
         handled = 0;
         spin_lock_irq(&conf->device_lock);
         while (1) {
-               struct list_head *first;
                 struct bio *bio;
  
                 if (conf->seq_flush != conf->seq_write) {
@@ -3936,17 +3951,12 @@ static void raid5d(mddev_t *mddev)
                         handled++;
                 }
  
-               if (list_empty(&conf->handle_list)) {
+               sh = __get_priority_stripe(conf);
+
+               if (!sh) {
                         async_tx_issue_pending_all();
                         break;
                 }
-
-               first = conf->handle_list.next;
-               sh = list_entry(first, struct stripe_head, lru);
-
-               list_del_init(first);
-               atomic_inc(&sh->count);
-               BUG_ON(atomic_read(&sh->count)!= 1);
                 spin_unlock_irq(&conf->device_lock);
                 
                 handled++;
@@ -3978,15 +3988,13 @@ static ssize_t
  raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
  {
         raid5_conf_t *conf = mddev_to_conf(mddev);
-       char *end;
-       int new;
+       unsigned long new;
         if (len >= PAGE_SIZE)
                 return -EINVAL;
         if (!conf)
                 return -ENODEV;
  
-       new = simple_strtoul(page, &end, 10);
-       if (!*page || (*end && *end != '\n') )
+       if (strict_strtoul(page, 10, &new))
                 return -EINVAL;
         if (new <= 16 || new > 32768)
                 return -EINVAL;
@@ -4010,6 +4018,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                 raid5_show_stripe_cache_size,
                                 raid5_store_stripe_cache_size);
  
+static ssize_t
+raid5_show_preread_threshold(mddev_t *mddev, char *page)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       if (conf)
+               return sprintf(page, "%d\n", conf->bypass_threshold);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       unsigned long new;
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+       if (!conf)
+               return -ENODEV;
+
+       if (strict_strtoul(page, 10, &new))
+               return -EINVAL;
+       if (new > conf->max_nr_stripes)
+               return -EINVAL;
+       conf->bypass_threshold = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
+                                       S_IRUGO | S_IWUSR,
+                                       raid5_show_preread_threshold,
+                                       raid5_store_preread_threshold);
+
  static ssize_t
  stripe_cache_active_show(mddev_t *mddev, char *page)
  {
@@ -4026,6 +4068,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
  static struct attribute *raid5_attrs[] =  {
         &raid5_stripecache_size.attr,
         &raid5_stripecache_active.attr,
+       &raid5_preread_bypass_threshold.attr,
         NULL,
  };
  static struct attribute_group raid5_attrs_group = {
@@ -4127,15 +4170,18 @@ static int run(mddev_t *mddev)
                         goto abort;
         }
         spin_lock_init(&conf->device_lock);
+       mddev->queue->queue_lock = &conf->device_lock;
         init_waitqueue_head(&conf->wait_for_stripe);
         init_waitqueue_head(&conf->wait_for_overlap);
         INIT_LIST_HEAD(&conf->handle_list);
+       INIT_LIST_HEAD(&conf->hold_list);
         INIT_LIST_HEAD(&conf->delayed_list);
         INIT_LIST_HEAD(&conf->bitmap_list);
         INIT_LIST_HEAD(&conf->inactive_list);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
         atomic_set(&conf->active_aligned_reads, 0);
+       conf->bypass_threshold = BYPASS_THRESHOLD;
  
         pr_debug("raid5: run(%s) called.\n", mdname(mddev));
  
@@ -4154,7 +4200,9 @@ static int run(mddev_t *mddev)
                                 " disk %d\n", bdevname(rdev->bdev,b),
                                 raid_disk);
                         working_disks++;
-               }
+               } else
+                       /* Cannot rely on bitmap to complete recovery */
+                       conf->fullsync = 1;
         }
  
         /*
@@ -4431,6 +4479,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                         err = -EBUSY;
                         goto abort;
                 }
+               /* Only remove non-faulty devices if recovery
+                * isn't possible.
+                */
+               if (!test_bit(Faulty, &rdev->flags) &&
+                   mddev->degraded <= conf->max_degraded) {
+                       err = -EBUSY;
+                       goto abort;
+               }
                 p->rdev = NULL;
                 synchronize_rcu();
                 if (atomic_read(&rdev->nr_pending)) {
@@ -4448,35 +4504,41 @@ abort:
  static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
  {
         raid5_conf_t *conf = mddev->private;
-       int found = 0;
+       int err = -EEXIST;
         int disk;
         struct disk_info *p;
+       int first = 0;
+       int last = conf->raid_disks - 1;
  
         if (mddev->degraded > conf->max_degraded)
                 /* no point adding a device */
-               return 0;
+               return -EINVAL;
+
+       if (rdev->raid_disk >= 0)
+               first = last = rdev->raid_disk;
  
         /*
          * find the disk ... but prefer rdev->saved_raid_disk
          * if possible.
          */
         if (rdev->saved_raid_disk >= 0 &&
+           rdev->saved_raid_disk >= first &&
             conf->disks[rdev->saved_raid_disk].rdev == NULL)
                 disk = rdev->saved_raid_disk;
         else
-               disk = 0;
-       for ( ; disk < conf->raid_disks; disk++)
+               disk = first;
+       for ( ; disk <= last ; disk++)
                 if ((p=conf->disks + disk)->rdev == NULL) {
                         clear_bit(In_sync, &rdev->flags);
                         rdev->raid_disk = disk;
-                       found = 1;
+                       err = 0;
                         if (rdev->saved_raid_disk != disk)
                                 conf->fullsync = 1;
                         rcu_assign_pointer(p->rdev, rdev);
                         break;
                 }
         print_raid5_conf(conf);
-       return found;
+       return err;
  }
  
  static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4577,7 +4639,7 @@ static int raid5_start_reshape(mddev_t *mddev)
         rdev_for_each(rdev, rtmp, mddev)
                 if (rdev->raid_disk < 0 &&
                     !test_bit(Faulty, &rdev->flags)) {
-                       if (raid5_add_disk(mddev, rdev)) {
+                       if (raid5_add_disk(mddev, rdev) == 0) {
                                 char nm[20];
                                 set_bit(In_sync, &rdev->flags);
                                 added_devices++;