md: kill STRIPE_OP_IO flag

[safe/jmp/linux-2.6] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index f96dea9..cac9708 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
  #define STRIPE_SHIFT           (PAGE_SHIFT - 9)
  #define STRIPE_SECTORS         (STRIPE_SIZE>>9)
  #define        IO_THRESHOLD            1
+#define BYPASS_THRESHOLD       1
  #define NR_HASH                        (PAGE_SIZE / sizeof(struct hlist_head))
  #define HASH_MASK              (NR_HASH - 1)
  
@@ -93,6 +94,8 @@
  #define __inline__
  #endif
  
+#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
+
  #if !RAID6_USE_EMPTY_ZERO_PAGE
  /* In .bss so it's zeroed */
  const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
@@ -108,14 +111,11 @@ static void return_io(struct bio *return_bi)
  {
         struct bio *bi = return_bi;
         while (bi) {
-               int bytes = bi->bi_size;
  
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes,
-                             test_bit(BIO_UPTODATE, &bi->bi_flags)
-                               ? 0 : -EIO);
+               bio_endio(bi, 0);
                 bi = return_bi;
         }
  }
@@ -373,19 +373,22 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
         test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
         test_and_ack_op(STRIPE_OP_POSTXOR, pending);
         test_and_ack_op(STRIPE_OP_CHECK, pending);
-       if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
-               ack++;
  
         sh->ops.count -= ack;
-       BUG_ON(sh->ops.count < 0);
+       if (unlikely(sh->ops.count < 0)) {
+               printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
+                       "ops.complete: %#lx\n", pending, sh->ops.pending,
+                       sh->ops.ack, sh->ops.complete);
+               BUG();
+       }
  
         return pending;
  }
  
-static int
-raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
-static int
-raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+static void
+raid5_end_read_request(struct bio *bi, int error);
+static void
+raid5_end_write_request(struct bio *bi, int error);
  
  static void ops_run_io(struct stripe_head *sh)
  {
@@ -427,9 +430,11 @@ static void ops_run_io(struct stripe_head *sh)
                                 test_bit(STRIPE_EXPAND_READY, &sh->state))
                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
  
+                       set_bit(STRIPE_IO_STARTED, &sh->state);
+
                         bi->bi_bdev = rdev->bdev;
                         pr_debug("%s: for %llu schedule op %ld on disc %d\n",
-                               __FUNCTION__, (unsigned long long)sh->sector,
+                               __func__, (unsigned long long)sh->sector,
                                 bi->bi_rw, i);
                         atomic_inc(&sh->count);
                         bi->bi_sector = sh->sector + rdev->data_offset;
@@ -516,7 +521,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
         raid5_conf_t *conf = sh->raid_conf;
         int i;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         /* clear completed biofills */
@@ -551,8 +556,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                         }
                 }
         }
-       clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
-       clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+       set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
  
         return_io(return_bi);
  
@@ -566,7 +570,7 @@ static void ops_run_biofill(struct stripe_head *sh)
         raid5_conf_t *conf = sh->raid_conf;
         int i;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         for (i = sh->disks; i--; ) {
@@ -597,7 +601,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
         int target = sh->ops.target;
         struct r5dev *tgt = &sh->dev[target];
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         set_bit(R5_UPTODATE, &tgt->flags);
@@ -622,7 +626,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
         int i;
  
         pr_debug("%s: stripe %llu block: %d\n",
-               __FUNCTION__, (unsigned long long)sh->sector, target);
+               __func__, (unsigned long long)sh->sector, target);
         BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
  
         for (i = disks; i--; )
@@ -650,7 +654,7 @@ static void ops_complete_prexor(void *stripe_head_ref)
  {
         struct stripe_head *sh = stripe_head_ref;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
@@ -667,7 +671,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
         /* existing parity data subtracted */
         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         for (i = disks; i--; ) {
@@ -685,7 +689,8 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  }
  
  static struct dma_async_tx_descriptor *
-ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
+                unsigned long pending)
  {
         int disks = sh->disks;
         int pd_idx = sh->pd_idx, i;
@@ -693,9 +698,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
         /* check if prexor is active which means only process blocks
          * that are part of a read-modify-write (Wantprexor)
          */
-       int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+       int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         for (i = disks; i--; ) {
@@ -740,7 +745,7 @@ static void ops_complete_postxor(void *stripe_head_ref)
  {
         struct stripe_head *sh = stripe_head_ref;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
@@ -753,7 +758,7 @@ static void ops_complete_write(void *stripe_head_ref)
         struct stripe_head *sh = stripe_head_ref;
         int disks = sh->disks, i, pd_idx = sh->pd_idx;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         for (i = disks; i--; ) {
@@ -770,7 +775,8 @@ static void ops_complete_write(void *stripe_head_ref)
  }
  
  static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
+               unsigned long pending)
  {
         /* kernel stack size limits the total number of disks */
         int disks = sh->disks;
@@ -778,11 +784,11 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  
         int count = 0, pd_idx = sh->pd_idx, i;
         struct page *xor_dest;
-       int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+       int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
         unsigned long flags;
         dma_async_tx_callback callback;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         /* check if prexor is active which means only process blocks
@@ -805,7 +811,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
         }
  
         /* check whether this postxor is part of a write */
-       callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
+       callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
                 ops_complete_write : ops_complete_postxor;
  
         /* 1/ if we prexor'd then the dest is reused as a source
@@ -830,15 +836,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  static void ops_complete_check(void *stripe_head_ref)
  {
         struct stripe_head *sh = stripe_head_ref;
-       int pd_idx = sh->pd_idx;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
-       if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
-               sh->ops.zero_sum_result == 0)
-               set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-
         set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
@@ -854,7 +855,7 @@ static void ops_run_check(struct stripe_head *sh)
         int count = 0, pd_idx = sh->pd_idx, i;
         struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
  
-       pr_debug("%s: stripe %llu\n", __FUNCTION__,
+       pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
         for (i = disks; i--; ) {
@@ -866,11 +867,6 @@ static void ops_run_check(struct stripe_head *sh)
         tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
                 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
  
-       if (tx)
-               set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-       else
-               clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-
         atomic_inc(&sh->count);
         tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
                 ops_complete_check, sh);
@@ -893,19 +889,16 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
                 tx = ops_run_prexor(sh, tx);
  
         if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
-               tx = ops_run_biodrain(sh, tx);
+               tx = ops_run_biodrain(sh, tx, pending);
                 overlap_clear++;
         }
  
         if (test_bit(STRIPE_OP_POSTXOR, &pending))
-               ops_run_postxor(sh, tx);
+               ops_run_postxor(sh, tx, pending);
  
         if (test_bit(STRIPE_OP_CHECK, &pending))
                 ops_run_check(sh);
  
-       if (test_bit(STRIPE_OP_IO, &pending))
-               ops_run_io(sh);
-
         if (overlap_clear)
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
@@ -1110,8 +1103,7 @@ static void shrink_stripes(raid5_conf_t *conf)
         conf->slab_cache = NULL;
  }
  
-static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
-                                  int error)
+static void raid5_end_read_request(struct bio * bi, int error)
  {
         struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
@@ -1120,8 +1112,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
         char b[BDEVNAME_SIZE];
         mdk_rdev_t *rdev;
  
-       if (bi->bi_size)
-               return 1;
  
         for (i=0 ; i<disks; i++)
                 if (bi == &sh->dev[i].req)
@@ -1132,17 +1122,19 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                 uptodate);
         if (i == disks) {
                 BUG();
-               return 0;
+               return;
         }
  
         if (uptodate) {
                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
                         rdev = conf->disks[i].rdev;
-                       printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
-                              mdname(conf->mddev), STRIPE_SECTORS,
-                              (unsigned long long)sh->sector + rdev->data_offset,
-                              bdevname(rdev->bdev, b));
+                       printk_rl(KERN_INFO "raid5:%s: read error corrected"
+                                 " (%lu sectors at %llu on %s)\n",
+                                 mdname(conf->mddev), STRIPE_SECTORS,
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdevname(rdev->bdev, b));
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
                 }
@@ -1156,16 +1148,22 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
                 atomic_inc(&rdev->read_errors);
                 if (conf->mddev->degraded)
-                       printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
-                              mdname(conf->mddev),
-                              (unsigned long long)sh->sector + rdev->data_offset,
-                              bdn);
+                       printk_rl(KERN_WARNING
+                                 "raid5:%s: read error not correctable "
+                                 "(sector %llu on %s).\n",
+                                 mdname(conf->mddev),
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdn);
                 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                         /* Oh, no!!! */
-                       printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
-                              mdname(conf->mddev),
-                              (unsigned long long)sh->sector + rdev->data_offset,
-                              bdn);
+                       printk_rl(KERN_WARNING
+                                 "raid5:%s: read error NOT corrected!! "
+                                 "(sector %llu on %s).\n",
+                                 mdname(conf->mddev),
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdn);
                 else if (atomic_read(&rdev->read_errors)
                          > conf->max_nr_stripes)
                         printk(KERN_WARNING
@@ -1185,20 +1183,15 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
         clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
-       return 0;
  }
  
-static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
-                                   int error)
+static void raid5_end_write_request (struct bio *bi, int error)
  {
         struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
         int disks = sh->disks, i;
         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  
-       if (bi->bi_size)
-               return 1;
-
         for (i=0 ; i<disks; i++)
                 if (bi == &sh->dev[i].req)
                         break;
@@ -1208,7 +1201,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
                 uptodate);
         if (i == disks) {
                 BUG();
-               return 0;
+               return;
         }
  
         if (!uptodate)
@@ -1219,7 +1212,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
         clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
-       return 0;
  }
  
  
@@ -1260,12 +1252,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                         /*
                          * if recovery was running, make sure it aborts.
                          */
-                       set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                 }
                 set_bit(Faulty, &rdev->flags);
                 printk (KERN_ALERT
-                       "raid5: Disk failure on %s, disabling device."
-                       " Operation continuing on %d devices\n",
+                       "raid5: Disk failure on %s, disabling device.\n"
+                       "raid5: Operation continuing on %d devices.\n",
                         bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
         }
  }
@@ -1724,6 +1716,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
                                 locked++;
                         }
                 }
+               if (locked + 1 == disks)
+                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+                               atomic_inc(&sh->raid_conf->pending_full_writes);
         } else {
                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1763,7 +1758,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
         locked++;
  
         pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
-               __FUNCTION__, (unsigned long long)sh->sector,
+               __func__, (unsigned long long)sh->sector,
                 locked, sh->ops.pending);
  
         return locked;
@@ -1951,6 +1946,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
                                         STRIPE_SECTORS, 0, 0);
         }
  
+       if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+               if (atomic_dec_and_test(&conf->pending_full_writes))
+                       md_wakeup_thread(conf->mddev->thread);
  }
  
  /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -1988,6 +1986,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
                  * have quiesced.
                  */
                 if ((s->uptodate == disks - 1) &&
+                   (s->failed && disk_idx == s->failed_num) &&
                     !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
                         set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
                         set_bit(R5_Wantcompute, &dev->flags);
@@ -2010,8 +2009,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
                          */
                         set_bit(R5_LOCKED, &dev->flags);
                         set_bit(R5_Wantread, &dev->flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
                         s->locked++;
                         pr_debug("Reading block %d (sync=%d)\n", disk_idx,
                                 s->syncing);
@@ -2073,7 +2070,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
                         /* we would like to get this block, possibly
                          * by computing it, but we might not be able to
                          */
-                       if (s->uptodate == disks-1) {
+                       if ((s->uptodate == disks - 1) &&
+                           (s->failed && (i == r6s->failed_num[0] ||
+                                          i == r6s->failed_num[1]))) {
                                 pr_debug("Computing stripe %llu block %d\n",
                                        (unsigned long long)sh->sector, i);
                                 compute_block_1(sh, i, 0);
@@ -2153,6 +2152,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
                                                         0);
                         }
                 }
+
+       if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+               if (atomic_dec_and_test(&conf->pending_full_writes))
+                       md_wakeup_thread(conf->mddev->thread);
  }
  
  static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2199,9 +2202,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
                                                 "%d for r-m-w\n", i);
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
-                                       if (!test_and_set_bit(
-                                               STRIPE_OP_IO, &sh->ops.pending))
-                                               sh->ops.count++;
                                         s->locked++;
                                 } else {
                                         set_bit(STRIPE_DELAYED, &sh->state);
@@ -2225,9 +2225,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
                                                 "%d for Reconstruct\n", i);
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
-                                       if (!test_and_set_bit(
-                                               STRIPE_OP_IO, &sh->ops.pending))
-                                               sh->ops.count++;
                                         s->locked++;
                                 } else {
                                         set_bit(STRIPE_DELAYED, &sh->state);
@@ -2337,6 +2334,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
                                 s->locked++;
                                 set_bit(R5_Wantwrite, &sh->dev[i].flags);
                         }
+               if (s->locked == disks)
+                       if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+                               atomic_inc(&conf->pending_full_writes);
                 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
                 set_bit(STRIPE_INSYNC, &sh->state);
  
@@ -2352,25 +2352,15 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
  static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                 struct stripe_head_state *s, int disks)
  {
+       int canceled_check = 0;
+
         set_bit(STRIPE_HANDLE, &sh->state);
-       /* Take one of the following actions:
-        * 1/ start a check parity operation if (uptodate == disks)
-        * 2/ finish a check parity operation and act on the result
-        * 3/ skip to the writeback section if we previously
-        *    initiated a recovery operation
-        */
-       if (s->failed == 0 &&
-           !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-               if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
-                       BUG_ON(s->uptodate != disks);
-                       clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
-                       sh->ops.count++;
-                       s->uptodate--;
-               } else if (
-                      test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-                       clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-                       clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
  
+       /* complete a check operation */
+       if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
+               clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+               clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+               if (s->failed == 0) {
                         if (sh->ops.zero_sum_result == 0)
                                 /* parity is correct (on disc,
                                  * not in buffer any more)
@@ -2395,12 +2385,27 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                         s->uptodate++;
                                 }
                         }
+               } else
+                       canceled_check = 1; /* STRIPE_INSYNC is not set */
+       }
+
+       /* start a new check operation if there are no failures, the stripe is
+        * not insync, and a repair is not in flight
+        */
+       if (s->failed == 0 &&
+           !test_bit(STRIPE_INSYNC, &sh->state) &&
+           !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+               if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+                       BUG_ON(s->uptodate != disks);
+                       clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+                       sh->ops.count++;
+                       s->uptodate--;
                 }
         }
  
         /* check if we can clear a parity disk reconstruct */
         if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-               test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+           test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
  
                 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
                 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
@@ -2408,12 +2413,15 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
         }
  
+
         /* Wait for check parity and compute block operations to complete
-        * before write-back
+        * before write-back.  If a failure occurred while the check operation
+        * was in flight we need to cycle this stripe through handle_stripe
+        * since the parity block may not be uptodate
          */
-       if (!test_bit(STRIPE_INSYNC, &sh->state) &&
-               !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
-               !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+       if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
+           !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
+           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
                 struct r5dev *dev;
                 /* either failed parity check, or recovery is happening */
                 if (s->failed == 0)
@@ -2424,8 +2432,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
  
                 set_bit(R5_LOCKED, &dev->flags);
                 set_bit(R5_Wantwrite, &dev->flags);
-               if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                       sh->ops.count++;
  
                 clear_bit(STRIPE_DEGRADED, &sh->state);
                 s->locked++;
@@ -2589,6 +2595,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
         }
  }
  
+
  /*
   * handle_stripe - do things to a stripe.
   *
@@ -2614,6 +2621,8 @@ static void handle_stripe5(struct stripe_head *sh)
         struct stripe_head_state s;
         struct r5dev *dev;
         unsigned long pending = 0;
+       mdk_rdev_t *blocked_rdev = NULL;
+       int prexor;
  
         memset(&s, 0, sizeof(s));
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2630,6 +2639,13 @@ static void handle_stripe5(struct stripe_head *sh)
         s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
         /* Now to look around and see what can be done */
  
+       /* clean-up completed biofill operations */
+       if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
+               clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+               clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
+               clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
+       }
+
         rcu_read_lock();
         for (i=disks; i--; ) {
                 mdk_rdev_t *rdev;
@@ -2666,6 +2682,11 @@ static void handle_stripe5(struct stripe_head *sh)
                 if (dev->written)
                         s.written++;
                 rdev = rcu_dereference(conf->disks[i].rdev);
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       blocked_rdev = rdev;
+                       atomic_inc(&rdev->nr_pending);
+                       break;
+               }
                 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                         /* The ReadError flag will just be confusing now */
                         clear_bit(R5_ReadError, &dev->flags);
@@ -2680,6 +2701,11 @@ static void handle_stripe5(struct stripe_head *sh)
         }
         rcu_read_unlock();
  
+       if (unlikely(blocked_rdev)) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto unlock;
+       }
+
         if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
                 sh->ops.count++;
  
@@ -2726,9 +2752,11 @@ static void handle_stripe5(struct stripe_head *sh)
         /* leave prexor set until postxor is done, allows us to distinguish
          * a rmw from a rcw during biodrain
          */
+       prexor = 0;
         if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
                 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
  
+               prexor = 1;
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -2759,9 +2787,8 @@ static void handle_stripe5(struct stripe_head *sh)
                                 (i == sh->pd_idx || dev->written)) {
                                 pr_debug("Writing block %d\n", i);
                                 set_bit(R5_Wantwrite, &dev->flags);
-                               if (!test_and_set_bit(
-                                   STRIPE_OP_IO, &sh->ops.pending))
-                                       sh->ops.count++;
+                               if (prexor)
+                                       continue;
                                 if (!test_bit(R5_Insync, &dev->flags) ||
                                     (i == sh->pd_idx && s.failed == 0))
                                         set_bit(STRIPE_INSYNC, &sh->state);
@@ -2813,16 +2840,12 @@ static void handle_stripe5(struct stripe_head *sh)
                 dev = &sh->dev[s.failed_num];
                 if (!test_bit(R5_ReWrite, &dev->flags)) {
                         set_bit(R5_Wantwrite, &dev->flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
                         set_bit(R5_ReWrite, &dev->flags);
                         set_bit(R5_LOCKED, &dev->flags);
                         s.locked++;
                 } else {
                         /* let's read it back */
                         set_bit(R5_Wantread, &dev->flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
                         set_bit(R5_LOCKED, &dev->flags);
                         s.locked++;
                 }
@@ -2840,11 +2863,10 @@ static void handle_stripe5(struct stripe_head *sh)
                 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
                 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
  
-               for (i = conf->raid_disks; i--; ) {
+               for (i = conf->raid_disks; i--; )
                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
-                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-                               sh->ops.count++;
-               }
+                       set_bit(R5_LOCKED, &dev->flags);
+                       s.locked++;
         }
  
         if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
@@ -2855,6 +2877,7 @@ static void handle_stripe5(struct stripe_head *sh)
                         conf->raid_disks);
                 s.locked += handle_write_operations5(sh, 1, 1);
         } else if (s.expanded &&
+                  s.locked == 0 &&
                 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
                 clear_bit(STRIPE_EXPAND_READY, &sh->state);
                 atomic_dec(&conf->reshape_stripes);
@@ -2862,17 +2885,25 @@ static void handle_stripe5(struct stripe_head *sh)
                 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
         }
  
-       if (s.expanding && s.locked == 0)
+       if (s.expanding && s.locked == 0 &&
+           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
                 handle_stripe_expansion(conf, sh, NULL);
  
         if (sh->ops.count)
                 pending = get_stripe_work(sh);
  
+ unlock:
         spin_unlock(&sh->lock);
  
+       /* wait for this device to become unblocked */
+       if (unlikely(blocked_rdev))
+               md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+
         if (pending)
                 raid5_run_ops(sh, pending);
  
+       ops_run_io(sh);
+
         return_io(return_bi);
  
  }
@@ -2886,6 +2917,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         struct stripe_head_state s;
         struct r6_state r6s;
         struct r5dev *dev, *pdev, *qdev;
+       mdk_rdev_t *blocked_rdev = NULL;
  
         r6s.qd_idx = raid6_next_disk(pd_idx, disks);
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
@@ -2949,6 +2981,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 if (dev->written)
                         s.written++;
                 rdev = rcu_dereference(conf->disks[i].rdev);
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       blocked_rdev = rdev;
+                       atomic_inc(&rdev->nr_pending);
+                       break;
+               }
                 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                         /* The ReadError flag will just be confusing now */
                         clear_bit(R5_ReadError, &dev->flags);
@@ -2963,6 +3000,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                         set_bit(R5_Insync, &dev->flags);
         }
         rcu_read_unlock();
+
+       if (unlikely(blocked_rdev)) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto unlock;
+       }
         pr_debug("locked=%d uptodate=%d to_read=%d"
                " to_write=%d failed=%d failed_num=%d,%d\n",
                s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3064,11 +3106,17 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
         }
  
-       if (s.expanding && s.locked == 0)
+       if (s.expanding && s.locked == 0 &&
+           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
                 handle_stripe_expansion(conf, sh, &r6s);
  
+ unlock:
         spin_unlock(&sh->lock);
  
+       /* wait for this device to become unblocked */
+       if (unlikely(blocked_rdev))
+               md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+
         return_io(return_bi);
  
         for (i=disks; i-- ;) {
@@ -3082,6 +3130,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 else
                         continue;
  
+               set_bit(STRIPE_IO_STARTED, &sh->state);
+
                 bi = &sh->dev[i].req;
  
                 bi->bi_rw = rw;
@@ -3152,9 +3202,10 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
                         clear_bit(STRIPE_DELAYED, &sh->state);
                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                 atomic_inc(&conf->preread_active_stripes);
-                       list_add_tail(&sh->lru, &conf->handle_list);
+                       list_add_tail(&sh->lru, &conf->hold_list);
                 }
-       }
+       } else
+               blk_plug_device(conf->mddev->queue);
  }
  
  static void activate_bit_delay(raid5_conf_t *conf)
@@ -3185,8 +3236,7 @@ static void unplug_slaves(mddev_t *mddev)
                         atomic_inc(&rdev->nr_pending);
                         rcu_read_unlock();
  
-                       if (r_queue->unplug_fn)
-                               r_queue->unplug_fn(r_queue);
+                       blk_unplug(r_queue);
  
                         rdev_dec_pending(rdev, mddev);
                         rcu_read_lock();
@@ -3214,36 +3264,6 @@ static void raid5_unplug_device(struct request_queue *q)
         unplug_slaves(mddev);
  }
  
-static int raid5_issue_flush(struct request_queue *q, struct gendisk *disk,
-                            sector_t *error_sector)
-{
-       mddev_t *mddev = q->queuedata;
-       raid5_conf_t *conf = mddev_to_conf(mddev);
-       int i, ret = 0;
-
-       rcu_read_lock();
-       for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                       struct block_device *bdev = rdev->bdev;
-                       struct request_queue *r_queue = bdev_get_queue(bdev);
-
-                       if (!r_queue->issue_flush_fn)
-                               ret = -EOPNOTSUPP;
-                       else {
-                               atomic_inc(&rdev->nr_pending);
-                               rcu_read_unlock();
-                               ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-                                                             error_sector);
-                               rdev_dec_pending(rdev, mddev);
-                               rcu_read_lock();
-                       }
-               }
-       }
-       rcu_read_unlock();
-       return ret;
-}
-
  static int raid5_congested(void *data, int bits)
  {
         mddev_t *mddev = data;
@@ -3340,7 +3360,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
   *  first).
   *  If the read failed..
   */
-static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
+static void raid5_align_endio(struct bio *bi, int error)
  {
         struct bio* raid_bi  = bi->bi_private;
         mddev_t *mddev;
@@ -3348,8 +3368,6 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
         mdk_rdev_t *rdev;
  
-       if (bi->bi_size)
-               return 1;
         bio_put(bi);
  
         mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
@@ -3360,17 +3378,16 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
         rdev_dec_pending(rdev, conf->mddev);
  
         if (!error && uptodate) {
-               bio_endio(raid_bi, bytes, 0);
+               bio_endio(raid_bi, 0);
                 if (atomic_dec_and_test(&conf->active_aligned_reads))
                         wake_up(&conf->wait_for_stripe);
-               return 0;
+               return;
         }
  
  
         pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
  
         add_bio_to_retry(raid_bi, conf);
-       return 0;
  }
  
  static int bio_fits_rdev(struct bio *bi)
@@ -3463,6 +3480,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
         }
  }
  
+/* __get_priority_stripe - get the next stripe to process
+ *
+ * Full stripe writes are allowed to pass preread active stripes up until
+ * the bypass_threshold is exceeded.  In general the bypass_count
+ * increments when the handle_list is handled before the hold_list; however, it
+ * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
+ * stripe with in flight i/o.  The bypass_count will be reset when the
+ * head of the hold_list has changed, i.e. the head was promoted to the
+ * handle_list.
+ */
+static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
+{
+       struct stripe_head *sh;
+
+       pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
+                 __func__,
+                 list_empty(&conf->handle_list) ? "empty" : "busy",
+                 list_empty(&conf->hold_list) ? "empty" : "busy",
+                 atomic_read(&conf->pending_full_writes), conf->bypass_count);
+
+       if (!list_empty(&conf->handle_list)) {
+               sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+
+               if (list_empty(&conf->hold_list))
+                       conf->bypass_count = 0;
+               else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
+                       if (conf->hold_list.next == conf->last_hold)
+                               conf->bypass_count++;
+                       else {
+                               conf->last_hold = conf->hold_list.next;
+                               conf->bypass_count -= conf->bypass_threshold;
+                               if (conf->bypass_count < 0)
+                                       conf->bypass_count = 0;
+                       }
+               }
+       } else if (!list_empty(&conf->hold_list) &&
+                  ((conf->bypass_threshold &&
+                    conf->bypass_count > conf->bypass_threshold) ||
+                   atomic_read(&conf->pending_full_writes) == 0)) {
+               sh = list_entry(conf->hold_list.next,
+                               typeof(*sh), lru);
+               conf->bypass_count -= conf->bypass_threshold;
+               if (conf->bypass_count < 0)
+                       conf->bypass_count = 0;
+       } else
+               return NULL;
+
+       list_del_init(&sh->lru);
+       atomic_inc(&sh->count);
+       BUG_ON(atomic_read(&sh->count) != 1);
+       return sh;
+}
  
  static int make_request(struct request_queue *q, struct bio * bi)
  {
@@ -3476,7 +3545,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
         int remaining;
  
         if (unlikely(bio_barrier(bi))) {
-               bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+               bio_endio(bi, -EOPNOTSUPP);
                 return 0;
         }
  
@@ -3578,7 +3647,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                 goto retry;
                         }
                         finish_wait(&conf->wait_for_overlap, &w);
-                       handle_stripe(sh, NULL);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       clear_bit(STRIPE_DELAYED, &sh->state);
                         release_stripe(sh);
                 } else {
                         /* cannot get stripe for read-ahead, just give-up */
@@ -3592,14 +3662,11 @@ static int make_request(struct request_queue *q, struct bio * bi)
         remaining = --bi->bi_phys_segments;
         spin_unlock_irq(&conf->device_lock);
         if (remaining == 0) {
-               int bytes = bi->bi_size;
  
                 if ( rw == WRITE )
                         md_write_end(mddev);
-               bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes,
-                             test_bit(BIO_UPTODATE, &bi->bi_flags)
-                               ? 0 : -EIO);
+
+               bio_endio(bi, 0);
         }
         return 0;
  }
@@ -3728,6 +3795,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                 release_stripe(sh);
                 first_sector += STRIPE_SECTORS;
         }
+       /* If this takes us to the resync_max point where we have to pause,
+        * then we need to write out the superblock.
+        */
+       sector_nr += conf->chunk_size>>9;
+       if (sector_nr >= mddev->resync_max) {
+               /* Cannot proceed until we've updated the superblock... */
+               wait_event(conf->wait_for_overlap,
+                          atomic_read(&conf->reshape_stripes) == 0);
+               mddev->reshape_position = conf->expand_progress;
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               md_wakeup_thread(mddev->thread);
+               wait_event(mddev->sb_wait,
+                          !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+                          || kthread_should_stop());
+               spin_lock_irq(&conf->device_lock);
+               conf->expand_lo = mddev->reshape_position;
+               spin_unlock_irq(&conf->device_lock);
+               wake_up(&conf->wait_for_overlap);
+       }
         return conf->chunk_size>>9;
  }
  
@@ -3764,6 +3850,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                 return reshape_request(mddev, sector_nr, skipped);
  
+       /* No need to check resync_max as we never do more than one
+        * stripe, and as resync_max will always be on a chunk boundary,
+        * if the check in md_do_sync didn't fire, there is no chance
+        * of overstepping resync_max here
+        */
+
         /* if there is too many failed drives and we are trying
          * to resync, then assert that we are finished, because there is
          * nothing we can do.
@@ -3783,6 +3875,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
                 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
         }
  
+
+       bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
         pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
         sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
         if (sh == NULL) {
@@ -3874,14 +3969,8 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
         spin_lock_irq(&conf->device_lock);
         remaining = --raid_bio->bi_phys_segments;
         spin_unlock_irq(&conf->device_lock);
-       if (remaining == 0) {
-               int bytes = raid_bio->bi_size;
-
-               raid_bio->bi_size = 0;
-               raid_bio->bi_end_io(raid_bio, bytes,
-                             test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
-                               ? 0 : -EIO);
-       }
+       if (remaining == 0)
+               bio_endio(raid_bio, 0);
         if (atomic_dec_and_test(&conf->active_aligned_reads))
                 wake_up(&conf->wait_for_stripe);
         return handled;
@@ -3896,7 +3985,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
   * During the scan, completed stripes are saved for us by the interrupt
   * handler, so that they will not have to wait for our next wakeup.
   */
-static void raid5d (mddev_t *mddev)
+static void raid5d(mddev_t *mddev)
  {
         struct stripe_head *sh;
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -3909,7 +3998,6 @@ static void raid5d (mddev_t *mddev)
         handled = 0;
         spin_lock_irq(&conf->device_lock);
         while (1) {
-               struct list_head *first;
                 struct bio *bio;
  
                 if (conf->seq_flush != conf->seq_write) {
@@ -3921,12 +4009,6 @@ static void raid5d (mddev_t *mddev)
                         activate_bit_delay(conf);
                 }
  
-               if (list_empty(&conf->handle_list) &&
-                   atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-                   !blk_queue_plugged(mddev->queue) &&
-                   !list_empty(&conf->delayed_list))
-                       raid5_activate_delayed(conf);
-
                 while ((bio = remove_bio_from_retry(conf))) {
                         int ok;
                         spin_unlock_irq(&conf->device_lock);
@@ -3937,17 +4019,12 @@ static void raid5d (mddev_t *mddev)
                         handled++;
                 }
  
-               if (list_empty(&conf->handle_list)) {
+               sh = __get_priority_stripe(conf);
+
+               if (!sh) {
                         async_tx_issue_pending_all();
                         break;
                 }
-
-               first = conf->handle_list.next;
-               sh = list_entry(first, struct stripe_head, lru);
-
-               list_del_init(first);
-               atomic_inc(&sh->count);
-               BUG_ON(atomic_read(&sh->count)!= 1);
                 spin_unlock_irq(&conf->device_lock);
                 
                 handled++;
@@ -3979,15 +4056,13 @@ static ssize_t
  raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
  {
         raid5_conf_t *conf = mddev_to_conf(mddev);
-       char *end;
-       int new;
+       unsigned long new;
         if (len >= PAGE_SIZE)
                 return -EINVAL;
         if (!conf)
                 return -ENODEV;
  
-       new = simple_strtoul(page, &end, 10);
-       if (!*page || (*end && *end != '\n') )
+       if (strict_strtoul(page, 10, &new))
                 return -EINVAL;
         if (new <= 16 || new > 32768)
                 return -EINVAL;
@@ -4012,6 +4087,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                 raid5_store_stripe_cache_size);
  
  static ssize_t
+raid5_show_preread_threshold(mddev_t *mddev, char *page)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       if (conf)
+               return sprintf(page, "%d\n", conf->bypass_threshold);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       unsigned long new;
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+       if (!conf)
+               return -ENODEV;
+
+       if (strict_strtoul(page, 10, &new))
+               return -EINVAL;
+       if (new > conf->max_nr_stripes)
+               return -EINVAL;
+       conf->bypass_threshold = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
+                                       S_IRUGO | S_IWUSR,
+                                       raid5_show_preread_threshold,
+                                       raid5_store_preread_threshold);
+
+static ssize_t
  stripe_cache_active_show(mddev_t *mddev, char *page)
  {
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4027,6 +4136,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
  static struct attribute *raid5_attrs[] =  {
         &raid5_stripecache_size.attr,
         &raid5_stripecache_active.attr,
+       &raid5_preread_bypass_threshold.attr,
         NULL,
  };
  static struct attribute_group raid5_attrs_group = {
@@ -4128,19 +4238,22 @@ static int run(mddev_t *mddev)
                         goto abort;
         }
         spin_lock_init(&conf->device_lock);
+       mddev->queue->queue_lock = &conf->device_lock;
         init_waitqueue_head(&conf->wait_for_stripe);
         init_waitqueue_head(&conf->wait_for_overlap);
         INIT_LIST_HEAD(&conf->handle_list);
+       INIT_LIST_HEAD(&conf->hold_list);
         INIT_LIST_HEAD(&conf->delayed_list);
         INIT_LIST_HEAD(&conf->bitmap_list);
         INIT_LIST_HEAD(&conf->inactive_list);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
         atomic_set(&conf->active_aligned_reads, 0);
+       conf->bypass_threshold = BYPASS_THRESHOLD;
  
         pr_debug("raid5: run(%s) called.\n", mdname(mddev));
  
-       ITERATE_RDEV(mddev,rdev,tmp) {
+       rdev_for_each(rdev, tmp, mddev) {
                 raid_disk = rdev->raid_disk;
                 if (raid_disk >= conf->raid_disks
                     || raid_disk < 0)
@@ -4155,7 +4268,9 @@ static int run(mddev_t *mddev)
                                 " disk %d\n", bdevname(rdev->bdev,b),
                                 raid_disk);
                         working_disks++;
-               }
+               } else
+                       /* Cannot rely on bitmap to complete recovery */
+                       conf->fullsync = 1;
         }
  
         /*
@@ -4279,7 +4394,6 @@ static int run(mddev_t *mddev)
                        mdname(mddev));
  
         mddev->queue->unplug_fn = raid5_unplug_device;
-       mddev->queue->issue_flush_fn = raid5_issue_flush;
         mddev->queue->backing_dev_info.congested_data = mddev;
         mddev->queue->backing_dev_info.congested_fn = raid5_congested;
  
@@ -4433,6 +4547,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                         err = -EBUSY;
                         goto abort;
                 }
+               /* Only remove non-faulty devices if recovery
+                * isn't possible.
+                */
+               if (!test_bit(Faulty, &rdev->flags) &&
+                   mddev->degraded <= conf->max_degraded) {
+                       err = -EBUSY;
+                       goto abort;
+               }
                 p->rdev = NULL;
                 synchronize_rcu();
                 if (atomic_read(&rdev->nr_pending)) {
@@ -4450,35 +4572,41 @@ abort:
  static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
  {
         raid5_conf_t *conf = mddev->private;
-       int found = 0;
+       int err = -EEXIST;
         int disk;
         struct disk_info *p;
+       int first = 0;
+       int last = conf->raid_disks - 1;
  
         if (mddev->degraded > conf->max_degraded)
                 /* no point adding a device */
-               return 0;
+               return -EINVAL;
+
+       if (rdev->raid_disk >= 0)
+               first = last = rdev->raid_disk;
  
         /*
          * find the disk ... but prefer rdev->saved_raid_disk
          * if possible.
          */
         if (rdev->saved_raid_disk >= 0 &&
+           rdev->saved_raid_disk >= first &&
             conf->disks[rdev->saved_raid_disk].rdev == NULL)
                 disk = rdev->saved_raid_disk;
         else
-               disk = 0;
-       for ( ; disk < conf->raid_disks; disk++)
+               disk = first;
+       for ( ; disk <= last ; disk++)
                 if ((p=conf->disks + disk)->rdev == NULL) {
                         clear_bit(In_sync, &rdev->flags);
                         rdev->raid_disk = disk;
-                       found = 1;
+                       err = 0;
                         if (rdev->saved_raid_disk != disk)
                                 conf->fullsync = 1;
                         rcu_assign_pointer(p->rdev, rdev);
                         break;
                 }
         print_raid5_conf(conf);
-       return found;
+       return err;
  }
  
  static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4554,7 +4682,7 @@ static int raid5_start_reshape(mddev_t *mddev)
         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                 return -EBUSY;
  
-       ITERATE_RDEV(mddev, rdev, rtmp)
+       rdev_for_each(rdev, rtmp, mddev)
                 if (rdev->raid_disk < 0 &&
                     !test_bit(Faulty, &rdev->flags))
                         spares++;
@@ -4576,10 +4704,10 @@ static int raid5_start_reshape(mddev_t *mddev)
         /* Add some new drives, as many as will fit.
          * We know there are enough to make the newly sized array work.
          */
-       ITERATE_RDEV(mddev, rdev, rtmp)
+       rdev_for_each(rdev, rtmp, mddev)
                 if (rdev->raid_disk < 0 &&
                     !test_bit(Faulty, &rdev->flags)) {
-                       if (raid5_add_disk(mddev, rdev)) {
+                       if (raid5_add_disk(mddev, rdev) == 0) {
                                 char nm[20];
                                 set_bit(In_sync, &rdev->flags);
                                 added_devices++;