dm raid1: use list_split_init

[safe/jmp/linux-2.6] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index c6e0e2b..b162b83 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -108,12 +108,11 @@ static void return_io(struct bio *return_bi)
  {
         struct bio *bi = return_bi;
         while (bi) {
-               int bytes = bi->bi_size;
  
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes,
+               bi->bi_end_io(bi,
                               test_bit(BIO_UPTODATE, &bi->bi_flags)
                                 ? 0 : -EIO);
                 bi = return_bi;
@@ -289,7 +288,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
  }
  
  static void unplug_slaves(mddev_t *mddev);
-static void raid5_unplug_device(request_queue_t *q);
+static void raid5_unplug_device(struct request_queue *q);
  
  static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
                                              int pd_idx, int noblock)
@@ -377,15 +376,20 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
                 ack++;
  
         sh->ops.count -= ack;
-       BUG_ON(sh->ops.count < 0);
+       if (unlikely(sh->ops.count < 0)) {
+               printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
+                       "ops.complete: %#lx\n", pending, sh->ops.pending,
+                       sh->ops.ack, sh->ops.complete);
+               BUG();
+       }
  
         return pending;
  }
  
-static int
-raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
-static int
-raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+static void
+raid5_end_read_request(struct bio *bi, int error);
+static void
+raid5_end_write_request(struct bio *bi, int error);
  
  static void ops_run_io(struct stripe_head *sh)
  {
@@ -493,12 +497,12 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
                         if (frombio)
                                 tx = async_memcpy(page, bio_page, page_offset,
                                         b_offset, clen,
-                                       ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
+                                       ASYNC_TX_DEP_ACK,
                                         tx, NULL, NULL);
                         else
                                 tx = async_memcpy(bio_page, page, b_offset,
                                         page_offset, clen,
-                                       ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
+                                       ASYNC_TX_DEP_ACK,
                                         tx, NULL, NULL);
                 }
                 if (clen < len) /* hit end of page */
@@ -514,7 +518,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
         struct stripe_head *sh = stripe_head_ref;
         struct bio *return_bi = NULL;
         raid5_conf_t *conf = sh->raid_conf;
-       int i, more_to_read = 0;
+       int i;
  
         pr_debug("%s: stripe %llu\n", __FUNCTION__,
                 (unsigned long long)sh->sector);
@@ -522,16 +526,14 @@ static void ops_complete_biofill(void *stripe_head_ref)
         /* clear completed biofills */
         for (i = sh->disks; i--; ) {
                 struct r5dev *dev = &sh->dev[i];
-               /* check if this stripe has new incoming reads */
-               if (dev->toread)
-                       more_to_read++;
  
                 /* acknowledge completion of a biofill operation */
-               /* and check if we need to reply to a read request
-               */
-               if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) {
+               /* and check if we need to reply to a read request,
+                * new R5_Wantfill requests are held off until
+                * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
+                */
+               if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
                         struct bio *rbi, *rbi2;
-                       clear_bit(R5_Wantfill, &dev->flags);
  
                         /* The access to dev->read is outside of the
                          * spin_lock_irq(&conf->device_lock), but is protected
@@ -553,13 +555,11 @@ static void ops_complete_biofill(void *stripe_head_ref)
                         }
                 }
         }
-       clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
-       clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+       set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
  
         return_io(return_bi);
  
-       if (more_to_read)
-               set_bit(STRIPE_HANDLE, &sh->state);
+       set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
  }
  
@@ -688,7 +688,8 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  }
  
  static struct dma_async_tx_descriptor *
-ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
+                unsigned long pending)
  {
         int disks = sh->disks;
         int pd_idx = sh->pd_idx, i;
@@ -696,7 +697,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
         /* check if prexor is active which means only process blocks
          * that are part of a read-modify-write (Wantprexor)
          */
-       int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+       int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
  
         pr_debug("%s: stripe %llu\n", __FUNCTION__,
                 (unsigned long long)sh->sector);
@@ -773,7 +774,8 @@ static void ops_complete_write(void *stripe_head_ref)
  }
  
  static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
+               unsigned long pending)
  {
         /* kernel stack size limits the total number of disks */
         int disks = sh->disks;
@@ -781,7 +783,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  
         int count = 0, pd_idx = sh->pd_idx, i;
         struct page *xor_dest;
-       int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+       int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
         unsigned long flags;
         dma_async_tx_callback callback;
  
@@ -808,7 +810,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
         }
  
         /* check whether this postxor is part of a write */
-       callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
+       callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
                 ops_complete_write : ops_complete_postxor;
  
         /* 1/ if we prexor'd then the dest is reused as a source
@@ -896,12 +898,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
                 tx = ops_run_prexor(sh, tx);
  
         if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
-               tx = ops_run_biodrain(sh, tx);
+               tx = ops_run_biodrain(sh, tx, pending);
                 overlap_clear++;
         }
  
         if (test_bit(STRIPE_OP_POSTXOR, &pending))
-               ops_run_postxor(sh, tx);
+               ops_run_postxor(sh, tx, pending);
  
         if (test_bit(STRIPE_OP_CHECK, &pending))
                 ops_run_check(sh);
@@ -951,7 +953,7 @@ static int grow_stripes(raid5_conf_t *conf, int num)
         conf->active_name = 0;
         sc = kmem_cache_create(conf->cache_name[conf->active_name],
                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
-                              0, 0, NULL, NULL);
+                              0, 0, NULL);
         if (!sc)
                 return 1;
         conf->slab_cache = sc;
@@ -1003,7 +1005,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
         /* Step 1 */
         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
-                              0, 0, NULL, NULL);
+                              0, 0, NULL);
         if (!sc)
                 return -ENOMEM;
  
@@ -1113,8 +1115,7 @@ static void shrink_stripes(raid5_conf_t *conf)
         conf->slab_cache = NULL;
  }
  
-static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
-                                  int error)
+static void raid5_end_read_request(struct bio * bi, int error)
  {
         struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
@@ -1123,8 +1124,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
         char b[BDEVNAME_SIZE];
         mdk_rdev_t *rdev;
  
-       if (bi->bi_size)
-               return 1;
  
         for (i=0 ; i<disks; i++)
                 if (bi == &sh->dev[i].req)
@@ -1135,7 +1134,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                 uptodate);
         if (i == disks) {
                 BUG();
-               return 0;
+               return;
         }
  
         if (uptodate) {
@@ -1144,7 +1143,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                         rdev = conf->disks[i].rdev;
                         printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
                                mdname(conf->mddev), STRIPE_SECTORS,
-                              (unsigned long long)sh->sector + rdev->data_offset,
+                              (unsigned long long)(sh->sector + rdev->data_offset),
                                bdevname(rdev->bdev, b));
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
@@ -1161,13 +1160,13 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                 if (conf->mddev->degraded)
                         printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
                                mdname(conf->mddev),
-                              (unsigned long long)sh->sector + rdev->data_offset,
+                              (unsigned long long)(sh->sector + rdev->data_offset),
                                bdn);
                 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                         /* Oh, no!!! */
                         printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
                                mdname(conf->mddev),
-                              (unsigned long long)sh->sector + rdev->data_offset,
+                              (unsigned long long)(sh->sector + rdev->data_offset),
                                bdn);
                 else if (atomic_read(&rdev->read_errors)
                          > conf->max_nr_stripes)
@@ -1188,20 +1187,15 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
         clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
-       return 0;
  }
  
-static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
-                                   int error)
+static void raid5_end_write_request (struct bio *bi, int error)
  {
         struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
         int disks = sh->disks, i;
         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  
-       if (bi->bi_size)
-               return 1;
-
         for (i=0 ; i<disks; i++)
                 if (bi == &sh->dev[i].req)
                         break;
@@ -1211,7 +1205,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
                 uptodate);
         if (i == disks) {
                 BUG();
-               return 0;
+               return;
         }
  
         if (!uptodate)
@@ -1222,7 +1216,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
         clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
-       return 0;
  }
  
  
@@ -1516,130 +1509,6 @@ static void copy_data(int frombio, struct bio *bio,
                            }                                              \
                         } while(0)
  
-
-static void compute_block(struct stripe_head *sh, int dd_idx)
-{
-       int i, count, disks = sh->disks;
-       void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-
-       pr_debug("compute_block, stripe %llu, idx %d\n",
-               (unsigned long long)sh->sector, dd_idx);
-
-       dest = page_address(sh->dev[dd_idx].page);
-       memset(dest, 0, STRIPE_SIZE);
-       count = 0;
-       for (i = disks ; i--; ) {
-               if (i == dd_idx)
-                       continue;
-               p = page_address(sh->dev[i].page);
-               if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                       ptr[count++] = p;
-               else
-                       printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
-                               " not present\n", dd_idx,
-                               (unsigned long long)sh->sector, i);
-
-               check_xor();
-       }
-       if (count)
-               xor_blocks(count, STRIPE_SIZE, dest, ptr);
-       set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-}
-
-static void compute_parity5(struct stripe_head *sh, int method)
-{
-       raid5_conf_t *conf = sh->raid_conf;
-       int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-       void *ptr[MAX_XOR_BLOCKS], *dest;
-       struct bio *chosen;
-
-       pr_debug("compute_parity5, stripe %llu, method %d\n",
-               (unsigned long long)sh->sector, method);
-
-       count = 0;
-       dest = page_address(sh->dev[pd_idx].page);
-       switch(method) {
-       case READ_MODIFY_WRITE:
-               BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
-               for (i=disks ; i-- ;) {
-                       if (i==pd_idx)
-                               continue;
-                       if (sh->dev[i].towrite &&
-                           test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-                               ptr[count++] = page_address(sh->dev[i].page);
-                               chosen = sh->dev[i].towrite;
-                               sh->dev[i].towrite = NULL;
-
-                               if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-                                       wake_up(&conf->wait_for_overlap);
-
-                               BUG_ON(sh->dev[i].written);
-                               sh->dev[i].written = chosen;
-                               check_xor();
-                       }
-               }
-               break;
-       case RECONSTRUCT_WRITE:
-               memset(dest, 0, STRIPE_SIZE);
-               for (i= disks; i-- ;)
-                       if (i!=pd_idx && sh->dev[i].towrite) {
-                               chosen = sh->dev[i].towrite;
-                               sh->dev[i].towrite = NULL;
-
-                               if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-                                       wake_up(&conf->wait_for_overlap);
-
-                               BUG_ON(sh->dev[i].written);
-                               sh->dev[i].written = chosen;
-                       }
-               break;
-       case CHECK_PARITY:
-               break;
-       }
-       if (count) {
-               xor_blocks(count, STRIPE_SIZE, dest, ptr);
-               count = 0;
-       }
-       
-       for (i = disks; i--;)
-               if (sh->dev[i].written) {
-                       sector_t sector = sh->dev[i].sector;
-                       struct bio *wbi = sh->dev[i].written;
-                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-                               copy_data(1, wbi, sh->dev[i].page, sector);
-                               wbi = r5_next_bio(wbi, sector);
-                       }
-
-                       set_bit(R5_LOCKED, &sh->dev[i].flags);
-                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
-               }
-
-       switch(method) {
-       case RECONSTRUCT_WRITE:
-       case CHECK_PARITY:
-               for (i=disks; i--;)
-                       if (i != pd_idx) {
-                               ptr[count++] = page_address(sh->dev[i].page);
-                               check_xor();
-                       }
-               break;
-       case READ_MODIFY_WRITE:
-               for (i = disks; i--;)
-                       if (sh->dev[i].written) {
-                               ptr[count++] = page_address(sh->dev[i].page);
-                               check_xor();
-                       }
-       }
-       if (count)
-               xor_blocks(count, STRIPE_SIZE, dest, ptr);
-
-       if (method != CHECK_PARITY) {
-               set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-               set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
-       } else
-               clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-}
-
  static void compute_parity6(struct stripe_head *sh, int method)
  {
         raid6_conf_t *conf = sh->raid_conf;
@@ -2326,6 +2195,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
                                                 "%d for r-m-w\n", i);
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
+                                       if (!test_and_set_bit(
+                                               STRIPE_OP_IO, &sh->ops.pending))
+                                               sh->ops.count++;
                                         s->locked++;
                                 } else {
                                         set_bit(STRIPE_DELAYED, &sh->state);
@@ -2349,6 +2221,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
                                                 "%d for Reconstruct\n", i);
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
+                                       if (!test_and_set_bit(
+                                               STRIPE_OP_IO, &sh->ops.pending))
+                                               sh->ops.count++;
                                         s->locked++;
                                 } else {
                                         set_bit(STRIPE_DELAYED, &sh->state);
@@ -2473,25 +2348,15 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
  static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                 struct stripe_head_state *s, int disks)
  {
+       int canceled_check = 0;
+
         set_bit(STRIPE_HANDLE, &sh->state);
-       /* Take one of the following actions:
-        * 1/ start a check parity operation if (uptodate == disks)
-        * 2/ finish a check parity operation and act on the result
-        * 3/ skip to the writeback section if we previously
-        *    initiated a recovery operation
-        */
-       if (s->failed == 0 &&
-           !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-               if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
-                       BUG_ON(s->uptodate != disks);
-                       clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
-                       sh->ops.count++;
-                       s->uptodate--;
-               } else if (
-                      test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-                       clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-                       clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
  
+       /* complete a check operation */
+       if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
+           clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+           clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+               if (s->failed == 0) {
                         if (sh->ops.zero_sum_result == 0)
                                 /* parity is correct (on disc,
                                  * not in buffer any more)
@@ -2516,7 +2381,8 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                                         s->uptodate++;
                                 }
                         }
-               }
+               } else
+                       canceled_check = 1; /* STRIPE_INSYNC is not set */
         }
  
         /* check if we can clear a parity disk reconstruct */
@@ -2529,12 +2395,28 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
         }
  
+       /* start a new check operation if there are no failures, the stripe is
+        * not insync, and a repair is not in flight
+        */
+       if (s->failed == 0 &&
+           !test_bit(STRIPE_INSYNC, &sh->state) &&
+           !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+               if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+                       BUG_ON(s->uptodate != disks);
+                       clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+                       sh->ops.count++;
+                       s->uptodate--;
+               }
+       }
+
         /* Wait for check parity and compute block operations to complete
-        * before write-back
+        * before write-back.  If a failure occurred while the check operation
+        * was in flight we need to cycle this stripe through handle_stripe
+        * since the parity block may not be uptodate
          */
-       if (!test_bit(STRIPE_INSYNC, &sh->state) &&
-               !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
-               !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+       if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
+           !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
+           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
                 struct r5dev *dev;
                 /* either failed parity check, or recovery is happening */
                 if (s->failed == 0)
@@ -2545,6 +2427,9 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
  
                 set_bit(R5_LOCKED, &dev->flags);
                 set_bit(R5_Wantwrite, &dev->flags);
+               if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+                       sh->ops.count++;
+
                 clear_bit(STRIPE_DEGRADED, &sh->state);
                 s->locked++;
                 set_bit(STRIPE_INSYNC, &sh->state);
@@ -2656,7 +2541,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
         struct dma_async_tx_descriptor *tx = NULL;
         clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
         for (i = 0; i < sh->disks; i++)
-               if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) {
+               if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
                         int dd_idx, pd_idx, j;
                         struct stripe_head *sh2;
  
@@ -2689,7 +2574,8 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
                         set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
                         for (j = 0; j < conf->raid_disks; j++)
                                 if (j != sh2->pd_idx &&
-                                   (r6s && j != r6s->qd_idx) &&
+                                   (!r6s || j != raid6_next_disk(sh2->pd_idx,
+                                                                sh2->disks)) &&
                                     !test_bit(R5_Expanded, &sh2->dev[j].flags))
                                         break;
                         if (j == conf->raid_disks) {
@@ -2698,12 +2584,12 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
                         }
                         release_stripe(sh2);
  
-                       /* done submitting copies, wait for them to complete */
-                       if (i + 1 >= sh->disks) {
-                               async_tx_ack(tx);
-                               dma_wait_for_async_tx(tx);
-                       }
                 }
+       /* done submitting copies, wait for them to complete */
+       if (tx) {
+               async_tx_ack(tx);
+               dma_wait_for_async_tx(tx);
+       }
  }
  
  /*
@@ -2747,6 +2633,13 @@ static void handle_stripe5(struct stripe_head *sh)
         s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
         /* Now to look around and see what can be done */
  
+       /* clean-up completed biofill operations */
+       if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
+               clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+               clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
+               clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
+       }
+
         rcu_read_lock();
         for (i=disks; i--; ) {
                 mdk_rdev_t *rdev;
@@ -2930,12 +2823,16 @@ static void handle_stripe5(struct stripe_head *sh)
                 dev = &sh->dev[s.failed_num];
                 if (!test_bit(R5_ReWrite, &dev->flags)) {
                         set_bit(R5_Wantwrite, &dev->flags);
+                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+                               sh->ops.count++;
                         set_bit(R5_ReWrite, &dev->flags);
                         set_bit(R5_LOCKED, &dev->flags);
                         s.locked++;
                 } else {
                         /* let's read it back */
                         set_bit(R5_Wantread, &dev->flags);
+                       if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+                               sh->ops.count++;
                         set_bit(R5_LOCKED, &dev->flags);
                         s.locked++;
                 }
@@ -2966,7 +2863,7 @@ static void handle_stripe5(struct stripe_head *sh)
                 sh->disks = conf->raid_disks;
                 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
                         conf->raid_disks);
-               s.locked += handle_write_operations5(sh, 0, 1);
+               s.locked += handle_write_operations5(sh, 1, 1);
         } else if (s.expanded &&
                 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
                 clear_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -2975,7 +2872,8 @@ static void handle_stripe5(struct stripe_head *sh)
                 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
         }
  
-       if (s.expanding && s.locked == 0)
+       if (s.expanding && s.locked == 0 &&
+           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
                 handle_stripe_expansion(conf, sh, NULL);
  
         if (sh->ops.count)
@@ -2988,64 +2886,6 @@ static void handle_stripe5(struct stripe_head *sh)
  
         return_io(return_bi);
  
-       for (i=disks; i-- ;) {
-               int rw;
-               struct bio *bi;
-               mdk_rdev_t *rdev;
-               if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-                       rw = WRITE;
-               else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-                       rw = READ;
-               else
-                       continue;
- 
-               bi = &sh->dev[i].req;
- 
-               bi->bi_rw = rw;
-               if (rw == WRITE)
-                       bi->bi_end_io = raid5_end_write_request;
-               else
-                       bi->bi_end_io = raid5_end_read_request;
- 
-               rcu_read_lock();
-               rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && test_bit(Faulty, &rdev->flags))
-                       rdev = NULL;
-               if (rdev)
-                       atomic_inc(&rdev->nr_pending);
-               rcu_read_unlock();
- 
-               if (rdev) {
-                       if (s.syncing || s.expanding || s.expanded)
-                               md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-                       bi->bi_bdev = rdev->bdev;
-                       pr_debug("for %llu schedule op %ld on disc %d\n",
-                               (unsigned long long)sh->sector, bi->bi_rw, i);
-                       atomic_inc(&sh->count);
-                       bi->bi_sector = sh->sector + rdev->data_offset;
-                       bi->bi_flags = 1 << BIO_UPTODATE;
-                       bi->bi_vcnt = 1;        
-                       bi->bi_max_vecs = 1;
-                       bi->bi_idx = 0;
-                       bi->bi_io_vec = &sh->dev[i].vec;
-                       bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
-                       bi->bi_io_vec[0].bv_offset = 0;
-                       bi->bi_size = STRIPE_SIZE;
-                       bi->bi_next = NULL;
-                       if (rw == WRITE &&
-                           test_bit(R5_ReWrite, &sh->dev[i].flags))
-                               atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
-                       generic_make_request(bi);
-               } else {
-                       if (rw == WRITE)
-                               set_bit(STRIPE_DEGRADED, &sh->state);
-                       pr_debug("skip op %ld on disc %d for sector %llu\n",
-                               bi->bi_rw, i, (unsigned long long)sh->sector);
-                       clear_bit(R5_LOCKED, &sh->dev[i].flags);
-                       set_bit(STRIPE_HANDLE, &sh->state);
-               }
-       }
  }
  
  static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -3235,7 +3075,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
         }
  
-       if (s.expanding && s.locked == 0)
+       if (s.expanding && s.locked == 0 &&
+           !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
                 handle_stripe_expansion(conf, sh, &r6s);
  
         spin_unlock(&sh->lock);
@@ -3325,7 +3166,8 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
                                 atomic_inc(&conf->preread_active_stripes);
                         list_add_tail(&sh->lru, &conf->handle_list);
                 }
-       }
+       } else
+               blk_plug_device(conf->mddev->queue);
  }
  
  static void activate_bit_delay(raid5_conf_t *conf)
@@ -3351,13 +3193,12 @@ static void unplug_slaves(mddev_t *mddev)
         for (i=0; i<mddev->raid_disks; i++) {
                 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
                 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                       request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+                       struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
  
                         atomic_inc(&rdev->nr_pending);
                         rcu_read_unlock();
  
-                       if (r_queue->unplug_fn)
-                               r_queue->unplug_fn(r_queue);
+                       blk_unplug(r_queue);
  
                         rdev_dec_pending(rdev, mddev);
                         rcu_read_lock();
@@ -3366,7 +3207,7 @@ static void unplug_slaves(mddev_t *mddev)
         rcu_read_unlock();
  }
  
-static void raid5_unplug_device(request_queue_t *q)
+static void raid5_unplug_device(struct request_queue *q)
  {
         mddev_t *mddev = q->queuedata;
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -3385,36 +3226,6 @@ static void raid5_unplug_device(request_queue_t *q)
         unplug_slaves(mddev);
  }
  
-static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
-                            sector_t *error_sector)
-{
-       mddev_t *mddev = q->queuedata;
-       raid5_conf_t *conf = mddev_to_conf(mddev);
-       int i, ret = 0;
-
-       rcu_read_lock();
-       for (i=0; i<mddev->raid_disks && ret == 0; i++) {
-               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                       struct block_device *bdev = rdev->bdev;
-                       request_queue_t *r_queue = bdev_get_queue(bdev);
-
-                       if (!r_queue->issue_flush_fn)
-                               ret = -EOPNOTSUPP;
-                       else {
-                               atomic_inc(&rdev->nr_pending);
-                               rcu_read_unlock();
-                               ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
-                                                             error_sector);
-                               rdev_dec_pending(rdev, mddev);
-                               rcu_read_lock();
-                       }
-               }
-       }
-       rcu_read_unlock();
-       return ret;
-}
-
  static int raid5_congested(void *data, int bits)
  {
         mddev_t *mddev = data;
@@ -3436,7 +3247,7 @@ static int raid5_congested(void *data, int bits)
  /* We want read requests to align with chunks where possible,
   * but write requests don't need to.
   */
-static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
  {
         mddev_t *mddev = q->queuedata;
         sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
@@ -3511,7 +3322,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
   *  first).
   *  If the read failed..
   */
-static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
+static void raid5_align_endio(struct bio *bi, int error)
  {
         struct bio* raid_bi  = bi->bi_private;
         mddev_t *mddev;
@@ -3519,8 +3330,6 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
         mdk_rdev_t *rdev;
  
-       if (bi->bi_size)
-               return 1;
         bio_put(bi);
  
         mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
@@ -3531,22 +3340,21 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
         rdev_dec_pending(rdev, conf->mddev);
  
         if (!error && uptodate) {
-               bio_endio(raid_bi, bytes, 0);
+               bio_endio(raid_bi, 0);
                 if (atomic_dec_and_test(&conf->active_aligned_reads))
                         wake_up(&conf->wait_for_stripe);
-               return 0;
+               return;
         }
  
  
         pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
  
         add_bio_to_retry(raid_bi, conf);
-       return 0;
  }
  
  static int bio_fits_rdev(struct bio *bi)
  {
-       request_queue_t *q = bdev_get_queue(bi->bi_bdev);
+       struct request_queue *q = bdev_get_queue(bi->bi_bdev);
  
         if ((bi->bi_size>>9) > q->max_sectors)
                 return 0;
@@ -3565,7 +3373,7 @@ static int bio_fits_rdev(struct bio *bi)
  }
  
  
-static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
+static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
  {
         mddev_t *mddev = q->queuedata;
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -3635,7 +3443,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
  }
  
  
-static int make_request(request_queue_t *q, struct bio * bi)
+static int make_request(struct request_queue *q, struct bio * bi)
  {
         mddev_t *mddev = q->queuedata;
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -3647,7 +3455,7 @@ static int make_request(request_queue_t *q, struct bio * bi)
         int remaining;
  
         if (unlikely(bio_barrier(bi))) {
-               bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+               bio_endio(bi, -EOPNOTSUPP);
                 return 0;
         }
  
@@ -3749,7 +3557,8 @@ static int make_request(request_queue_t *q, struct bio * bi)
                                 goto retry;
                         }
                         finish_wait(&conf->wait_for_overlap, &w);
-                       handle_stripe(sh, NULL);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       clear_bit(STRIPE_DELAYED, &sh->state);
                         release_stripe(sh);
                 } else {
                         /* cannot get stripe for read-ahead, just give-up */
@@ -3763,12 +3572,11 @@ static int make_request(request_queue_t *q, struct bio * bi)
         remaining = --bi->bi_phys_segments;
         spin_unlock_irq(&conf->device_lock);
         if (remaining == 0) {
-               int bytes = bi->bi_size;
  
                 if ( rw == WRITE )
                         md_write_end(mddev);
-               bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes,
+
+               bi->bi_end_io(bi,
                               test_bit(BIO_UPTODATE, &bi->bi_flags)
                                 ? 0 : -EIO);
         }
@@ -3899,6 +3707,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                 release_stripe(sh);
                 first_sector += STRIPE_SECTORS;
         }
+       /* If this takes us to the resync_max point where we have to pause,
+        * then we need to write out the superblock.
+        */
+       sector_nr += conf->chunk_size>>9;
+       if (sector_nr >= mddev->resync_max) {
+               /* Cannot proceed until we've updated the superblock... */
+               wait_event(conf->wait_for_overlap,
+                          atomic_read(&conf->reshape_stripes) == 0);
+               mddev->reshape_position = conf->expand_progress;
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               md_wakeup_thread(mddev->thread);
+               wait_event(mddev->sb_wait,
+                          !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+                          || kthread_should_stop());
+               spin_lock_irq(&conf->device_lock);
+               conf->expand_lo = mddev->reshape_position;
+               spin_unlock_irq(&conf->device_lock);
+               wake_up(&conf->wait_for_overlap);
+       }
         return conf->chunk_size>>9;
  }
  
@@ -3935,6 +3762,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                 return reshape_request(mddev, sector_nr, skipped);
  
+       /* No need to check resync_max as we never do more than one
+        * stripe, and as resync_max will always be on a chunk boundary,
+        * if the check in md_do_sync didn't fire, there is no chance
+        * of overstepping resync_max here
+        */
+
         /* if there is too many failed drives and we are trying
          * to resync, then assert that we are finished, because there is
          * nothing we can do.
@@ -3954,6 +3787,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
                 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
         }
  
+
+       bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+
         pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
         sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
         if (sh == NULL) {
@@ -4046,10 +3882,8 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
         remaining = --raid_bio->bi_phys_segments;
         spin_unlock_irq(&conf->device_lock);
         if (remaining == 0) {
-               int bytes = raid_bio->bi_size;
  
-               raid_bio->bi_size = 0;
-               raid_bio->bi_end_io(raid_bio, bytes,
+               raid_bio->bi_end_io(raid_bio,
                               test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
                                 ? 0 : -EIO);
         }
@@ -4067,7 +3901,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
   * During the scan, completed stripes are saved for us by the interrupt
   * handler, so that they will not have to wait for our next wakeup.
   */
-static void raid5d (mddev_t *mddev)
+static void raid5d(mddev_t *mddev)
  {
         struct stripe_head *sh;
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4092,12 +3926,6 @@ static void raid5d (mddev_t *mddev)
                         activate_bit_delay(conf);
                 }
  
-               if (list_empty(&conf->handle_list) &&
-                   atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
-                   !blk_queue_plugged(mddev->queue) &&
-                   !list_empty(&conf->delayed_list))
-                       raid5_activate_delayed(conf);
-
                 while ((bio = remove_bio_from_retry(conf))) {
                         int ok;
                         spin_unlock_irq(&conf->device_lock);
@@ -4311,7 +4139,7 @@ static int run(mddev_t *mddev)
  
         pr_debug("raid5: run(%s) called.\n", mdname(mddev));
  
-       ITERATE_RDEV(mddev,rdev,tmp) {
+       rdev_for_each(rdev, tmp, mddev) {
                 raid_disk = rdev->raid_disk;
                 if (raid_disk >= conf->raid_disks
                     || raid_disk < 0)
@@ -4450,7 +4278,6 @@ static int run(mddev_t *mddev)
                        mdname(mddev));
  
         mddev->queue->unplug_fn = raid5_unplug_device;
-       mddev->queue->issue_flush_fn = raid5_issue_flush;
         mddev->queue->backing_dev_info.congested_data = mddev;
         mddev->queue->backing_dev_info.congested_fn = raid5_congested;
  
@@ -4725,7 +4552,7 @@ static int raid5_start_reshape(mddev_t *mddev)
         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                 return -EBUSY;
  
-       ITERATE_RDEV(mddev, rdev, rtmp)
+       rdev_for_each(rdev, rtmp, mddev)
                 if (rdev->raid_disk < 0 &&
                     !test_bit(Faulty, &rdev->flags))
                         spares++;
@@ -4747,7 +4574,7 @@ static int raid5_start_reshape(mddev_t *mddev)
         /* Add some new drives, as many as will fit.
          * We know there are enough to make the newly sized array work.
          */
-       ITERATE_RDEV(mddev, rdev, rtmp)
+       rdev_for_each(rdev, rtmp, mddev)
                 if (rdev->raid_disk < 0 &&
                     !test_bit(Faulty, &rdev->flags)) {
                         if (raid5_add_disk(mddev, rdev)) {