siimage: DRAC4 note

[safe/jmp/linux-2.6] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 6d3a2a0..d247429 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -18,8 +18,31 @@
   * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   */
  
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches.  Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ *    new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ *   we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ *   batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
  
-#include <linux/config.h>
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/highmem.h>
@@ -89,12 +112,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                 BUG_ON(!list_empty(&sh->lru));
                 BUG_ON(atomic_read(&conf->active_stripes)==0);
                 if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                       if (test_bit(STRIPE_DELAYED, &sh->state))
+                       if (test_bit(STRIPE_DELAYED, &sh->state)) {
                                 list_add_tail(&sh->lru, &conf->delayed_list);
-                       else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                                conf->seq_write == sh->bm_seq)
+                               blk_plug_device(conf->mddev->queue);
+                       } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                  sh->bm_seq - conf->seq_write > 0) {
                                 list_add_tail(&sh->lru, &conf->bitmap_list);
-                       else {
+                               blk_plug_device(conf->mddev->queue);
+                       } else {
                                 clear_bit(STRIPE_BIT_DELAY, &sh->state);
                                 list_add_tail(&sh->lru, &conf->handle_list);
                         }
@@ -109,6 +134,8 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                         if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
                                 list_add_tail(&sh->lru, &conf->inactive_list);
                                 wake_up(&conf->wait_for_stripe);
+                               if (conf->retry_read_aligned)
+                                       md_wakeup_thread(conf->mddev->thread);
                         }
                 }
         }
@@ -271,7 +298,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                                                      < (conf->max_nr_stripes *3/4)
                                                      || !conf->inactive_blocked),
                                                     conf->device_lock,
-                                                   unplug_slaves(conf->mddev)
+                                                   raid5_unplug_device(conf->mddev->queue)
                                         );
                                 conf->inactive_blocked = 0;
                         } else
@@ -282,7 +309,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                         } else {
                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
                                         atomic_inc(&conf->active_stripes);
-                               if (list_empty(&sh->lru))
+                               if (list_empty(&sh->lru) &&
+                                   !test_bit(STRIPE_EXPANDING, &sh->state))
                                         BUG();
                                 list_del_init(&sh->lru);
                         }
@@ -322,7 +350,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
  
  static int grow_stripes(raid5_conf_t *conf, int num)
  {
-       kmem_cache_t *sc;
+       struct kmem_cache *sc;
         int devs = conf->raid_disks;
  
         sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
@@ -371,12 +399,14 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
         LIST_HEAD(newstripes);
         struct disk_info *ndisks;
         int err = 0;
-       kmem_cache_t *sc;
+       struct kmem_cache *sc;
         int i;
  
         if (newsize <= conf->pool_size)
                 return 0; /* never bother to shrink */
  
+       md_allow_write(conf->mddev);
+
         /* Step 1 */
         sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
                                sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
@@ -497,6 +527,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
         raid5_conf_t *conf = sh->raid_conf;
         int disks = sh->disks, i;
         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+       char b[BDEVNAME_SIZE];
+       mdk_rdev_t *rdev;
  
         if (bi->bi_size)
                 return 1;
@@ -514,55 +546,41 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
         }
  
         if (uptodate) {
-#if 0
-               struct bio *bio;
-               unsigned long flags;
-               spin_lock_irqsave(&conf->device_lock, flags);
-               /* we can return a buffer if we bypassed the cache or
-                * if the top buffer is not in highmem.  If there are
-                * multiple buffers, leave the extra work to
-                * handle_stripe
-                */
-               buffer = sh->bh_read[i];
-               if (buffer &&
-                   (!PageHighMem(buffer->b_page)
-                    || buffer->b_page == bh->b_page )
-                       ) {
-                       sh->bh_read[i] = buffer->b_reqnext;
-                       buffer->b_reqnext = NULL;
-               } else
-                       buffer = NULL;
-               spin_unlock_irqrestore(&conf->device_lock, flags);
-               if (sh->bh_page[i]==bh->b_page)
-                       set_buffer_uptodate(bh);
-               if (buffer) {
-                       if (buffer->b_page != bh->b_page)
-                               memcpy(buffer->b_data, bh->b_data, bh->b_size);
-                       buffer->b_end_io(buffer, 1);
-               }
-#else
                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif
                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-                       printk(KERN_INFO "raid5: read error corrected!!\n");
+                       rdev = conf->disks[i].rdev;
+                       printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+                              mdname(conf->mddev), STRIPE_SECTORS,
+                              (unsigned long long)sh->sector + rdev->data_offset,
+                              bdevname(rdev->bdev, b));
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
                 }
                 if (atomic_read(&conf->disks[i].rdev->read_errors))
                         atomic_set(&conf->disks[i].rdev->read_errors, 0);
         } else {
+               const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
                 int retry = 0;
+               rdev = conf->disks[i].rdev;
+
                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-               atomic_inc(&conf->disks[i].rdev->read_errors);
+               atomic_inc(&rdev->read_errors);
                 if (conf->mddev->degraded)
-                       printk(KERN_WARNING "raid5: read error not correctable.\n");
+                       printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+                              mdname(conf->mddev),
+                              (unsigned long long)sh->sector + rdev->data_offset,
+                              bdn);
                 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                         /* Oh, no!!! */
-                       printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
-               else if (atomic_read(&conf->disks[i].rdev->read_errors)
+                       printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
+                              mdname(conf->mddev),
+                              (unsigned long long)sh->sector + rdev->data_offset,
+                              bdn);
+               else if (atomic_read(&rdev->read_errors)
                          > conf->max_nr_stripes)
                         printk(KERN_WARNING
-                              "raid5: Too many read errors, failing device.\n");
+                              "raid5:%s: Too many read errors, failing device %s.\n",
+                              mdname(conf->mddev), bdn);
                 else
                         retry = 1;
                 if (retry)
@@ -570,18 +588,10 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
                 else {
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
-                       md_error(conf->mddev, conf->disks[i].rdev);
+                       md_error(conf->mddev, rdev);
                 }
         }
         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-#if 0
-       /* must restore b_page before unlocking buffer... */
-       if (sh->bh_page[i] != bh->b_page) {
-               bh->b_page = sh->bh_page[i];
-               bh->b_data = page_address(bh->b_page);
-               clear_buffer_uptodate(bh);
-       }
-#endif
         clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
@@ -594,7 +604,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
         struct stripe_head *sh = bi->bi_private;
         raid5_conf_t *conf = sh->raid_conf;
         int disks = sh->disks, i;
-       unsigned long flags;
         int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  
         if (bi->bi_size)
@@ -612,7 +621,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
                 return 0;
         }
  
-       spin_lock_irqsave(&conf->device_lock, flags);
         if (!uptodate)
                 md_error(conf->mddev, conf->disks[i].rdev);
  
@@ -620,8 +628,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
         
         clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
-       __release_stripe(conf, sh);
-       spin_unlock_irqrestore(&conf->device_lock, flags);
+       release_stripe(sh);
         return 0;
  }
  
@@ -654,12 +661,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
         PRINTK("raid5: error called\n");
  
         if (!test_bit(Faulty, &rdev->flags)) {
-               mddev->sb_dirty = 1;
-               if (test_bit(In_sync, &rdev->flags)) {
-                       conf->working_disks--;
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               if (test_and_clear_bit(In_sync, &rdev->flags)) {
+                       unsigned long flags;
+                       spin_lock_irqsave(&conf->device_lock, flags);
                         mddev->degraded++;
-                       conf->failed_disks++;
-                       clear_bit(In_sync, &rdev->flags);
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
                         /*
                          * if recovery was running, make sure it aborts.
                          */
@@ -669,7 +676,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                 printk (KERN_ALERT
                         "raid5: Disk failure on %s, disabling device."
                         " Operation continuing on %d devices\n",
-                       bdevname(rdev->bdev,b), conf->working_disks);
+                       bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
         }
  }
  
@@ -782,7 +789,8 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
  static sector_t compute_blocknr(struct stripe_head *sh, int i)
  {
         raid5_conf_t *conf = sh->raid_conf;
-       int raid_disks = sh->disks, data_disks = raid_disks - 1;
+       int raid_disks = sh->disks;
+       int data_disks = raid_disks - conf->max_degraded;
         sector_t new_sector = sh->sector, check;
         int sectors_per_chunk = conf->chunk_size >> 9;
         sector_t stripe;
@@ -818,7 +826,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                 }
                 break;
         case 6:
-               data_disks = raid_disks - 2;
                 if (i == raid6_next_disk(sh->pd_idx, raid_disks))
                         return 0; /* It is the Q disk */
                 switch (conf->algorithm) {
@@ -1043,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
  static void compute_parity6(struct stripe_head *sh, int method)
  {
         raid6_conf_t *conf = sh->raid_conf;
-       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
         struct bio *chosen;
         /**** FIX THIS: This could be very bad if disks is close to 256 ****/
         void *ptrs[disks];
@@ -1066,7 +1073,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
                                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                         wake_up(&conf->wait_for_overlap);
  
-                               if (sh->dev[i].written) BUG();
+                               BUG_ON(sh->dev[i].written);
                                 sh->dev[i].written = chosen;
                         }
                 break;
@@ -1124,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
  /* Compute one missing block */
  static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
  {
-       raid6_conf_t *conf = sh->raid_conf;
-       int i, count, disks = conf->raid_disks;
+       int i, count, disks = sh->disks;
         void *ptr[MAX_XOR_BLOCKS], *p;
         int pd_idx = sh->pd_idx;
         int qd_idx = raid6_next_disk(pd_idx, disks);
@@ -1163,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
  /* Compute two missing blocks */
  static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
  {
-       raid6_conf_t *conf = sh->raid_conf;
-       int i, count, disks = conf->raid_disks;
+       int i, count, disks = sh->disks;
         int pd_idx = sh->pd_idx;
         int qd_idx = raid6_next_disk(pd_idx, disks);
         int d0_idx = raid6_next_disk(qd_idx, disks);
@@ -1271,9 +1276,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                 (unsigned long long)sh->sector, dd_idx);
  
         if (conf->mddev->bitmap && firstwrite) {
-               sh->bm_seq = conf->seq_write;
                 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
                                   STRIPE_SECTORS, 0);
+               sh->bm_seq = conf->seq_flush+1;
                 set_bit(STRIPE_BIT_DELAY, &sh->state);
         }
  
@@ -1311,12 +1316,13 @@ static int page_is_zero(struct page *p)
  static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
  {
         int sectors_per_chunk = conf->chunk_size >> 9;
-       sector_t x = stripe;
         int pd_idx, dd_idx;
-       int chunk_offset = sector_div(x, sectors_per_chunk);
-       stripe = x;
-       raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
-                            + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+       int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
+       raid5_compute_sector(stripe * (disks - conf->max_degraded)
+                            *sectors_per_chunk + chunk_offset,
+                            disks, disks - conf->max_degraded,
+                            &dd_idx, &pd_idx, conf);
         return pd_idx;
  }
  
@@ -1577,15 +1583,6 @@ static void handle_stripe5(struct stripe_head *sh)
                                 } else if (test_bit(R5_Insync, &dev->flags)) {
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
-#if 0
-                                       /* if I am just reading this block and we don't have
-                                          a failed drive, or any pending writes then sidestep the cache */
-                                       if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
-                                           ! syncing && !failed && !to_write) {
-                                               sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
-                                               sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
-                                       }
-#endif
                                         locked++;
                                         PRINTK("Reading block %d (sync=%d)\n", 
                                                 i, syncing);
@@ -1603,9 +1600,6 @@ static void handle_stripe5(struct stripe_head *sh)
                         dev = &sh->dev[i];
                         if ((dev->towrite || i == sh->pd_idx) &&
                             (!test_bit(R5_LOCKED, &dev->flags) 
-#if 0
-|| sh->bh_page[i]!=bh->b_page
-#endif
                                     ) &&
                             !test_bit(R5_UPTODATE, &dev->flags)) {
                                 if (test_bit(R5_Insync, &dev->flags)
@@ -1617,9 +1611,6 @@ static void handle_stripe5(struct stripe_head *sh)
                         /* Would I have to read this buffer for reconstruct_write */
                         if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
                             (!test_bit(R5_LOCKED, &dev->flags) 
-#if 0
-|| sh->bh_page[i] != bh->b_page
-#endif
                                     ) &&
                             !test_bit(R5_UPTODATE, &dev->flags)) {
                                 if (test_bit(R5_Insync, &dev->flags)) rcw++;
@@ -1827,23 +1818,25 @@ static void handle_stripe5(struct stripe_head *sh)
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes, 0);
+               bi->bi_end_io(bi, bytes,
+                             test_bit(BIO_UPTODATE, &bi->bi_flags)
+                               ? 0 : -EIO);
         }
         for (i=disks; i-- ;) {
                 int rw;
                 struct bio *bi;
                 mdk_rdev_t *rdev;
                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-                       rw = 1;
+                       rw = WRITE;
                 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-                       rw = 0;
+                       rw = READ;
                 else
                         continue;
   
                 bi = &sh->dev[i].req;
   
                 bi->bi_rw = rw;
-               if (rw)
+               if (rw == WRITE)
                         bi->bi_end_io = raid5_end_write_request;
                 else
                         bi->bi_end_io = raid5_end_read_request;
@@ -1879,7 +1872,7 @@ static void handle_stripe5(struct stripe_head *sh)
                                 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
                         generic_make_request(bi);
                 } else {
-                       if (rw == 1)
+                       if (rw == WRITE)
                                 set_bit(STRIPE_DEGRADED, &sh->state);
                         PRINTK("skip op %ld on disc %d for sector %llu\n",
                                 bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -1892,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh)
  static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
  {
         raid6_conf_t *conf = sh->raid_conf;
-       int disks = conf->raid_disks;
+       int disks = sh->disks;
         struct bio *return_bi= NULL;
         struct bio *bi;
         int i;
-       int syncing;
+       int syncing, expanding, expanded;
         int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
         int non_overwrite = 0;
         int failed_num[2] = {0, 0};
@@ -1914,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         clear_bit(STRIPE_DELAYED, &sh->state);
  
         syncing = test_bit(STRIPE_SYNCING, &sh->state);
+       expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+       expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
         /* Now to look around and see what can be done */
  
         rcu_read_lock();
@@ -2119,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
          * parity, or to satisfy requests
          * or to load a block that is being partially written.
          */
-       if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+       if (to_read || non_overwrite || (to_write && failed) ||
+           (syncing && (uptodate < disks)) || expanding) {
                 for (i=disks; i--;) {
                         dev = &sh->dev[i];
                         if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
                             (dev->toread ||
                              (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
                              syncing ||
+                            expanding ||
                              (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
                              (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
                                     )
@@ -2155,15 +2152,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                 } else if (test_bit(R5_Insync, &dev->flags)) {
                                         set_bit(R5_LOCKED, &dev->flags);
                                         set_bit(R5_Wantread, &dev->flags);
-#if 0
-                                       /* if I am just reading this block and we don't have
-                                          a failed drive, or any pending writes then sidestep the cache */
-                                       if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
-                                           ! syncing && !failed && !to_write) {
-                                               sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
-                                               sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
-                                       }
-#endif
                                         locked++;
                                         PRINTK("Reading block %d (sync=%d)\n",
                                                 i, syncing);
@@ -2182,9 +2170,6 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                         if (!test_bit(R5_OVERWRITE, &dev->flags)
                             && i != pd_idx && i != qd_idx
                             && (!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-                               || sh->bh_page[i] != bh->b_page
-#endif
                                     ) &&
                             !test_bit(R5_UPTODATE, &dev->flags)) {
                                 if (test_bit(R5_Insync, &dev->flags)) rcw++;
@@ -2372,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                 }
                         }
                 }
+
+       if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+               /* Need to write out all blocks after computing P&Q */
+               sh->disks = conf->raid_disks;
+               sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+                                            conf->raid_disks);
+               compute_parity6(sh, RECONSTRUCT_WRITE);
+               for (i = conf->raid_disks ; i-- ;  ) {
+                       set_bit(R5_LOCKED, &sh->dev[i].flags);
+                       locked++;
+                       set_bit(R5_Wantwrite, &sh->dev[i].flags);
+               }
+               clear_bit(STRIPE_EXPANDING, &sh->state);
+       } else if (expanded) {
+               clear_bit(STRIPE_EXPAND_READY, &sh->state);
+               atomic_dec(&conf->reshape_stripes);
+               wake_up(&conf->wait_for_overlap);
+               md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+       }
+
+       if (expanding && locked == 0) {
+               /* We have read all the blocks in this stripe and now we need to
+                * copy some of them into a target stripe for expand.
+                */
+               clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+               for (i = 0; i < sh->disks ; i++)
+                       if (i != pd_idx && i != qd_idx) {
+                               int dd_idx2, pd_idx2, j;
+                               struct stripe_head *sh2;
+
+                               sector_t bn = compute_blocknr(sh, i);
+                               sector_t s = raid5_compute_sector(
+                                       bn, conf->raid_disks,
+                                       conf->raid_disks - conf->max_degraded,
+                                       &dd_idx2, &pd_idx2, conf);
+                               sh2 = get_active_stripe(conf, s,
+                                                       conf->raid_disks,
+                                                      pd_idx2, 1);
+                               if (sh2 == NULL)
+                                       /* so for only the early blocks of
+                                        * this stripe have been requests.
+                                        * When later blocks get requests, we
+                                        * will try again
+                                        */
+                                       continue;
+                               if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+                                   test_bit(R5_Expanded,
+                                            &sh2->dev[dd_idx2].flags)) {
+                                       /* must have already done this block */
+                                       release_stripe(sh2);
+                                       continue;
+                               }
+                               memcpy(page_address(sh2->dev[dd_idx2].page),
+                                      page_address(sh->dev[i].page),
+                                      STRIPE_SIZE);
+                               set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
+                               set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
+                               for (j = 0 ; j < conf->raid_disks ; j++)
+                                       if (j != sh2->pd_idx &&
+                                           j != raid6_next_disk(sh2->pd_idx,
+                                                          sh2->disks) &&
+                                           !test_bit(R5_Expanded,
+                                                     &sh2->dev[j].flags))
+                                               break;
+                               if (j == conf->raid_disks) {
+                                       set_bit(STRIPE_EXPAND_READY,
+                                               &sh2->state);
+                                       set_bit(STRIPE_HANDLE, &sh2->state);
+                               }
+                               release_stripe(sh2);
+                       }
+       }
+
         spin_unlock(&sh->lock);
  
         while ((bi=return_bi)) {
@@ -2380,23 +2438,25 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes, 0);
+               bi->bi_end_io(bi, bytes,
+                             test_bit(BIO_UPTODATE, &bi->bi_flags)
+                               ? 0 : -EIO);
         }
         for (i=disks; i-- ;) {
                 int rw;
                 struct bio *bi;
                 mdk_rdev_t *rdev;
                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-                       rw = 1;
+                       rw = WRITE;
                 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-                       rw = 0;
+                       rw = READ;
                 else
                         continue;
  
                 bi = &sh->dev[i].req;
  
                 bi->bi_rw = rw;
-               if (rw)
+               if (rw == WRITE)
                         bi->bi_end_io = raid5_end_write_request;
                 else
                         bi->bi_end_io = raid5_end_read_request;
@@ -2410,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 rcu_read_unlock();
  
                 if (rdev) {
-                       if (syncing)
+                       if (syncing || expanding || expanded)
                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
  
                         bi->bi_bdev = rdev->bdev;
@@ -2432,7 +2492,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
                         generic_make_request(bi);
                 } else {
-                       if (rw == 1)
+                       if (rw == WRITE)
                                 set_bit(STRIPE_DEGRADED, &sh->state);
                         PRINTK("skip op %ld on disc %d for sector %llu\n",
                                 bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -2555,13 +2615,226 @@ static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
         return ret;
  }
  
-static inline void raid5_plug_device(raid5_conf_t *conf)
+static int raid5_congested(void *data, int bits)
  {
-       spin_lock_irq(&conf->device_lock);
-       blk_plug_device(conf->mddev->queue);
-       spin_unlock_irq(&conf->device_lock);
+       mddev_t *mddev = data;
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       /* No difference between reads and writes.  Just check
+        * how busy the stripe_cache is
+        */
+       if (conf->inactive_blocked)
+               return 1;
+       if (conf->quiesce)
+               return 1;
+       if (list_empty_careful(&conf->inactive_list))
+               return 1;
+
+       return 0;
+}
+
+/* We want read requests to align with chunks where possible,
+ * but write requests don't need to.
+ */
+static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+{
+       mddev_t *mddev = q->queuedata;
+       sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+       int max;
+       unsigned int chunk_sectors = mddev->chunk_size >> 9;
+       unsigned int bio_sectors = bio->bi_size >> 9;
+
+       if (bio_data_dir(bio) == WRITE)
+               return biovec->bv_len; /* always allow writes to be mergeable */
+
+       max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+       if (max < 0) max = 0;
+       if (max <= biovec->bv_len && bio_sectors == 0)
+               return biovec->bv_len;
+       else
+               return max;
+}
+
+
+static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
+{
+       sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+       unsigned int chunk_sectors = mddev->chunk_size >> 9;
+       unsigned int bio_sectors = bio->bi_size >> 9;
+
+       return  chunk_sectors >=
+               ((sector & (chunk_sectors - 1)) + bio_sectors);
  }
  
+/*
+ *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
+ *  later sampled by raid5d.
+ */
+static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&conf->device_lock, flags);
+
+       bi->bi_next = conf->retry_read_aligned_list;
+       conf->retry_read_aligned_list = bi;
+
+       spin_unlock_irqrestore(&conf->device_lock, flags);
+       md_wakeup_thread(conf->mddev->thread);
+}
+
+
+static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
+{
+       struct bio *bi;
+
+       bi = conf->retry_read_aligned;
+       if (bi) {
+               conf->retry_read_aligned = NULL;
+               return bi;
+       }
+       bi = conf->retry_read_aligned_list;
+       if(bi) {
+               conf->retry_read_aligned_list = bi->bi_next;
+               bi->bi_next = NULL;
+               bi->bi_phys_segments = 1; /* biased count of active stripes */
+               bi->bi_hw_segments = 0; /* count of processed stripes */
+       }
+
+       return bi;
+}
+
+
+/*
+ *  The "raid5_align_endio" should check if the read succeeded and if it
+ *  did, call bio_endio on the original bio (having bio_put the new bio
+ *  first).
+ *  If the read failed..
+ */
+static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
+{
+       struct bio* raid_bi  = bi->bi_private;
+       mddev_t *mddev;
+       raid5_conf_t *conf;
+       int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+       mdk_rdev_t *rdev;
+
+       if (bi->bi_size)
+               return 1;
+       bio_put(bi);
+
+       mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
+       conf = mddev_to_conf(mddev);
+       rdev = (void*)raid_bi->bi_next;
+       raid_bi->bi_next = NULL;
+
+       rdev_dec_pending(rdev, conf->mddev);
+
+       if (!error && uptodate) {
+               bio_endio(raid_bi, bytes, 0);
+               if (atomic_dec_and_test(&conf->active_aligned_reads))
+                       wake_up(&conf->wait_for_stripe);
+               return 0;
+       }
+
+
+       PRINTK("raid5_align_endio : io error...handing IO for a retry\n");
+
+       add_bio_to_retry(raid_bi, conf);
+       return 0;
+}
+
+static int bio_fits_rdev(struct bio *bi)
+{
+       request_queue_t *q = bdev_get_queue(bi->bi_bdev);
+
+       if ((bi->bi_size>>9) > q->max_sectors)
+               return 0;
+       blk_recount_segments(q, bi);
+       if (bi->bi_phys_segments > q->max_phys_segments ||
+           bi->bi_hw_segments > q->max_hw_segments)
+               return 0;
+
+       if (q->merge_bvec_fn)
+               /* it's too hard to apply the merge_bvec_fn at this stage,
+                * just just give up
+                */
+               return 0;
+
+       return 1;
+}
+
+
+static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
+{
+       mddev_t *mddev = q->queuedata;
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+       const unsigned int raid_disks = conf->raid_disks;
+       const unsigned int data_disks = raid_disks - conf->max_degraded;
+       unsigned int dd_idx, pd_idx;
+       struct bio* align_bi;
+       mdk_rdev_t *rdev;
+
+       if (!in_chunk_boundary(mddev, raid_bio)) {
+               PRINTK("chunk_aligned_read : non aligned\n");
+               return 0;
+       }
+       /*
+        * use bio_clone to make a copy of the bio
+        */
+       align_bi = bio_clone(raid_bio, GFP_NOIO);
+       if (!align_bi)
+               return 0;
+       /*
+        *   set bi_end_io to a new function, and set bi_private to the
+        *     original bio.
+        */
+       align_bi->bi_end_io  = raid5_align_endio;
+       align_bi->bi_private = raid_bio;
+       /*
+        *      compute position
+        */
+       align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
+                                       raid_disks,
+                                       data_disks,
+                                       &dd_idx,
+                                       &pd_idx,
+                                       conf);
+
+       rcu_read_lock();
+       rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+       if (rdev && test_bit(In_sync, &rdev->flags)) {
+               atomic_inc(&rdev->nr_pending);
+               rcu_read_unlock();
+               raid_bio->bi_next = (void*)rdev;
+               align_bi->bi_bdev =  rdev->bdev;
+               align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+               align_bi->bi_sector += rdev->data_offset;
+
+               if (!bio_fits_rdev(align_bi)) {
+                       /* too big in some way */
+                       bio_put(align_bi);
+                       rdev_dec_pending(rdev, mddev);
+                       return 0;
+               }
+
+               spin_lock_irq(&conf->device_lock);
+               wait_event_lock_irq(conf->wait_for_stripe,
+                                   conf->quiesce == 0,
+                                   conf->device_lock, /* nothing */);
+               atomic_inc(&conf->active_aligned_reads);
+               spin_unlock_irq(&conf->device_lock);
+
+               generic_make_request(align_bi);
+               return 1;
+       } else {
+               rcu_read_unlock();
+               bio_put(align_bi);
+               return 0;
+       }
+}
+
+
  static int make_request(request_queue_t *q, struct bio * bi)
  {
         mddev_t *mddev = q->queuedata;
@@ -2583,6 +2856,11 @@ static int make_request(request_queue_t *q, struct bio * bi)
         disk_stat_inc(mddev->gendisk, ios[rw]);
         disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
  
+       if (rw == READ &&
+            mddev->reshape_position == MaxSector &&
+            chunk_aligned_read(q,bi))
+               return 0;
+
         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         last_sector = bi->bi_sector + (bi->bi_size>>9);
         bi->bi_next = NULL;
@@ -2671,7 +2949,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
                                 goto retry;
                         }
                         finish_wait(&conf->wait_for_overlap, &w);
-                       raid5_plug_device(conf);
                         handle_stripe(sh, NULL);
                         release_stripe(sh);
                 } else {
@@ -2691,7 +2968,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
                 if ( rw == WRITE )
                         md_write_end(mddev);
                 bi->bi_size = 0;
-               bi->bi_end_io(bi, bytes, 0);
+               bi->bi_end_io(bi, bytes,
+                             test_bit(BIO_UPTODATE, &bi->bi_flags)
+                               ? 0 : -EIO);
         }
         return 0;
  }
@@ -2711,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         struct stripe_head *sh;
         int pd_idx;
         sector_t first_sector, last_sector;
-       int raid_disks;
-       int data_disks;
+       int raid_disks = conf->previous_raid_disks;
+       int data_disks = raid_disks - conf->max_degraded;
+       int new_data_disks = conf->raid_disks - conf->max_degraded;
         int i;
         int dd_idx;
         sector_t writepos, safepos, gap;
@@ -2721,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
             conf->expand_progress != 0) {
                 /* restarting in the middle, skip the initial sectors */
                 sector_nr = conf->expand_progress;
-               sector_div(sector_nr, conf->raid_disks-1);
+               sector_div(sector_nr, new_data_disks);
                 *skipped = 1;
                 return sector_nr;
         }
@@ -2735,21 +3015,21 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
          * to after where expand_lo old_maps to
          */
         writepos = conf->expand_progress +
-               conf->chunk_size/512*(conf->raid_disks-1);
-       sector_div(writepos, conf->raid_disks-1);
+               conf->chunk_size/512*(new_data_disks);
+       sector_div(writepos, new_data_disks);
         safepos = conf->expand_lo;
-       sector_div(safepos, conf->previous_raid_disks-1);
+       sector_div(safepos, data_disks);
         gap = conf->expand_progress - conf->expand_lo;
  
         if (writepos >= safepos ||
-           gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+           gap > (new_data_disks)*3000*2 /*3Meg*/) {
                 /* Cannot proceed until we've updated the superblock... */
                 wait_event(conf->wait_for_overlap,
                            atomic_read(&conf->reshape_stripes)==0);
                 mddev->reshape_position = conf->expand_progress;
-               mddev->sb_dirty = 1;
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
                 md_wakeup_thread(mddev->thread);
-               wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+               wait_event(mddev->sb_wait, mddev->flags == 0 ||
                            kthread_should_stop());
                 spin_lock_irq(&conf->device_lock);
                 conf->expand_lo = mddev->reshape_position;
@@ -2772,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                         sector_t s;
                         if (j == sh->pd_idx)
                                 continue;
+                       if (conf->level == 6 &&
+                           j == raid6_next_disk(sh->pd_idx, sh->disks))
+                               continue;
                         s = compute_blocknr(sh, j);
                         if (s < (mddev->array_size<<1)) {
                                 skipped = 1;
@@ -2795,21 +3078,20 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
          * The source stripes are determined by mapping the first and last
          * block on the destination stripes.
          */
-       raid_disks = conf->previous_raid_disks;
-       data_disks = raid_disks - 1;
         first_sector =
-               raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+               raid5_compute_sector(sector_nr*(new_data_disks),
                                      raid_disks, data_disks,
                                      &dd_idx, &pd_idx, conf);
         last_sector =
                 raid5_compute_sector((sector_nr+conf->chunk_size/512)
-                                    *(conf->raid_disks-1) -1,
+                                    *(new_data_disks) -1,
                                      raid_disks, data_disks,
                                      &dd_idx, &pd_idx, conf);
         if (last_sector >= (mddev->size<<1))
                 last_sector = (mddev->size<<1)-1;
         while (first_sector <= last_sector) {
-               pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+               pd_idx = stripe_to_pdidx(first_sector, conf,
+                                        conf->previous_raid_disks);
                 sh = get_active_stripe(conf, first_sector,
                                        conf->previous_raid_disks, pd_idx, 0);
                 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
@@ -2827,7 +3109,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         struct stripe_head *sh;
         int pd_idx;
         int raid_disks = conf->raid_disks;
-       int data_disks = raid_disks - conf->max_degraded;
         sector_t max_sector = mddev->size << 1;
         int sync_blocks;
         int still_degraded = 0;
@@ -2858,7 +3139,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
          * to resync, then assert that we are finished, because there is
          * nothing we can do.
          */
-       if (mddev->degraded >= (data_disks - raid_disks) &&
+       if (mddev->degraded >= conf->max_degraded &&
             test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                 sector_t rv = (mddev->size << 1) - sector_nr;
                 *skipped = 1;
@@ -2903,6 +3184,82 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         return STRIPE_SECTORS;
  }
  
+static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
+{
+       /* We may not be able to submit a whole bio at once as there
+        * may not be enough stripe_heads available.
+        * We cannot pre-allocate enough stripe_heads as we may need
+        * more than exist in the cache (if we allow ever large chunks).
+        * So we do one stripe head at a time and record in
+        * ->bi_hw_segments how many have been done.
+        *
+        * We *know* that this entire raid_bio is in one chunk, so
+        * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
+        */
+       struct stripe_head *sh;
+       int dd_idx, pd_idx;
+       sector_t sector, logical_sector, last_sector;
+       int scnt = 0;
+       int remaining;
+       int handled = 0;
+
+       logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       sector = raid5_compute_sector(  logical_sector,
+                                       conf->raid_disks,
+                                       conf->raid_disks - conf->max_degraded,
+                                       &dd_idx,
+                                       &pd_idx,
+                                       conf);
+       last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+
+       for (; logical_sector < last_sector;
+            logical_sector += STRIPE_SECTORS,
+                    sector += STRIPE_SECTORS,
+                    scnt++) {
+
+               if (scnt < raid_bio->bi_hw_segments)
+                       /* already done this stripe */
+                       continue;
+
+               sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+
+               if (!sh) {
+                       /* failed to get a stripe - must wait */
+                       raid_bio->bi_hw_segments = scnt;
+                       conf->retry_read_aligned = raid_bio;
+                       return handled;
+               }
+
+               set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
+               if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+                       release_stripe(sh);
+                       raid_bio->bi_hw_segments = scnt;
+                       conf->retry_read_aligned = raid_bio;
+                       return handled;
+               }
+
+               handle_stripe(sh, NULL);
+               release_stripe(sh);
+               handled++;
+       }
+       spin_lock_irq(&conf->device_lock);
+       remaining = --raid_bio->bi_phys_segments;
+       spin_unlock_irq(&conf->device_lock);
+       if (remaining == 0) {
+               int bytes = raid_bio->bi_size;
+
+               raid_bio->bi_size = 0;
+               raid_bio->bi_end_io(raid_bio, bytes,
+                             test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
+                               ? 0 : -EIO);
+       }
+       if (atomic_dec_and_test(&conf->active_aligned_reads))
+               wake_up(&conf->wait_for_stripe);
+       return handled;
+}
+
+
+
  /*
   * This is our raid5 kernel thread.
   *
@@ -2924,8 +3281,9 @@ static void raid5d (mddev_t *mddev)
         spin_lock_irq(&conf->device_lock);
         while (1) {
                 struct list_head *first;
+               struct bio *bio;
  
-               if (conf->seq_flush - conf->seq_write > 0) {
+               if (conf->seq_flush != conf->seq_write) {
                         int seq = conf->seq_flush;
                         spin_unlock_irq(&conf->device_lock);
                         bitmap_unplug(mddev->bitmap);
@@ -2940,6 +3298,16 @@ static void raid5d (mddev_t *mddev)
                     !list_empty(&conf->delayed_list))
                         raid5_activate_delayed(conf);
  
+               while ((bio = remove_bio_from_retry(conf))) {
+                       int ok;
+                       spin_unlock_irq(&conf->device_lock);
+                       ok = retry_aligned_read(conf, bio);
+                       spin_lock_irq(&conf->device_lock);
+                       if (!ok)
+                               break;
+                       handled++;
+               }
+
                 if (list_empty(&conf->handle_list))
                         break;
  
@@ -2998,6 +3366,7 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
                 else
                         break;
         }
+       md_allow_write(mddev);
         while (new > conf->max_nr_stripes) {
                 if (grow_one_stripe(conf))
                         conf->max_nr_stripes++;
@@ -3041,6 +3410,7 @@ static int run(mddev_t *mddev)
         mdk_rdev_t *rdev;
         struct disk_info *disk;
         struct list_head *tmp;
+       int working_disks = 0;
  
         if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
                 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
@@ -3056,35 +3426,44 @@ static int run(mddev_t *mddev)
                  */
                 sector_t here_new, here_old;
                 int old_disks;
+               int max_degraded = (mddev->level == 5 ? 1 : 2);
  
                 if (mddev->new_level != mddev->level ||
                     mddev->new_layout != mddev->layout ||
                     mddev->new_chunk != mddev->chunk_size) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+                       printk(KERN_ERR "raid5: %s: unsupported reshape "
+                              "required - aborting.\n",
                                mdname(mddev));
                         return -EINVAL;
                 }
                 if (mddev->delta_disks <= 0) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+                       printk(KERN_ERR "raid5: %s: unsupported reshape "
+                              "(reduce disks) required - aborting.\n",
                                mdname(mddev));
                         return -EINVAL;
                 }
                 old_disks = mddev->raid_disks - mddev->delta_disks;
                 /* reshape_position must be on a new-stripe boundary, and one
-                * further up in new geometry must map after here in old geometry.
+                * further up in new geometry must map after here in old
+                * geometry.
                  */
                 here_new = mddev->reshape_position;
-               if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
-                       printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+               if (sector_div(here_new, (mddev->chunk_size>>9)*
+                              (mddev->raid_disks - max_degraded))) {
+                       printk(KERN_ERR "raid5: reshape_position not "
+                              "on a stripe boundary\n");
                         return -EINVAL;
                 }
                 /* here_new is the stripe we will write to */
                 here_old = mddev->reshape_position;
-               sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
-               /* here_old is the first stripe that we might need to read from */
+               sector_div(here_old, (mddev->chunk_size>>9)*
+                          (old_disks-max_degraded));
+               /* here_old is the first stripe that we might need to read
+                * from */
                 if (here_new >= here_old) {
                         /* Reading from the same stripe as writing to - bad */
-                       printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+                       printk(KERN_ERR "raid5: reshape_position too early for "
+                              "auto-recovery - aborting.\n");
                         return -EINVAL;
                 }
                 printk(KERN_INFO "raid5: reshape will continue\n");
@@ -3126,6 +3505,7 @@ static int run(mddev_t *mddev)
         INIT_LIST_HEAD(&conf->inactive_list);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
+       atomic_set(&conf->active_aligned_reads, 0);
  
         PRINTK("raid5: run(%s) called.\n", mdname(mddev));
  
@@ -3143,14 +3523,14 @@ static int run(mddev_t *mddev)
                         printk(KERN_INFO "raid5: device %s operational as raid"
                                 " disk %d\n", bdevname(rdev->bdev,b),
                                 raid_disk);
-                       conf->working_disks++;
+                       working_disks++;
                 }
         }
  
         /*
          * 0 for a fully functional array, 1 or 2 for a degraded array.
          */
-       mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+       mddev->degraded = conf->raid_disks - working_disks;
         conf->mddev = mddev;
         conf->chunk_size = mddev->chunk_size;
         conf->level = mddev->level;
@@ -3185,7 +3565,7 @@ static int run(mddev_t *mddev)
         if (mddev->degraded > conf->max_degraded) {
                 printk(KERN_ERR "raid5: not enough operational devices for %s"
                         " (%d/%d failed)\n",
-                       mdname(mddev), conf->failed_disks, conf->raid_disks);
+                       mdname(mddev), mddev->degraded, conf->raid_disks);
                 goto abort;
         }
  
@@ -3248,9 +3628,6 @@ static int run(mddev_t *mddev)
                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
                                                         "%s_reshape");
-               /* FIXME if md_register_thread fails?? */
-               md_wakeup_thread(mddev->sync_thread);
-
         }
  
         /* read-ahead size must cover two whole stripes, which is
@@ -3269,9 +3646,14 @@ static int run(mddev_t *mddev)
  
         mddev->queue->unplug_fn = raid5_unplug_device;
         mddev->queue->issue_flush_fn = raid5_issue_flush;
+       mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+       mddev->queue->backing_dev_info.congested_data = mddev;
+
         mddev->array_size =  mddev->size * (conf->previous_raid_disks -
                                             conf->max_degraded);
  
+       blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+
         return 0;
  abort:
         if (conf) {
@@ -3345,7 +3727,7 @@ static void status (struct seq_file *seq, mddev_t *mddev)
         int i;
  
         seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
-       seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+       seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
         for (i = 0; i < conf->raid_disks; i++)
                 seq_printf (seq, "%s",
                                conf->disks[i].rdev &&
@@ -3367,8 +3749,8 @@ static void print_raid5_conf (raid5_conf_t *conf)
                 printk("(conf==NULL)\n");
                 return;
         }
-       printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
-                conf->working_disks, conf->failed_disks);
+       printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+                conf->raid_disks - conf->mddev->degraded);
  
         for (i = 0; i < conf->raid_disks; i++) {
                 char b[BDEVNAME_SIZE];
@@ -3390,11 +3772,11 @@ static int raid5_spare_active(mddev_t *mddev)
                 tmp = conf->disks + i;
                 if (tmp->rdev
                     && !test_bit(Faulty, &tmp->rdev->flags)
-                   && !test_bit(In_sync, &tmp->rdev->flags)) {
+                   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+                       unsigned long flags;
+                       spin_lock_irqsave(&conf->device_lock, flags);
                         mddev->degraded--;
-                       conf->failed_disks--;
-                       conf->working_disks++;
-                       set_bit(In_sync, &tmp->rdev->flags);
+                       spin_unlock_irqrestore(&conf->device_lock, flags);
                 }
         }
         print_raid5_conf(conf);
@@ -3519,6 +3901,8 @@ static int raid5_check_reshape(mddev_t *mddev)
         if (err)
                 return err;
  
+       if (mddev->degraded > conf->max_degraded)
+               return -EINVAL;
         /* looks like we might be able to manage this */
         return 0;
  }
@@ -3530,9 +3914,9 @@ static int raid5_start_reshape(mddev_t *mddev)
         struct list_head *rtmp;
         int spares = 0;
         int added_devices = 0;
+       unsigned long flags;
  
-       if (mddev->degraded ||
-           test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+       if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                 return -EBUSY;
  
         ITERATE_RDEV(mddev, rdev, rtmp)
@@ -3540,7 +3924,7 @@ static int raid5_start_reshape(mddev_t *mddev)
                     !test_bit(Faulty, &rdev->flags))
                         spares++;
  
-       if (spares < mddev->delta_disks-1)
+       if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
                 /* Not enough devices even to make a degraded array
                  * of that size
                  */
@@ -3563,7 +3947,6 @@ static int raid5_start_reshape(mddev_t *mddev)
                         if (raid5_add_disk(mddev, rdev)) {
                                 char nm[20];
                                 set_bit(In_sync, &rdev->flags);
-                               conf->working_disks++;
                                 added_devices++;
                                 rdev->recovery_offset = 0;
                                 sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3572,10 +3955,12 @@ static int raid5_start_reshape(mddev_t *mddev)
                                 break;
                 }
  
+       spin_lock_irqsave(&conf->device_lock, flags);
         mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+       spin_unlock_irqrestore(&conf->device_lock, flags);
         mddev->raid_disks = conf->raid_disks;
         mddev->reshape_position = 0;
-       mddev->sb_dirty = 1;
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
  
         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -3602,14 +3987,15 @@ static void end_reshape(raid5_conf_t *conf)
         struct block_device *bdev;
  
         if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-               conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+               conf->mddev->array_size = conf->mddev->size *
+                       (conf->raid_disks - conf->max_degraded);
                 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
                 conf->mddev->changed = 1;
  
                 bdev = bdget_disk(conf->mddev->gendisk, 0);
                 if (bdev) {
                         mutex_lock(&bdev->bd_inode->i_mutex);
-                       i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+                       i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
                         mutex_unlock(&bdev->bd_inode->i_mutex);
                         bdput(bdev);
                 }
@@ -3644,7 +4030,8 @@ static void raid5_quiesce(mddev_t *mddev, int state)
                 spin_lock_irq(&conf->device_lock);
                 conf->quiesce = 1;
                 wait_event_lock_irq(conf->wait_for_stripe,
-                                   atomic_read(&conf->active_stripes) == 0,
+                                   atomic_read(&conf->active_stripes) == 0 &&
+                                   atomic_read(&conf->active_aligned_reads) == 0,
                                     conf->device_lock, /* nothing */);
                 spin_unlock_irq(&conf->device_lock);
                 break;
@@ -3674,6 +4061,10 @@ static struct mdk_personality raid6_personality =
         .spare_active   = raid5_spare_active,
         .sync_request   = sync_request,
         .resize         = raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+       .check_reshape  = raid5_check_reshape,
+       .start_reshape  = raid5_start_reshape,
+#endif
         .quiesce        = raid5_quiesce,
  };
  static struct mdk_personality raid5_personality =