mtd: davinci nand: update clock naming
[safe/jmp/linux-2.6] / drivers / md / raid5.c
index 4fdc6d0..5d400ae 100644 (file)
@@ -395,7 +395,8 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
                                init_stripe(sh, sector, previous);
                } else {
                        if (atomic_read(&sh->count)) {
-                         BUG_ON(!list_empty(&sh->lru));
+                               BUG_ON(!list_empty(&sh->lru)
+                                   && !test_bit(STRIPE_EXPANDING, &sh->state));
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
@@ -947,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        return 0;
 }
 
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
@@ -1072,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        conf->pool_size = newsize;
        return err;
 }
-#endif
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -2944,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
 
        /* Finish reconstruct operations initiated by the expansion process */
        if (sh->reconstruct_state == reconstruct_state_result) {
+               struct stripe_head *sh2
+                       = get_active_stripe(conf, sh->sector, 1, 1);
+               if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                       /* sh cannot be written until sh2 has been read.
+                        * so arrange for sh to be delayed a little
+                        */
+                       set_bit(STRIPE_DELAYED, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                             &sh2->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       release_stripe(sh2);
+                       goto unlock;
+               }
+               if (sh2)
+                       release_stripe(sh2);
+
                sh->reconstruct_state = reconstruct_state_idle;
                clear_bit(STRIPE_EXPANDING, &sh->state);
                for (i = conf->raid_disks; i--; ) {
@@ -3172,6 +3188,23 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                }
 
        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+               struct stripe_head *sh2
+                       = get_active_stripe(conf, sh->sector, 1, 1);
+               if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                       /* sh cannot be written until sh2 has been read.
+                        * so arrange for sh to be delayed a little
+                        */
+                       set_bit(STRIPE_DELAYED, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                             &sh2->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       release_stripe(sh2);
+                       goto unlock;
+               }
+               if (sh2)
+                       release_stripe(sh2);
+
                /* Need to write out all blocks after computing P&Q */
                sh->disks = conf->raid_disks;
                stripe_set_idx(sh->sector, conf, 0, sh);
@@ -3604,10 +3637,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
        retry:
                previous = 0;
+               disks = conf->raid_disks;
                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-               if (likely(conf->reshape_progress == MaxSector))
-                       disks = conf->raid_disks;
-               else {
+               if (unlikely(conf->reshape_progress != MaxSector)) {
                        /* spinlock is needed as reshape_progress may be
                         * 64bit on a 32bit platform, and so it might be
                         * possible to see a half-updated value
@@ -3617,7 +3649,6 @@ static int make_request(struct request_queue *q, struct bio * bi)
                         * to check again.
                         */
                        spin_lock_irq(&conf->device_lock);
-                       disks = conf->raid_disks;
                        if (mddev->delta_disks < 0
                            ? logical_sector < conf->reshape_progress
                            : logical_sector >= conf->reshape_progress) {
@@ -3646,7 +3677,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                sh = get_active_stripe(conf, new_sector, previous,
                                       (bi->bi_rw&RWA_MASK));
                if (sh) {
-                       if (unlikely(conf->reshape_progress != MaxSector)) {
+                       if (unlikely(previous)) {
                                /* expansion might have moved on while waiting for a
                                 * stripe, so we must do the range check again.
                                 * Expansion could still move past after this
@@ -3657,10 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                 */
                                int must_retry = 0;
                                spin_lock_irq(&conf->device_lock);
-                               if ((mddev->delta_disks < 0
-                                    ? logical_sector >= conf->reshape_progress
-                                    : logical_sector < conf->reshape_progress)
-                                   && previous)
+                               if (mddev->delta_disks < 0
+                                   ? logical_sector >= conf->reshape_progress
+                                   : logical_sector < conf->reshape_progress)
                                        /* mismatch, need to try again */
                                        must_retry = 1;
                                spin_unlock_irq(&conf->device_lock);
@@ -3736,9 +3766,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
        int new_data_disks = conf->raid_disks - conf->max_degraded;
        int i;
        int dd_idx;
-       sector_t writepos, safepos, gap;
+       sector_t writepos, readpos, safepos;
        sector_t stripe_addr;
        int reshape_sectors;
+       struct list_head stripes;
 
        if (sector_nr == 0) {
                /* If restarting in the middle, skip the initial sectors */
@@ -3775,26 +3806,47 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         */
        writepos = conf->reshape_progress;
        sector_div(writepos, new_data_disks);
+       readpos = conf->reshape_progress;
+       sector_div(readpos, data_disks);
        safepos = conf->reshape_safe;
        sector_div(safepos, data_disks);
        if (mddev->delta_disks < 0) {
-               writepos -= reshape_sectors;
+               writepos -= min_t(sector_t, reshape_sectors, writepos);
+               readpos += reshape_sectors;
                safepos += reshape_sectors;
-               gap = conf->reshape_safe - conf->reshape_progress;
        } else {
                writepos += reshape_sectors;
-               safepos -= reshape_sectors;
-               gap = conf->reshape_progress - conf->reshape_safe;
+               readpos -= min_t(sector_t, reshape_sectors, readpos);
+               safepos -= min_t(sector_t, reshape_sectors, safepos);
        }
 
+       /* 'writepos' is the most advanced device address we might write.
+        * 'readpos' is the least advanced device address we might read.
+        * 'safepos' is the least address recorded in the metadata as having
+        *     been reshaped.
+        * If 'readpos' is behind 'writepos', then there is no way that we can
+        * ensure safety in the face of a crash - that must be done by userspace
+        * making a backup of the data.  So in that case there is no particular
+        * rush to update metadata.
+        * Otherwise if 'safepos' is behind 'writepos', then we really need to
+        * update the metadata to advance 'safepos' to match 'readpos' so that
+        * we can be safe in the event of a crash.
+        * So we insist on updating metadata if safepos is behind writepos and
+        * readpos is beyond writepos.
+        * In any case, update the metadata every 10 seconds.
+        * Maybe that number should be configurable, but I'm not sure it is
+        * worth it.... maybe it could be a multiple of safemode_delay???
+        */
        if ((mddev->delta_disks < 0
-            ? writepos < safepos
-            : writepos > safepos) ||
-           gap > (new_data_disks)*3000*2 /*3Meg*/) {
+            ? (safepos > writepos && readpos < writepos)
+            : (safepos < writepos && readpos > writepos)) ||
+           time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
                mddev->reshape_position = conf->reshape_progress;
+               mddev->curr_resync_completed = mddev->curr_resync;
+               conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
@@ -3803,6 +3855,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
+               sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
 
        if (mddev->delta_disks < 0) {
@@ -3816,6 +3869,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                BUG_ON(writepos != sector_nr + reshape_sectors);
                stripe_addr = sector_nr;
        }
+       INIT_LIST_HEAD(&stripes);
        for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                int j;
                int skipped = 0;
@@ -3845,7 +3899,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                        set_bit(STRIPE_EXPAND_READY, &sh->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
-               release_stripe(sh);
+               list_add(&sh->lru, &stripes);
        }
        spin_lock_irq(&conf->device_lock);
        if (mddev->delta_disks < 0)
@@ -3874,15 +3928,26 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                release_stripe(sh);
                first_sector += STRIPE_SECTORS;
        }
+       /* Now that the sources are clearly marked, we can release
+        * the destination stripes
+        */
+       while (!list_empty(&stripes)) {
+               sh = list_entry(stripes.next, struct stripe_head, lru);
+               list_del_init(&sh->lru);
+               release_stripe(sh);
+       }
        /* If this takes us to the resync_max point where we have to pause,
         * then we need to write out the superblock.
         */
        sector_nr += reshape_sectors;
-       if (sector_nr >= mddev->resync_max) {
+       if ((sector_nr - mddev->curr_resync_completed) * 2
+           >= mddev->resync_max - mddev->curr_resync_completed) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
                mddev->reshape_position = conf->reshape_progress;
+               mddev->curr_resync_completed = mddev->curr_resync;
+               conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
@@ -3892,6 +3957,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
+               sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
        return reshape_sectors;
 }
@@ -4777,7 +4843,6 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
        return 0;
 }
 
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int raid5_check_reshape(mddev_t *mddev)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4918,11 +4983,11 @@ static int raid5_start_reshape(mddev_t *mddev)
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
+       conf->reshape_checkpoint = jiffies;
        md_wakeup_thread(mddev->sync_thread);
        md_new_event(mddev);
        return 0;
 }
-#endif
 
 /* This is called from the reshape thread and should make any
  * changes needed in 'conf'
@@ -4936,6 +5001,7 @@ static void end_reshape(raid5_conf_t *conf)
                conf->previous_raid_disks = conf->raid_disks;
                conf->reshape_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
+               wake_up(&conf->wait_for_overlap);
 
                /* read-ahead size must cover two whole stripes, which is
                 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
@@ -5244,11 +5310,9 @@ static struct mdk_personality raid6_personality =
        .sync_request   = sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
-#ifdef CONFIG_MD_RAID5_RESHAPE
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
        .finish_reshape = raid5_finish_reshape,
-#endif
        .quiesce        = raid5_quiesce,
        .takeover       = raid6_takeover,
        .reconfig       = raid6_reconfig,
@@ -5269,11 +5333,9 @@ static struct mdk_personality raid5_personality =
        .sync_request   = sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
-#ifdef CONFIG_MD_RAID5_RESHAPE
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
        .finish_reshape = raid5_finish_reshape,
-#endif
        .quiesce        = raid5_quiesce,
        .takeover       = raid5_takeover,
        .reconfig       = raid5_reconfig,
@@ -5295,11 +5357,9 @@ static struct mdk_personality raid4_personality =
        .sync_request   = sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
-#ifdef CONFIG_MD_RAID5_RESHAPE
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
        .finish_reshape = raid5_finish_reshape,
-#endif
        .quiesce        = raid5_quiesce,
 };