X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid1.c;h=e07ce2e033a95c48ba2c9a78c519640743220e2d;hb=929be8fcb4b4b65d038e73d3bb34715851a95ca2;hp=a06ff91f27e2e6bb3af330e170faebdd79e6a9ce;hpb=4dbcdc751cb25ffca3a8374cbc5ab6de961cc545;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a06ff91..e07ce2e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -9,7 +9,7 @@ * * Better read-balancing code written by Mika Kuoppala , 2000 * - * Fixes to reconstruction by Jakob Østergaard" + * Fixes to reconstruction by Jakob Østergaard" * Various fixes by Neil Brown * * Changes by Peter T. Breuer 31/1/2003 to support @@ -31,9 +31,12 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" -#include -#include +#include +#include +#include +#include "md.h" +#include "raid1.h" +#include "bitmap.h" #define DEBUG 0 #if DEBUG @@ -61,7 +64,7 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) /* allocate a r1bio with room for raid_disks entries in the bios array */ r1_bio = kzalloc(size, gfp_flags); - if (!r1_bio) + if (!r1_bio && pi->mddev) unplug_slaves(pi->mddev); return r1_bio; @@ -119,6 +122,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; + bio->bi_vcnt = i+1; } } /* If not user-requests, copy the page pointers to all bios */ @@ -134,9 +138,9 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - for (i=0; i < RESYNC_PAGES ; i++) - for (j=0 ; j < pi->raid_disks; j++) - safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); + for (j=0 ; j < pi->raid_disks; j++) + for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) + put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); j = -1; out_free_bio: while ( ++j < pi->raid_disks ) @@ -176,9 +180,9 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) } } -static inline void free_r1bio(r1bio_t *r1_bio) +static void free_r1bio(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -190,9 +194,9 @@ static inline void free_r1bio(r1bio_t *r1_bio) mempool_free(r1_bio, conf->r1bio_pool); } -static inline void put_buf(r1bio_t *r1_bio) +static void put_buf(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; int i; for (i=0; iraid_disks; i++) { @@ -210,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); @@ -238,7 +242,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) (unsigned long long) bio->bi_sector + (bio->bi_size >> 9) - 1); - bio_endio(bio, bio->bi_size, + bio_endio(bio, test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); } free_r1bio(r1_bio); @@ -249,43 +253,44 @@ static void raid_end_bio_io(r1bio_t *r1_bio) */ static inline void update_head_pos(int disk, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors); } -static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid1_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; - if (bio->bi_size) - return 1; - mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ update_head_pos(mirror, r1_bio); - if (uptodate || conf->working_disks <= 1) { - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); + else { + /* If all other devices have failed, we want to return + * the error upwards rather than fail the last device. + * Here we redefine "uptodate" to mean "Don't want to retry" */ - if (uptodate) - set_bit(R1BIO_Uptodate, &r1_bio->state); + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) raid_end_bio_io(r1_bio); - } else { + else { /* * oops, read error: */ @@ -297,32 +302,32 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); - return 0; } -static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; + struct bio *to_put = NULL; - if (bio->bi_size) - return 1; for (mirror = 0; mirror < conf->raid_disks; mirror++) if (r1_bio->bios[mirror] == bio) break; - if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { + if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); set_bit(R1BIO_BarrierRetry, &r1_bio->state); r1_bio->mddev->barriers_work = 0; + /* Don't rdev_dec_pending in this branch - keep it for the retry */ } else { /* * this branch is our 'one mirror IO has finished' event handler: */ r1_bio->bios[mirror] = NULL; + to_put = bio; if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); /* an I/O failed, we can't clear the bitmap */ @@ -359,10 +364,11 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + (mbio->bi_size >> 9) - 1); - bio_endio(mbio, mbio->bi_size, 0); + bio_endio(mbio, 0); } } } + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } /* * @@ -370,33 +376,28 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { + if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) reschedule_retry(r1_bio); - /* Don't dec_pending yet, we want to hold - * the reference over the retry - */ - return 0; - } - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); + else { + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + safe_put_page(bio->bi_io_vec[i].bv_page); + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); } - if (r1_bio->bios[mirror]==NULL) - bio_put(bio); - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); - return 0; + if (to_put) + bio_put(to_put); } @@ -540,20 +541,19 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + struct request_queue *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); + blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -562,7 +562,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid1_unplug(request_queue_t *q) +static void raid1_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; @@ -570,36 +570,66 @@ static void raid1_unplug(request_queue_t *q) md_wakeup_thread(mddev->thread); } -static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) +static int raid1_congested(void *data, int bits) { - mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + mddev_t *mddev = data; + conf_t *conf = mddev->private; int i, ret = 0; + if (mddev_congested(mddev, bits)) + return 1; + rcu_read_lock(); - for (i=0; iraid_disks && ret == 0; i++) { + for (i = 0; i < mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct block_device *bdev = rdev->bdev; - request_queue_t *r_queue = bdev_get_queue(bdev); + struct request_queue *q = bdev_get_queue(rdev->bdev); - if (!r_queue->issue_flush_fn) - ret = -EOPNOTSUPP; - else { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, - error_sector); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } + /* Note the '|| 1' - when read_balance prefers + * non-congested targets, it can be removed + */ + if ((bits & (1<backing_dev_info, bits); + else + ret &= bdi_congested(&q->backing_dev_info, bits); } } rcu_read_unlock(); return ret; } + +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -681,15 +711,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid1_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -734,38 +772,52 @@ do_sync_io: return NULL; } -static int make_request(request_queue_t *q, struct bio * bio) +static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - mdk_rdev_t *rdev; - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - int do_barriers; - - if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { - bio_endio(bio, bio->bi_size, -EOPNOTSUPP); - return 0; - } + const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + int cpu; + bool do_barriers; + mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. + * We test barriers_work *after* md_write_start as md_write_start + * may cause the first superblock write, and that will check out + * if barriers work. */ + md_write_start(mddev, bio); /* wait on superblock update early */ + if (unlikely(!mddev->barriers_work && + bio_rw_flagged(bio, BIO_RW_BARRIER))) { + if (rw == WRITE) + md_write_end(mddev); + bio_endio(bio, -EOPNOTSUPP); + return 0; + } + wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + bitmap = mddev->bitmap; + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); /* * make_request() can abort the operation when READA is being @@ -802,7 +854,7 @@ static int make_request(request_queue_t *q, struct bio * bio) read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ; + read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); read_bio->bi_private = r1_bio; generic_make_request(read_bio); @@ -824,10 +876,17 @@ static int make_request(request_queue_t *q, struct bio * bio) first = 0; } #endif + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < disks; i++) { - if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && - !test_bit(Faulty, &rdev->flags)) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); @@ -840,6 +899,20 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + BUG_ON(targets == 0); /* we never fail the last device */ if (targets < conf->raid_disks) { @@ -857,7 +930,7 @@ static int make_request(request_queue_t *q, struct bio * bio) atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio->bi_rw & BIO_RW_BARRIER; + do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER); if (do_barriers) set_bit(R1BIO_Barrier, &r1_bio->state); @@ -873,7 +946,8 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers; + mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) | + (do_sync << BIO_RW_SYNCIO); mbio->bi_private = r1_bio; if (behind_pages) { @@ -908,6 +982,11 @@ static int make_request(request_queue_t *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid1d snuck into freeze_array */ + wake_up(&conf->wait_barrier); + + if (do_sync) + md_wakeup_thread(mddev->thread); #if 0 while ((bio = bio_list_pop(&bl)) != NULL) generic_make_request(bio); @@ -918,15 +997,18 @@ static int make_request(request_queue_t *q, struct bio * bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -934,7 +1016,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -943,49 +1025,57 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a - * normal single drive + * normal single drive. + * However don't try a recovery from this drive as + * it is very likely to fail. */ + mddev->recovery_disabled = 1; return; - if (test_bit(In_sync, &rdev->flags)) { + } + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); - } - clear_bit(In_sync, &rdev->flags); - set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + } else + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" + "raid1: Operation continuing on %d devices.\n", + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) { int i; - mirror_info_t *tmp; printk("RAID1 conf printout:\n"); if (!conf) { printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } + rcu_read_unlock(); } static void close_sync(conf_t *conf) @@ -1001,20 +1091,21 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; - mirror_info_t *tmp; /* * Find all failed disks within the RAID1 configuration - * and mark them readable + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1026,26 +1117,31 @@ static int raid1_spare_active(mddev_t *mddev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror = 0; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -1054,9 +1150,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + md_integrity_add_rdev(rdev, mddev); print_conf(conf); - return found; + return err; } static int raid1_remove_disk(mddev_t *mddev, int number) @@ -1074,13 +1170,23 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -1089,14 +1195,11 @@ abort: } -static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_read(struct bio *bio, int error) { r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int i; - if (bio->bi_size) - return 1; - for (i=r1_bio->mddev->raid_disks; i--; ) if (r1_bio->bios[i] == bio) break; @@ -1112,41 +1215,48 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) if (atomic_dec_and_test(&r1_bio->remaining)) reschedule_retry(r1_bio); - return 0; } -static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int mirror=0; - if (bio->bi_size) - return 1; - for (i = 0; i < conf->raid_disks; i++) if (r1_bio->bios[i] == bio) { mirror = i; break; } - if (!uptodate) + if (!uptodate) { + int sync_blocks = 0; + sector_t s = r1_bio->sector; + long sectors_to_go = r1_bio->sectors; + /* make sure these bits doesn't get cleared. */ + do { + bitmap_end_sync(mddev->bitmap, s, + &sync_blocks, 1); + s += sync_blocks; + sectors_to_go -= sync_blocks; + } while (sectors_to_go > 0); md_error(mddev, conf->mirrors[mirror].rdev); + } update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { - md_done_sync(mddev, r1_bio->sectors, uptodate); + sector_t s = r1_bio->sectors; put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); } - return 0; } static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int disks = conf->raid_disks; struct bio *bio, *wbio; @@ -1181,37 +1291,58 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) } r1_bio->read_disk = primary; for (i=0; iraid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { + if (r1_bio->bios[i]->bi_end_io == end_sync_read) { int j; int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - for (j = vcnt; j-- ; ) - if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), - page_address(sbio->bi_io_vec[j].bv_page), - PAGE_SIZE)) - break; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + PAGE_SIZE)) + break; + } + } else + j = 0; if (j >= 0) mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); } else { /* fixup the bio for reuse */ + int size; sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; sbio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bi_flags |= 1 << BIO_UPTODATE; sbio->bi_next = NULL; sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), + page_address(pbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + } + } } } @@ -1219,7 +1350,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) /* ouch - failed to read all of that. * Try some synchronous reads of other devices to get * good data, much like with normal read errors. Only - * read into the pages we already have so they we don't + * read into the pages we already have so we don't * need to re-issue the read request. * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and @@ -1239,6 +1370,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) s = PAGE_SIZE >> 9; do { if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev->bdev, sect + rdev->data_offset, @@ -1341,12 +1476,101 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * 3. Performs writes following reads for array syncronising. */ +static void fix_read_error(conf_t *conf, int read_disk, + sector_t sect, int sectors) +{ + mddev_t *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + mdk_rdev_t *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + d++; + if (d == conf->raid_disks) + d = 0; + } + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[read_disk].rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + d = start; + while (d != read_disk) { + char b[BDEVNAME_SIZE]; + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO + "raid1:%s: read error corrected " + "(%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect + + rdev->data_offset), + bdevname(rdev->bdev, b)); + } + } + } + sectors -= s; + sect += s; + } +} + static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1355,50 +1579,40 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - if (bitmap_unplug(mddev->bitmap) != 0) - printk("%s: bitmap file write failed!\n", mdname(mddev)); - - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - continue; - } + unplug += flush_pending_writes(conf); - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { /* some requests in the r1bio were BIO_RW_BARRIER - * requests which failed with -ENOTSUPP. Hohumm.. + * requests which failed with -EOPNOTSUPP. Hohumm.. * Better resubmit without the barrier. * We know which devices to resubmit for, because * all others have had their bios[] entry cleared. + * We already have a nr_pending reference on these rdevs. */ int i; + const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); clear_bit(R1BIO_BarrierRetry, &r1_bio->state); clear_bit(R1BIO_Barrier, &r1_bio->state); for (i=0; i < conf->raid_disks; i++) + if (r1_bio->bios[i]) + atomic_inc(&r1_bio->remaining); + for (i=0; i < conf->raid_disks; i++) if (r1_bio->bios[i]) { struct bio_vec *bvec; int j; @@ -1413,7 +1627,8 @@ static void raid1d(mddev_t *mddev) conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE; + bio->bi_rw = WRITE | + (do_sync << BIO_RW_SYNCIO); bio->bi_private = r1_bio; r1_bio->bios[i] = bio; generic_make_request(bio); @@ -1429,76 +1644,15 @@ static void raid1d(mddev_t *mddev) * This is all done synchronously while the array is * frozen */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - do { - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) - success = 1; - else { - d++; - if (d == conf->raid_disks) - d = 0; - } - } while (!success && d != r1_bio->read_disk); - - if (success) { - /* write it back and re-read */ - int start = d; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - d = start; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - break; - } - sectors -= s; - sect += s; - } - - unfreeze_array(conf); + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, + r1_bio->sectors); + unfreeze_array(conf); + } else + md_error(mddev, + conf->mirrors[r1_bio->read_disk].rdev); bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { @@ -1508,6 +1662,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { + const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; @@ -1523,14 +1678,14 @@ static void raid1d(mddev_t *mddev) bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ; + bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); } } + cond_resched(); } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1541,8 +1696,7 @@ static int init_resync(conf_t *conf) int buffs; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; - if (conf->r1buf_pool) - BUG(); + BUG_ON(conf->r1buf_pool); conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, conf->poolinfo); if (!conf->r1buf_pool) @@ -1563,7 +1717,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r1bio_t *r1_bio; struct bio *bio; sector_t max_sector, nr_sectors; @@ -1583,7 +1737,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -1601,6 +1755,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + return max_sector - sector_nr; + } /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ @@ -1618,6 +1779,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev->bitmap, sector_nr); raise_barrier(conf); conf->next_resync = sector_nr; @@ -1645,11 +1807,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* take from bio_init */ bio->bi_next = NULL; bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_rw = 0; + bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; @@ -1700,6 +1861,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ nr_sectors = 0; sync_blocks = 0; do { @@ -1715,8 +1878,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; - if (sync_blocks < (PAGE_SIZE>>9)) - BUG(); + BUG_ON(sync_blocks < (PAGE_SIZE>>9)); if (len > (sync_blocks<<9)) len = sync_blocks<<9; } @@ -1757,35 +1919,45 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; iraid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } } } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, - nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } - return nr_sectors; } +static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + static int run(mddev_t *mddev) { conf_t *conf; int i, j, disk_idx; mirror_info_t *disk; mdk_rdev_t *rdev; - struct list_head *tmp; if (mddev->level != 1) { printk("raid1: %s: raid level not set to mirroring (%d)\n", mdname(mddev), mddev->level); goto out; } + if (mddev->reshape_position != MaxSector) { + printk("raid1: %s: reshape_position set but not supported\n", + mdname(mddev)); + goto out; + } /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), @@ -1808,15 +1980,19 @@ static int run(mddev_t *mddev) conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) goto out_no_mem; - conf->poolinfo->mddev = mddev; + conf->poolinfo->mddev = NULL; conf->poolinfo->raid_disks = mddev->raid_disks; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (!conf->r1bio_pool) goto out_no_mem; + conf->poolinfo->mddev = mddev; - ITERATE_RDEV(mddev, rdev, tmp) { + spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; + + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -1824,27 +2000,21 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; - spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (conf->working_disks == 1) - mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -1852,22 +2022,27 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - if (!conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; - } mddev->degraded = 0; for (i = 0; i < conf->raid_disks; i++) { disk = conf->mirrors + i; - if (!disk->rdev) { + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } + if (mddev->degraded == conf->raid_disks) { + printk(KERN_ERR "raid1: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; /* * find the first working one and use it as a starting point @@ -1880,7 +2055,7 @@ static int run(mddev_t *mddev) conf->last_used = j; - mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); + mddev->thread = md_register_thread(raid1d, mddev, NULL); if (!mddev->thread) { printk(KERN_ERR "raid1: couldn't allocate thread for %s\n", @@ -1888,6 +2063,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid1: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, @@ -1895,11 +2074,12 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_size = mddev->size; + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); mddev->queue->unplug_fn = raid1_unplug; - mddev->queue->issue_flush_fn = raid1_issue_flush; - + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + md_integrity_register(mddev); return 0; out_no_mem: @@ -1922,7 +2102,7 @@ out: static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; int behind_wait = 0; @@ -1935,6 +2115,9 @@ static int stop(mddev_t *mddev) /* need to kick something here to make sure I/O goes? */ } + raise_barrier(conf); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -1956,19 +2139,23 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_size = sectors>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); + if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_size; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } -static int raid1_reshape(mddev_t *mddev, int raid_disks) +static int raid1_reshape(mddev_t *mddev) { /* We need to: * 1/ resize the r1bio_pool @@ -1984,10 +2171,26 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; mirror_info_t *newmirrors; - conf_t *conf = mddev_to_conf(mddev); - int cnt; + conf_t *conf = mddev->private; + int cnt, raid_disks; + unsigned long flags; + int d, d2, err; + + /* Cannot change chunk_size, layout, or level */ + if (mddev->chunk_sectors != mddev->new_chunk_sectors || + mddev->layout != mddev->new_layout || + mddev->level != mddev->new_level) { + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->new_layout = mddev->layout; + mddev->new_level = mddev->level; + return -EINVAL; + } - int d, d2; + err = md_allow_write(mddev); + if (err) + return err; + + raid_disks = mddev->raid_disks + mddev->delta_disks; if (raid_disks < conf->raid_disks) { cnt=0; @@ -2023,18 +2226,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) oldpool = conf->r1bio_pool; conf->r1bio_pool = newpool; - for (d=d2=0; d < conf->raid_disks; d++) - if (conf->mirrors[d].rdev) { - conf->mirrors[d].rdev->raid_disk = d2; - newmirrors[d2++].rdev = conf->mirrors[d].rdev; + for (d = d2 = 0; d < conf->raid_disks; d++) { + mdk_rdev_t *rdev = conf->mirrors[d].rdev; + if (rdev && rdev->raid_disk != d2) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = d2; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) + printk(KERN_WARNING + "md/raid1: cannot register " + "%s for %s\n", + nm, mdname(mddev)); } + if (rdev) + newmirrors[d2++].rdev = rdev; + } kfree(conf->mirrors); conf->mirrors = newmirrors; kfree(conf->poolinfo); conf->poolinfo = newpoolinfo; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); conf->raid_disks = mddev->raid_disks = raid_disks; + mddev->delta_disks = 0; conf->last_used = 0; /* just make sure it is in-range */ lower_barrier(conf); @@ -2048,7 +2268,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) static void raid1_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { case 1: @@ -2076,7 +2296,8 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, - .reshape = raid1_reshape, + .size = raid1_size, + .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, };