X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid1.c;h=e59b10e66edb961c8a270b8ebd11830a62551ca3;hb=4434ade8c9334a3ab975d8993de456f06841899e;hp=5c7fef091cec800da3f3cb482f7e97f574cb43c6;hpb=d089c6af10c2be5988f03667d6d22fe6085fbe5e;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5c7fef0..e59b10e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,9 +31,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" -#include -#include +#include +#include +#include +#include +#include "md.h" +#include "raid1.h" +#include "bitmap.h" #define DEBUG 0 #if DEBUG @@ -61,7 +65,7 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) /* allocate a r1bio with room for raid_disks entries in the bios array */ r1_bio = kzalloc(size, gfp_flags); - if (!r1_bio) + if (!r1_bio && pi->mddev) unplug_slaves(pi->mddev); return r1_bio; @@ -119,6 +123,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; + bio->bi_vcnt = i+1; } } /* If not user-requests, copy the page pointers to all bios */ @@ -134,9 +139,9 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - for (i=0; i < RESYNC_PAGES ; i++) - for (j=0 ; j < pi->raid_disks; j++) - safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); + for (j=0 ; j < pi->raid_disks; j++) + for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) + put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); j = -1; out_free_bio: while ( ++j < pi->raid_disks ) @@ -178,7 +183,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) static void free_r1bio(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -192,7 +197,7 @@ static void free_r1bio(r1bio_t *r1_bio) static void put_buf(r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; int i; for (i=0; iraid_disks; i++) { @@ -210,7 +215,7 @@ static void reschedule_retry(r1bio_t *r1_bio) { unsigned long flags; mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); @@ -249,7 +254,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) */ static inline void update_head_pos(int disk, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors); @@ -260,7 +265,7 @@ static void raid1_end_read_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; mirror = r1_bio->read_disk; /* @@ -305,7 +310,7 @@ static void raid1_end_write_request(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); - conf_t *conf = mddev_to_conf(r1_bio->mddev); + conf_t *conf = r1_bio->mddev->private; struct bio *to_put = NULL; @@ -537,7 +542,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); @@ -569,9 +574,12 @@ static void raid1_unplug(struct request_queue *q) static int raid1_congested(void *data, int bits) { mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, ret = 0; + if (mddev_congested(mddev, bits)) + return 1; + rcu_read_lock(); for (i = 0; i < mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -581,7 +589,7 @@ static int raid1_congested(void *data, int bits) /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ - if ((bits & (1<backing_dev_info, bits); else ret &= bdi_congested(&q->backing_dev_info, bits); @@ -592,6 +600,37 @@ static int raid1_congested(void *data, int bits) } +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -639,6 +678,7 @@ static void raise_barrier(conf_t *conf) static void lower_barrier(conf_t *conf) { unsigned long flags; + BUG_ON(conf->barrier <= 0); spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; spin_unlock_irqrestore(&conf->resync_lock, flags); @@ -673,15 +713,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid1_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -729,19 +777,20 @@ do_sync_io: static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - mdk_rdev_t *rdev; - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - const int do_sync = bio_sync(bio); - int do_barriers; + const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + int cpu; + bool do_barriers; + mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction @@ -754,7 +803,27 @@ static int make_request(struct request_queue *q, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ - if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { + if (bio_data_dir(bio) == WRITE && + bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && + bio->bi_sector < mddev->suspend_hi) { + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + DEFINE_WAIT(w); + for (;;) { + flush_signals(current); + prepare_to_wait(&conf->wait_barrier, + &w, TASK_INTERRUPTIBLE); + if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || + bio->bi_sector >= mddev->suspend_hi) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + if (unlikely(!mddev->barriers_work && + bio_rw_flagged(bio, BIO_RW_BARRIER))) { if (rw == WRITE) md_write_end(mddev); bio_endio(bio, -EOPNOTSUPP); @@ -763,8 +832,13 @@ static int make_request(struct request_queue *q, struct bio * bio) wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + bitmap = mddev->bitmap; + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); /* * make_request() can abort the operation when READA is being @@ -801,7 +875,7 @@ static int make_request(struct request_queue *q, struct bio * bio) read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ | do_sync; + read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); read_bio->bi_private = r1_bio; generic_make_request(read_bio); @@ -823,10 +897,17 @@ static int make_request(struct request_queue *q, struct bio * bio) first = 0; } #endif + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < disks; i++) { - if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && - !test_bit(Faulty, &rdev->flags)) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); @@ -839,6 +920,20 @@ static int make_request(struct request_queue *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + BUG_ON(targets == 0); /* we never fail the last device */ if (targets < conf->raid_disks) { @@ -849,14 +944,15 @@ static int make_request(struct request_queue *q, struct bio * bio) /* do behind I/O ? */ if (bitmap && - atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio_barrier(bio); + do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER); if (do_barriers) set_bit(R1BIO_Barrier, &r1_bio->state); @@ -872,7 +968,8 @@ static int make_request(struct request_queue *q, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers | do_sync; + mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) | + (do_sync << BIO_RW_SYNCIO); mbio->bi_private = r1_bio; if (behind_pages) { @@ -907,6 +1004,9 @@ static int make_request(struct request_queue *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid1d snuck into freeze_array */ + wake_up(&conf->wait_barrier); + if (do_sync) md_wakeup_thread(mddev->thread); #if 0 @@ -919,7 +1019,7 @@ static int make_request(struct request_queue *q, struct bio * bio) static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, @@ -938,7 +1038,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -947,12 +1047,16 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && (conf->raid_disks - mddev->degraded) == 1) + && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a - * normal single drive + * normal single drive. + * However don't try a recovery from this drive as + * it is very likely to fail. */ + mddev->recovery_disabled = 1; return; + } if (test_and_clear_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); @@ -962,12 +1066,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", + printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" + "raid1: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } @@ -1035,26 +1139,35 @@ static int raid1_spare_active(mddev_t *mddev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror = 0; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + /* as we don't honour merge_bvec_fn, we must + * never risk violating it, so limit + * ->max_segments to one lying with a single + * page, as a one page request is never in + * violation. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -1063,9 +1176,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + md_integrity_add_rdev(rdev, mddev); print_conf(conf); - return found; + return err; } static int raid1_remove_disk(mddev_t *mddev, int number) @@ -1083,13 +1196,23 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -1125,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int mirror=0; @@ -1151,14 +1274,15 @@ static void end_sync_write(struct bio *bio, int error) update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { - md_done_sync(mddev, r1_bio->sectors, uptodate); + sector_t s = r1_bio->sectors; put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); } } static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; int disks = conf->raid_disks; struct bio *bio, *wbio; @@ -1219,23 +1343,31 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev_dec_pending(conf->mirrors[i].rdev, mddev); } else { /* fixup the bio for reuse */ + int size; sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; sbio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bi_flags |= 1 << BIO_UPTODATE; sbio->bi_next = NULL; sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - for (j = 0; j < vcnt ; j++) - memcpy(page_address(sbio->bi_io_vec[j].bv_page), + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), page_address(pbio->bi_io_vec[j].bv_page), PAGE_SIZE); + } } } @@ -1464,7 +1596,7 @@ static void raid1d(mddev_t *mddev) r1bio_t *r1_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1473,35 +1605,21 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - bitmap_unplug(mddev->bitmap); - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - - continue; - } + unplug += flush_pending_writes(conf); - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { sync_request_write(mddev, r1_bio); unplug = 1; @@ -1514,7 +1632,7 @@ static void raid1d(mddev_t *mddev) * We already have a nr_pending reference on these rdevs. */ int i; - const int do_sync = bio_sync(r1_bio->master_bio); + const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); clear_bit(R1BIO_BarrierRetry, &r1_bio->state); clear_bit(R1BIO_Barrier, &r1_bio->state); for (i=0; i < conf->raid_disks; i++) @@ -1535,7 +1653,8 @@ static void raid1d(mddev_t *mddev) conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | do_sync; + bio->bi_rw = WRITE | + (do_sync << BIO_RW_SYNCIO); bio->bi_private = r1_bio; r1_bio->bios[i] = bio; generic_make_request(bio); @@ -1557,7 +1676,9 @@ static void raid1d(mddev_t *mddev) r1_bio->sector, r1_bio->sectors); unfreeze_array(conf); - } + } else + md_error(mddev, + conf->mirrors[r1_bio->read_disk].rdev); bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { @@ -1567,7 +1688,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - const int do_sync = bio_sync(r1_bio->master_bio); + const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO); r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; @@ -1583,14 +1704,14 @@ static void raid1d(mddev_t *mddev) bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | do_sync; + bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); } } + cond_resched(); } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1622,7 +1743,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r1bio_t *r1_bio; struct bio *bio; sector_t max_sector, nr_sectors; @@ -1642,7 +1763,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -1716,7 +1837,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; @@ -1839,56 +1959,50 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return nr_sectors; } -static int run(mddev_t *mddev) +static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + if (sectors) + return sectors; + + return mddev->dev_sectors; +} + +static conf_t *setup_conf(mddev_t *mddev) { conf_t *conf; - int i, j, disk_idx; + int i; mirror_info_t *disk; mdk_rdev_t *rdev; - struct list_head *tmp; + int err = -ENOMEM; - if (mddev->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - if (mddev->reshape_position != MaxSector) { - printk("raid1: %s: reshape_position set but not supported\n", - mdname(mddev)); - goto out; - } - /* - * copy the already verified devices into our private RAID1 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ conf = kzalloc(sizeof(conf_t), GFP_KERNEL); - mddev->private = conf; if (!conf) - goto out_no_mem; + goto abort; conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, GFP_KERNEL); if (!conf->mirrors) - goto out_no_mem; + goto abort; conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) - goto out_no_mem; + goto abort; - conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); + conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) - goto out_no_mem; - conf->poolinfo->mddev = mddev; + goto abort; conf->poolinfo->raid_disks = mddev->raid_disks; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (!conf->r1bio_pool) - goto out_no_mem; + goto abort; - rdev_for_each(rdev, tmp, mddev) { - disk_idx = rdev->raid_disk; + conf->poolinfo->mddev = mddev; + + spin_lock_init(&conf->device_lock); + list_for_each_entry(rdev, &mddev->disks, same_set) { + int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) continue; @@ -1896,21 +2010,10 @@ static int run(mddev_t *mddev) disk->rdev = rdev; - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - disk->head_position = 0; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; - spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); spin_lock_init(&conf->resync_lock); @@ -1919,8 +2022,7 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - - mddev->degraded = 0; + conf->last_used = -1; for (i = 0; i < conf->raid_disks; i++) { disk = conf->mirrors + i; @@ -1928,74 +2030,127 @@ static int run(mddev_t *mddev) if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; - mddev->degraded++; if (disk->rdev) conf->fullsync = 1; - } + } else if (conf->last_used < 0) + /* + * The first working device is used as a + * starting point to read balancing. + */ + conf->last_used = i; } - if (mddev->degraded == conf->raid_disks) { + + err = -EIO; + if (conf->last_used < 0) { printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; + mdname(mddev)); + goto abort; } - if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + err = -ENOMEM; + conf->thread = md_register_thread(raid1d, mddev, NULL); + if (!conf->thread) { + printk(KERN_ERR + "raid1: couldn't allocate thread for %s\n", + mdname(mddev)); + goto abort; + } + + return conf; + abort: + if (conf) { + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf->poolinfo); + kfree(conf); + } + return ERR_PTR(err); +} + +static int run(mddev_t *mddev) +{ + conf_t *conf; + int i; + mdk_rdev_t *rdev; + + if (mddev->level != 1) { + printk("raid1: %s: raid level not set to mirroring (%d)\n", + mdname(mddev), mddev->level); + return -EIO; + } + if (mddev->reshape_position != MaxSector) { + printk("raid1: %s: reshape_position set but not supported\n", + mdname(mddev)); + return -EIO; + } /* - * find the first working one and use it as a starting point - * to read balancing. + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] */ - for (j = 0; j < conf->raid_disks && - (!conf->mirrors[j].rdev || - !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) - /* nothing */; - conf->last_used = j; + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + if (IS_ERR(conf)) + return PTR_ERR(conf); - mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); - if (!mddev->thread) { - printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; + mddev->queue->queue_lock = &conf->device_lock; + list_for_each_entry(rdev, &mddev->disks, same_set) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, so limit ->max_segments to 1 lying within + * a single page, as a one page request is never in violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } } + mddev->degraded = 0; + for (i=0; i < conf->raid_disks; i++) + if (conf->mirrors[i].rdev == NULL || + !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || + test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + mddev->degraded++; + + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; + + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid1: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); + /* * Ok, everything is just fine now */ - mddev->array_size = mddev->size; + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; - + md_integrity_register(mddev); return 0; - -out_no_mem: - printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", - mdname(mddev)); - -out_free_conf: - if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - kfree(conf->poolinfo); - kfree(conf); - mddev->private = NULL; - } -out: - return -EIO; } static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; int behind_wait = 0; @@ -2008,6 +2163,9 @@ static int stop(mddev_t *mddev) /* need to kick something here to make sure I/O goes? */ } + raise_barrier(conf); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -2029,14 +2187,18 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_size = sectors>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); + if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_size; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } @@ -2057,22 +2219,24 @@ static int raid1_reshape(mddev_t *mddev) mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; mirror_info_t *newmirrors; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int cnt, raid_disks; unsigned long flags; - int d, d2; + int d, d2, err; /* Cannot change chunk_size, layout, or level */ - if (mddev->chunk_size != mddev->new_chunk || + if (mddev->chunk_sectors != mddev->new_chunk_sectors || mddev->layout != mddev->new_layout || mddev->level != mddev->new_level) { - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->new_level = mddev->level; return -EINVAL; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; raid_disks = mddev->raid_disks + mddev->delta_disks; @@ -2152,9 +2316,12 @@ static int raid1_reshape(mddev_t *mddev) static void raid1_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { + case 2: /* wake for suspend */ + wake_up(&conf->wait_barrier); + break; case 1: raise_barrier(conf); break; @@ -2164,6 +2331,23 @@ static void raid1_quiesce(mddev_t *mddev, int state) } } +static void *raid1_takeover(mddev_t *mddev) +{ + /* raid1 can take over: + * raid5 with 2 devices, any layout or chunk size + */ + if (mddev->level == 5 && mddev->raid_disks == 2) { + conf_t *conf; + mddev->new_level = 1; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + conf = setup_conf(mddev); + if (!IS_ERR(conf)) + conf->barrier = 1; + return conf; + } + return ERR_PTR(-EINVAL); +} static struct mdk_personality raid1_personality = { @@ -2180,8 +2364,10 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, + .size = raid1_size, .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, + .takeover = raid1_takeover, }; static int __init raid_init(void) @@ -2197,6 +2383,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1");