X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid1.c;h=7b4f5f7155d8726705c1f42757f68dd1de869624;hb=97a39896816489fe9a67c223e782e8dda06f25c9;hp=b3cfae41f7690d0cc8e09ba25896ebffc3b4e5ef;hpb=5fd6c1dce06ec24ef3de20fe0c7ecf2ba9fe5ef9;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b3cfae4..7b4f5f7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -9,7 +9,7 @@ * * Better read-balancing code written by Mika Kuoppala , 2000 * - * Fixes to reconstruction by Jakob Østergaard" + * Fixes to reconstruction by Jakob Østergaard" * Various fixes by Neil Brown * * Changes by Peter T. Breuer 31/1/2003 to support @@ -32,6 +32,7 @@ */ #include "dm-bio-list.h" +#include #include #include @@ -238,7 +239,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) (unsigned long long) bio->bi_sector + (bio->bi_size >> 9) - 1); - bio_endio(bio, bio->bi_size, + bio_endio(bio, test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); } free_r1bio(r1_bio); @@ -255,37 +256,38 @@ static inline void update_head_pos(int disk, r1bio_t *r1_bio) r1_bio->sector + (r1_bio->sectors); } -static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid1_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; conf_t *conf = mddev_to_conf(r1_bio->mddev); - if (bio->bi_size) - return 1; - mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ update_head_pos(mirror, r1_bio); - if (uptodate || conf->working_disks <= 1) { - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); + else { + /* If all other devices have failed, we want to return + * the error upwards rather than fail the last device. + * Here we redefine "uptodate" to mean "Don't want to retry" */ - if (uptodate) - set_bit(R1BIO_Uptodate, &r1_bio->state); + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) raid_end_bio_io(r1_bio); - } else { + else { /* * oops, read error: */ @@ -297,10 +299,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); - return 0; } -static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); @@ -308,8 +309,6 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int conf_t *conf = mddev_to_conf(r1_bio->mddev); struct bio *to_put = NULL; - if (bio->bi_size) - return 1; for (mirror = 0; mirror < conf->raid_disks; mirror++) if (r1_bio->bios[mirror] == bio) @@ -362,7 +361,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + (mbio->bi_size >> 9) - 1); - bio_endio(mbio, mbio->bi_size, 0); + bio_endio(mbio, 0); } } } @@ -396,8 +395,6 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int if (to_put) bio_put(to_put); - - return 0; } @@ -548,13 +545,12 @@ static void unplug_slaves(mddev_t *mddev) for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + struct request_queue *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); + blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -563,7 +559,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid1_unplug(request_queue_t *q) +static void raid1_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; @@ -571,36 +567,63 @@ static void raid1_unplug(request_queue_t *q) md_wakeup_thread(mddev->thread); } -static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) +static int raid1_congested(void *data, int bits) { - mddev_t *mddev = q->queuedata; + mddev_t *mddev = data; conf_t *conf = mddev_to_conf(mddev); int i, ret = 0; rcu_read_lock(); - for (i=0; iraid_disks && ret == 0; i++) { + for (i = 0; i < mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct block_device *bdev = rdev->bdev; - request_queue_t *r_queue = bdev_get_queue(bdev); + struct request_queue *q = bdev_get_queue(rdev->bdev); - if (!r_queue->issue_flush_fn) - ret = -EOPNOTSUPP; - else { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, - error_sector); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } + /* Note the '|| 1' - when read_balance prefers + * non-congested targets, it can be removed + */ + if ((bits & (1<backing_dev_info, bits); + else + ret &= bdi_congested(&q->backing_dev_info, bits); } } rcu_read_unlock(); return ret; } + +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -682,15 +705,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid1_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -735,7 +766,7 @@ do_sync_io: return NULL; } -static int make_request(request_queue_t *q, struct bio * bio) +static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); @@ -743,13 +774,14 @@ static int make_request(request_queue_t *q, struct bio * bio) r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - mdk_rdev_t *rdev; - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - int do_barriers; + const int do_sync = bio_sync(bio); + int cpu, do_barriers; + mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction @@ -765,14 +797,19 @@ static int make_request(request_queue_t *q, struct bio * bio) if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { if (rw == WRITE) md_write_end(mddev); - bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + bio_endio(bio, -EOPNOTSUPP); return 0; } wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + bitmap = mddev->bitmap; + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); /* * make_request() can abort the operation when READA is being @@ -809,7 +846,7 @@ static int make_request(request_queue_t *q, struct bio * bio) read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_rw = READ; + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; generic_make_request(read_bio); @@ -831,10 +868,17 @@ static int make_request(request_queue_t *q, struct bio * bio) first = 0; } #endif + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < disks; i++) { - if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && - !test_bit(Faulty, &rdev->flags)) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); @@ -847,6 +891,20 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + BUG_ON(targets == 0); /* we never fail the last device */ if (targets < conf->raid_disks) { @@ -880,7 +938,7 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers; + mbio->bi_rw = WRITE | do_barriers | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { @@ -915,6 +973,11 @@ static int make_request(request_queue_t *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid1d snuck into freeze_array */ + wake_up(&conf->wait_barrier); + + if (do_sync) + md_wakeup_thread(mddev->thread); #if 0 while ((bio = bio_list_pop(&bl)) != NULL) generic_make_request(bio); @@ -929,11 +992,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -950,49 +1016,57 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a - * normal single drive + * normal single drive. + * However don't try a recovery from this drive as + * it is very likely to fail. */ + mddev->recovery_disabled = 1; return; - if (test_bit(In_sync, &rdev->flags)) { + } + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); - } - clear_bit(In_sync, &rdev->flags); - set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + } else + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" + "raid1: Operation continuing on %d devices.\n", + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) { int i; - mirror_info_t *tmp; printk("RAID1 conf printout:\n"); if (!conf) { printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } + rcu_read_unlock(); } static void close_sync(conf_t *conf) @@ -1008,20 +1082,21 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; - mirror_info_t *tmp; /* * Find all failed disks within the RAID1 configuration - * and mark them readable + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1033,11 +1108,16 @@ static int raid1_spare_active(mddev_t *mddev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror = 0; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@ -1052,7 +1132,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -1063,7 +1143,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) } print_conf(conf); - return found; + return err; } static int raid1_remove_disk(mddev_t *mddev, int number) @@ -1081,6 +1161,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1096,14 +1184,11 @@ abort: } -static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_read(struct bio *bio, int error) { r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int i; - if (bio->bi_size) - return 1; - for (i=r1_bio->mddev->raid_disks; i--; ) if (r1_bio->bios[i] == bio) break; @@ -1119,10 +1204,9 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) if (atomic_dec_and_test(&r1_bio->remaining)) reschedule_retry(r1_bio); - return 0; } -static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); @@ -1131,9 +1215,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) int i; int mirror=0; - if (bio->bi_size) - return 1; - for (i = 0; i < conf->raid_disks; i++) if (r1_bio->bios[i] == bio) { mirror = i; @@ -1145,7 +1226,7 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) long sectors_to_go = r1_bio->sectors; /* make sure these bits doesn't get cleared. */ do { - bitmap_end_sync(mddev->bitmap, r1_bio->sector, + bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); s += sync_blocks; sectors_to_go -= sync_blocks; @@ -1159,7 +1240,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) md_done_sync(mddev, r1_bio->sectors, uptodate); put_buf(r1_bio); } - return 0; } static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) @@ -1199,37 +1279,58 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) } r1_bio->read_disk = primary; for (i=0; iraid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { + if (r1_bio->bios[i]->bi_end_io == end_sync_read) { int j; int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - for (j = vcnt; j-- ; ) - if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), - page_address(sbio->bi_io_vec[j].bv_page), - PAGE_SIZE)) - break; + + if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + PAGE_SIZE)) + break; + } + } else + j = 0; if (j >= 0) mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); } else { /* fixup the bio for reuse */ + int size; sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; sbio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bi_flags |= 1 << BIO_UPTODATE; sbio->bi_next = NULL; sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), + page_address(pbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + } + } } } @@ -1237,7 +1338,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) /* ouch - failed to read all of that. * Try some synchronous reads of other devices to get * good data, much like with normal read errors. Only - * read into the pages we already have so they we don't + * read into the pages we already have so we don't * need to re-issue the read request. * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and @@ -1257,6 +1358,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) s = PAGE_SIZE >> 9; do { if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev->bdev, sect + rdev->data_offset, @@ -1359,6 +1464,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * 3. Performs writes following reads for array syncronising. */ +static void fix_read_error(conf_t *conf, int read_disk, + sector_t sect, int sectors) +{ + mddev_t *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + mdk_rdev_t *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + d++; + if (d == conf->raid_disks) + d = 0; + } + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[read_disk].rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + d = start; + while (d != read_disk) { + char b[BDEVNAME_SIZE]; + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO + "raid1:%s: read error corrected " + "(%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect + + rdev->data_offset), + bdevname(rdev->bdev, b)); + } + } + } + sectors -= s; + sect += s; + } +} + static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; @@ -1373,29 +1567,14 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - if (bitmap_unplug(mddev->bitmap) != 0) - printk("%s: bitmap file write failed!\n", mdname(mddev)); + unplug += flush_pending_writes(conf); - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - - continue; - } - - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); conf->nr_queued--; @@ -1415,6 +1594,7 @@ static void raid1d(mddev_t *mddev) * We already have a nr_pending reference on these rdevs. */ int i; + const int do_sync = bio_sync(r1_bio->master_bio); clear_bit(R1BIO_BarrierRetry, &r1_bio->state); clear_bit(R1BIO_Barrier, &r1_bio->state); for (i=0; i < conf->raid_disks; i++) @@ -1435,7 +1615,7 @@ static void raid1d(mddev_t *mddev) conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE; + bio->bi_rw = WRITE | do_sync; bio->bi_private = r1_bio; r1_bio->bios[i] = bio; generic_make_request(bio); @@ -1451,77 +1631,14 @@ static void raid1d(mddev_t *mddev) * This is all done synchronously while the array is * frozen */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - do { - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) - success = 1; - else { - d++; - if (d == conf->raid_disks) - d = 0; - } - } while (!success && d != r1_bio->read_disk); - - if (success) { - /* write it back and re-read */ - int start = d; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - d = start; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, + r1_bio->sectors); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" @@ -1530,6 +1647,7 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { + const int do_sync = bio_sync(r1_bio->master_bio); r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; @@ -1545,14 +1663,13 @@ static void raid1d(mddev_t *mddev) bio->bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ; + bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; unplug = 1; generic_make_request(bio); } } } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1622,6 +1739,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + return max_sector - sector_nr; + } /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ @@ -1639,6 +1763,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); + bitmap_cond_end_sync(mddev->bitmap, sector_nr); raise_barrier(conf); conf->next_resync = sector_nr; @@ -1666,11 +1791,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* take from bio_init */ bio->bi_next = NULL; bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_rw = 0; + bio->bi_rw = READ; bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; @@ -1721,6 +1845,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ nr_sectors = 0; sync_blocks = 0; do { @@ -1777,19 +1903,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; iraid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } } } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, - nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } - return nr_sectors; } @@ -1799,7 +1923,6 @@ static int run(mddev_t *mddev) int i, j, disk_idx; mirror_info_t *disk; mdk_rdev_t *rdev; - struct list_head *tmp; if (mddev->level != 1) { printk("raid1: %s: raid level not set to mirroring (%d)\n", @@ -1841,7 +1964,10 @@ static int run(mddev_t *mddev) if (!conf->r1bio_pool) goto out_no_mem; - ITERATE_RDEV(mddev, rdev, tmp) { + spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; + + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -1861,15 +1987,10 @@ static int run(mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; - spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (conf->working_disks == 1) - mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -1877,11 +1998,6 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - if (!conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; - } mddev->degraded = 0; for (i = 0; i < conf->raid_disks; i++) { @@ -1892,8 +2008,17 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } + if (mddev->degraded == conf->raid_disks) { + printk(KERN_ERR "raid1: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; /* * find the first working one and use it as a starting point @@ -1921,10 +2046,11 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_size = mddev->size; + mddev->array_sectors = mddev->size * 2; mddev->queue->unplug_fn = raid1_unplug; - mddev->queue->issue_flush_fn = raid1_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; @@ -1982,14 +2108,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_size = sectors>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { + if (mddev->array_sectors / 2 > mddev->size && + mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_size; + mddev->size = mddev->array_sectors / 2; mddev->resync_max_sectors = sectors; return 0; } @@ -2012,8 +2139,8 @@ static int raid1_reshape(mddev_t *mddev) mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; - - int d, d2; + unsigned long flags; + int d, d2, err; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_size != mddev->new_chunk || @@ -2025,6 +2152,10 @@ static int raid1_reshape(mddev_t *mddev) return -EINVAL; } + err = md_allow_write(mddev); + if (err) + return err; + raid_disks = mddev->raid_disks + mddev->delta_disks; if (raid_disks < conf->raid_disks) { @@ -2061,17 +2192,33 @@ static int raid1_reshape(mddev_t *mddev) oldpool = conf->r1bio_pool; conf->r1bio_pool = newpool; - for (d=d2=0; d < conf->raid_disks; d++) - if (conf->mirrors[d].rdev) { - conf->mirrors[d].rdev->raid_disk = d2; - newmirrors[d2++].rdev = conf->mirrors[d].rdev; + for (d = d2 = 0; d < conf->raid_disks; d++) { + mdk_rdev_t *rdev = conf->mirrors[d].rdev; + if (rdev && rdev->raid_disk != d2) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = d2; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) + printk(KERN_WARNING + "md/raid1: cannot register " + "%s for %s\n", + nm, mdname(mddev)); } + if (rdev) + newmirrors[d2++].rdev = rdev; + } kfree(conf->mirrors); conf->mirrors = newmirrors; kfree(conf->poolinfo); conf->poolinfo = newpoolinfo; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0;