X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid10.c;h=499620afb44b259184d3a04ba73fb5a1e7d2c724;hb=f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db;hp=f0141910bb8d8a0166df17903c977c404057514c;hpb=802ba064c49f655d20fed563f2a4924c8256ea10;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f014191..499620a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,9 +18,12 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" -#include -#include +#include +#include +#include +#include "md.h" +#include "raid10.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -76,11 +79,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +/* Maximum size of each resync request */ #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE -#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +/* amount of memory to reserve for resync requests */ +#define RESYNC_WINDOW (1024*1024) +/* maximum number of concurrent requests, memory permitting */ +#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) /* * When performing a resync, we need to read and compare, so @@ -215,6 +220,9 @@ static void reschedule_retry(r10bio_t *r10_bio) conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); + /* wake up frozen array... */ + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); } @@ -227,7 +235,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio) { struct bio *bio = r10_bio->master_bio; - bio_endio(bio, bio->bi_size, + bio_endio(bio, test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); free_r10bio(r10_bio); } @@ -243,15 +251,13 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) r10_bio->devs[slot].addr + (r10_bio->sectors); } -static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; conf_t *conf = mddev_to_conf(r10_bio->mddev); - if (bio->bi_size) - return 1; slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; @@ -284,19 +290,15 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int } rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); - return 0; } -static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; conf_t *conf = mddev_to_conf(r10_bio->mddev); - if (bio->bi_size) - return 1; - for (slot = 0; slot < conf->copies; slot++) if (r10_bio->devs[slot].bio == bio) break; @@ -339,7 +341,6 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in } rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); - return 0; } @@ -429,7 +430,7 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) if (dev < 0) dev += conf->raid_disks; } else { - while (sector > conf->stride) { + while (sector >= conf->stride) { sector -= conf->stride; if (dev < conf->near_copies) dev += conf->raid_disks - conf->near_copies; @@ -446,26 +447,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } @@ -544,7 +546,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) current_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); - /* Find the disk whose head is closest */ + /* Find the disk whose head is closest, + * or - for far > 1 - find the closest to partition beginning */ for (nslot = slot; nslot < conf->copies; nslot++) { int ndisk = r10_bio->devs[nslot].devnum; @@ -564,8 +567,13 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) slot = nslot; break; } - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); + + /* for far > 1 always use the lowest address */ + if (conf->far_copies > 1) + new_distance = r10_bio->devs[nslot].addr; + else + new_distance = abs(r10_bio->devs[nslot].addr - + conf->mirrors[ndisk].head_position); if (new_distance < current_distance) { current_distance = new_distance; disk = ndisk; @@ -595,13 +603,12 @@ static void unplug_slaves(mddev_t *mddev) for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + struct request_queue *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); + blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -610,7 +617,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid10_unplug(request_queue_t *q) +static void raid10_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; @@ -618,36 +625,6 @@ static void raid10_unplug(request_queue_t *q) md_wakeup_thread(mddev->thread); } -static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) -{ - mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); - int i, ret = 0; - - rcu_read_lock(); - for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct block_device *bdev = rdev->bdev; - request_queue_t *r_queue = bdev_get_queue(bdev); - - if (!r_queue->issue_flush_fn) - ret = -EOPNOTSUPP; - else { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, - error_sector); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - } - rcu_read_unlock(); - return ret; -} - static int raid10_congested(void *data, int bits) { mddev_t *mddev = data; @@ -658,7 +635,7 @@ static int raid10_congested(void *data, int bits) for (i = 0; i < mddev->raid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { - request_queue_t *q = bdev_get_queue(rdev->bdev); + struct request_queue *q = bdev_get_queue(rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } @@ -667,7 +644,36 @@ static int raid10_congested(void *data, int bits) return ret; } - +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to disk + * before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -689,7 +695,6 @@ static int raid10_congested(void *data, int bits) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 static void raise_barrier(conf_t *conf, int force) { @@ -750,15 +755,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid10_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } @@ -772,21 +785,24 @@ static void unfreeze_array(conf_t *conf) spin_unlock_irq(&conf->resync_lock); } -static int make_request(request_queue_t *q, struct bio * bio) +static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; + int cpu; int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); + const int do_sync = bio_sync(bio); struct bio_list bl; unsigned long flags; + mdk_rdev_t *blocked_rdev; if (unlikely(bio_barrier(bio))) { - bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + bio_endio(bio, -EOPNOTSUPP); return 0; } @@ -804,7 +820,7 @@ static int make_request(request_queue_t *q, struct bio * bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, bio_split_pool, + bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); if (make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); @@ -818,7 +834,7 @@ static int make_request(request_queue_t *q, struct bio * bio) " or bigger than %dk %llu %d\n", chunk_sects/2, (unsigned long long)bio->bi_sector, bio->bi_size >> 10); - bio_io_error(bio, bio->bi_size); + bio_io_error(bio); return 0; } @@ -831,8 +847,11 @@ static int make_request(request_queue_t *q, struct bio * bio) */ wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); @@ -863,7 +882,7 @@ static int make_request(request_queue_t *q, struct bio * bio) mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ; + read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; generic_make_request(read_bio); @@ -873,17 +892,23 @@ static int make_request(request_queue_t *q, struct bio * bio) /* * WRITE: */ - /* first select target devices under spinlock and + /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio */ raid10_find_phys(conf, r10_bio); + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - !test_bit(Faulty, &rdev->flags)) { + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; } else { @@ -893,6 +918,22 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + int j; + int d; + + for (j = 0; j < i; j++) + if (r10_bio->devs[j].bio) { + d = r10_bio->devs[j].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + atomic_set(&r10_bio->remaining, 0); bio_list_init(&bl); @@ -909,19 +950,32 @@ static int make_request(request_queue_t *q, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE; + mbio->bi_rw = WRITE | do_sync; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); bio_list_add(&bl, mbio); } + if (unlikely(!atomic_read(&r10_bio->remaining))) { + /* the array is dead */ + md_write_end(mddev); + raid_end_bio_io(r10_bio); + return 0; + } + bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid10d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); + + if (do_sync) + md_wakeup_thread(mddev->thread); + return 0; } @@ -978,12 +1032,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", + printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" + "raid10: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } @@ -1071,24 +1125,30 @@ static int raid10_spare_active(mddev_t *mddev) static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ - return 0; + return -EBUSY; if (!enough(conf)) - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else - mirror = 0; - for ( ; mirror < mddev->raid_disks; mirror++) + mirror = first; + for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@ -1103,7 +1163,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); @@ -1111,7 +1171,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) } print_conf(conf); - return found; + return err; } static int raid10_remove_disk(mddev_t *mddev, int number) @@ -1129,6 +1189,14 @@ static int raid10_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove faulty devices in recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + enough(conf)) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1144,15 +1212,12 @@ abort: } -static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_read(struct bio *bio, int error) { r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); conf_t *conf = mddev_to_conf(r10_bio->mddev); int i,d; - if (bio->bi_size) - return 1; - for (i=0; icopies; i++) if (r10_bio->devs[i].bio == bio) break; @@ -1173,6 +1238,7 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) /* for reconstruct, we always reschedule after a read. * for resync, only after all reads */ + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); if (test_bit(R10BIO_IsRecover, &r10_bio->state) || atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, @@ -1180,11 +1246,9 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) */ reschedule_retry(r10_bio); } - rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); - return 0; } -static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); @@ -1192,9 +1256,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) conf_t *conf = mddev_to_conf(mddev); int i,d; - if (bio->bi_size) - return 1; - for (i = 0; i < conf->copies; i++) if (r10_bio->devs[i].bio == bio) break; @@ -1202,13 +1263,16 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) if (!uptodate) md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); + rdev_dec_pending(conf->mirrors[d].rdev, mddev); while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ - md_done_sync(mddev, r10_bio->sectors, 1); + sector_t s = r10_bio->sectors; put_buf(r10_bio); + md_done_sync(mddev, s, 1); break; } else { r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; @@ -1216,8 +1280,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) r10_bio = r10_bio2; } } - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - return 0; } /* @@ -1291,9 +1353,6 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) tbio->bi_size = r10_bio->sectors << 9; tbio->bi_idx = 0; tbio->bi_phys_segments = 0; - tbio->bi_hw_segments = 0; - tbio->bi_hw_front_size = 0; - tbio->bi_hw_back_size = 0; tbio->bi_flags &= ~(BIO_POOL_MASK - 1); tbio->bi_flags |= 1 << BIO_UPTODATE; tbio->bi_next = NULL; @@ -1363,7 +1422,7 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) if (test_bit(R10BIO_Uptodate, &r10_bio->state)) generic_make_request(wbio); else - bio_endio(wbio, wbio->bi_size, -EIO); + bio_endio(wbio, -EIO); } @@ -1499,29 +1558,14 @@ static void raid10d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - if (bitmap_unplug(mddev->bitmap) != 0) - printk("%s: bitmap file write failed!\n", mdname(mddev)); - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; + unplug += flush_pending_writes(conf); - continue; - } - - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r10_bio = list_entry(head->prev, r10bio_t, retry_list); list_del(head->prev); conf->nr_queued--; @@ -1554,7 +1598,6 @@ static void raid10d(mddev_t *mddev) bio = r10_bio->devs[r10_bio->read_slot].bio; r10_bio->devs[r10_bio->read_slot].bio = mddev->ro ? IO_BLOCKED : NULL; - bio_put(bio); mirror = read_balance(conf, r10_bio); if (mirror == -1) { printk(KERN_ALERT "raid10: %s: unrecoverable I/O" @@ -1562,7 +1605,10 @@ static void raid10d(mddev_t *mddev) bdevname(bio->bi_bdev,b), (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); + bio_put(bio); } else { + const int do_sync = bio_sync(r10_bio->master_bio); + bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) printk(KERN_ERR "raid10: %s: redirecting sector %llu to" @@ -1574,7 +1620,7 @@ static void raid10d(mddev_t *mddev) bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ; + bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; unplug = 1; @@ -1582,7 +1628,6 @@ static void raid10d(mddev_t *mddev) } } } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1652,7 +1697,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; skipped: - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { @@ -1691,6 +1736,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return (max_sector - sector_nr) + sectors_skipped; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + /* make sure whole request will fit in a chunk - if chunks * are meaningful */ @@ -1761,17 +1809,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio->sector = sect; raid10_find_phys(conf, r10_bio); - /* Need to check if this section will still be + + /* Need to check if the array will still be * degraded */ - for (j=0; jcopies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) { + for (j=0; jraid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { still_degraded = 1; break; } - } + must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); @@ -1796,6 +1844,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (k=0; kcopies; k++) if (r10_bio->devs[k].devnum == i) break; + BUG_ON(k == conf->copies); bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; @@ -1815,8 +1864,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (j == conf->copies) { /* Cannot recover, so abort the recovery */ put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", mdname(mddev)); break; @@ -1835,6 +1887,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* resync. Schedule a read for every block at this virt offset */ int count = 0; + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { @@ -1861,6 +1915,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int d = r10_bio->devs[i].devnum; bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; + clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; @@ -1897,7 +1952,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; } @@ -1959,13 +2013,32 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* There is nowhere to write, so all non-sync * drives must be failed, so try the next chunk... */ - { - sector_t sec = max_sector - sector_nr; - sectors_skipped += sec; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + + sectors_skipped += (max_sector - sector_nr); chunks_skipped ++; sector_nr = max_sector; goto skipped; - } +} + +static sector_t +raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + conf_t *conf = mddev_to_conf(mddev); + + if (!raid_disks) + raid_disks = mddev->raid_disks; + if (!sectors) + sectors = mddev->dev_sectors; + + size = sectors >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * raid_disks; + sector_div(size, conf->near_copies); + + return size << conf->chunk_shift; } static int run(mddev_t *mddev) @@ -1974,12 +2047,12 @@ static int run(mddev_t *mddev) int i, disk_idx; mirror_info_t *disk; mdk_rdev_t *rdev; - struct list_head *tmp; int nc, fc, fo; sector_t stride, size; - if (mddev->chunk_size == 0) { - printk(KERN_ERR "md/raid10: non-zero chunk size required.\n"); + if (mddev->chunk_size < PAGE_SIZE) { + printk(KERN_ERR "md/raid10: chunk size must be " + "at least PAGE_SIZE(%ld).\n", PAGE_SIZE); return -EINVAL; } @@ -2016,19 +2089,35 @@ static int run(mddev_t *mddev) if (!conf->tmppage) goto out_free_conf; + conf->mddev = mddev; + conf->raid_disks = mddev->raid_disks; conf->near_copies = nc; conf->far_copies = fc; conf->copies = nc*fc; conf->far_offset = fo; conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; conf->chunk_shift = ffz(~mddev->chunk_size) - 9; + size = mddev->dev_sectors >> conf->chunk_shift; + sector_div(size, fc); + size = size * conf->raid_disks; + sector_div(size, nc); + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" in 'stride' */ + stride = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + stride += conf->raid_disks - 1; + sector_div(stride, conf->raid_disks); + mddev->dev_sectors = stride << conf->chunk_shift; + if (fo) - conf->stride = 1 << conf->chunk_shift; - else { - stride = mddev->size >> (conf->chunk_shift-1); + stride = 1; + else sector_div(stride, fc); - conf->stride = stride << conf->chunk_shift; - } + conf->stride = stride << conf->chunk_shift; + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, r10bio_pool_free, conf); if (!conf->r10bio_pool) { @@ -2037,7 +2126,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } - ITERATE_RDEV(mddev, rdev, tmp) { + spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; + + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2058,9 +2150,6 @@ static int run(mddev_t *mddev) disk->head_position = 0; } - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); spin_lock_init(&conf->resync_lock); @@ -2082,6 +2171,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } @@ -2101,19 +2192,10 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - if (conf->far_offset) { - size = mddev->size >> (conf->chunk_shift-1); - size *= conf->raid_disks; - size <<= conf->chunk_shift; - sector_div(size, conf->far_copies); - } else - size = conf->stride * conf->raid_disks; - sector_div(size, conf->near_copies); - mddev->array_size = size/2; - mddev->resync_max_sectors = size; + md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); + mddev->resync_max_sectors = raid10_size(mddev, 0, 0); mddev->queue->unplug_fn = raid10_unplug; - mddev->queue->issue_flush_fn = raid10_issue_flush; mddev->queue->backing_dev_info.congested_fn = raid10_congested; mddev->queue->backing_dev_info.congested_data = mddev; @@ -2147,6 +2229,9 @@ static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + raise_barrier(conf, 0); + lower_barrier(conf); + md_unregister_thread(mddev->thread); mddev->thread = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ @@ -2194,6 +2279,7 @@ static struct mdk_personality raid10_personality = .spare_active = raid10_spare_active, .sync_request = sync_request, .quiesce = raid10_quiesce, + .size = raid10_size, }; static int __init raid_init(void)