X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid10.c;h=e2766d8251a184a6c28a67430ca34c7e68d0df67;hb=4434ade8c9334a3ab975d8993de456f06841899e;hp=1440935414e63825adfb5278d2aad8d0be6e1822;hpb=e0a33270ed0e8e00cbb882a33d21e1f92aac0ceb;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1440935..e2766d8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,9 +18,13 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" -#include -#include +#include +#include +#include +#include +#include "md.h" +#include "raid10.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -29,6 +33,7 @@ * raid_disks * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) + * far_offset (stored in bit 16 of layout ) * * The data to be stored is divided into chunks using chunksize. * Each device is divided into far_copies sections. @@ -36,10 +41,14 @@ * near_copies copies of each chunk is stored (each on a different drive). * The starting device for each section is offset near_copies from the starting * device of the previous section. - * Thus there are (near_copies*far_copies) of each chunk, and each is on a different + * Thus they are (near_copies*far_copies) of each chunk, and each is on a different * drive. * near_copies and far_copies must be at least one, and their product is at most * raid_disks. + * + * If far_offset is true, then the far_copies are handled a bit differently. + * The copies are still in different stripes, but instead of be very far apart + * on disk, there are adjacent stripes. */ /* @@ -60,7 +69,7 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) /* allocate a r10bio with room for raid_disks entries in the bios array */ r10_bio = kzalloc(size, gfp_flags); - if (!r10_bio) + if (!r10_bio && conf->mddev) unplug_slaves(conf->mddev); return r10_bio; @@ -71,11 +80,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +/* Maximum size of each resync request */ #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE -#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +/* amount of memory to reserve for resync requests */ +#define RESYNC_WINDOW (1024*1024) +/* maximum number of concurrent requests, memory permitting */ +#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) /* * When performing a resync, we need to read and compare, so @@ -178,7 +189,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) static void free_r10bio(r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; /* * Wake up any possible resync thread that waits for the device @@ -192,7 +203,7 @@ static void free_r10bio(r10bio_t *r10_bio) static void put_buf(r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; mempool_free(r10_bio, conf->r10buf_pool); @@ -203,13 +214,16 @@ static void reschedule_retry(r10bio_t *r10_bio) { unsigned long flags; mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); + /* wake up frozen array... */ + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); } @@ -222,7 +236,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio) { struct bio *bio = r10_bio->master_bio; - bio_endio(bio, bio->bi_size, + bio_endio(bio, test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); free_r10bio(r10_bio); } @@ -232,21 +246,19 @@ static void raid_end_bio_io(r10bio_t *r10_bio) */ static inline void update_head_pos(int slot, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; conf->mirrors[r10_bio->devs[slot].devnum].head_position = r10_bio->devs[slot].addr + (r10_bio->sectors); } -static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; - if (bio->bi_size) - return 1; slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; @@ -279,18 +291,14 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int } rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); - return 0; } -static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error) +static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); - - if (bio->bi_size) - return 1; + conf_t *conf = r10_bio->mddev->private; for (slot = 0; slot < conf->copies; slot++) if (r10_bio->devs[slot].bio == bio) @@ -334,7 +342,6 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in } rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); - return 0; } @@ -357,8 +364,7 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector - * on each device that it is on. If a block isn't on a device, - * that entry in the array is set to MaxSector. + * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address @@ -381,6 +387,8 @@ static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) chunk *= conf->near_copies; stripe = chunk; dev = sector_div(stripe, conf->raid_disks); + if (conf->far_offset) + stripe *= conf->far_copies; sector += stripe << conf->chunk_shift; @@ -414,16 +422,24 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) { sector_t offset, chunk, vchunk; - while (sector > conf->stride) { - sector -= conf->stride; - if (dev < conf->near_copies) - dev += conf->raid_disks - conf->near_copies; - else - dev -= conf->near_copies; - } - offset = sector & conf->chunk_mask; - chunk = sector >> conf->chunk_shift; + if (conf->far_offset) { + int fc; + chunk = sector >> conf->chunk_shift; + fc = sector_div(chunk, conf->far_copies); + dev -= fc * conf->near_copies; + if (dev < 0) + dev += conf->raid_disks; + } else { + while (sector >= conf->stride) { + sector -= conf->stride; + if (dev < conf->near_copies) + dev += conf->raid_disks - conf->near_copies; + else + dev -= conf->near_copies; + } + chunk = sector >> conf->chunk_shift; + } vchunk = chunk * conf->raid_disks + dev; sector_div(vchunk, conf->near_copies); return (vchunk << conf->chunk_shift) + offset; @@ -432,26 +448,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } @@ -530,7 +547,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) current_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); - /* Find the disk whose head is closest */ + /* Find the disk whose head is closest, + * or - for far > 1 - find the closest to partition beginning */ for (nslot = slot; nslot < conf->copies; nslot++) { int ndisk = r10_bio->devs[nslot].devnum; @@ -550,8 +568,13 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) slot = nslot; break; } - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); + + /* for far > 1 always use the lowest address */ + if (conf->far_copies > 1) + new_distance = r10_bio->devs[nslot].addr; + else + new_distance = abs(r10_bio->devs[nslot].addr - + conf->mirrors[ndisk].head_position); if (new_distance < current_distance) { current_distance = new_distance; disk = ndisk; @@ -574,20 +597,19 @@ rb_out: static void unplug_slaves(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; rcu_read_lock(); for (i=0; iraid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); + struct request_queue *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); + blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -596,7 +618,7 @@ static void unplug_slaves(mddev_t *mddev) rcu_read_unlock(); } -static void raid10_unplug(request_queue_t *q) +static void raid10_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; @@ -604,36 +626,57 @@ static void raid10_unplug(request_queue_t *q) md_wakeup_thread(mddev->thread); } -static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) +static int raid10_congested(void *data, int bits) { - mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + mddev_t *mddev = data; + conf_t *conf = mddev->private; int i, ret = 0; + if (mddev_congested(mddev, bits)) + return 1; rcu_read_lock(); - for (i=0; iraid_disks && ret == 0; i++) { + for (i = 0; i < mddev->raid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct block_device *bdev = rdev->bdev; - request_queue_t *r_queue = bdev_get_queue(bdev); + struct request_queue *q = bdev_get_queue(rdev->bdev); - if (!r_queue->issue_flush_fn) - ret = -EOPNOTSUPP; - else { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, - error_sector); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } + ret |= bdi_congested(&q->backing_dev_info, bits); } } rcu_read_unlock(); return ret; } +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to disk + * before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -655,7 +698,6 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 static void raise_barrier(conf_t *conf, int force) { @@ -716,15 +758,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid10_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } @@ -738,21 +788,24 @@ static void unfreeze_array(conf_t *conf) spin_unlock_irq(&conf->resync_lock); } -static int make_request(request_queue_t *q, struct bio * bio) +static int make_request(struct request_queue *q, struct bio * bio) { mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; + int cpu; int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); + const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); struct bio_list bl; unsigned long flags; + mdk_rdev_t *blocked_rdev; - if (unlikely(bio_barrier(bio))) { - bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { + md_barrier_request(mddev, bio); return 0; } @@ -770,7 +823,7 @@ static int make_request(request_queue_t *q, struct bio * bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, bio_split_pool, + bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); if (make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); @@ -784,7 +837,7 @@ static int make_request(request_queue_t *q, struct bio * bio) " or bigger than %dk %llu %d\n", chunk_sects/2, (unsigned long long)bio->bi_sector, bio->bi_size >> 10); - bio_io_error(bio, bio->bi_size); + bio_io_error(bio); return 0; } @@ -797,8 +850,11 @@ static int make_request(request_queue_t *q, struct bio * bio) */ wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); @@ -829,7 +885,7 @@ static int make_request(request_queue_t *q, struct bio * bio) mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_rw = READ; + read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); read_bio->bi_private = r10_bio; generic_make_request(read_bio); @@ -839,17 +895,23 @@ static int make_request(request_queue_t *q, struct bio * bio) /* * WRITE: */ - /* first select target devices under spinlock and + /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio */ raid10_find_phys(conf, r10_bio); + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - !test_bit(Faulty, &rdev->flags)) { + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; } else { @@ -859,6 +921,22 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + int j; + int d; + + for (j = 0; j < i; j++) + if (r10_bio->devs[j].bio) { + d = r10_bio->devs[j].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + atomic_set(&r10_bio->remaining, 0); bio_list_init(&bl); @@ -875,36 +953,52 @@ static int make_request(request_queue_t *q, struct bio * bio) conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE; + mbio->bi_rw = WRITE | (do_sync << BIO_RW_SYNCIO); mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); bio_list_add(&bl, mbio); } + if (unlikely(!atomic_read(&r10_bio->remaining))) { + /* the array is dead */ + md_write_end(mddev); + raid_end_bio_io(r10_bio); + return 0; + } + bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid10d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); + + if (do_sync) + md_wakeup_thread(mddev->thread); + return 0; } static void status(struct seq_file *seq, mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i; if (conf->near_copies < conf->raid_disks) - seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); + seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); if (conf->near_copies > 1) seq_printf(seq, " %d near-copies", conf->near_copies); - if (conf->far_copies > 1) - seq_printf(seq, " %d far-copies", conf->far_copies); - + if (conf->far_copies > 1) { + if (conf->far_offset) + seq_printf(seq, " %d offset-copies", conf->far_copies); + else + seq_printf(seq, " %d far-copies", conf->far_copies); + } seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && @@ -915,7 +1009,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -924,7 +1018,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && conf->raid_disks-mddev->degraded == 1) /* * Don't fail the drive, just return an IO error. * The test should really be more sophisticated than @@ -933,20 +1027,21 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * really dead" tests... */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" + "raid10: Operation continuing on %d devices.\n", + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -959,7 +1054,7 @@ static void print_conf(conf_t *conf) printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { @@ -1017,10 +1112,11 @@ static int raid10_spare_active(mddev_t *mddev) tmp = conf->mirrors + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1032,47 +1128,58 @@ static int raid10_spare_active(mddev_t *mddev) static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ - return 0; + return -EBUSY; if (!enough(conf)) - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else - mirror = 0; - for ( ; mirror < mddev->raid_disks; mirror++) + mirror = first; + for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + /* as we don't honour merge_bvec_fn, we must + * never risk violating it, so limit + * ->max_segments to one lying with a single + * page, as a one page request is never in + * violation. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } + md_integrity_add_rdev(rdev, mddev); print_conf(conf); - return found; + return err; } static int raid10_remove_disk(mddev_t *mddev, int number) @@ -1090,13 +1197,23 @@ static int raid10_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove faulty devices in recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + enough(conf)) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -1105,15 +1222,12 @@ abort: } -static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_read(struct bio *bio, int error) { r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); - conf_t *conf = mddev_to_conf(r10_bio->mddev); + conf_t *conf = r10_bio->mddev->private; int i,d; - if (bio->bi_size) - return 1; - for (i=0; icopies; i++) if (r10_bio->devs[i].bio == bio) break; @@ -1134,6 +1248,7 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) /* for reconstruct, we always reschedule after a read. * for resync, only after all reads */ + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); if (test_bit(R10BIO_IsRecover, &r10_bio->state) || atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, @@ -1141,21 +1256,16 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) */ reschedule_retry(r10_bio); } - rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); - return 0; } -static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) +static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i,d; - if (bio->bi_size) - return 1; - for (i = 0; i < conf->copies; i++) if (r10_bio->devs[i].bio == bio) break; @@ -1163,13 +1273,16 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) if (!uptodate) md_error(mddev, conf->mirrors[d].rdev); + update_head_pos(i, r10_bio); + rdev_dec_pending(conf->mirrors[d].rdev, mddev); while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ - md_done_sync(mddev, r10_bio->sectors, 1); + sector_t s = r10_bio->sectors; put_buf(r10_bio); + md_done_sync(mddev, s, 1); break; } else { r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; @@ -1177,8 +1290,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) r10_bio = r10_bio2; } } - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - return 0; } /* @@ -1199,7 +1310,7 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) */ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, first; struct bio *tbio, *fbio; @@ -1252,9 +1363,6 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) tbio->bi_size = r10_bio->sectors << 9; tbio->bi_idx = 0; tbio->bi_phys_segments = 0; - tbio->bi_hw_segments = 0; - tbio->bi_hw_front_size = 0; - tbio->bi_hw_back_size = 0; tbio->bi_flags &= ~(BIO_POOL_MASK - 1); tbio->bi_flags |= 1 << BIO_UPTODATE; tbio->bi_next = NULL; @@ -1302,7 +1410,7 @@ done: static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; int i, d; struct bio *bio, *wbio; @@ -1324,24 +1432,234 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) if (test_bit(R10BIO_Uptodate, &r10_bio->state)) generic_make_request(wbio); else - bio_endio(wbio, wbio->bi_size, -EIO); + bio_endio(wbio, -EIO); } /* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + * + */ +static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct timespec cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + ktime_get_ts(&cur_time_mon); + + if (rdev->last_read_error.tv_sec == 0 && + rdev->last_read_error.tv_nsec == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (cur_time_mon.tv_sec - + rdev->last_read_error.tv_sec) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); + else + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); +} + +/* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. + * 3. Performs writes following reads for array synchronising. */ +static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) +{ + int sect = 0; /* Offset from r10_bio->sector */ + int sectors = r10_bio->sectors; + mdk_rdev_t*rdev; + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + + rcu_read_lock(); + { + int d = r10_bio->devs[r10_bio->read_slot].devnum; + char b[BDEVNAME_SIZE]; + int cur_read_error_count = 0; + + rdev = rcu_dereference(conf->mirrors[d].rdev); + bdevname(rdev->bdev, b); + + if (test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; + } + + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + cur_read_error_count = atomic_read(&rdev->read_errors); + if (cur_read_error_count > max_read_errors) { + rcu_read_unlock(); + printk(KERN_NOTICE + "raid10: %s: Raid device exceeded " + "read_error threshold " + "[cur %d:max %d]\n", + b, cur_read_error_count, max_read_errors); + printk(KERN_NOTICE + "raid10: %s: Failing raid " + "device\n", b); + md_error(mddev, conf->mirrors[d].rdev); + return; + } + } + rcu_read_unlock(); + + while(sectors) { + int s = sectors; + int sl = r10_bio->read_slot; + int success = 0; + int start; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rcu_read_lock(); + do { + int d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; + } + sl++; + if (sl == conf->copies) + sl = 0; + } while (!success && sl != r10_bio->read_slot); + rcu_read_unlock(); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + int dn = r10_bio->devs[r10_bio->read_slot].devnum; + md_error(mddev, conf->mirrors[dn].rdev); + break; + } + + start = sl; + /* write it back and re-read */ + rcu_read_lock(); + while (sl != r10_bio->read_slot) { + char b[BDEVNAME_SIZE]; + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "raid10:%s: read correction " + "write failed" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect+ + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "raid10:%s: failing " + "drive\n", + bdevname(rdev->bdev, b)); + md_error(mddev, rdev); + } + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + sl = start; + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, + READ) == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "raid10:%s: unable to read back " + "corrected sectors" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect+ + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "raid10:%s: failing drive\n", + bdevname(rdev->bdev, b)); + + md_error(mddev, rdev); + } else { + printk(KERN_INFO + "raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect+ + rdev->data_offset), + bdevname(rdev->bdev, b)); + } + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + } + } + rcu_read_unlock(); + + sectors -= s; + sect += s; + } +} + static void raid10d(mddev_t *mddev) { r10bio_t *r10_bio; struct bio *bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; @@ -1350,36 +1668,21 @@ static void raid10d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - if (bitmap_unplug(mddev->bitmap) != 0) - printk("%s: bitmap file write failed!\n", mdname(mddev)); + unplug += flush_pending_writes(conf); - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - - continue; - } - - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r10_bio = list_entry(head->prev, r10bio_t, retry_list); list_del(head->prev); conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R10BIO_IsSync, &r10_bio->state)) { sync_request_write(mddev, r10_bio); unplug = 1; @@ -1396,105 +1699,15 @@ static void raid10d(mddev_t *mddev) * This is all done synchronously while the array is * frozen. */ - int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int sl = r10_bio->read_slot; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - rcu_read_lock(); - do { - int d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - success = sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - if (success) - break; - } - sl++; - if (sl == conf->copies) - sl = 0; - } while (!success && sl != r10_bio->read_slot); - rcu_read_unlock(); - - if (success) { - int start = sl; - /* write it back and re-read */ - rcu_read_lock(); - while (sl != r10_bio->read_slot) { - int d; - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - sl = start; - while (sl != r10_bio->read_slot) { - int d; - if (sl==0) - sl = conf->copies; - sl--; - d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r10_bio->devs[r10_bio->read_slot].bio; r10_bio->devs[r10_bio->read_slot].bio = mddev->ro ? IO_BLOCKED : NULL; - bio_put(bio); mirror = read_balance(conf, r10_bio); if (mirror == -1) { printk(KERN_ALERT "raid10: %s: unrecoverable I/O" @@ -1502,7 +1715,10 @@ static void raid10d(mddev_t *mddev) bdevname(bio->bi_bdev,b), (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); + bio_put(bio); } else { + const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO); + bio_put(bio); rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) printk(KERN_ERR "raid10: %s: redirecting sector %llu to" @@ -1514,15 +1730,15 @@ static void raid10d(mddev_t *mddev) bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ; + bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO); bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; unplug = 1; generic_make_request(bio); } } + cond_resched(); } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1575,7 +1791,7 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; r10bio_t *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; @@ -1592,7 +1808,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; skipped: - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { @@ -1631,6 +1847,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return (max_sector - sector_nr) + sectors_skipped; } + if (max_sector > mddev->resync_max) + max_sector = mddev->resync_max; /* Don't do IO beyond here */ + /* make sure whole request will fit in a chunk - if chunks * are meaningful */ @@ -1662,7 +1881,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* recovery... the complicated one */ - int i, j, k; + int j, k; r10_bio = NULL; for (i=0 ; iraid_disks; i++) @@ -1701,17 +1920,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio->sector = sect; raid10_find_phys(conf, r10_bio); - /* Need to check if this section will still be + + /* Need to check if the array will still be * degraded */ - for (j=0; jcopies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) { + for (j=0; jraid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { still_degraded = 1; break; } - } + must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); @@ -1725,7 +1944,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; - bio->bi_rw = 0; + bio->bi_rw = READ; bio->bi_sector = r10_bio->devs[j].addr + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; @@ -1736,12 +1955,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (k=0; kcopies; k++) if (r10_bio->devs[k].devnum == i) break; + BUG_ON(k == conf->copies); bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; - bio->bi_rw = 1; + bio->bi_rw = WRITE; bio->bi_sector = r10_bio->devs[k].addr + conf->mirrors[i].rdev->data_offset; bio->bi_bdev = conf->mirrors[i].rdev->bdev; @@ -1755,8 +1975,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (j == conf->copies) { /* Cannot recover, so abort the recovery */ put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); r10_bio = rb2; - if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) + if (!test_and_set_bit(MD_RECOVERY_INTR, + &mddev->recovery)) printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", mdname(mddev)); break; @@ -1775,6 +1998,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* resync. Schedule a read for every block at this virt offset */ int count = 0; + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { @@ -1801,6 +2026,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i int d = r10_bio->devs[i].devnum; bio = r10_bio->devs[i].bio; bio->bi_end_io = NULL; + clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; @@ -1810,7 +2036,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; - bio->bi_rw = 0; + bio->bi_rw = READ; bio->bi_sector = r10_bio->devs[i].addr + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; @@ -1837,7 +2063,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; } @@ -1899,34 +2124,55 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* There is nowhere to write, so all non-sync * drives must be failed, so try the next chunk... */ - { - sector_t sec = max_sector - sector_nr; - sectors_skipped += sec; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + + sectors_skipped += (max_sector - sector_nr); chunks_skipped ++; sector_nr = max_sector; goto skipped; - } +} + +static sector_t +raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + conf_t *conf = mddev->private; + + if (!raid_disks) + raid_disks = mddev->raid_disks; + if (!sectors) + sectors = mddev->dev_sectors; + + size = sectors >> conf->chunk_shift; + sector_div(size, conf->far_copies); + size = size * raid_disks; + sector_div(size, conf->near_copies); + + return size << conf->chunk_shift; } static int run(mddev_t *mddev) { conf_t *conf; - int i, disk_idx; + int i, disk_idx, chunk_size; mirror_info_t *disk; mdk_rdev_t *rdev; - struct list_head *tmp; - int nc, fc; + int nc, fc, fo; sector_t stride, size; - if (mddev->chunk_size == 0) { - printk(KERN_ERR "md/raid10: non-zero chunk size required.\n"); + if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->chunk_sectors)) { + printk(KERN_ERR "md/raid10: chunk size must be " + "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); return -EINVAL; } nc = mddev->layout & 255; fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->layout >> 16)) { + (mddev->layout >> 17)) { printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", mdname(mddev), mddev->layout); goto out; @@ -1955,13 +2201,32 @@ static int run(mddev_t *mddev) if (!conf->tmppage) goto out_free_conf; + conf->raid_disks = mddev->raid_disks; conf->near_copies = nc; conf->far_copies = fc; conf->copies = nc*fc; - conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; - conf->chunk_shift = ffz(~mddev->chunk_size) - 9; - stride = mddev->size >> (conf->chunk_shift-1); - sector_div(stride, fc); + conf->far_offset = fo; + conf->chunk_mask = mddev->chunk_sectors - 1; + conf->chunk_shift = ffz(~mddev->chunk_sectors); + size = mddev->dev_sectors >> conf->chunk_shift; + sector_div(size, fc); + size = size * conf->raid_disks; + sector_div(size, nc); + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" in 'stride' */ + stride = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + stride += conf->raid_disks - 1; + sector_div(stride, conf->raid_disks); + mddev->dev_sectors = stride << conf->chunk_shift; + + if (fo) + stride = 1; + else + sector_div(stride, fc); conf->stride = stride << conf->chunk_shift; conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, @@ -1972,7 +2237,19 @@ static int run(mddev_t *mddev) goto out_free_conf; } - ITERATE_RDEV(mddev, rdev, tmp) { + conf->mddev = mddev; + spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; + + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->raid_disks % conf->near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks / conf->near_copies)); + + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -1980,24 +2257,20 @@ static int run(mddev_t *mddev) disk = conf->mirrors + disk_idx; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit max_segments to 1 lying + * within a single page. */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); spin_lock_init(&conf->resync_lock); @@ -2015,14 +2288,17 @@ static int run(mddev_t *mddev) disk = conf->mirrors + i; - if (!disk->rdev) { + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; + if (disk->rdev) + conf->fullsync = 1; } } - mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); + mddev->thread = md_register_thread(raid10d, mddev, NULL); if (!mddev->thread) { printk(KERN_ERR "raid10: couldn't allocate thread for %s\n", @@ -2030,6 +2306,10 @@ static int run(mddev_t *mddev) goto out_free_conf; } + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "raid10: %s is not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO "raid10: raid set %s active with %d out of %d devices\n", mdname(mddev), mddev->raid_disks - mddev->degraded, @@ -2037,20 +2317,20 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - size = conf->stride * conf->raid_disks; - sector_div(size, conf->near_copies); - mddev->array_size = size/2; - mddev->resync_max_sectors = size; + md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); + mddev->resync_max_sectors = raid10_size(mddev, 0, 0); mddev->queue->unplug_fn = raid10_unplug; - mddev->queue->issue_flush_fn = raid10_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid10_congested; + mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... * maybe... */ { - int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE; + int stripe = conf->raid_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); stripe /= conf->near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; @@ -2058,6 +2338,7 @@ static int run(mddev_t *mddev) if (conf->near_copies < mddev->raid_disks) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + md_integrity_register(mddev); return 0; out_free_conf: @@ -2073,7 +2354,10 @@ out: static int stop(mddev_t *mddev) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; + + raise_barrier(conf, 0); + lower_barrier(conf); md_unregister_thread(mddev->thread); mddev->thread = NULL; @@ -2088,7 +2372,7 @@ static int stop(mddev_t *mddev) static void raid10_quiesce(mddev_t *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + conf_t *conf = mddev->private; switch(state) { case 1: @@ -2098,13 +2382,6 @@ static void raid10_quiesce(mddev_t *mddev, int state) lower_barrier(conf); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - md_wakeup_thread(mddev->thread); - } } static struct mdk_personality raid10_personality = @@ -2122,6 +2399,7 @@ static struct mdk_personality raid10_personality = .spare_active = raid10_spare_active, .sync_request = sync_request, .quiesce = raid10_quiesce, + .size = raid10_size, }; static int __init raid_init(void) @@ -2137,6 +2415,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10");