X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid1.c;h=c165b1eed8bb37745d3d1284dc855eeb3b771c7a;hb=159ec1fc060ab22b157a62364045f5e98749c4d3;hp=ae7c15207df5034d2da1e33935f64893a17a1ab7;hpb=c620727779f7cc8ea96efb71f0651a26349e59c1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ae7c152..c165b1e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -32,6 +32,7 @@ */ #include "dm-bio-list.h" +#include #include #include @@ -592,6 +593,37 @@ static int raid1_congested(void *data, int bits) } +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -673,15 +705,23 @@ static void freeze_array(conf_t *conf) /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then - * wait until barrier+nr_pending match nr_queued+2 + * wait until nr_pending match nr_queued+1 + * This is called in the context of one normal IO request + * that has failed. Thus any sync request that might be pending + * will be blocked by nr_pending, and we need to wait for + * pending IO requests to complete or be queued for re-try. + * Thus the number queued (nr_queued) plus this request (1) + * must match the number of pending IOs (nr_pending) before + * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, - conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid1_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -734,14 +774,14 @@ static int make_request(struct request_queue *q, struct bio * bio) r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; - mdk_rdev_t *rdev; - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const int do_sync = bio_sync(bio); - int do_barriers; + int cpu, do_barriers; + mdk_rdev_t *blocked_rdev; /* * Register the new request and wait if the reconstruction @@ -763,8 +803,13 @@ static int make_request(struct request_queue *q, struct bio * bio) wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + bitmap = mddev->bitmap; + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bio)); + part_stat_unlock(); /* * make_request() can abort the operation when READA is being @@ -823,10 +868,17 @@ static int make_request(struct request_queue *q, struct bio * bio) first = 0; } #endif + retry_write: + blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < disks; i++) { - if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && - !test_bit(Faulty, &rdev->flags)) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); @@ -839,6 +891,20 @@ static int make_request(struct request_queue *q, struct bio * bio) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Wait for this device to become unblocked */ + int j; + + for (j = 0; j < i; j++) + if (r1_bio->bios[j]) + rdev_dec_pending(conf->mirrors[j].rdev, mddev); + + allow_barrier(conf); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + BUG_ON(targets == 0); /* we never fail the last device */ if (targets < conf->raid_disks) { @@ -907,6 +973,9 @@ static int make_request(struct request_queue *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid1d snuck into freeze_array */ + wake_up(&conf->wait_barrier); + if (do_sync) md_wakeup_thread(mddev->thread); #if 0 @@ -962,12 +1031,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery is running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" - " Operation continuing on %d devices\n", + printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" + "raid1: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } @@ -1035,11 +1104,16 @@ static int raid1_spare_active(mddev_t *mddev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror = 0; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + for (mirror = first; mirror <= last; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@ -1054,7 +1128,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ @@ -1065,7 +1139,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) } print_conf(conf); - return found; + return err; } static int raid1_remove_disk(mddev_t *mddev, int number) @@ -1083,6 +1157,14 @@ static int raid1_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices is recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded < conf->raid_disks) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -1219,23 +1301,31 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) rdev_dec_pending(conf->mirrors[i].rdev, mddev); } else { /* fixup the bio for reuse */ + int size; sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; sbio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bi_flags |= 1 << BIO_UPTODATE; sbio->bi_next = NULL; sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - for (j = 0; j < vcnt ; j++) - memcpy(page_address(sbio->bi_io_vec[j].bv_page), + size = sbio->bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &sbio->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; + memcpy(page_address(bi->bv_page), page_address(pbio->bi_io_vec[j].bv_page), PAGE_SIZE); + } } } @@ -1473,28 +1563,14 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - bitmap_unplug(mddev->bitmap); - - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - continue; - } + unplug += flush_pending_writes(conf); - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); conf->nr_queued--; @@ -1590,7 +1666,6 @@ static void raid1d(mddev_t *mddev) } } } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); } @@ -1716,7 +1791,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; @@ -1845,7 +1919,6 @@ static int run(mddev_t *mddev) int i, j, disk_idx; mirror_info_t *disk; mdk_rdev_t *rdev; - struct list_head *tmp; if (mddev->level != 1) { printk("raid1: %s: raid level not set to mirroring (%d)\n", @@ -1887,7 +1960,10 @@ static int run(mddev_t *mddev) if (!conf->r1bio_pool) goto out_no_mem; - ITERATE_RDEV(mddev, rdev, tmp) { + spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; + + list_for_each_entry(rdev, &mddev->disks, same_set) { disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -1910,7 +1986,6 @@ static int run(mddev_t *mddev) } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; - spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); spin_lock_init(&conf->resync_lock); @@ -1967,7 +2042,7 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - mddev->array_size = mddev->size; + mddev->array_sectors = mddev->size * 2; mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; @@ -2029,14 +2104,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_size = sectors>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { + if (mddev->array_sectors / 2 > mddev->size && + mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_size; + mddev->size = mddev->array_sectors / 2; mddev->resync_max_sectors = sectors; return 0; } @@ -2060,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev) conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; unsigned long flags; - int d, d2; + int d, d2, err; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_size != mddev->new_chunk || @@ -2072,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev) return -EINVAL; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; raid_disks = mddev->raid_disks + mddev->delta_disks;