X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid5.c;h=cac97080b2780dc2bb064ec8df7109518489e088;hb=2b7497f0e0a0b9cf21d822e427d5399b2056501a;hp=f96dea975fa50d94e805c40c46be82df7cfe80bc;hpb=e4d84909dd48b5e5806a5d18b881e1ca1610ba9b;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f96dea9..cac9708 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -63,6 +63,7 @@ #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) #define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) @@ -93,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -108,14 +111,11 @@ static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; while (bi) { - int bytes = bi->bi_size; return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, bytes, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } @@ -373,19 +373,22 @@ static unsigned long get_stripe_work(struct stripe_head *sh) test_and_ack_op(STRIPE_OP_BIODRAIN, pending); test_and_ack_op(STRIPE_OP_POSTXOR, pending); test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; sh->ops.count -= ack; - BUG_ON(sh->ops.count < 0); + if (unlikely(sh->ops.count < 0)) { + printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " + "ops.complete: %#lx\n", pending, sh->ops.pending, + sh->ops.ack, sh->ops.complete); + BUG(); + } return pending; } -static int -raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error); -static int -raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error); +static void +raid5_end_read_request(struct bio *bi, int error); +static void +raid5_end_write_request(struct bio *bi, int error); static void ops_run_io(struct stripe_head *sh) { @@ -427,9 +430,11 @@ static void ops_run_io(struct stripe_head *sh) test_bit(STRIPE_EXPAND_READY, &sh->state)) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", - __FUNCTION__, (unsigned long long)sh->sector, + __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; @@ -516,7 +521,7 @@ static void ops_complete_biofill(void *stripe_head_ref) raid5_conf_t *conf = sh->raid_conf; int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* clear completed biofills */ @@ -551,8 +556,7 @@ static void ops_complete_biofill(void *stripe_head_ref) } } } - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); return_io(return_bi); @@ -566,7 +570,7 @@ static void ops_run_biofill(struct stripe_head *sh) raid5_conf_t *conf = sh->raid_conf; int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = sh->disks; i--; ) { @@ -597,7 +601,7 @@ static void ops_complete_compute5(void *stripe_head_ref) int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(R5_UPTODATE, &tgt->flags); @@ -622,7 +626,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) int i; pr_debug("%s: stripe %llu block: %d\n", - __FUNCTION__, (unsigned long long)sh->sector, target); + __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); for (i = disks; i--; ) @@ -650,7 +654,7 @@ static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); @@ -667,7 +671,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -685,7 +689,8 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, + unsigned long pending) { int disks = sh->disks; int pd_idx = sh->pd_idx, i; @@ -693,9 +698,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) /* check if prexor is active which means only process blocks * that are part of a read-modify-write (Wantprexor) */ - int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + int prexor = test_bit(STRIPE_OP_PREXOR, &pending); - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -740,7 +745,7 @@ static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); @@ -753,7 +758,7 @@ static void ops_complete_write(void *stripe_head_ref) struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -770,7 +775,8 @@ static void ops_complete_write(void *stripe_head_ref) } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, + unsigned long pending) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -778,11 +784,11 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + int prexor = test_bit(STRIPE_OP_PREXOR, &pending); unsigned long flags; dma_async_tx_callback callback; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* check if prexor is active which means only process blocks @@ -805,7 +811,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ? + callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? ops_complete_write : ops_complete_postxor; /* 1/ if we prexor'd then the dest is reused as a source @@ -830,15 +836,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -854,7 +855,7 @@ static void ops_run_check(struct stripe_head *sh) int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -866,11 +867,6 @@ static void ops_run_check(struct stripe_head *sh) tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); @@ -893,19 +889,16 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) tx = ops_run_prexor(sh, tx); if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx); + tx = ops_run_biodrain(sh, tx, pending); overlap_clear++; } if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx); + ops_run_postxor(sh, tx, pending); if (test_bit(STRIPE_OP_CHECK, &pending)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1110,8 +1103,7 @@ static void shrink_stripes(raid5_conf_t *conf) conf->slab_cache = NULL; } -static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, - int error) +static void raid5_end_read_request(struct bio * bi, int error) { struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; @@ -1120,8 +1112,6 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev; - if (bi->bi_size) - return 1; for (i=0 ; idev[i].req) @@ -1132,17 +1122,19 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, uptodate); if (i == disks) { BUG(); - return 0; + return; } if (uptodate) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)sh->sector + rdev->data_offset, - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1156,16 +1148,22 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)sh->sector + rdev->data_offset, - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)sh->sector + rdev->data_offset, - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1185,20 +1183,15 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); - return 0; } -static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, - int error) +static void raid5_end_write_request (struct bio *bi, int error) { struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - if (bi->bi_size) - return 1; - for (i=0 ; idev[i].req) break; @@ -1208,7 +1201,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, uptodate); if (i == disks) { BUG(); - return 0; + return; } if (!uptodate) @@ -1219,7 +1212,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); - return 0; } @@ -1260,12 +1252,12 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT - "raid5: Disk failure on %s, disabling device." - " Operation continuing on %d devices\n", + "raid5: Disk failure on %s, disabling device.\n" + "raid5: Operation continuing on %d devices.\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } } @@ -1724,6 +1716,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) locked++; } } + if (locked + 1 == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); @@ -1763,7 +1758,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) locked++; pr_debug("%s: stripe %llu locked: %d pending: %lx\n", - __FUNCTION__, (unsigned long long)sh->sector, + __func__, (unsigned long long)sh->sector, locked, sh->ops.pending); return locked; @@ -1951,6 +1946,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, STRIPE_SECTORS, 0, 0); } + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); } /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks @@ -1988,6 +1986,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * have quiesced. */ if ((s->uptodate == disks - 1) && + (s->failed && disk_idx == s->failed_num) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); @@ -2010,8 +2009,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, */ set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); @@ -2073,7 +2070,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); @@ -2153,6 +2152,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf, 0); } } + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); } static void handle_issuing_new_write_requests5(raid5_conf_t *conf, @@ -2199,9 +2202,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2225,9 +2225,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2337,6 +2334,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, s->locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); } + if (s->locked == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&conf->pending_full_writes); /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ set_bit(STRIPE_INSYNC, &sh->state); @@ -2352,25 +2352,15 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { + int canceled_check = 0; + set_bit(STRIPE_HANDLE, &sh->state); - /* Take one of the following actions: - * 1/ start a check parity operation if (uptodate == disks) - * 2/ finish a check parity operation and act on the result - * 3/ skip to the writeback section if we previously - * initiated a recovery operation - */ - if (s->failed == 0 && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - BUG_ON(s->uptodate != disks); - clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; - s->uptodate--; - } else if ( - test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + /* complete a check operation */ + if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + if (s->failed == 0) { if (sh->ops.zero_sum_result == 0) /* parity is correct (on disc, * not in buffer any more) @@ -2395,12 +2385,27 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, s->uptodate++; } } + } else + canceled_check = 1; /* STRIPE_INSYNC is not set */ + } + + /* start a new check operation if there are no failures, the stripe is + * not insync, and a repair is not in flight + */ + if (s->failed == 0 && + !test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(s->uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + s->uptodate--; } } /* check if we can clear a parity disk reconstruct */ if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); @@ -2408,12 +2413,15 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); } + /* Wait for check parity and compute block operations to complete - * before write-back + * before write-back. If a failure occurred while the check operation + * was in flight we need to cycle this stripe through handle_stripe + * since the parity block may not be uptodate */ - if (!test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { + if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { struct r5dev *dev; /* either failed parity check, or recovery is happening */ if (s->failed == 0) @@ -2424,8 +2432,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); s->locked++; @@ -2589,6 +2595,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } } + /* * handle_stripe - do things to a stripe. * @@ -2614,6 +2621,8 @@ static void handle_stripe5(struct stripe_head *sh) struct stripe_head_state s; struct r5dev *dev; unsigned long pending = 0; + mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " @@ -2630,6 +2639,13 @@ static void handle_stripe5(struct stripe_head *sh) s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ + /* clean-up completed biofill operations */ + if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + } + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -2666,6 +2682,11 @@ static void handle_stripe5(struct stripe_head *sh) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + break; + } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -2680,6 +2701,11 @@ static void handle_stripe5(struct stripe_head *sh) } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) sh->ops.count++; @@ -2726,9 +2752,11 @@ static void handle_stripe5(struct stripe_head *sh) /* leave prexor set until postxor is done, allows us to distinguish * a rmw from a rcw during biodrain */ + prexor = 0; if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + prexor = 1; clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); @@ -2759,9 +2787,8 @@ static void handle_stripe5(struct stripe_head *sh) (i == sh->pd_idx || dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); @@ -2813,16 +2840,12 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); s.locked++; } @@ -2840,11 +2863,10 @@ static void handle_stripe5(struct stripe_head *sh) clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - for (i = conf->raid_disks; i--; ) { + for (i = conf->raid_disks; i--; ) set_bit(R5_Wantwrite, &sh->dev[i].flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; - } + set_bit(R5_LOCKED, &dev->flags); + s.locked++; } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && @@ -2855,6 +2877,7 @@ static void handle_stripe5(struct stripe_head *sh) conf->raid_disks); s.locked += handle_write_operations5(sh, 1, 1); } else if (s.expanded && + s.locked == 0 && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -2862,17 +2885,25 @@ static void handle_stripe5(struct stripe_head *sh) md_done_sync(conf->mddev, STRIPE_SECTORS, 1); } - if (s.expanding && s.locked == 0) + if (s.expanding && s.locked == 0 && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_stripe_expansion(conf, sh, NULL); if (sh->ops.count) pending = get_stripe_work(sh); + unlock: spin_unlock(&sh->lock); + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + if (pending) raid5_run_ops(sh, pending); + ops_run_io(sh); + return_io(return_bi); } @@ -2886,6 +2917,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; + mdk_rdev_t *blocked_rdev = NULL; r6s.qd_idx = raid6_next_disk(pd_idx, disks); pr_debug("handling stripe %llu, state=%#lx cnt=%d, " @@ -2949,6 +2981,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + break; + } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -2963,6 +3000,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, @@ -3064,11 +3106,17 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) md_done_sync(conf->mddev, STRIPE_SECTORS, 1); } - if (s.expanding && s.locked == 0) + if (s.expanding && s.locked == 0 && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_stripe_expansion(conf, sh, &r6s); + unlock: spin_unlock(&sh->lock); + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + return_io(return_bi); for (i=disks; i-- ;) { @@ -3082,6 +3130,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) else continue; + set_bit(STRIPE_IO_STARTED, &sh->state); + bi = &sh->dev[i].req; bi->bi_rw = rw; @@ -3152,9 +3202,10 @@ static void raid5_activate_delayed(raid5_conf_t *conf) clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->handle_list); + list_add_tail(&sh->lru, &conf->hold_list); } - } + } else + blk_plug_device(conf->mddev->queue); } static void activate_bit_delay(raid5_conf_t *conf) @@ -3185,8 +3236,7 @@ static void unplug_slaves(mddev_t *mddev) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); + blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -3214,36 +3264,6 @@ static void raid5_unplug_device(struct request_queue *q) unplug_slaves(mddev); } -static int raid5_issue_flush(struct request_queue *q, struct gendisk *disk, - sector_t *error_sector) -{ - mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); - int i, ret = 0; - - rcu_read_lock(); - for (i=0; iraid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct block_device *bdev = rdev->bdev; - struct request_queue *r_queue = bdev_get_queue(bdev); - - if (!r_queue->issue_flush_fn) - ret = -EOPNOTSUPP; - else { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, - error_sector); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - } - rcu_read_unlock(); - return ret; -} - static int raid5_congested(void *data, int bits) { mddev_t *mddev = data; @@ -3340,7 +3360,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) * first). * If the read failed.. */ -static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) +static void raid5_align_endio(struct bio *bi, int error) { struct bio* raid_bi = bi->bi_private; mddev_t *mddev; @@ -3348,8 +3368,6 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); mdk_rdev_t *rdev; - if (bi->bi_size) - return 1; bio_put(bi); mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; @@ -3360,17 +3378,16 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { - bio_endio(raid_bi, bytes, 0); + bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); - return 0; + return; } pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); add_bio_to_retry(raid_bi, conf); - return 0; } static int bio_fits_rdev(struct bio *bi) @@ -3463,6 +3480,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) } } +/* __get_priority_stripe - get the next stripe to process + * + * Full stripe writes are allowed to pass preread active stripes up until + * the bypass_threshold is exceeded. In general the bypass_count + * increments when the handle_list is handled before the hold_list; however, it + * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a + * stripe with in flight i/o. The bypass_count will be reset when the + * head of the hold_list has changed, i.e. the head was promoted to the + * handle_list. + */ +static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) +{ + struct stripe_head *sh; + + pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", + __func__, + list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(&conf->hold_list) ? "empty" : "busy", + atomic_read(&conf->pending_full_writes), conf->bypass_count); + + if (!list_empty(&conf->handle_list)) { + sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + + if (list_empty(&conf->hold_list)) + conf->bypass_count = 0; + else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { + if (conf->hold_list.next == conf->last_hold) + conf->bypass_count++; + else { + conf->last_hold = conf->hold_list.next; + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + } else if (!list_empty(&conf->hold_list) && + ((conf->bypass_threshold && + conf->bypass_count > conf->bypass_threshold) || + atomic_read(&conf->pending_full_writes) == 0)) { + sh = list_entry(conf->hold_list.next, + typeof(*sh), lru); + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } else + return NULL; + + list_del_init(&sh->lru); + atomic_inc(&sh->count); + BUG_ON(atomic_read(&sh->count) != 1); + return sh; +} static int make_request(struct request_queue *q, struct bio * bi) { @@ -3476,7 +3545,7 @@ static int make_request(struct request_queue *q, struct bio * bi) int remaining; if (unlikely(bio_barrier(bi))) { - bio_endio(bi, bi->bi_size, -EOPNOTSUPP); + bio_endio(bi, -EOPNOTSUPP); return 0; } @@ -3578,7 +3647,8 @@ static int make_request(struct request_queue *q, struct bio * bi) goto retry; } finish_wait(&conf->wait_for_overlap, &w); - handle_stripe(sh, NULL); + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -3592,14 +3662,11 @@ static int make_request(struct request_queue *q, struct bio * bi) remaining = --bi->bi_phys_segments; spin_unlock_irq(&conf->device_lock); if (remaining == 0) { - int bytes = bi->bi_size; if ( rw == WRITE ) md_write_end(mddev); - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + + bio_endio(bi, 0); } return 0; } @@ -3728,6 +3795,25 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* If this takes us to the resync_max point where we have to pause, + * then we need to write out the superblock. + */ + sector_nr += conf->chunk_size>>9; + if (sector_nr >= mddev->resync_max) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes) == 0); + mddev->reshape_position = conf->expand_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) + || kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->expand_lo = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + } return conf->chunk_size>>9; } @@ -3764,6 +3850,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); + /* No need to check resync_max as we never do more than one + * stripe, and as resync_max will always be on a chunk boundary, + * if the check in md_do_sync didn't fire, there is no chance + * of overstepping resync_max here + */ + /* if there is too many failed drives and we are trying * to resync, then assert that we are finished, because there is * nothing we can do. @@ -3783,6 +3875,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } + + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); if (sh == NULL) { @@ -3874,14 +3969,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) spin_lock_irq(&conf->device_lock); remaining = --raid_bio->bi_phys_segments; spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - int bytes = raid_bio->bi_size; - - raid_bio->bi_size = 0; - raid_bio->bi_end_io(raid_bio, bytes, - test_bit(BIO_UPTODATE, &raid_bio->bi_flags) - ? 0 : -EIO); - } + if (remaining == 0) + bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@ -3896,7 +3985,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d (mddev_t *mddev) +static void raid5d(mddev_t *mddev) { struct stripe_head *sh; raid5_conf_t *conf = mddev_to_conf(mddev); @@ -3909,7 +3998,6 @@ static void raid5d (mddev_t *mddev) handled = 0; spin_lock_irq(&conf->device_lock); while (1) { - struct list_head *first; struct bio *bio; if (conf->seq_flush != conf->seq_write) { @@ -3921,12 +4009,6 @@ static void raid5d (mddev_t *mddev) activate_bit_delay(conf); } - if (list_empty(&conf->handle_list) && - atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && - !blk_queue_plugged(mddev->queue) && - !list_empty(&conf->delayed_list)) - raid5_activate_delayed(conf); - while ((bio = remove_bio_from_retry(conf))) { int ok; spin_unlock_irq(&conf->device_lock); @@ -3937,17 +4019,12 @@ static void raid5d (mddev_t *mddev) handled++; } - if (list_empty(&conf->handle_list)) { + sh = __get_priority_stripe(conf); + + if (!sh) { async_tx_issue_pending_all(); break; } - - first = conf->handle_list.next; - sh = list_entry(first, struct stripe_head, lru); - - list_del_init(first); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count)!= 1); spin_unlock_irq(&conf->device_lock); handled++; @@ -3979,15 +4056,13 @@ static ssize_t raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); - char *end; - int new; + unsigned long new; if (len >= PAGE_SIZE) return -EINVAL; if (!conf) return -ENODEV; - new = simple_strtoul(page, &end, 10); - if (!*page || (*end && *end != '\n') ) + if (strict_strtoul(page, 10, &new)) return -EINVAL; if (new <= 16 || new > 32768) return -EINVAL; @@ -4012,6 +4087,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, raid5_store_stripe_cache_size); static ssize_t +raid5_show_preread_threshold(mddev_t *mddev, char *page) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + if (conf) + return sprintf(page, "%d\n", conf->bypass_threshold); + else + return 0; +} + +static ssize_t +raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (strict_strtoul(page, 10, &new)) + return -EINVAL; + if (new > conf->max_nr_stripes) + return -EINVAL; + conf->bypass_threshold = new; + return len; +} + +static struct md_sysfs_entry +raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, + S_IRUGO | S_IWUSR, + raid5_show_preread_threshold, + raid5_store_preread_threshold); + +static ssize_t stripe_cache_active_show(mddev_t *mddev, char *page) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -4027,6 +4136,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, + &raid5_preread_bypass_threshold.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -4128,19 +4238,22 @@ static int run(mddev_t *mddev) goto abort; } spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + conf->bypass_threshold = BYPASS_THRESHOLD; pr_debug("raid5: run(%s) called.\n", mdname(mddev)); - ITERATE_RDEV(mddev,rdev,tmp) { + rdev_for_each(rdev, tmp, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= conf->raid_disks || raid_disk < 0) @@ -4155,7 +4268,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* @@ -4279,7 +4394,6 @@ static int run(mddev_t *mddev) mdname(mddev)); mddev->queue->unplug_fn = raid5_unplug_device; - mddev->queue->issue_flush_fn = raid5_issue_flush; mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; @@ -4433,6 +4547,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -4450,35 +4572,41 @@ abort: static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { raid5_conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int disk; struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; if (mddev->degraded > conf->max_degraded) /* no point adding a device */ - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; /* * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) disk = rdev->saved_raid_disk; else - disk = 0; - for ( ; disk < conf->raid_disks; disk++) + disk = first; + for ( ; disk <= last ; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - found = 1; + err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); - return found; + return err; } static int raid5_resize(mddev_t *mddev, sector_t sectors) @@ -4554,7 +4682,7 @@ static int raid5_start_reshape(mddev_t *mddev) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; - ITERATE_RDEV(mddev, rdev, rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) spares++; @@ -4576,10 +4704,10 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. */ - ITERATE_RDEV(mddev, rdev, rtmp) + rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev)) { + if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; set_bit(In_sync, &rdev->flags); added_devices++;