X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid5.c;h=611ea7bbf47437804aa1cecb38337050ea652fc2;hb=d562b0c4313e3ddea402a400371afa47ddf679f9;hp=388a974d63ef9aed767329c989c1849c431403c8;hpb=b47490c9bc73d0b34e4c194db40de183e592e446;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 388a974..611ea7b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -43,16 +43,14 @@ * miss any bits. */ -#include -#include -#include -#include +#include #include -#include -#include "raid6.h" - -#include #include +#include +#include "md.h" +#include "raid5.h" +#include "raid6.h" +#include "bitmap.h" /* * Stripe cache @@ -63,6 +61,7 @@ #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) #define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) @@ -93,17 +92,83 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); #endif +/* + * We maintain a biased count of active stripes in the bottom 16 bits of + * bi_phys_segments, and a count of processed stripes in the upper 16 bits + */ +static inline int raid5_bi_phys_segments(struct bio *bio) +{ + return bio->bi_phys_segments & 0xffff; +} + +static inline int raid5_bi_hw_segments(struct bio *bio) +{ + return (bio->bi_phys_segments >> 16) & 0xffff; +} + +static inline int raid5_dec_bi_phys_segments(struct bio *bio) +{ + --bio->bi_phys_segments; + return raid5_bi_phys_segments(bio); +} + +static inline int raid5_dec_bi_hw_segments(struct bio *bio) +{ + unsigned short val = raid5_bi_hw_segments(bio); + + --val; + bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); + return val; +} + +static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) +{ + bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); +} + +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} static inline int raid6_next_disk(int disk, int raid_disks) { disk++; return (disk < raid_disks) ? disk : 0; } +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) +{ + int slot; + + if (idx == sh->pd_idx) + return syndrome_disks; + if (idx == sh->qd_idx) + return syndrome_disks + 1; + slot = (*count)++; + return slot; +} + static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; @@ -112,15 +177,20 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } static void print_raid5_conf (raid5_conf_t *conf); +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { @@ -140,7 +210,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { - BUG_ON(sh->ops.pending); + BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -156,6 +226,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } } } + static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; @@ -233,16 +304,18 @@ static int grow_buffers(struct stripe_head *sh, int num) return 0; } -static void raid5_build_block (struct stripe_head *sh, int i); +static void raid5_build_block(struct stripe_head *sh, int i); +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh); -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { raid5_conf_t *conf = sh->raid_conf; int i; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + BUG_ON(stripe_operations_active(sh)); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", @@ -250,11 +323,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int remove_hash(sh); + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = pd_idx; + stripe_set_idx(sector, conf, previous, sh); sh->state = 0; - sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -290,10 +363,12 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); -static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, - int pd_idx, int noblock) +static struct stripe_head * +get_active_stripe(raid5_conf_t *conf, sector_t sector, + int previous, int noblock) { struct stripe_head *sh; + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -321,7 +396,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ); conf->inactive_blocked = 0; } else - init_stripe(sh, sector, pd_idx, disks); + init_stripe(sh, sector, previous); } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru)); @@ -343,55 +418,12 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } -/* test_and_ack_op() ensures that we only dequeue an operation once */ -#define test_and_ack_op(op, pend) \ -do { \ - if (test_bit(op, &sh->ops.pending) && \ - !test_bit(op, &sh->ops.complete)) { \ - if (test_and_set_bit(op, &sh->ops.ack)) \ - clear_bit(op, &pend); \ - else \ - ack++; \ - } else \ - clear_bit(op, &pend); \ -} while (0) - -/* find new work to run, do not resubmit work that is already - * in flight - */ -static unsigned long get_stripe_work(struct stripe_head *sh) -{ - unsigned long pending; - int ack = 0; - - pending = sh->ops.pending; - - test_and_ack_op(STRIPE_OP_BIOFILL, pending); - test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); - test_and_ack_op(STRIPE_OP_PREXOR, pending); - test_and_ack_op(STRIPE_OP_BIODRAIN, pending); - test_and_ack_op(STRIPE_OP_POSTXOR, pending); - test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; - - sh->ops.count -= ack; - if (unlikely(sh->ops.count < 0)) { - printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " - "ops.complete: %#lx\n", pending, sh->ops.pending, - sh->ops.ack, sh->ops.complete); - BUG(); - } - - return pending; -} - static void raid5_end_read_request(struct bio *bi, int error); static void raid5_end_write_request(struct bio *bi, int error); -static void ops_run_io(struct stripe_head *sh) +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; @@ -426,14 +458,14 @@ static void ops_run_io(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(STRIPE_SYNCING, &sh->state) || - test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || - test_bit(STRIPE_EXPAND_READY, &sh->state)) + if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", - __FUNCTION__, (unsigned long long)sh->sector, + __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; @@ -520,42 +552,38 @@ static void ops_complete_biofill(void *stripe_head_ref) raid5_conf_t *conf = sh->raid_conf; int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until - * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) + * !STRIPE_BIOFILL_RUN */ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2; - /* The access to dev->read is outside of the - * spin_lock_irq(&conf->device_lock), but is protected - * by the STRIPE_OP_BIOFILL pending bit - */ BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } - spin_unlock_irq(&conf->device_lock); rbi = rbi2; } } } - set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -569,7 +597,7 @@ static void ops_run_biofill(struct stripe_head *sh) raid5_conf_t *conf = sh->raid_conf; int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = sh->disks; i--; ) { @@ -600,19 +628,20 @@ static void ops_complete_compute5(void *stripe_head_ref) int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, unsigned long pending) +static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -625,7 +654,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) int i; pr_debug("%s: stripe %llu block: %d\n", - __FUNCTION__, (unsigned long long)sh->sector, target); + __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); for (i = disks; i--; ) @@ -642,10 +671,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute5, sh); - /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) - async_tx_ack(tx); - return tx; } @@ -653,10 +678,8 @@ static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); } static struct dma_async_tx_descriptor * @@ -670,13 +693,13 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -688,37 +711,19 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - int pd_idx = sh->pd_idx, i; - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (Wantprexor) - */ - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int i; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct bio *chosen; - int towrite; - towrite = 0; - if (prexor) { /* rmw */ - if (dev->towrite && - test_bit(R5_Wantprexor, &dev->flags)) - towrite = 1; - } else { /* rcw */ - if (i != pd_idx && dev->towrite && - test_bit(R5_LOCKED, &dev->flags)) - towrite = 1; - } - - if (towrite) { + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; spin_lock(&sh->lock); @@ -743,21 +748,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __FUNCTION__, - (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_complete_write(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -766,16 +759,21 @@ static void ops_complete_write(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); } - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -783,17 +781,17 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = 0; unsigned long flags; - dma_async_tx_callback callback; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (prexor) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -809,10 +807,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, } } - /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? - ops_complete_write : ops_complete_postxor; - /* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST @@ -826,25 +820,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, if (unlikely(count == 1)) { flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -859,7 +848,7 @@ static void ops_run_check(struct stripe_head *sh) int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; - pr_debug("%s: stripe %llu\n", __FUNCTION__, + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { @@ -871,46 +860,42 @@ static void ops_run_check(struct stripe_head *sh) tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) - tx = ops_run_compute5(sh, pending); + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + async_tx_ack(tx); + } - if (test_bit(STRIPE_OP_PREXOR, &pending)) + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx, pending); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx, pending); + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -948,8 +933,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) struct kmem_cache *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + sprintf(conf->cache_name[1], + "raid%d-%s-alt", conf->level, mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -993,14 +980,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -1117,7 +1106,7 @@ static void shrink_stripes(raid5_conf_t *conf) static void raid5_end_read_request(struct bio * bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1141,10 +1130,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)sh->sector + rdev->data_offset, - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1158,16 +1149,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)sh->sector + rdev->data_offset, - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)sh->sector + rdev->data_offset, - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1189,9 +1186,9 @@ static void raid5_end_read_request(struct bio * bi, int error) release_stripe(sh); } -static void raid5_end_write_request (struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi, int error) { - struct stripe_head *sh = bi->bi_private; + struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1221,7 +1218,7 @@ static void raid5_end_write_request (struct bio *bi, int error) static sector_t compute_blocknr(struct stripe_head *sh, int i); -static void raid5_build_block (struct stripe_head *sh, int i) +static void raid5_build_block(struct stripe_head *sh, int i) { struct r5dev *dev = &sh->dev[i]; @@ -1256,13 +1253,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); - printk (KERN_ALERT - "raid5: Disk failure on %s, disabling device." - " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT + "raid5: Disk failure on %s, disabling device.\n" + "raid5: Operation continuing on %d devices.\n", + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } } @@ -1270,15 +1267,20 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) +static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { long stripe; unsigned long chunk_number; unsigned int chunk_offset; + int pd_idx, qd_idx; + int ddf_layout = 0; sector_t new_sector; int sectors_per_chunk = conf->chunk_size >> 9; + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; /* First compute the information on this sector */ @@ -1302,68 +1304,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, /* * Select the parity disk based on the user selected algorithm. */ + pd_idx = qd_idx = ~0; switch(conf->level) { case 4: - *pd_idx = data_disks; + pd_idx = data_disks; break; case 5: switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; case 6: - /**** FIX THIS ****/ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = stripe % raid_disks; + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; + break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = stripe % raid_disks; + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - (stripe % raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = stripe % (raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + default: - printk (KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + printk(KERN_CRIT "raid6: unsupported algorithm %d\n", + conf->algorithm); + BUG(); } break; } + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; + } /* * Finally, compute the new sector number */ @@ -1381,8 +1485,9 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) int sectors_per_chunk = conf->chunk_size >> 9; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; + int chunk_number, dummy1, dd_idx = i; sector_t r_sector; + struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); @@ -1406,19 +1511,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i += raid_disks; i -= (sh->pd_idx + 1); break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + BUG(); } break; case 6: - if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + if (i == sh->qd_idx) return 0; /* It is the Q disk */ switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ else if (i > sh->pd_idx) i -= 2; /* D D P Q D */ break; @@ -1433,9 +1546,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 2); } break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else if (i > sh->pd_idx) + i -= 2; /* D D Q P D */ + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; default: - printk (KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + printk(KERN_CRIT "raid6: unsupported algorithm %d\n", + conf->algorithm); + BUG(); } break; } @@ -1443,8 +1582,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) chunk_number = stripe * data_disks + i; r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { + check = raid5_compute_sector(conf, r_sector, + (raid_disks != conf->raid_disks), + &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; } @@ -1511,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { - raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; + void *ptrs[syndrome_disks+2]; - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); + pd_idx = sh->pd_idx; + qd_idx = sh->qd_idx; + d0_idx = raid6_d0(sh); pr_debug("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -1556,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method) set_bit(R5_UPTODATE, &sh->dev[i].flags); } -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } - - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ + + for (i = 0; i < disks; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); + if (slot < syndrome_disks && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { + printk(KERN_ERR "block %d/%d not uptodate " + "on parity calc\n", i, count); + BUG(); + } + + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); + + raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); switch(method) { case RECONSTRUCT_WRITE: @@ -1595,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); + int qd_idx = sh->qd_idx; pr_debug("compute_block_1, stripe %llu, idx %d\n", (unsigned long long)sh->sector, dd_idx); @@ -1632,70 +1780,72 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { int i, count, disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[syndrome_disks+2]; + + for (i = 0; i < disks ; i++) + ptrs[i] = (void *)raid6_empty_zero_page; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + ptrs[slot] = page_address(sh->dev[i].page); - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + if (i == dd_idx1) + faila = slot; + if (i == dd_idx2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + BUG_ON(count != syndrome_disks); BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + (unsigned long long)sh->sector, dd_idx1, dd_idx2, + faila, failb); - if ( failb == disks-1 ) { + if (failb == syndrome_disks+1) { /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { + if (faila == syndrome_disks) { /* Missing P+Q, just recompute */ compute_parity6(sh, UPDATE_PARITY); return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); + compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? + dd_idx2 : dd_idx1), + 0); compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ return; } } - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); + /* We're missing D+P or D+D; */ + if (failb == syndrome_disks) { + /* We're missing D+P. */ + raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, + ptrs); } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); } -static int -handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +static void +schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - int locked = 0; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1703,50 +1853,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) * stripe cache */ if (!expand) { - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - sh->ops.count++; - } + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - sh->ops.count++; + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->towrite) { set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } + if (s->locked + 1 == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - - sh->ops.count += 3; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) continue; - /* For a read-modify write there may be blocks that are - * locked for reading while others are ready to be - * written so we distinguish these blocks by the - * R5_Wantprexor bit - */ if (dev->towrite && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantprexor, &dev->flags); + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } } @@ -1756,13 +1904,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - locked++; - - pr_debug("%s: stripe %llu locked: %d pending: %lx\n", - __FUNCTION__, (unsigned long long)sh->sector, - locked, sh->ops.pending); + s->locked++; - return locked; + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", + __func__, (unsigned long long)sh->sector, + s->locked, s->ops_request); } /* @@ -1801,7 +1947,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (*bip) bi->bi_next = *bip; *bip = bi; - bi->bi_phys_segments ++; + bi->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); spin_unlock(&sh->lock); @@ -1847,21 +1993,23 @@ static int page_is_zero(struct page *p) memcmp(a, a+4, STRIPE_SIZE-4)==0); } -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) +static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, + struct stripe_head *sh) { int sectors_per_chunk = conf->chunk_size >> 9; - int pd_idx, dd_idx; + int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - raid5_compute_sector(stripe * (disks - conf->max_degraded) + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, - disks, disks - conf->max_degraded, - &dd_idx, &pd_idx, conf); - return pd_idx; + previous, + &dd_idx, sh); } static void -handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, +handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@ -1895,7 +2043,7 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1910,7 +2058,7 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1934,7 +2082,7 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { bi->bi_next = *return_bi; *return_bi = bi; } @@ -1947,49 +2095,43 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, STRIPE_SECTORS, 0, 0); } + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); } -/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks - * to process +/* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue */ -static int __handle_issuing_new_read_requests5(struct stripe_head *sh, - struct stripe_head_state *s, int disk_idx, int disks) +static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; - /* don't schedule compute operations or reads on the parity block while - * a check is in flight - */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - return ~0; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || (s->failed && - (failed_dev->toread || (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags) - ))))) { - /* 1/ We would like to get this block, possibly by computing it, - * but we might not be able to. - * - * 2/ Since parity check operations potentially make the parity - * block !uptodate it will need to be refreshed before any - * compute operations on data disks are scheduled. - * - * 3/ We hold off parity block re-reads until check operations - * have quiesced. + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync */ if ((s->uptodate == disks - 1) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + (s->failed && disk_idx == s->failed_num)) { + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; - sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will @@ -1997,58 +2139,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * subsequent operation. */ s->uptodate++; - return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + return 1; /* uptodate + compute == disks */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); } } - return ~0; + return 0; } -static void handle_issuing_new_read_requests5(struct stripe_head *sh, +/** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ - if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) for (i = disks; i--; ) - if (__handle_issuing_new_read_requests5( - sh, s, i, disks) == 0) + if (fetch_block5(sh, s, i, disks)) break; - } set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_issuing_new_read_requests6(struct stripe_head *sh, +static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2069,7 +2193,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); @@ -2105,12 +2231,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, } -/* handle_completed_write_requests +/* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ -static void handle_completed_write_requests(raid5_conf_t *conf, +static void handle_stripe_clean_event(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; @@ -2131,7 +2257,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(wbi)) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; @@ -2149,9 +2275,13 @@ static void handle_completed_write_requests(raid5_conf_t *conf, 0); } } + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); } -static void handle_issuing_new_write_requests5(raid5_conf_t *conf, +static void handle_stripe_dirtying5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; @@ -2195,9 +2325,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2221,9 +2348,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2241,14 +2365,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ - if ((s->req_compute || - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) - s->locked += handle_write_operations5(sh, rcw == 0, 0); + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction5(sh, s, rcw == 0, 0); } -static void handle_issuing_new_write_requests6(raid5_conf_t *conf, +static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2333,6 +2456,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, s->locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); } + if (s->locked == disks) + if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) + atomic_inc(&conf->pending_full_writes); /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ set_bit(STRIPE_INSYNC, &sh->state); @@ -2348,84 +2474,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { + struct r5dev *dev = NULL; + set_bit(STRIPE_HANDLE, &sh->state); - /* Take one of the following actions: - * 1/ start a check parity operation if (uptodate == disks) - * 2/ finish a check parity operation and act on the result - * 3/ skip to the writeback section if we previously - * initiated a recovery operation - */ - if (s->failed == 0 && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ + if (s->failed == 0) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; - } else if ( - test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); - - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } + break; } - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } + dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - /* Wait for check parity and compute block operations to complete - * before write-back - */ - if (!test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; - dev = &sh->dev[s->failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2535,16 +2663,13 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, j; + int dd_idx, j; struct stripe_head *sh2; sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks - - conf->max_degraded, &dd_idx, - &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, - pd_idx, 1); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); + sh2 = get_active_stripe(conf, s, 0, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2567,8 +2692,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != raid6_next_disk(sh2->pd_idx, - sh2->disks)) && + (!r6s || j != sh2->qd_idx) && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2585,6 +2709,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } } + /* * handle_stripe - do things to a stripe. * @@ -2602,20 +2727,21 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, * */ -static void handle_stripe5(struct stripe_head *sh) +static bool handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; - unsigned long pending = 0; + mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " - "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, - sh->ops.pending, sh->ops.ack, sh->ops.complete); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -2624,15 +2750,8 @@ static void handle_stripe5(struct stripe_head *sh) s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); - } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -2646,10 +2765,10 @@ static void handle_stripe5(struct stripe_head *sh) /* maybe we can request a biofill operation * * new wantfill requests are only permitted while - * STRIPE_OP_BIOFILL is clear + * ops_complete_biofill is guaranteed to be inactive */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ @@ -2669,6 +2788,11 @@ static void handle_stripe5(struct stripe_head *sh) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -2683,8 +2807,21 @@ static void handle_stripe5(struct stripe_head *sh) } rcu_read_unlock(); - if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) - sh->ops.count++; + if (unlikely(blocked_rdev)) { + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; + } + + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", @@ -2694,8 +2831,7 @@ static void handle_stripe5(struct stripe_head *sh) * need to be failed */ if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -2711,46 +2847,25 @@ static void handle_stripe5(struct stripe_head *sh) !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || - test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) - handle_issuing_new_read_requests5(sh, &s, disks); + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill5(sh, &s, disks); /* Now we check to see if any write operations have recently * completed */ - - /* leave prexor set until postxor is done, allows us to distinguish - * a rmw from a rcw during biodrain - */ - if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - - for (i = disks; i--; ) - clear_bit(R5_Wantprexor, &sh->dev[i].flags); - } - - /* if only POSTXOR is set then this is an 'expand' postxor */ - if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + prexor = 0; + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; /* All the 'written' buffers and the parity block are ready to * be written back to disk @@ -2762,9 +2877,8 @@ static void handle_stripe5(struct stripe_head *sh) (i == sh->pd_idx || dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); @@ -2784,20 +2898,18 @@ static void handle_stripe5(struct stripe_head *sh) * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ - if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - handle_issuing_new_write_requests5(conf, sh, &s, disks); + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough * data is available. The parity check is held off while parity * dependent operations are in flight. */ - if ((s.syncing && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_INSYNC, &sh->state)) || - test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -2816,49 +2928,35 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); s.locked++; } } - /* Finish postxor operations initiated by the expansion - * process - */ - if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && - !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { - + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - for (i = conf->raid_disks; i--; ) { set_bit(R5_Wantwrite, &sh->dev[i].flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; } } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); - s.locked += handle_write_operations5(sh, 1, 1); - } else if (s.expanded && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + stripe_set_idx(sh->sector, conf, 0, sh); + schedule_reconstruction5(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2866,32 +2964,38 @@ static void handle_stripe5(struct stripe_head *sh) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, NULL); - if (sh->ops.count) - pending = get_stripe_work(sh); - + unlock: spin_unlock(&sh->lock); - if (pending) - raid5_run_ops(sh, pending); + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + + if (s.ops_request) + raid5_run_ops(sh, s.ops_request); + + ops_run_io(sh, &s); return_io(return_bi); + return blocked_rdev == NULL; } -static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) +static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; int i, pd_idx = sh->pd_idx; struct stripe_head_state s; struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; + mdk_rdev_t *blocked_rdev = NULL; - r6s.qd_idx = raid6_next_disk(pd_idx, disks); + r6s.qd_idx = sh->qd_idx; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n", (unsigned long long)sh->sector, sh->state, @@ -2929,7 +3033,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) copy_data(0, rbi, dev->page, dev->sector); rbi2 = r5_next_bio(rbi, dev->sector); spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -2953,6 +3057,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (dev->written) s.written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (blocked_rdev == NULL && + rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); @@ -2967,16 +3076,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); - pr_debug("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d,%d\n", + + if (unlikely(blocked_rdev)) { + if (s.syncing || s.expanding || s.expanded || + s.to_write || s.written) { + set_bit(STRIPE_HANDLE, &sh->state); + goto unlock; + } + /* There is nothing for the blocked_rdev to block */ + rdev_dec_pending(blocked_rdev, conf->mddev); + blocked_rdev = NULL; + } + + pr_debug("locked=%d uptodate=%d to_read=%d" + " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, r6s.failed_num[0], r6s.failed_num[1]); /* check if the array has lost >2 devices and, if so, some requests * might need to be failed */ if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 2 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -3001,7 +3121,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -3009,11 +3129,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || (s.syncing && (s.uptodate < disks)) || s.expanding) - handle_issuing_new_read_requests6(sh, &s, &r6s, disks); + handle_stripe_fill6(sh, &s, &r6s, disks); /* now to consider writing and what else, if anything should be read */ if (s.to_write) - handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); + handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -3052,8 +3172,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); + stripe_set_idx(sh->sector, conf, 0, sh); compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); @@ -3069,79 +3188,30 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, &r6s); + unlock: spin_unlock(&sh->lock); - return_io(return_bi); + /* wait for this device to become unblocked */ + if (unlikely(blocked_rdev)) + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; + ops_run_io(sh, &s); - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (s.syncing || s.expanding || s.expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + return_io(return_bi); - bi->bi_bdev = rdev->bdev; - pr_debug("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } + return blocked_rdev == NULL; } -static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) +/* returns true if the stripe was handled */ +static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) { if (sh->raid_conf->level == 6) - handle_stripe6(sh, tmp_page); + return handle_stripe6(sh, tmp_page); else - handle_stripe5(sh); + return handle_stripe5(sh); } @@ -3157,9 +3227,10 @@ static void raid5_activate_delayed(raid5_conf_t *conf) clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->handle_list); + list_add_tail(&sh->lru, &conf->hold_list); } - } + } else + blk_plug_device(conf->mddev->queue); } static void activate_bit_delay(raid5_conf_t *conf) @@ -3239,15 +3310,17 @@ static int raid5_congested(void *data, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -3300,8 +3373,11 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) if(bi) { conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; + /* + * this sets the active strip count to 1 and the processed + * strip count to zero (upper 8 bits) + */ bi->bi_phys_segments = 1; /* biased count of active stripes */ - bi->bi_hw_segments = 0; /* count of processed stripes */ } return bi; @@ -3351,8 +3427,7 @@ static int bio_fits_rdev(struct bio *bi) if ((bi->bi_size>>9) > q->max_sectors) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments || - bi->bi_hw_segments > q->max_hw_segments) + if (bi->bi_phys_segments > q->max_phys_segments) return 0; if (q->merge_bvec_fn) @@ -3369,9 +3444,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - conf->max_degraded; - unsigned int dd_idx, pd_idx; + unsigned int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3380,7 +3453,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone to make a copy of the bio */ align_bi = bio_clone(raid_bio, GFP_NOIO); if (!align_bi) @@ -3394,12 +3467,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, - raid_disks, - data_disks, - &dd_idx, - &pd_idx, - conf); + align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, + 0, + &dd_idx, NULL); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].rdev); @@ -3434,17 +3504,69 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) } } +/* __get_priority_stripe - get the next stripe to process + * + * Full stripe writes are allowed to pass preread active stripes up until + * the bypass_threshold is exceeded. In general the bypass_count + * increments when the handle_list is handled before the hold_list; however, it + * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a + * stripe with in flight i/o. The bypass_count will be reset when the + * head of the hold_list has changed, i.e. the head was promoted to the + * handle_list. + */ +static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) +{ + struct stripe_head *sh; + + pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", + __func__, + list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(&conf->hold_list) ? "empty" : "busy", + atomic_read(&conf->pending_full_writes), conf->bypass_count); + + if (!list_empty(&conf->handle_list)) { + sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + + if (list_empty(&conf->hold_list)) + conf->bypass_count = 0; + else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { + if (conf->hold_list.next == conf->last_hold) + conf->bypass_count++; + else { + conf->last_hold = conf->hold_list.next; + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + } + } else if (!list_empty(&conf->hold_list) && + ((conf->bypass_threshold && + conf->bypass_count > conf->bypass_threshold) || + atomic_read(&conf->pending_full_writes) == 0)) { + sh = list_entry(conf->hold_list.next, + typeof(*sh), lru); + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } else + return NULL; + + list_del_init(&sh->lru); + atomic_inc(&sh->count); + BUG_ON(atomic_read(&sh->count) != 1); + return sh; +} static int make_request(struct request_queue *q, struct bio * bi) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int remaining; + int cpu, remaining; if (unlikely(bio_barrier(bi))) { bio_endio(bi, -EOPNOTSUPP); @@ -3453,13 +3575,16 @@ static int make_request(struct request_queue *q, struct bio * bi) md_write_start(mddev, bi); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], + bio_sectors(bi)); + part_stat_unlock(); if (rw == READ && mddev->reshape_position == MaxSector && chunk_aligned_read(q,bi)) - return 0; + return 0; logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); @@ -3469,8 +3594,10 @@ static int make_request(struct request_queue *q, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int disks, data_disks; + int previous; retry: + previous = 0; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); if (likely(conf->expand_progress == MaxSector)) disks = conf->raid_disks; @@ -3485,9 +3612,10 @@ static int make_request(struct request_queue *q, struct bio * bi) */ spin_lock_irq(&conf->device_lock); disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) + if (logical_sector >= conf->expand_progress) { disks = conf->previous_raid_disks; - else { + previous = 1; + } else { if (logical_sector >= conf->expand_lo) { spin_unlock_irq(&conf->device_lock); schedule(); @@ -3498,13 +3626,15 @@ static int make_request(struct request_queue *q, struct bio * bi) } data_disks = disks - conf->max_degraded; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, NULL); pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK)); if (sh) { if (unlikely(conf->expand_progress != MaxSector)) { /* expansion might have moved on while waiting for a @@ -3549,7 +3679,8 @@ static int make_request(struct request_queue *q, struct bio * bi) goto retry; } finish_wait(&conf->wait_for_overlap, &w); - handle_stripe(sh, NULL); + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -3560,16 +3691,14 @@ static int make_request(struct request_queue *q, struct bio * bi) } spin_lock_irq(&conf->device_lock); - remaining = --bi->bi_phys_segments; + remaining = raid5_dec_bi_phys_segments(bi); spin_unlock_irq(&conf->device_lock); if (remaining == 0) { if ( rw == WRITE ) md_write_end(mddev); - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); } return 0; } @@ -3587,7 +3716,6 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -3639,9 +3767,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr+i, 0, 0); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3652,10 +3778,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (j == sh->pd_idx) continue; if (conf->level == 6 && - j == raid6_next_disk(sh->pd_idx, sh->disks)) + j == sh->qd_idx) continue; s = compute_blocknr(sh, j); - if (s < (mddev->array_size<<1)) { + if (s < mddev->array_sectors) { skipped = 1; continue; } @@ -3678,26 +3804,40 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(sector_nr*(new_data_disks), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, sector_nr*(new_data_disks), + 1, &dd_idx, NULL); last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(new_data_disks) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; + raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512) + *(new_data_disks) - 1), + 1, &dd_idx, NULL); + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, - conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, first_sector, 1, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* If this takes us to the resync_max point where we have to pause, + * then we need to write out the superblock. + */ + sector_nr += conf->chunk_size>>9; + if (sector_nr >= mddev->resync_max) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes) == 0); + mddev->reshape_position = conf->expand_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) + || kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->expand_lo = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + } return conf->chunk_size>>9; } @@ -3706,9 +3846,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int pd_idx; - int raid_disks = conf->raid_disks; - sector_t max_sector = mddev->size << 1; + sector_t max_sector = mddev->dev_sectors; int sync_blocks; int still_degraded = 0; int i; @@ -3734,13 +3872,19 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); + /* No need to check resync_max as we never do more than one + * stripe, and as resync_max will always be on a chunk boundary, + * if the check in md_do_sync didn't fire, there is no chance + * of overstepping resync_max here + */ + /* if there is too many failed drives and we are trying * to resync, then assert that we are finished, because there is * nothing we can do. */ if (mddev->degraded >= conf->max_degraded && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; + sector_t rv = mddev->dev_sectors - sector_nr; *skipped = 1; return rv; } @@ -3756,10 +3900,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -3780,7 +3923,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - handle_stripe(sh, NULL); + /* wait for any blocked device to be handled */ + while(unlikely(!handle_stripe(sh, NULL))) + ; release_stripe(sh); return STRIPE_SECTORS; @@ -3799,19 +3944,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx; + int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; int handled = 0; logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector( logical_sector, - conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx, - &pd_idx, - conf); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, NULL); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; @@ -3819,15 +3960,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid_bio->bi_hw_segments) + if (scnt < raid5_bi_hw_segments(raid_bio)) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector, 0, 1); if (!sh) { /* failed to get a stripe - must wait */ - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_hw_segments(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } @@ -3835,7 +3976,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) set_bit(R5_ReadError, &sh->dev[dd_idx].flags); if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_hw_segments(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } @@ -3845,14 +3986,10 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) handled++; } spin_lock_irq(&conf->device_lock); - remaining = --raid_bio->bi_phys_segments; + remaining = raid5_dec_bi_phys_segments(raid_bio); spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - - raid_bio->bi_end_io(raid_bio, - test_bit(BIO_UPTODATE, &raid_bio->bi_flags) - ? 0 : -EIO); - } + if (remaining == 0) + bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@ -3867,7 +4004,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d (mddev_t *mddev) +static void raid5d(mddev_t *mddev) { struct stripe_head *sh; raid5_conf_t *conf = mddev_to_conf(mddev); @@ -3880,7 +4017,6 @@ static void raid5d (mddev_t *mddev) handled = 0; spin_lock_irq(&conf->device_lock); while (1) { - struct list_head *first; struct bio *bio; if (conf->seq_flush != conf->seq_write) { @@ -3892,12 +4028,6 @@ static void raid5d (mddev_t *mddev) activate_bit_delay(conf); } - if (list_empty(&conf->handle_list) && - atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && - !blk_queue_plugged(mddev->queue) && - !list_empty(&conf->delayed_list)) - raid5_activate_delayed(conf); - while ((bio = remove_bio_from_retry(conf))) { int ok; spin_unlock_irq(&conf->device_lock); @@ -3908,17 +4038,10 @@ static void raid5d (mddev_t *mddev) handled++; } - if (list_empty(&conf->handle_list)) { - async_tx_issue_pending_all(); - break; - } + sh = __get_priority_stripe(conf); - first = conf->handle_list.next; - sh = list_entry(first, struct stripe_head, lru); - - list_del_init(first); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count)!= 1); + if (!sh) + break; spin_unlock_irq(&conf->device_lock); handled++; @@ -3931,6 +4054,7 @@ static void raid5d (mddev_t *mddev) spin_unlock_irq(&conf->device_lock); + async_tx_issue_pending_all(); unplug_slaves(mddev); pr_debug("--- raid5d inactive\n"); @@ -3950,15 +4074,15 @@ static ssize_t raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); - char *end; - int new; + unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; if (!conf) return -ENODEV; - new = simple_strtoul(page, &end, 10); - if (!*page || (*end && *end != '\n') ) + if (strict_strtoul(page, 10, &new)) return -EINVAL; if (new <= 16 || new > 32768) return -EINVAL; @@ -3968,7 +4092,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; @@ -3983,6 +4109,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, raid5_store_stripe_cache_size); static ssize_t +raid5_show_preread_threshold(mddev_t *mddev, char *page) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + if (conf) + return sprintf(page, "%d\n", conf->bypass_threshold); + else + return 0; +} + +static ssize_t +raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (strict_strtoul(page, 10, &new)) + return -EINVAL; + if (new > conf->max_nr_stripes) + return -EINVAL; + conf->bypass_threshold = new; + return len; +} + +static struct md_sysfs_entry +raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, + S_IRUGO | S_IWUSR, + raid5_show_preread_threshold, + raid5_store_preread_threshold); + +static ssize_t stripe_cache_active_show(mddev_t *mddev, char *page) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -3998,6 +4158,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, + &raid5_preread_bypass_threshold.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -4005,21 +4166,147 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int run(mddev_t *mddev) +static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; mdk_rdev_t *rdev; struct disk_info *disk; - struct list_head *tmp; - int working_disks = 0; - if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->level); - return -EIO; + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); + } + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { + printk(KERN_ERR "raid5: %s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); + } + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); + } + + if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { + printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", + mddev->new_chunk, mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); + if (conf == NULL) + goto abort; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else + conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + + conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), + GFP_KERNEL); + if (!conf->disks) + goto abort; + + conf->mddev = mddev; + + if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) + goto abort; + + if (mddev->new_level == 6) { + conf->spare_page = alloc_page(GFP_KERNEL); + if (!conf->spare_page) + goto abort; + } + spin_lock_init(&conf->device_lock); + init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_overlap); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->hold_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); + INIT_LIST_HEAD(&conf->inactive_list); + atomic_set(&conf->active_stripes, 0); + atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); + conf->bypass_threshold = BYPASS_THRESHOLD; + + pr_debug("raid5: run(%s) called.\n", mdname(mddev)); + + list_for_each_entry(rdev, &mddev->disks, same_set) { + raid_disk = rdev->raid_disk; + if (raid_disk >= conf->raid_disks + || raid_disk < 0) + continue; + disk = conf->disks + raid_disk; + + disk->rdev = rdev; + + if (test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "raid5: device %s operational as raid" + " disk %d\n", bdevname(rdev->bdev,b), + raid_disk); + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } + conf->chunk_size = mddev->new_chunk; + conf->level = mddev->new_level; + if (conf->level == 6) + conf->max_degraded = 2; + else + conf->max_degraded = 1; + conf->algorithm = mddev->new_layout; + conf->max_nr_stripes = NR_STRIPES; + conf->expand_progress = mddev->reshape_position; + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes)) { + printk(KERN_ERR + "raid5: couldn't allocate %dkB for buffers\n", memory); + goto abort; + } else + printk(KERN_INFO "raid5: allocated %dkB for %s\n", + memory, mdname(mddev)); + + conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); + if (!conf->thread) { + printk(KERN_ERR + "raid5: couldn't allocate thread for %s\n", + mdname(mddev)); + goto abort; + } + + return conf; + + abort: + if (conf) { + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + +static int run(mddev_t *mddev) +{ + raid5_conf_t *conf; + int working_disks = 0; + mdk_rdev_t *rdev; + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Currently only disks can change, it must @@ -4070,100 +4357,35 @@ static int run(mddev_t *mddev) } printk(KERN_INFO "raid5: reshape will continue\n"); /* OK, we should be able to continue; */ - } - - - mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) - goto abort; - if (mddev->reshape_position == MaxSector) { - conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; } else { - conf->raid_disks = mddev->raid_disks; - conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_size != mddev->new_chunk); + BUG_ON(mddev->delta_disks != 0); } - conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), - GFP_KERNEL); - if (!conf->disks) - goto abort; - - conf->mddev = mddev; - - if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) - goto abort; - - if (mddev->level == 6) { - conf->spare_page = alloc_page(GFP_KERNEL); - if (!conf->spare_page) - goto abort; - } - spin_lock_init(&conf->device_lock); - init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); - INIT_LIST_HEAD(&conf->delayed_list); - INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); - atomic_set(&conf->active_stripes, 0); - atomic_set(&conf->preread_active_stripes, 0); - atomic_set(&conf->active_aligned_reads, 0); + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); + if (IS_ERR(conf)) + return PTR_ERR(conf); - ITERATE_RDEV(mddev,rdev,tmp) { - raid_disk = rdev->raid_disk; - if (raid_disk >= conf->raid_disks - || raid_disk < 0) - continue; - disk = conf->disks + raid_disk; - - disk->rdev = rdev; - - if (test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "raid5: device %s operational as raid" - " disk %d\n", bdevname(rdev->bdev,b), - raid_disk); - working_disks++; - } - } + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = conf->raid_disks - working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; - if (conf->level == 6) - conf->max_degraded = 2; - else - conf->max_degraded = 1; - conf->algorithm = mddev->layout; - conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = mddev->reshape_position; + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + test_bit(In_sync, &rdev->flags)) + working_disks++; - /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; + mddev->degraded = conf->raid_disks - working_disks; - if (conf->level == 6 && conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); - goto abort; - } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); - goto abort; - } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid5: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; - } if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", @@ -4171,6 +4393,10 @@ static int run(mddev_t *mddev) goto abort; } + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) @@ -4186,27 +4412,6 @@ static int run(mddev_t *mddev) } } - { - mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); - if (!mddev->thread) { - printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); - if (mddev->degraded == 0) printk("raid5: raid level %d set %s active with %d out of %d" " devices, algorithm %d\n", conf->level, mdname(mddev), @@ -4249,18 +4454,23 @@ static int run(mddev_t *mddev) "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + mddev->queue->queue_lock = &conf->device_lock; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_size = mddev->size * (conf->previous_raid_disks - - conf->max_degraded); + mddev->array_sectors = mddev->dev_sectors * + (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); return 0; abort: + md_unregister_thread(mddev->thread); + mddev->thread = NULL; if (conf) { + shrink_stripes(conf); print_raid5_conf(conf); safe_put_page(conf->spare_page); kfree(conf->disks); @@ -4292,7 +4502,7 @@ static int stop(mddev_t *mddev) } #ifdef DEBUG -static void print_sh (struct seq_file *seq, struct stripe_head *sh) +static void print_sh(struct seq_file *seq, struct stripe_head *sh) { int i; @@ -4308,7 +4518,7 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh) seq_printf(seq, "\n"); } -static void printall (struct seq_file *seq, raid5_conf_t *conf) +static void printall(struct seq_file *seq, raid5_conf_t *conf) { struct stripe_head *sh; struct hlist_node *hn; @@ -4326,7 +4536,7 @@ static void printall (struct seq_file *seq, raid5_conf_t *conf) } #endif -static void status (struct seq_file *seq, mddev_t *mddev) +static void status(struct seq_file *seq, mddev_t *mddev) { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; int i; @@ -4403,6 +4613,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -4420,35 +4638,41 @@ abort: static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { raid5_conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int disk; struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; if (mddev->degraded > conf->max_degraded) /* no point adding a device */ - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; /* * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) disk = rdev->saved_raid_disk; else - disk = 0; - for ( ; disk < conf->raid_disks; disk++) + disk = first; + for ( ; disk <= last ; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - found = 1; + err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); - return found; + return err; } static int raid5_resize(mddev_t *mddev, sector_t sectors) @@ -4463,14 +4687,15 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_conf_t *conf = mddev_to_conf(mddev); sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors * (mddev->raid_disks + - conf->max_degraded); + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = sectors /2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } @@ -4486,6 +4711,9 @@ static int raid5_check_reshape(mddev_t *mddev) return -EINVAL; /* Cannot shrink array or change level yet */ if (mddev->delta_disks == 0) return 0; /* nothing to do */ + if (mddev->bitmap) + /* Cannot grow a bitmap yet */ + return -EBUSY; /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress @@ -4516,7 +4744,6 @@ static int raid5_start_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); mdk_rdev_t *rdev; - struct list_head *rtmp; int spares = 0; int added_devices = 0; unsigned long flags; @@ -4524,7 +4751,7 @@ static int raid5_start_reshape(mddev_t *mddev) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; - ITERATE_RDEV(mddev, rdev, rtmp) + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) spares++; @@ -4546,10 +4773,10 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. */ - ITERATE_RDEV(mddev, rdev, rtmp) + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev)) { + if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; set_bit(In_sync, &rdev->flags); added_devices++; @@ -4597,15 +4824,16 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * + conf->mddev->array_sectors = conf->mddev->dev_sectors * (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, + (loff_t)conf->mddev->array_sectors << 9); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } @@ -4656,6 +4884,102 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } + +static void *raid5_takeover_raid1(mddev_t *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk = chunksect << 9; + + return setup_conf(mddev); +} + + +static void *raid5_takeover(mddev_t *mddev) +{ + /* raid5 can take over: + * raid0 - if all devices are the same - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + * + * For now, just do raid1 + */ + + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + + return ERR_PTR(-EINVAL); +} + + +static struct mdk_personality raid5_personality; + +static void *raid6_takeover(mddev_t *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + static struct mdk_personality raid6_personality = { .name = "raid6", @@ -4676,6 +5000,7 @@ static struct mdk_personality raid6_personality = .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, + .takeover = raid6_takeover, }; static struct mdk_personality raid5_personality = { @@ -4697,6 +5022,7 @@ static struct mdk_personality raid5_personality = .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, + .takeover = raid5_takeover, }; static struct mdk_personality raid4_personality =