* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+/*
+ * BITMAP UNPLUGGING:
+ *
+ * The sequencing for updating the bitmap reliably is a little
+ * subtle (and I got it wrong the first time) so it deserves some
+ * explanation.
+ *
+ * We group bitmap updates into batches. Each batch has a number.
+ * We may write out several batches at once, but that isn't very important.
+ * conf->bm_write is the number of the last batch successfully written.
+ * conf->bm_flush is the number of the last batch that was closed to
+ * new additions.
+ * When we discover that we will need to write to any block in a stripe
+ * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
+ * the number of the batch it will be in. This is bm_flush+1.
+ * When we are ready to do a write, if that batch hasn't been written yet,
+ * we plug the array and queue the stripe for later.
+ * When an unplug happens, we increment bm_flush, thus closing the current
+ * batch.
+ * When we notice that bm_flush > bm_write, we write out all pending updates
+ * to the bitmap, and advance bm_write to where bm_flush was.
+ * This may occasionally write a bit out twice, but is sure never to
+ * miss any bits.
+ */
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/highmem.h>
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state))
+ if (test_bit(STRIPE_DELAYED, &sh->state)) {
list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- conf->seq_write == sh->bm_seq)
+ blk_plug_device(conf->mddev->queue);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ sh->bm_seq - conf->seq_write > 0) {
list_add_tail(&sh->lru, &conf->bitmap_list);
- else {
+ blk_plug_device(conf->mddev->queue);
+ } else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
}
if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
list_add_tail(&sh->lru, &conf->inactive_list);
wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
}
}
}
< (conf->max_nr_stripes *3/4)
|| !conf->inactive_blocked),
conf->device_lock,
- unplug_slaves(conf->mddev)
+ raid5_unplug_device(conf->mddev->queue)
);
conf->inactive_blocked = 0;
} else
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
- if (list_empty(&sh->lru))
+ if (list_empty(&sh->lru) &&
+ !test_bit(STRIPE_EXPANDING, &sh->state))
BUG();
list_del_init(&sh->lru);
}
static int grow_stripes(raid5_conf_t *conf, int num)
{
- kmem_cache_t *sc;
+ struct kmem_cache *sc;
int devs = conf->raid_disks;
sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
LIST_HEAD(newstripes);
struct disk_info *ndisks;
int err = 0;
- kmem_cache_t *sc;
+ struct kmem_cache *sc;
int i;
if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */
+ md_allow_write(conf->mddev);
+
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
if (bi->bi_size)
return 1;
}
if (uptodate) {
-#if 0
- struct bio *bio;
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- /* we can return a buffer if we bypassed the cache or
- * if the top buffer is not in highmem. If there are
- * multiple buffers, leave the extra work to
- * handle_stripe
- */
- buffer = sh->bh_read[i];
- if (buffer &&
- (!PageHighMem(buffer->b_page)
- || buffer->b_page == bh->b_page )
- ) {
- sh->bh_read[i] = buffer->b_reqnext;
- buffer->b_reqnext = NULL;
- } else
- buffer = NULL;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (sh->bh_page[i]==bh->b_page)
- set_buffer_uptodate(bh);
- if (buffer) {
- if (buffer->b_page != bh->b_page)
- memcpy(buffer->b_data, bh->b_data, bh->b_size);
- buffer->b_end_io(buffer, 1);
- }
-#else
set_bit(R5_UPTODATE, &sh->dev[i].flags);
-#endif
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
- printk(KERN_INFO "raid5: read error corrected!!\n");
+ rdev = conf->disks[i].rdev;
+ printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
+ mdname(conf->mddev), STRIPE_SECTORS,
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdevname(rdev->bdev, b));
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
if (atomic_read(&conf->disks[i].rdev->read_errors))
atomic_set(&conf->disks[i].rdev->read_errors, 0);
} else {
+ const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
int retry = 0;
+ rdev = conf->disks[i].rdev;
+
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&conf->disks[i].rdev->read_errors);
+ atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded)
- printk(KERN_WARNING "raid5: read error not correctable.\n");
+ printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
- printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
- else if (atomic_read(&conf->disks[i].rdev->read_errors)
+ printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
+ mdname(conf->mddev),
+ (unsigned long long)sh->sector + rdev->data_offset,
+ bdn);
+ else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
printk(KERN_WARNING
- "raid5: Too many read errors, failing device.\n");
+ "raid5:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
else
retry = 1;
if (retry)
else {
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
- md_error(conf->mddev, conf->disks[i].rdev);
+ md_error(conf->mddev, rdev);
}
}
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-#if 0
- /* must restore b_page before unlocking buffer... */
- if (sh->bh_page[i] != bh->b_page) {
- bh->b_page = sh->bh_page[i];
- bh->b_data = page_address(bh->b_page);
- clear_buffer_uptodate(bh);
- }
-#endif
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
- unsigned long flags;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
if (bi->bi_size)
return 0;
}
- spin_lock_irqsave(&conf->device_lock, flags);
if (!uptodate)
md_error(conf->mddev, conf->disks[i].rdev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- __release_stripe(conf, sh);
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ release_stripe(sh);
return 0;
}
PRINTK("raid5: error called\n");
if (!test_bit(Faulty, &rdev->flags)) {
- mddev->sb_dirty = 1;
- if (test_bit(In_sync, &rdev->flags)) {
- conf->working_disks--;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
- conf->failed_disks++;
- clear_bit(In_sync, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery was running, make sure it aborts.
*/
printk (KERN_ALERT
"raid5: Disk failure on %s, disabling device."
" Operation continuing on %d devices\n",
- bdevname(rdev->bdev,b), conf->working_disks);
+ bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
}
static sector_t compute_blocknr(struct stripe_head *sh, int i)
{
raid5_conf_t *conf = sh->raid_conf;
- int raid_disks = sh->disks, data_disks = raid_disks - 1;
+ int raid_disks = sh->disks;
+ int data_disks = raid_disks - conf->max_degraded;
sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9;
sector_t stripe;
}
break;
case 6:
- data_disks = raid_disks - 2;
if (i == raid6_next_disk(sh->pd_idx, raid_disks))
return 0; /* It is the Q disk */
switch (conf->algorithm) {
static void compute_parity6(struct stripe_head *sh, int method)
{
raid6_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+ int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
struct bio *chosen;
/**** FIX THIS: This could be very bad if disks is close to 256 ****/
void *ptrs[disks];
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
- if (sh->dev[i].written) BUG();
+ BUG_ON(sh->dev[i].written);
sh->dev[i].written = chosen;
}
break;
/* Compute one missing block */
static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
{
- raid6_conf_t *conf = sh->raid_conf;
- int i, count, disks = conf->raid_disks;
+ int i, count, disks = sh->disks;
void *ptr[MAX_XOR_BLOCKS], *p;
int pd_idx = sh->pd_idx;
int qd_idx = raid6_next_disk(pd_idx, disks);
/* Compute two missing blocks */
static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
{
- raid6_conf_t *conf = sh->raid_conf;
- int i, count, disks = conf->raid_disks;
+ int i, count, disks = sh->disks;
int pd_idx = sh->pd_idx;
int qd_idx = raid6_next_disk(pd_idx, disks);
int d0_idx = raid6_next_disk(qd_idx, disks);
(unsigned long long)sh->sector, dd_idx);
if (conf->mddev->bitmap && firstwrite) {
- sh->bm_seq = conf->seq_write;
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0);
+ sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
{
int sectors_per_chunk = conf->chunk_size >> 9;
- sector_t x = stripe;
int pd_idx, dd_idx;
- int chunk_offset = sector_div(x, sectors_per_chunk);
- stripe = x;
- raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
- + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+ int chunk_offset = sector_div(stripe, sectors_per_chunk);
+
+ raid5_compute_sector(stripe * (disks - conf->max_degraded)
+ *sectors_per_chunk + chunk_offset,
+ disks, disks - conf->max_degraded,
+ &dd_idx, &pd_idx, conf);
return pd_idx;
}
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
-#if 0
- /* if I am just reading this block and we don't have
- a failed drive, or any pending writes then sidestep the cache */
- if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
- ! syncing && !failed && !to_write) {
- sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
- sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
- }
-#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
(!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-|| sh->bh_page[i]!=bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)
/* Would I have to read this buffer for reconstruct_write */
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
(!test_bit(R5_LOCKED, &dev->flags)
-#if 0
-|| sh->bh_page[i] != bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
for (i=disks; i-- ;) {
int rw;
struct bio *bi;
mdk_rdev_t *rdev;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
- rw = 1;
+ rw = WRITE;
else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
- rw = 0;
+ rw = READ;
else
continue;
bi = &sh->dev[i].req;
bi->bi_rw = rw;
- if (rw)
+ if (rw == WRITE)
bi->bi_end_io = raid5_end_write_request;
else
bi->bi_end_io = raid5_end_read_request;
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
generic_make_request(bi);
} else {
- if (rw == 1)
+ if (rw == WRITE)
set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
{
raid6_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks;
+ int disks = sh->disks;
struct bio *return_bi= NULL;
struct bio *bi;
int i;
- int syncing;
+ int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
int non_overwrite = 0;
int failed_num[2] = {0, 0};
clear_bit(STRIPE_DELAYED, &sh->state);
syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
/* Now to look around and see what can be done */
rcu_read_lock();
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
- if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+ if (to_read || non_overwrite || (to_write && failed) ||
+ (syncing && (uptodate < disks)) || expanding) {
for (i=disks; i--;) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
syncing ||
+ expanding ||
(failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
(failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
)
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
-#if 0
- /* if I am just reading this block and we don't have
- a failed drive, or any pending writes then sidestep the cache */
- if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
- ! syncing && !failed && !to_write) {
- sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
- sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
- }
-#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
if (!test_bit(R5_OVERWRITE, &dev->flags)
&& i != pd_idx && i != qd_idx
&& (!test_bit(R5_LOCKED, &dev->flags)
-#if 0
- || sh->bh_page[i] != bh->b_page
-#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
}
}
}
+
+ if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ /* Need to write out all blocks after computing P&Q */
+ sh->disks = conf->raid_disks;
+ sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+ conf->raid_disks);
+ compute_parity6(sh, RECONSTRUCT_WRITE);
+ for (i = conf->raid_disks ; i-- ; ) {
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ locked++;
+ set_bit(R5_Wantwrite, &sh->dev[i].flags);
+ }
+ clear_bit(STRIPE_EXPANDING, &sh->state);
+ } else if (expanded) {
+ clear_bit(STRIPE_EXPAND_READY, &sh->state);
+ atomic_dec(&conf->reshape_stripes);
+ wake_up(&conf->wait_for_overlap);
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ }
+
+ if (expanding && locked == 0) {
+ /* We have read all the blocks in this stripe and now we need to
+ * copy some of them into a target stripe for expand.
+ */
+ clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ for (i = 0; i < sh->disks ; i++)
+ if (i != pd_idx && i != qd_idx) {
+ int dd_idx2, pd_idx2, j;
+ struct stripe_head *sh2;
+
+ sector_t bn = compute_blocknr(sh, i);
+ sector_t s = raid5_compute_sector(
+ bn, conf->raid_disks,
+ conf->raid_disks - conf->max_degraded,
+ &dd_idx2, &pd_idx2, conf);
+ sh2 = get_active_stripe(conf, s,
+ conf->raid_disks,
+ pd_idx2, 1);
+ if (sh2 == NULL)
+ /* so for only the early blocks of
+ * this stripe have been requests.
+ * When later blocks get requests, we
+ * will try again
+ */
+ continue;
+ if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+ test_bit(R5_Expanded,
+ &sh2->dev[dd_idx2].flags)) {
+ /* must have already done this block */
+ release_stripe(sh2);
+ continue;
+ }
+ memcpy(page_address(sh2->dev[dd_idx2].page),
+ page_address(sh->dev[i].page),
+ STRIPE_SIZE);
+ set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
+ set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
+ for (j = 0 ; j < conf->raid_disks ; j++)
+ if (j != sh2->pd_idx &&
+ j != raid6_next_disk(sh2->pd_idx,
+ sh2->disks) &&
+ !test_bit(R5_Expanded,
+ &sh2->dev[j].flags))
+ break;
+ if (j == conf->raid_disks) {
+ set_bit(STRIPE_EXPAND_READY,
+ &sh2->state);
+ set_bit(STRIPE_HANDLE, &sh2->state);
+ }
+ release_stripe(sh2);
+ }
+ }
+
spin_unlock(&sh->lock);
while ((bi=return_bi)) {
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
for (i=disks; i-- ;) {
int rw;
struct bio *bi;
mdk_rdev_t *rdev;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
- rw = 1;
+ rw = WRITE;
else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
- rw = 0;
+ rw = READ;
else
continue;
bi = &sh->dev[i].req;
bi->bi_rw = rw;
- if (rw)
+ if (rw == WRITE)
bi->bi_end_io = raid5_end_write_request;
else
bi->bi_end_io = raid5_end_read_request;
rcu_read_unlock();
if (rdev) {
- if (syncing)
+ if (syncing || expanding || expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
bi->bi_bdev = rdev->bdev;
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
generic_make_request(bi);
} else {
- if (rw == 1)
+ if (rw == WRITE)
set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
return ret;
}
-static inline void raid5_plug_device(raid5_conf_t *conf)
+static int raid5_congested(void *data, int bits)
{
- spin_lock_irq(&conf->device_lock);
- blk_plug_device(conf->mddev->queue);
- spin_unlock_irq(&conf->device_lock);
+ mddev_t *mddev = data;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ /* No difference between reads and writes. Just check
+ * how busy the stripe_cache is
+ */
+ if (conf->inactive_blocked)
+ return 1;
+ if (conf->quiesce)
+ return 1;
+ if (list_empty_careful(&conf->inactive_list))
+ return 1;
+
+ return 0;
+}
+
+/* We want read requests to align with chunks where possible,
+ * but write requests don't need to.
+ */
+static int raid5_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
+{
+ mddev_t *mddev = q->queuedata;
+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ int max;
+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int bio_sectors = bio->bi_size >> 9;
+
+ if (bio_data_dir(bio) == WRITE)
+ return biovec->bv_len; /* always allow writes to be mergeable */
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+ if (max < 0) max = 0;
+ if (max <= biovec->bv_len && bio_sectors == 0)
+ return biovec->bv_len;
+ else
+ return max;
+}
+
+
+static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
+{
+ sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+ unsigned int chunk_sectors = mddev->chunk_size >> 9;
+ unsigned int bio_sectors = bio->bi_size >> 9;
+
+ return chunk_sectors >=
+ ((sector & (chunk_sectors - 1)) + bio_sectors);
}
+/*
+ * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
+ * later sampled by raid5d.
+ */
+static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+
+ bi->bi_next = conf->retry_read_aligned_list;
+ conf->retry_read_aligned_list = bi;
+
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ md_wakeup_thread(conf->mddev->thread);
+}
+
+
+static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
+{
+ struct bio *bi;
+
+ bi = conf->retry_read_aligned;
+ if (bi) {
+ conf->retry_read_aligned = NULL;
+ return bi;
+ }
+ bi = conf->retry_read_aligned_list;
+ if(bi) {
+ conf->retry_read_aligned_list = bi->bi_next;
+ bi->bi_next = NULL;
+ bi->bi_phys_segments = 1; /* biased count of active stripes */
+ bi->bi_hw_segments = 0; /* count of processed stripes */
+ }
+
+ return bi;
+}
+
+
+/*
+ * The "raid5_align_endio" should check if the read succeeded and if it
+ * did, call bio_endio on the original bio (having bio_put the new bio
+ * first).
+ * If the read failed..
+ */
+static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
+{
+ struct bio* raid_bi = bi->bi_private;
+ mddev_t *mddev;
+ raid5_conf_t *conf;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+ mdk_rdev_t *rdev;
+
+ if (bi->bi_size)
+ return 1;
+ bio_put(bi);
+
+ mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
+ conf = mddev_to_conf(mddev);
+ rdev = (void*)raid_bi->bi_next;
+ raid_bi->bi_next = NULL;
+
+ rdev_dec_pending(rdev, conf->mddev);
+
+ if (!error && uptodate) {
+ bio_endio(raid_bi, bytes, 0);
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return 0;
+ }
+
+
+ PRINTK("raid5_align_endio : io error...handing IO for a retry\n");
+
+ add_bio_to_retry(raid_bi, conf);
+ return 0;
+}
+
+static int bio_fits_rdev(struct bio *bi)
+{
+ request_queue_t *q = bdev_get_queue(bi->bi_bdev);
+
+ if ((bi->bi_size>>9) > q->max_sectors)
+ return 0;
+ blk_recount_segments(q, bi);
+ if (bi->bi_phys_segments > q->max_phys_segments ||
+ bi->bi_hw_segments > q->max_hw_segments)
+ return 0;
+
+ if (q->merge_bvec_fn)
+ /* it's too hard to apply the merge_bvec_fn at this stage,
+ * just just give up
+ */
+ return 0;
+
+ return 1;
+}
+
+
+static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
+{
+ mddev_t *mddev = q->queuedata;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+ const unsigned int raid_disks = conf->raid_disks;
+ const unsigned int data_disks = raid_disks - conf->max_degraded;
+ unsigned int dd_idx, pd_idx;
+ struct bio* align_bi;
+ mdk_rdev_t *rdev;
+
+ if (!in_chunk_boundary(mddev, raid_bio)) {
+ PRINTK("chunk_aligned_read : non aligned\n");
+ return 0;
+ }
+ /*
+ * use bio_clone to make a copy of the bio
+ */
+ align_bi = bio_clone(raid_bio, GFP_NOIO);
+ if (!align_bi)
+ return 0;
+ /*
+ * set bi_end_io to a new function, and set bi_private to the
+ * original bio.
+ */
+ align_bi->bi_end_io = raid5_align_endio;
+ align_bi->bi_private = raid_bio;
+ /*
+ * compute position
+ */
+ align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector,
+ raid_disks,
+ data_disks,
+ &dd_idx,
+ &pd_idx,
+ conf);
+
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+ if (rdev && test_bit(In_sync, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ rcu_read_unlock();
+ raid_bio->bi_next = (void*)rdev;
+ align_bi->bi_bdev = rdev->bdev;
+ align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+ align_bi->bi_sector += rdev->data_offset;
+
+ if (!bio_fits_rdev(align_bi)) {
+ /* too big in some way */
+ bio_put(align_bi);
+ rdev_dec_pending(rdev, mddev);
+ return 0;
+ }
+
+ spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock, /* nothing */);
+ atomic_inc(&conf->active_aligned_reads);
+ spin_unlock_irq(&conf->device_lock);
+
+ generic_make_request(align_bi);
+ return 1;
+ } else {
+ rcu_read_unlock();
+ bio_put(align_bi);
+ return 0;
+ }
+}
+
+
static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
disk_stat_inc(mddev->gendisk, ios[rw]);
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
+ if (rw == READ &&
+ mddev->reshape_position == MaxSector &&
+ chunk_aligned_read(q,bi))
+ return 0;
+
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
goto retry;
}
finish_wait(&conf->wait_for_overlap, &w);
- raid5_plug_device(conf);
handle_stripe(sh, NULL);
release_stripe(sh);
} else {
if ( rw == WRITE )
md_write_end(mddev);
bi->bi_size = 0;
- bi->bi_end_io(bi, bytes, 0);
+ bi->bi_end_io(bi, bytes,
+ test_bit(BIO_UPTODATE, &bi->bi_flags)
+ ? 0 : -EIO);
}
return 0;
}
struct stripe_head *sh;
int pd_idx;
sector_t first_sector, last_sector;
- int raid_disks;
- int data_disks;
+ int raid_disks = conf->previous_raid_disks;
+ int data_disks = raid_disks - conf->max_degraded;
+ int new_data_disks = conf->raid_disks - conf->max_degraded;
int i;
int dd_idx;
sector_t writepos, safepos, gap;
conf->expand_progress != 0) {
/* restarting in the middle, skip the initial sectors */
sector_nr = conf->expand_progress;
- sector_div(sector_nr, conf->raid_disks-1);
+ sector_div(sector_nr, new_data_disks);
*skipped = 1;
return sector_nr;
}
* to after where expand_lo old_maps to
*/
writepos = conf->expand_progress +
- conf->chunk_size/512*(conf->raid_disks-1);
- sector_div(writepos, conf->raid_disks-1);
+ conf->chunk_size/512*(new_data_disks);
+ sector_div(writepos, new_data_disks);
safepos = conf->expand_lo;
- sector_div(safepos, conf->previous_raid_disks-1);
+ sector_div(safepos, data_disks);
gap = conf->expand_progress - conf->expand_lo;
if (writepos >= safepos ||
- gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+ gap > (new_data_disks)*3000*2 /*3Meg*/) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->expand_progress;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
- wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+ wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop());
spin_lock_irq(&conf->device_lock);
conf->expand_lo = mddev->reshape_position;
sector_t s;
if (j == sh->pd_idx)
continue;
+ if (conf->level == 6 &&
+ j == raid6_next_disk(sh->pd_idx, sh->disks))
+ continue;
s = compute_blocknr(sh, j);
if (s < (mddev->array_size<<1)) {
skipped = 1;
* The source stripes are determined by mapping the first and last
* block on the destination stripes.
*/
- raid_disks = conf->previous_raid_disks;
- data_disks = raid_disks - 1;
first_sector =
- raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+ raid5_compute_sector(sector_nr*(new_data_disks),
raid_disks, data_disks,
&dd_idx, &pd_idx, conf);
last_sector =
raid5_compute_sector((sector_nr+conf->chunk_size/512)
- *(conf->raid_disks-1) -1,
+ *(new_data_disks) -1,
raid_disks, data_disks,
&dd_idx, &pd_idx, conf);
if (last_sector >= (mddev->size<<1))
last_sector = (mddev->size<<1)-1;
while (first_sector <= last_sector) {
- pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+ pd_idx = stripe_to_pdidx(first_sector, conf,
+ conf->previous_raid_disks);
sh = get_active_stripe(conf, first_sector,
conf->previous_raid_disks, pd_idx, 0);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
struct stripe_head *sh;
int pd_idx;
int raid_disks = conf->raid_disks;
- int data_disks = raid_disks - conf->max_degraded;
sector_t max_sector = mddev->size << 1;
int sync_blocks;
int still_degraded = 0;
* to resync, then assert that we are finished, because there is
* nothing we can do.
*/
- if (mddev->degraded >= (data_disks - raid_disks) &&
+ if (mddev->degraded >= conf->max_degraded &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
sector_t rv = (mddev->size << 1) - sector_nr;
*skipped = 1;
return STRIPE_SECTORS;
}
+static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
+{
+ /* We may not be able to submit a whole bio at once as there
+ * may not be enough stripe_heads available.
+ * We cannot pre-allocate enough stripe_heads as we may need
+ * more than exist in the cache (if we allow ever large chunks).
+ * So we do one stripe head at a time and record in
+ * ->bi_hw_segments how many have been done.
+ *
+ * We *know* that this entire raid_bio is in one chunk, so
+ * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
+ */
+ struct stripe_head *sh;
+ int dd_idx, pd_idx;
+ sector_t sector, logical_sector, last_sector;
+ int scnt = 0;
+ int remaining;
+ int handled = 0;
+
+ logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ sector = raid5_compute_sector( logical_sector,
+ conf->raid_disks,
+ conf->raid_disks - conf->max_degraded,
+ &dd_idx,
+ &pd_idx,
+ conf);
+ last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+
+ for (; logical_sector < last_sector;
+ logical_sector += STRIPE_SECTORS,
+ sector += STRIPE_SECTORS,
+ scnt++) {
+
+ if (scnt < raid_bio->bi_hw_segments)
+ /* already done this stripe */
+ continue;
+
+ sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+
+ if (!sh) {
+ /* failed to get a stripe - must wait */
+ raid_bio->bi_hw_segments = scnt;
+ conf->retry_read_aligned = raid_bio;
+ return handled;
+ }
+
+ set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
+ if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+ release_stripe(sh);
+ raid_bio->bi_hw_segments = scnt;
+ conf->retry_read_aligned = raid_bio;
+ return handled;
+ }
+
+ handle_stripe(sh, NULL);
+ release_stripe(sh);
+ handled++;
+ }
+ spin_lock_irq(&conf->device_lock);
+ remaining = --raid_bio->bi_phys_segments;
+ spin_unlock_irq(&conf->device_lock);
+ if (remaining == 0) {
+ int bytes = raid_bio->bi_size;
+
+ raid_bio->bi_size = 0;
+ raid_bio->bi_end_io(raid_bio, bytes,
+ test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
+ ? 0 : -EIO);
+ }
+ if (atomic_dec_and_test(&conf->active_aligned_reads))
+ wake_up(&conf->wait_for_stripe);
+ return handled;
+}
+
+
+
/*
* This is our raid5 kernel thread.
*
spin_lock_irq(&conf->device_lock);
while (1) {
struct list_head *first;
+ struct bio *bio;
- if (conf->seq_flush - conf->seq_write > 0) {
+ if (conf->seq_flush != conf->seq_write) {
int seq = conf->seq_flush;
spin_unlock_irq(&conf->device_lock);
bitmap_unplug(mddev->bitmap);
!list_empty(&conf->delayed_list))
raid5_activate_delayed(conf);
+ while ((bio = remove_bio_from_retry(conf))) {
+ int ok;
+ spin_unlock_irq(&conf->device_lock);
+ ok = retry_aligned_read(conf, bio);
+ spin_lock_irq(&conf->device_lock);
+ if (!ok)
+ break;
+ handled++;
+ }
+
if (list_empty(&conf->handle_list))
break;
else
break;
}
+ md_allow_write(mddev);
while (new > conf->max_nr_stripes) {
if (grow_one_stripe(conf))
conf->max_nr_stripes++;
mdk_rdev_t *rdev;
struct disk_info *disk;
struct list_head *tmp;
+ int working_disks = 0;
if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
*/
sector_t here_new, here_old;
int old_disks;
+ int max_degraded = (mddev->level == 5 ? 1 : 2);
if (mddev->new_level != mddev->level ||
mddev->new_layout != mddev->layout ||
mddev->new_chunk != mddev->chunk_size) {
- printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+ printk(KERN_ERR "raid5: %s: unsupported reshape "
+ "required - aborting.\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->delta_disks <= 0) {
- printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+ printk(KERN_ERR "raid5: %s: unsupported reshape "
+ "(reduce disks) required - aborting.\n",
mdname(mddev));
return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
- * further up in new geometry must map after here in old geometry.
+ * further up in new geometry must map after here in old
+ * geometry.
*/
here_new = mddev->reshape_position;
- if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
- printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+ if (sector_div(here_new, (mddev->chunk_size>>9)*
+ (mddev->raid_disks - max_degraded))) {
+ printk(KERN_ERR "raid5: reshape_position not "
+ "on a stripe boundary\n");
return -EINVAL;
}
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
- sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
- /* here_old is the first stripe that we might need to read from */
+ sector_div(here_old, (mddev->chunk_size>>9)*
+ (old_disks-max_degraded));
+ /* here_old is the first stripe that we might need to read
+ * from */
if (here_new >= here_old) {
/* Reading from the same stripe as writing to - bad */
- printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+ printk(KERN_ERR "raid5: reshape_position too early for "
+ "auto-recovery - aborting.\n");
return -EINVAL;
}
printk(KERN_INFO "raid5: reshape will continue\n");
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
+ atomic_set(&conf->active_aligned_reads, 0);
PRINTK("raid5: run(%s) called.\n", mdname(mddev));
printk(KERN_INFO "raid5: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b),
raid_disk);
- conf->working_disks++;
+ working_disks++;
}
}
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
- mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
+ mddev->degraded = conf->raid_disks - working_disks;
conf->mddev = mddev;
conf->chunk_size = mddev->chunk_size;
conf->level = mddev->level;
if (mddev->degraded > conf->max_degraded) {
printk(KERN_ERR "raid5: not enough operational devices for %s"
" (%d/%d failed)\n",
- mdname(mddev), conf->failed_disks, conf->raid_disks);
+ mdname(mddev), mddev->degraded, conf->raid_disks);
goto abort;
}
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"%s_reshape");
- /* FIXME if md_register_thread fails?? */
- md_wakeup_thread(mddev->sync_thread);
-
}
/* read-ahead size must cover two whole stripes, which is
mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->issue_flush_fn = raid5_issue_flush;
+ mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
+
mddev->array_size = mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
+ blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+
return 0;
abort:
if (conf) {
int i;
seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
- seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->disks[i].rdev &&
printk("(conf==NULL)\n");
return;
}
- printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
- conf->working_disks, conf->failed_disks);
+ printk(" --- rd:%d wd:%d\n", conf->raid_disks,
+ conf->raid_disks - conf->mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
tmp = conf->disks + i;
if (tmp->rdev
&& !test_bit(Faulty, &tmp->rdev->flags)
- && !test_bit(In_sync, &tmp->rdev->flags)) {
+ && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded--;
- conf->failed_disks--;
- conf->working_disks++;
- set_bit(In_sync, &tmp->rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
}
}
print_raid5_conf(conf);
if (err)
return err;
+ if (mddev->degraded > conf->max_degraded)
+ return -EINVAL;
/* looks like we might be able to manage this */
return 0;
}
struct list_head *rtmp;
int spares = 0;
int added_devices = 0;
+ unsigned long flags;
- if (mddev->degraded ||
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
ITERATE_RDEV(mddev, rdev, rtmp)
!test_bit(Faulty, &rdev->flags))
spares++;
- if (spares < mddev->delta_disks-1)
+ if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
/* Not enough devices even to make a degraded array
* of that size
*/
if (raid5_add_disk(mddev, rdev)) {
char nm[20];
set_bit(In_sync, &rdev->flags);
- conf->working_disks++;
added_devices++;
rdev->recovery_offset = 0;
sprintf(nm, "rd%d", rdev->raid_disk);
break;
}
+ spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = 0;
- mddev->sb_dirty = 1;
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
struct block_device *bdev;
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+ conf->mddev->array_size = conf->mddev->size *
+ (conf->raid_disks - conf->max_degraded);
set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+ i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
spin_lock_irq(&conf->device_lock);
conf->quiesce = 1;
wait_event_lock_irq(conf->wait_for_stripe,
- atomic_read(&conf->active_stripes) == 0,
+ atomic_read(&conf->active_stripes) == 0 &&
+ atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
spin_unlock_irq(&conf->device_lock);
break;
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+ .check_reshape = raid5_check_reshape,
+ .start_reshape = raid5_start_reshape,
+#endif
.quiesce = raid5_quiesce,
};
static struct mdk_personality raid5_personality =