* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/blkdev.h>
#include <linux/seq_file.h>
/* allocate a r1bio with room for raid_disks entries in the bios array */
r1_bio = kzalloc(size, gfp_flags);
- if (!r1_bio)
+ if (!r1_bio && pi->mddev)
unplug_slaves(pi->mddev);
return r1_bio;
static void raid1_end_read_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+ r1bio_t *r1_bio = bio->bi_private;
int mirror;
conf_t *conf = r1_bio->mddev->private;
*/
char b[BDEVNAME_SIZE];
if (printk_ratelimit())
- printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
+ printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
+ mdname(conf->mddev),
bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
reschedule_retry(r1_bio);
}
static void raid1_end_write_request(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+ r1bio_t *r1_bio = bio->bi_private;
int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
conf_t *conf = r1_bio->mddev->private;
struct bio *to_put = NULL;
*/
static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{
- const unsigned long this_sector = r1_bio->sector;
+ const sector_t this_sector = r1_bio->sector;
int new_disk = conf->last_used, disk = new_disk;
int wonly_disk = -1;
const int sectors = r1_bio->sectors;
retry:
if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) {
- /* Choose the first operation device, for consistancy */
+ /* Choose the first operational device, for consistancy */
new_disk = 0;
for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
static void lower_barrier(conf_t *conf)
{
unsigned long flags;
+ BUG_ON(conf->barrier <= 0);
spin_lock_irqsave(&conf->resync_lock, flags);
conf->barrier--;
spin_unlock_irqrestore(&conf->resync_lock, flags);
return NULL;
}
-static int make_request(struct request_queue *q, struct bio * bio)
+static int make_request(mddev_t *mddev, struct bio * bio)
{
- mddev_t *mddev = q->queuedata;
conf_t *conf = mddev->private;
mirror_info_t *mirror;
r1bio_t *r1_bio;
struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio);
const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
- int cpu;
bool do_barriers;
mdk_rdev_t *blocked_rdev;
md_write_start(mddev, bio); /* wait on superblock update early */
+ if (bio_data_dir(bio) == WRITE &&
+ bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+ bio->bi_sector < mddev->suspend_hi) {
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ DEFINE_WAIT(w);
+ for (;;) {
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_barrier,
+ &w, TASK_INTERRUPTIBLE);
+ if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+ bio->bi_sector >= mddev->suspend_hi)
+ break;
+ schedule();
+ }
+ finish_wait(&conf->wait_barrier, &w);
+ }
if (unlikely(!mddev->barriers_work &&
bio_rw_flagged(bio, BIO_RW_BARRIER))) {
if (rw == WRITE)
bitmap = mddev->bitmap;
- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
- bio_sectors(bio));
- part_stat_unlock();
-
/*
* make_request() can abort the operation when READA is being
* used and no empty request is available.
}
mirror = conf->mirrors + rdisk;
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ bitmap) {
+ /* Reading from a write-mostly device must
+ * take care not to over-take any writes
+ * that are 'behind'
+ */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
r1_bio->read_disk = rdisk;
read_bio = bio_clone(bio, GFP_NOIO);
if (test_bit(Faulty, &rdev->flags)) {
rdev_dec_pending(rdev, mddev);
r1_bio->bios[i] = NULL;
- } else
+ } else {
r1_bio->bios[i] = bio;
- targets++;
+ targets++;
+ }
} else
r1_bio->bios[i] = NULL;
}
set_bit(R1BIO_Degraded, &r1_bio->state);
}
- /* do behind I/O ? */
+ /* do behind I/O ?
+ * Not if there are too many, or cannot allocate memory,
+ * or a reader on WriteMostly is waiting for behind writes
+ * to flush */
if (bitmap &&
- atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+ (atomic_read(&bitmap->behind_writes)
+ < mddev->bitmap_info.max_write_behind) &&
+ !waitqueue_active(&bitmap->behind_wait) &&
(behind_pages = alloc_behind_pages(bio)) != NULL)
set_bit(R1BIO_BehindIO, &r1_bio->state);
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n"
- "raid1: Operation continuing on %d devices.\n",
- bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
+ printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), bdevname(rdev->bdev, b),
+ mdname(mddev), conf->raid_disks - mddev->degraded);
}
static void print_conf(conf_t *conf)
{
int i;
- printk("RAID1 conf printout:\n");
+ printk(KERN_DEBUG "RAID1 conf printout:\n");
if (!conf) {
- printk("(!conf)\n");
+ printk(KERN_DEBUG "(!conf)\n");
return;
}
- printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+ printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
rcu_read_lock();
char b[BDEVNAME_SIZE];
mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev)
- printk(" disk %d, wo:%d, o:%d, dev:%s\n",
+ printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
i, !test_bit(In_sync, &rdev->flags),
!test_bit(Faulty, &rdev->flags),
bdevname(rdev->bdev,b));
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_sector to one PAGE, as
- * a one page request is never in violation.
+ /* as we don't honour merge_bvec_fn, we must
+ * never risk violating it, so limit
+ * ->max_segments to one lying with a single
+ * page, as a one page request is never in
+ * violation.
*/
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
- blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
p->head_position = 0;
rdev->raid_disk = mirror;
static void end_sync_read(struct bio *bio, int error)
{
- r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+ r1bio_t *r1_bio = bio->bi_private;
int i;
for (i=r1_bio->mddev->raid_disks; i--; )
static void end_sync_write(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+ r1bio_t *r1_bio = bio->bi_private;
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
int i;
char b[BDEVNAME_SIZE];
/* Cannot read from anywhere, array is toast */
md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
- printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
- bdevname(bio->bi_bdev,b),
+ mdname(mddev),
+ bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
else {
atomic_add(s, &rdev->corrected_errors);
printk(KERN_INFO
- "raid1:%s: read error corrected "
+ "md/raid1:%s: read error corrected "
"(%d sectors at %llu on %s)\n",
mdname(mddev), s,
(unsigned long long)(sect +
r1_bio->sector,
r1_bio->sectors);
unfreeze_array(conf);
- }
+ } else
+ md_error(mddev,
+ conf->mirrors[r1_bio->read_disk].rdev);
bio = r1_bio->bios[r1_bio->read_disk];
- if ((disk=read_balance(conf, r1_bio)) == -1 ||
- disk == r1_bio->read_disk) {
- printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
+ if ((disk=read_balance(conf, r1_bio)) == -1) {
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
" read error for block %llu\n",
+ mdname(mddev),
bdevname(bio->bi_bdev,b),
(unsigned long long)r1_bio->sector);
raid_end_bio_io(r1_bio);
r1_bio->bios[r1_bio->read_disk] = bio;
rdev = conf->mirrors[disk].rdev;
if (printk_ratelimit())
- printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
- " another mirror\n",
- bdevname(rdev->bdev,b),
- (unsigned long long)r1_bio->sector);
+ printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
+ " other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(rdev->bdev,b));
bio->bi_sector = r1_bio->sector + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
bio->bi_end_io = raid1_end_read_request;
int still_degraded = 0;
if (!conf->r1buf_pool)
- {
-/*
- printk("sync start - bitmap %p\n", mddev->bitmap);
-*/
if (init_resync(conf))
return 0;
- }
max_sector = mddev->dev_sectors;
if (sector_nr >= max_sector) {
return mddev->dev_sectors;
}
-static int run(mddev_t *mddev)
+static conf_t *setup_conf(mddev_t *mddev)
{
conf_t *conf;
- int i, j, disk_idx;
+ int i;
mirror_info_t *disk;
mdk_rdev_t *rdev;
+ int err = -ENOMEM;
- if (mddev->level != 1) {
- printk("raid1: %s: raid level not set to mirroring (%d)\n",
- mdname(mddev), mddev->level);
- goto out;
- }
- if (mddev->reshape_position != MaxSector) {
- printk("raid1: %s: reshape_position set but not supported\n",
- mdname(mddev));
- goto out;
- }
- /*
- * copy the already verified devices into our private RAID1
- * bookkeeping area. [whatever we allocate in run(),
- * should be freed in stop()]
- */
conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
- mddev->private = conf;
if (!conf)
- goto out_no_mem;
+ goto abort;
conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
GFP_KERNEL);
if (!conf->mirrors)
- goto out_no_mem;
+ goto abort;
conf->tmppage = alloc_page(GFP_KERNEL);
if (!conf->tmppage)
- goto out_no_mem;
+ goto abort;
- conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+ conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
if (!conf->poolinfo)
- goto out_no_mem;
- conf->poolinfo->mddev = mddev;
+ goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks;
conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
r1bio_pool_free,
conf->poolinfo);
if (!conf->r1bio_pool)
- goto out_no_mem;
+ goto abort;
- spin_lock_init(&conf->device_lock);
- mddev->queue->queue_lock = &conf->device_lock;
+ conf->poolinfo->mddev = mddev;
+ spin_lock_init(&conf->device_lock);
list_for_each_entry(rdev, &mddev->disks, same_set) {
- disk_idx = rdev->raid_disk;
+ int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
disk = conf->mirrors + disk_idx;
disk->rdev = rdev;
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- /* as we don't honour merge_bvec_fn, we must never risk
- * violating it, so limit ->max_sector to one PAGE, as
- * a one page request is never in violation.
- */
- if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
- queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
- blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->head_position = 0;
}
bio_list_init(&conf->pending_bio_list);
bio_list_init(&conf->flushing_bio_list);
-
- mddev->degraded = 0;
+ conf->last_used = -1;
for (i = 0; i < conf->raid_disks; i++) {
disk = conf->mirrors + i;
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
- mddev->degraded++;
if (disk->rdev)
conf->fullsync = 1;
- }
+ } else if (conf->last_used < 0)
+ /*
+ * The first working device is used as a
+ * starting point to read balancing.
+ */
+ conf->last_used = i;
}
- if (mddev->degraded == conf->raid_disks) {
- printk(KERN_ERR "raid1: no operational mirrors for %s\n",
- mdname(mddev));
- goto out_free_conf;
+
+ err = -EIO;
+ if (conf->last_used < 0) {
+ printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
+ mdname(mddev));
+ goto abort;
+ }
+ err = -ENOMEM;
+ conf->thread = md_register_thread(raid1d, mddev, NULL);
+ if (!conf->thread) {
+ printk(KERN_ERR
+ "md/raid1:%s: couldn't allocate thread\n",
+ mdname(mddev));
+ goto abort;
}
- if (conf->raid_disks - mddev->degraded == 1)
- mddev->recovery_cp = MaxSector;
+ return conf;
+
+ abort:
+ if (conf) {
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
+ kfree(conf->poolinfo);
+ kfree(conf);
+ }
+ return ERR_PTR(err);
+}
+
+static int run(mddev_t *mddev)
+{
+ conf_t *conf;
+ int i;
+ mdk_rdev_t *rdev;
+
+ if (mddev->level != 1) {
+ printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
+ mdname(mddev), mddev->level);
+ return -EIO;
+ }
+ if (mddev->reshape_position != MaxSector) {
+ printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
+ mdname(mddev));
+ return -EIO;
+ }
/*
- * find the first working one and use it as a starting point
- * to read balancing.
+ * copy the already verified devices into our private RAID1
+ * bookkeeping area. [whatever we allocate in run(),
+ * should be freed in stop()]
*/
- for (j = 0; j < conf->raid_disks &&
- (!conf->mirrors[j].rdev ||
- !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
- /* nothing */;
- conf->last_used = j;
+ if (mddev->private == NULL)
+ conf = setup_conf(mddev);
+ else
+ conf = mddev->private;
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
- mddev->thread = md_register_thread(raid1d, mddev, NULL);
- if (!mddev->thread) {
- printk(KERN_ERR
- "raid1: couldn't allocate thread for %s\n",
- mdname(mddev));
- goto out_free_conf;
+ mddev->queue->queue_lock = &conf->device_lock;
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ /* as we don't honour merge_bvec_fn, we must never risk
+ * violating it, so limit ->max_segments to 1 lying within
+ * a single page, as a one page request is never in violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
}
+ mddev->degraded = 0;
+ for (i=0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].rdev == NULL ||
+ !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+ test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ mddev->degraded++;
+
+ if (conf->raid_disks - mddev->degraded == 1)
+ mddev->recovery_cp = MaxSector;
+
if (mddev->recovery_cp != MaxSector)
- printk(KERN_NOTICE "raid1: %s is not clean"
+ printk(KERN_NOTICE "md/raid1:%s: not clean"
" -- starting background reconstruction\n",
mdname(mddev));
printk(KERN_INFO
- "raid1: raid set %s active with %d out of %d mirrors\n",
+ "md/raid1:%s: active with %d out of %d mirrors\n",
mdname(mddev), mddev->raid_disks - mddev->degraded,
mddev->raid_disks);
+
/*
* Ok, everything is just fine now
*/
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+ mddev->private = conf;
+
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_data = mddev;
md_integrity_register(mddev);
return 0;
-
-out_no_mem:
- printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
- mdname(mddev));
-
-out_free_conf:
- if (conf) {
- if (conf->r1bio_pool)
- mempool_destroy(conf->r1bio_pool);
- kfree(conf->mirrors);
- safe_put_page(conf->tmppage);
- kfree(conf->poolinfo);
- kfree(conf);
- mddev->private = NULL;
- }
-out:
- return -EIO;
}
static int stop(mddev_t *mddev)
{
conf_t *conf = mddev->private;
struct bitmap *bitmap = mddev->bitmap;
- int behind_wait = 0;
/* wait for behind writes to complete */
- while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
- behind_wait++;
- printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ); /* wait a second */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
/* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
}
raise_barrier(conf);
if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
- mddev->changed = 1;
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
if (sysfs_create_link(&mddev->kobj,
&rdev->kobj, nm))
printk(KERN_WARNING
- "md/raid1: cannot register "
- "%s for %s\n",
- nm, mdname(mddev));
+ "md/raid1:%s: cannot register "
+ "%s\n",
+ mdname(mddev), nm);
}
if (rdev)
newmirrors[d2++].rdev = rdev;
conf_t *conf = mddev->private;
switch(state) {
+ case 2: /* wake for suspend */
+ wake_up(&conf->wait_barrier);
+ break;
case 1:
raise_barrier(conf);
break;
}
}
+static void *raid1_takeover(mddev_t *mddev)
+{
+ /* raid1 can take over:
+ * raid5 with 2 devices, any layout or chunk size
+ */
+ if (mddev->level == 5 && mddev->raid_disks == 2) {
+ conf_t *conf;
+ mddev->new_level = 1;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 0;
+ conf = setup_conf(mddev);
+ if (!IS_ERR(conf))
+ conf->barrier = 1;
+ return conf;
+ }
+ return ERR_PTR(-EINVAL);
+}
static struct mdk_personality raid1_personality =
{
.size = raid1_size,
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
+ .takeover = raid1_takeover,
};
static int __init raid_init(void)
module_init(raid_init);
module_exit(raid_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
MODULE_ALIAS("md-personality-3"); /* RAID1 */
MODULE_ALIAS("md-raid1");
MODULE_ALIAS("md-level-1");