X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=drivers%2Fmd%2Fmd.c;h=5f154ef1e4befeeef26358d9ae37e424c62cee47;hb=1557d33007f63dd96e5d15f33af389378e5f2e54;hp=8350bde60d1bdea8e44bba10b0e524e8e60efc7b;hpb=110518bccf076726cc93bf604527d8019aae50ba;p=safe%2Fjmp%2Flinux-2.6 diff --git a/drivers/md/md.c b/drivers/md/md.c index 8350bde..5f154ef 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -98,47 +98,43 @@ static struct ctl_table_header *raid_table_header; static ctl_table raid_table[] = { { - .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; static ctl_table raid_dir_table[] = { { - .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, - { .ctl_name = 0 } + { } }; static ctl_table raid_root_table[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = raid_dir_table, }, - { .ctl_name = 0 } + { } }; -static struct block_device_operations md_fops; +static const struct block_device_operations md_fops; static int start_readonly; @@ -262,6 +258,12 @@ static void mddev_resume(mddev_t *mddev) mddev->pers->quiesce(mddev, 0); } +int mddev_congested(mddev_t *mddev, int bits) +{ + return mddev->suspended; +} +EXPORT_SYMBOL(mddev_congested); + static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -359,6 +361,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; + mutex_init(&new->open_mutex); mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); @@ -440,15 +443,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev) return MD_NEW_SIZE_SECTORS(num_sectors); } -static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) -{ - sector_t num_sectors = rdev->sb_start; - - if (chunk_size) - num_sectors &= ~((sector_t)chunk_size/512 - 1); - return num_sectors; -} - static int alloc_disk_sb(mdk_rdev_t * rdev) { if (rdev->sb_page) @@ -745,6 +739,24 @@ struct super_type { }; /* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(mddev_t *mddev) +{ + if (!mddev->bitmap_file && !mddev->bitmap_offset) + return 0; + printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + +/* * load_super for 0.90.0 */ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) @@ -797,17 +809,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->data_offset = 0; rdev->sb_size = MD_SB_BYTES; - if (sb->state & (1<level != 1 && sb->level != 4 - && sb->level != 5 && sb->level != 6 - && sb->level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - goto abort; - } - } - if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; else @@ -836,7 +837,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); + rdev->sectors = rdev->sb_start; if (rdev->sectors < sb->size * 2 && sb->level > 1) /* "this cannot possibly happen" ... */ @@ -866,7 +867,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; - mddev->chunk_size = sb->chunk_size; + mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; @@ -883,13 +884,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; - mddev->new_chunk = sb->new_chunk; + mddev->new_chunk_sectors = sb->new_chunk >> 9; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } if (sb->state & (1<raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + } else if (desc->state & (1<minor_version >= 91) { + rdev->recovery_offset = 0; + rdev->raid_disk = desc->raid_disk; + } } if (desc->state & (1<flags); @@ -1004,7 +1013,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; - sb->new_chunk = mddev->new_chunk; + sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) @@ -1018,7 +1027,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->recovery_cp = 0; sb->layout = mddev->layout; - sb->chunk_size = mddev->chunk_size; + sb->chunk_size = mddev->chunk_sectors << 9; if (mddev->bitmap && mddev->bitmap_file == NULL) sb->state |= (1<disks, same_set) { mdp_disk_t *d; int desc_nr; - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + int is_active = test_bit(In_sync, &rdev2->flags); + + if (rdev2->raid_disk >= 0 && + sb->minor_version >= 91) + /* we have nowhere to store the recovery_offset, + * but if it is not below the reshape_position, + * we can piggy-back on that. + */ + is_active = 1; + if (rdev2->raid_disk < 0 || + test_bit(Faulty, &rdev2->flags)) + is_active = 0; + if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; @@ -1038,16 +1058,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<flags)) { + else if (is_active) { d->state = (1<state |= (1<flags)) + d->state |= (1<bdev,b)); return -EINVAL; } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { - if (sb->level != cpu_to_le32(1) && - sb->level != cpu_to_le32(4) && - sb->level != cpu_to_le32(5) && - sb->level != cpu_to_le32(6) && - sb->level != cpu_to_le32(10)) { - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } - } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; @@ -1248,9 +1257,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (rdev->sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le32_to_cpu(sb->chunksize)) - rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); - if (le64_to_cpu(sb->size) > rdev->sectors) return -EINVAL; return ret; @@ -1271,7 +1277,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; - mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); mddev->level = le32_to_cpu(sb->level); @@ -1297,13 +1303,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); - mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else if (mddev->pers == NULL) { @@ -1324,7 +1330,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ break; @@ -1375,6 +1386,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); @@ -1383,8 +1397,6 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags)) { - if (mddev->curr_resync_completed > rdev->recovery_offset) - rdev->recovery_offset = mddev->curr_resync_completed; if (rdev->recovery_offset > 0) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -1399,7 +1411,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); - sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); } max_dev = 0; @@ -1407,8 +1419,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; - if (max_dev > le32_to_cpu(sb->max_dev)) + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } for (i=0; idev_roles[i] = cpu_to_le16(0xfffe); @@ -1500,37 +1518,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); -static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(mddev_t *mddev) +{ + mdk_rdev_t *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (blk_get_integrity(mddev->gendisk)) + return 0; /* already registered */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + /* + * If at least one rdev is not integrity capable, we can not + * enable data integrity for the md device. + */ + if (!bdev_get_integrity(rdev->bdev)) + return -EINVAL; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity on %s enabled\n", + mdname(mddev)); + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - struct mdk_personality *pers = mddev->pers; - struct gendisk *disk = mddev->gendisk; struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(disk); + struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); - /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ - if (pers && pers->level >= 4 && pers->level <= 6) + if (!bi_mddev) /* nothing to do */ return; - - /* If rdev is integrity capable, register profile for mddev */ - if (!bi_mddev && bi_rdev) { - if (blk_integrity_register(disk, bi_rdev)) - printk(KERN_ERR "%s: %s Could not register integrity!\n", - __func__, disk->disk_name); - else - printk(KERN_NOTICE "Enabling data integrity on %s\n", - disk->disk_name); + if (rdev->raid_disk < 0) /* skip spares */ return; - } - - /* Check that mddev and rdev have matching profiles */ - if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { - printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, - disk->disk_name, rdev->bdev->bd_disk->disk_name); - printk(KERN_NOTICE "Disabling data integrity on %s\n", - disk->disk_name); - blk_integrity_unregister(disk); - } + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); } +EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { @@ -1604,7 +1661,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; - md_integrity_check(rdev, mddev); return 0; fail: @@ -1769,9 +1825,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb) __u8 *uuid; uuid = sb->set_uuid; - printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" - ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" - KERN_INFO "md: Name: \"%s\" CT:%llu\n", + printk(KERN_INFO + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" + ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" + "md: Name: \"%s\" CT:%llu\n", le32_to_cpu(sb->major_version), le32_to_cpu(sb->feature_map), uuid[0], uuid[1], uuid[2], uuid[3], @@ -1783,12 +1840,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb) & MD_SUPERBLOCK_1_TIME_SEC_MASK); uuid = sb->device_uuid; - printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" + printk(KERN_INFO + "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" " RO:%llu\n" - KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" - ":%02x%02x%02x%02x%02x%02x\n" - KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - KERN_INFO "md: (MaxDev:%u) \n", + "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" + ":%02x%02x%02x%02x%02x%02x\n" + "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" + "md: (MaxDev:%u) \n", le32_to_cpu(sb->level), (unsigned long long)le64_to_cpu(sb->size), le32_to_cpu(sb->raid_disks), @@ -1872,6 +1930,14 @@ static void sync_sbs(mddev_t * mddev, int nospares) */ mdk_rdev_t *rdev; + /* First make sure individual recovery_offsets are correct */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->sb_events == mddev->events || (nospares && @@ -1894,6 +1960,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; + mddev->utime = get_seconds(); if (mddev->external) return; repeat: @@ -1923,7 +1990,6 @@ repeat: nospares = 0; sync_req = mddev->in_sync; - mddev->utime = get_seconds(); /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ @@ -1936,17 +2002,14 @@ repeat: /* otherwise we have to go forward and ... */ mddev->events ++; if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, insist on an odd 'events' */ - if ((mddev->events&1)==0) { - mddev->events++; + /* .. if the array isn't clean, an 'even' event must also go + * to spares. */ + if ((mddev->events&1)==0) nospares = 0; - } } else { - /* otherwise insist on an even 'events' (for clean states) */ - if ((mddev->events&1)) { - mddev->events++; + /* otherwise an 'odd' event must go to spares */ + if ((mddev->events&1)) nospares = 0; - } } } @@ -2589,20 +2652,11 @@ static void analyze_sbs(mddev_t * mddev) rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= mddev->raid_disks) { + } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } - - - - if (mddev->recovery_cp != MaxSector && - mddev->level >= 1) - printk(KERN_ERR "md: %s: raid array is not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - } static void md_safemode_timeout(unsigned long data); @@ -2677,6 +2731,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ssize_t rv = len; struct mdk_personality *pers; void *priv; + mdk_rdev_t *rdev; if (mddev->pers == NULL) { if (len == 0) @@ -2743,7 +2798,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) if (IS_ERR(priv)) { mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; module_put(pers->owner); @@ -2756,12 +2811,18 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev_suspend(mddev); mddev->pers->stop(mddev); module_put(mddev->pers->owner); + /* Invalidate devices that are now superfluous */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= mddev->raid_disks) { + rdev->raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + } mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; - mddev->chunk_size = mddev->new_chunk; + mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->delta_disks = 0; pers->run(mddev); mddev_resume(mddev); @@ -2797,11 +2858,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { int err; - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EBUSY; - err = mddev->pers->reconfig(mddev, n, -1); - if (err) + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_layout = mddev->layout; return err; + } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) @@ -2854,10 +2918,11 @@ static ssize_t chunk_size_show(mddev_t *mddev, char *page) { if (mddev->reshape_position != MaxSector && - mddev->chunk_size != mddev->new_chunk) - return sprintf(page, "%d (%d)\n", mddev->new_chunk, - mddev->chunk_size); - return sprintf(page, "%d\n", mddev->chunk_size); + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t @@ -2871,15 +2936,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) { int err; - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EBUSY; - err = mddev->pers->reconfig(mddev, -1, n); - if (err) + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_chunk_sectors = mddev->chunk_sectors; return err; + } } else { - mddev->new_chunk = n; + mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) - mddev->chunk_size = n; + mddev->chunk_sectors = n >> 9; } return len; } @@ -3066,11 +3134,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) } else err = -EBUSY; spin_unlock_irq(&mddev->write_lock); - } else { - mddev->ro = 0; - mddev->recovery_cp = MaxSector; - err = do_md_run(mddev); - } + } else + err = -EINVAL; break; case active: if (mddev->pers) { @@ -3306,7 +3371,9 @@ static ssize_t action_show(mddev_t *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) type = "reshape"; @@ -3329,7 +3396,12 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "idle")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); @@ -3520,8 +3592,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (min & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = min; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_min = min; @@ -3553,12 +3626,14 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && + mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (max & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = max; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_max = max; @@ -3582,7 +3657,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; @@ -3610,7 +3686,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; @@ -3649,7 +3726,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; return len; } @@ -3683,24 +3760,15 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -EINVAL; + return -E2BIG; mddev->external_size = 1; } mddev->array_sectors = sectors; set_capacity(mddev->gendisk, mddev->array_sectors); - if (mddev->pers) { - struct block_device *bdev = bdget_disk(mddev->gendisk, 0); - - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (mddev->pers) + revalidate_disk(mddev->gendisk); return len; } @@ -3853,11 +3921,9 @@ static int md_alloc(dev_t dev, char *name) flush_scheduled_work(); mutex_lock(&disks_mutex); - if (mddev->gendisk) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return -EEXIST; - } + error = -EEXIST; + if (mddev->gendisk) + goto abort; if (name) { /* Need to ensure that 'name' is not a duplicate. @@ -3869,17 +3935,15 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); - return -EEXIST; + goto abort; } spin_unlock(&all_mddevs_lock); } + error = -ENOMEM; mddev->queue = blk_alloc_queue(GFP_KERNEL); - if (!mddev->queue) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return -ENOMEM; - } + if (!mddev->queue) + goto abort; mddev->queue->queuedata = mddev; /* Can be unlocked because the queue is new: no concurrency */ @@ -3889,11 +3953,9 @@ static int md_alloc(dev_t dev, char *name) disk = alloc_disk(1 << shift); if (!disk) { - mutex_unlock(&disks_mutex); blk_cleanup_queue(mddev->queue); mddev->queue = NULL; - mddev_put(mddev); - return -ENOMEM; + goto abort; } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -3915,16 +3977,22 @@ static int md_alloc(dev_t dev, char *name) mddev->gendisk = disk; error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk_to_dev(disk)->kobj, "%s", "md"); - mutex_unlock(&disks_mutex); - if (error) + if (error) { + /* This isn't possible, but as kobject_init_and_add is marked + * __must_check, we must do something with the result + */ printk(KERN_WARNING "md: cannot register %s/md - name in use\n", disk->disk_name); - else { + error = 0; + } + abort: + mutex_unlock(&disks_mutex); + if (!error) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); } mddev_put(mddev); - return 0; + return error; } static struct kobject *md_probe(dev_t dev, int *part, void *data) @@ -3969,11 +4037,9 @@ static int start_dirty_degraded; static int do_md_run(mddev_t * mddev) { int err; - int chunk_size; mdk_rdev_t *rdev; struct gendisk *disk; struct mdk_personality *pers; - char b[BDEVNAME_SIZE]; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -3991,38 +4057,6 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); } - chunk_size = mddev->chunk_size; - - if (chunk_size) { - if (chunk_size > MAX_CHUNK_SIZE) { - printk(KERN_ERR "too big chunk_size: %d > %d\n", - chunk_size, MAX_CHUNK_SIZE); - return -EINVAL; - } - /* - * chunk-size has to be a power of 2 - */ - if ( (1 << ffz(~chunk_size)) != chunk_size) { - printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); - return -EINVAL; - } - - /* devices must have minimum size of one chunk */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->sectors < chunk_size / 512) { - printk(KERN_WARNING - "md: Dev %s smaller than chunk_size:" - " %llu < %d\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sectors, - chunk_size / 512); - return -EINVAL; - } - } - } - if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) @@ -4087,10 +4121,6 @@ static int do_md_run(mddev_t * mddev) } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - if (pers->level >= 4 && pers->level <= 6) - /* Cannot support integrity (yet) */ - blk_integrity_unregister(mddev->gendisk); - if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ @@ -4215,7 +4245,7 @@ static int do_md_run(mddev_t * mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "%s_resync"); + "resync"); if (!mddev->sync_thread) { printk(KERN_ERR "%s: could not start resync" " thread...\n", @@ -4228,6 +4258,7 @@ static int do_md_run(mddev_t * mddev) md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + revalidate_disk(mddev->gendisk); mddev->changed = 1; md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); @@ -4297,13 +4328,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { int err = 0; struct gendisk *disk = mddev->gendisk; + mdk_rdev_t *rdev; + mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); - return -EBUSY; - } - - if (mddev->pers) { + err = -EBUSY; + } else if (mddev->pers) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4339,6 +4370,13 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + set_capacity(disk, 0); mddev->changed = 1; @@ -4353,13 +4391,16 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) if (mode == 1) set_disk_ro(disk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + err = 0; } - +out: + mutex_unlock(&mddev->open_mutex); + if (err) + return err; /* * Free resources if final stop */ if (mode == 0) { - mdk_rdev_t *rdev; printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); @@ -4371,13 +4412,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } mddev->bitmap_offset = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); @@ -4398,7 +4432,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->flags = 0; mddev->ro = 0; mddev->metadata_type[0] = 0; - mddev->chunk_size = 0; + mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; mddev->max_disks = 0; @@ -4406,7 +4440,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->delta_disks = 0; mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; - mddev->new_chunk = 0; + mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; mddev->resync_mismatches = 0; mddev->suspend_lo = mddev->suspend_hi = 0; @@ -4428,7 +4462,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); -out: return err; } @@ -4569,10 +4602,10 @@ static int get_version(void __user * arg) static int get_array_info(mddev_t * mddev, void __user * arg) { mdu_array_info_t info; - int nr,working,active,failed,spare; + int nr,working,insync,failed,spare; mdk_rdev_t *rdev; - nr=working=active=failed=spare=0; + nr=working=insync=failed=spare=0; list_for_each_entry(rdev, &mddev->disks, same_set) { nr++; if (test_bit(Faulty, &rdev->flags)) @@ -4580,7 +4613,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) else { working++; if (test_bit(In_sync, &rdev->flags)) - active++; + insync++; else spare++; } @@ -4605,13 +4638,13 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = (1<bitmap && mddev->bitmap_offset) info.state = (1<layout; - info.chunk_size = mddev->chunk_size; + info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4715,7 +4748,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!list_empty(&mddev->disks)) { mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, mdk_rdev_t, same_set); - int err = super_types[mddev->major_version] + err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { printk(KERN_WARNING @@ -4836,7 +4869,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); + rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4906,7 +4939,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) else rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); + rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -5055,7 +5088,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; - mddev->chunk_size = info->chunk_size; + mddev->chunk_sectors = info->chunk_size >> 9; mddev->max_disks = MD_SB_DISKS; @@ -5074,7 +5107,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; @@ -5126,18 +5159,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (!rv) + revalidate_disk(mddev->gendisk); return rv; } @@ -5184,7 +5207,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size || + mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) @@ -5208,10 +5231,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) * we don't need to do anything at the md level, the * personality will take care of it all. */ - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; - else - return mddev->pers->reconfig(mddev, info->layout, -1); + else { + mddev->new_layout = info->layout; + rv = mddev->pers->check_reshape(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } } if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -5518,12 +5546,12 @@ static int md_open(struct block_device *bdev, fmode_t mode) } BUG_ON(mddev != bdev->bd_disk->private_data); - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; err = 0; atomic_inc(&mddev->openers); - mddev_unlock(mddev); + mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); out: @@ -5555,12 +5583,12 @@ static int md_revalidate(struct gendisk *disk) mddev->changed = 0; return 0; } -static struct block_device_operations md_fops = +static const struct block_device_operations md_fops = { .owner = THIS_MODULE, .open = md_open, .release = md_release, - .locked_ioctl = md_ioctl, + .ioctl = md_ioctl, .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, @@ -5630,7 +5658,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; - thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); + thread->tsk = kthread_run(md_thread, thread, + "%s_%s", + mdname(thread->mddev), + name ?: mddev->pers->name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; @@ -6355,12 +6386,13 @@ void md_do_sync(mddev_t *mddev) skipped = 0; - if ((mddev->curr_resync > mddev->curr_resync_completed && - (mddev->curr_resync - mddev->curr_resync_completed) - > (max_sectors >> 4)) || - (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed - ) { + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { /* time to update curr_resync_completed */ blk_unplug(mddev->queue); wait_event(mddev->recovery_wait, @@ -6371,10 +6403,16 @@ void md_do_sync(mddev_t *mddev) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - if (j >= mddev->resync_max) - wait_event(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); + while (j >= mddev->resync_max && !kthread_should_stop()) { + /* As this condition is controlled by user-space, + * we can block indefinitely, so use '_interruptible' + * to avoid triggering warnings. + */ + flush_signals(current); /* just in case */ + wait_event_interruptible(mddev->recovery_wait, + mddev->resync_max > j + || kthread_should_stop()); + } if (kthread_should_stop()) goto interrupted; @@ -6487,8 +6525,9 @@ void md_do_sync(mddev_t *mddev) skip: mddev->curr_resync = 0; mddev->curr_resync_completed = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + /* We completed so max setting can be forgotten. */ + mddev->resync_max = MaxSector; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); @@ -6709,7 +6748,8 @@ void md_check_recovery(mddev_t *mddev) */ if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape(mddev) != 0) + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) /* Cannot proceed */ goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -6736,7 +6776,7 @@ void md_check_recovery(mddev_t *mddev) } mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "%s_resync"); + "resync"); if (!mddev->sync_thread) { printk(KERN_ERR "%s: could not start resync" " thread...\n",