#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h>
#include <linux/ctype.h>
+#include <linux/string.h>
#include <linux/hdreg.h>
#include <linux/proc_fs.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/file.h>
+#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
/*
+ * Default number of read corrections we'll attempt on an rdev
+ * before ejecting it from the array. We divide the read error
+ * count by 2 for every hour elapsed between read errors.
+ */
+#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
+/*
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
* is 1000 KB/sec, so the extra system load does not show up that much.
* Increase it if you want to have more _guaranteed_ speed. Note that
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
if (!mddev->raid_disks && list_empty(&mddev->disks) &&
- !mddev->hold_active) {
+ mddev->ctime == 0 && !mddev->hold_active) {
+ /* Array is not configured at all, and not held active,
+ * so destroy it */
list_del(&mddev->all_mddevs);
if (mddev->gendisk) {
/* we did a probe so need to clean up.
return mutex_trylock(&mddev->reconfig_mutex);
}
+static struct attribute_group md_redundancy_group;
+
static inline void mddev_unlock(mddev_t * mddev)
{
- mutex_unlock(&mddev->reconfig_mutex);
+ if (mddev->pers == NULL && mddev->private) {
+ /* These cannot be removed under reconfig_mutex as
+ * an access to the files will try to take reconfig_mutex
+ * while holding the file unremovable, which leads to
+ * a deadlock.
+ * So hold open_mutex instead - we are allowed to take
+ * it while holding reconfig_mutex, and md_run can
+ * use it to wait for the remove to complete.
+ */
+ mutex_lock(&mddev->open_mutex);
+ mutex_unlock(&mddev->reconfig_mutex);
+
+ sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+ if (mddev->private != (void*)1)
+ sysfs_remove_group(&mddev->kobj, mddev->private);
+ if (mddev->sysfs_action)
+ sysfs_put(mddev->sysfs_action);
+ mddev->sysfs_action = NULL;
+ mddev->private = NULL;
+ mutex_unlock(&mddev->open_mutex);
+ } else
+ mutex_unlock(&mddev->reconfig_mutex);
md_wakeup_thread(mddev->thread);
}
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags)) {
- if (rdev->recovery_offset > 0) {
- sb->feature_map |=
- cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
- sb->recovery_offset =
- cpu_to_le64(rdev->recovery_offset);
- }
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+ sb->recovery_offset =
+ cpu_to_le64(rdev->recovery_offset);
}
if (mddev->reshape_position != MaxSector) {
sb->dev_roles[i] = cpu_to_le16(0xfffe);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
- else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+ else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
sb->dev_roles[i] = cpu_to_le16(0xffff);
uuid = sb->set_uuid;
printk(KERN_INFO
- "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
- ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
+ "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
"md: Name: \"%s\" CT:%llu\n",
le32_to_cpu(sb->major_version),
le32_to_cpu(sb->feature_map),
- uuid[0], uuid[1], uuid[2], uuid[3],
- uuid[4], uuid[5], uuid[6], uuid[7],
- uuid[8], uuid[9], uuid[10], uuid[11],
- uuid[12], uuid[13], uuid[14], uuid[15],
+ uuid,
sb->set_name,
(unsigned long long)le64_to_cpu(sb->ctime)
& MD_SUPERBLOCK_1_TIME_SEC_MASK);
printk(KERN_INFO
"md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
" RO:%llu\n"
- "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
- ":%02x%02x%02x%02x%02x%02x\n"
+ "md: Dev:%08x UUID: %pU\n"
"md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
"md: (MaxDev:%u) \n",
le32_to_cpu(sb->level),
(unsigned long long)le64_to_cpu(sb->super_offset),
(unsigned long long)le64_to_cpu(sb->recovery_offset),
le32_to_cpu(sb->dev_number),
- uuid[0], uuid[1], uuid[2], uuid[3],
- uuid[4], uuid[5], uuid[6], uuid[7],
- uuid[8], uuid[9], uuid[10], uuid[11],
- uuid[12], uuid[13], uuid[14], uuid[15],
+ uuid,
sb->devflags,
(unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
(unsigned long long)le64_to_cpu(sb->events),
static struct rdev_sysfs_entry rdev_size =
__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
+
+static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
+{
+ unsigned long long recovery_start = rdev->recovery_offset;
+
+ if (test_bit(In_sync, &rdev->flags) ||
+ recovery_start == MaxSector)
+ return sprintf(page, "none\n");
+
+ return sprintf(page, "%llu\n", recovery_start);
+}
+
+static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+ unsigned long long recovery_start;
+
+ if (cmd_match(buf, "none"))
+ recovery_start = MaxSector;
+ else if (strict_strtoull(buf, 10, &recovery_start))
+ return -EINVAL;
+
+ if (rdev->mddev->pers &&
+ rdev->raid_disk >= 0)
+ return -EBUSY;
+
+ rdev->recovery_offset = recovery_start;
+ if (recovery_start == MaxSector)
+ set_bit(In_sync, &rdev->flags);
+ else
+ clear_bit(In_sync, &rdev->flags);
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_recovery_start =
+__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
&rdev_slot.attr,
&rdev_offset.attr,
&rdev_size.attr,
+ &rdev_recovery_start.attr,
NULL,
};
static ssize_t
rdev->flags = 0;
rdev->data_offset = 0;
rdev->sb_events = 0;
+ rdev->last_read_error.tv_sec = 0;
+ rdev->last_read_error.tv_nsec = 0;
atomic_set(&rdev->nr_pending, 0);
atomic_set(&rdev->read_errors, 0);
atomic_set(&rdev->corrected_errors, 0);
}
}
+/* Read a fixed-point number.
+ * Numbers in sysfs attributes should be in "standard" units where
+ * possible, so time should be in seconds.
+ * However we internally use a a much smaller unit such as
+ * milliseconds or jiffies.
+ * This function takes a decimal number with a possible fractional
+ * component, and produces an integer which is the result of
+ * multiplying that number by 10^'scale'.
+ * all without any floating-point arithmetic.
+ */
+int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
+{
+ unsigned long result = 0;
+ long decimals = -1;
+ while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
+ if (*cp == '.')
+ decimals = 0;
+ else if (decimals < scale) {
+ unsigned int value;
+ value = *cp - '0';
+ result = result * 10 + value;
+ if (decimals >= 0)
+ decimals++;
+ }
+ cp++;
+ }
+ if (*cp == '\n')
+ cp++;
+ if (*cp)
+ return -EINVAL;
+ if (decimals < 0)
+ decimals = 0;
+ while (decimals < scale) {
+ result *= 10;
+ decimals ++;
+ }
+ *res = result;
+ return 0;
+}
+
+
static void md_safemode_timeout(unsigned long data);
static ssize_t
static ssize_t
safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
{
- int scale=1;
- int dot=0;
- int i;
unsigned long msec;
- char buf[30];
- /* remove a period, and count digits after it */
- if (len >= sizeof(buf))
+ if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
- strlcpy(buf, cbuf, sizeof(buf));
- for (i=0; i<len; i++) {
- if (dot) {
- if (isdigit(buf[i])) {
- buf[i-1] = buf[i];
- scale *= 10;
- }
- buf[i] = 0;
- } else if (buf[i] == '.') {
- dot=1;
- buf[i] = 0;
- }
- }
- if (strict_strtoul(buf, 10, &msec) < 0)
- return -EINVAL;
- msec = (msec * 1000) / scale;
if (msec == 0)
mddev->safemode_delay = 0;
else {
if (mddev->pers)
return -EBUSY;
- if (!*buf || (*e && *e != '\n'))
+ if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else if (!*buf || (*e && *e != '\n'))
return -EINVAL;
mddev->recovery_cp = n;
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
static ssize_t
+max_corrected_read_errors_show(mddev_t *mddev, char *page) {
+ return sprintf(page, "%d\n",
+ atomic_read(&mddev->max_corr_read_errors));
+}
+
+static ssize_t
+max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+
+ if (*buf && (*e == 0 || *e == '\n')) {
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
+ }
+ return -EINVAL;
+}
+
+static struct md_sysfs_entry max_corr_read_errors =
+__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
+ max_corrected_read_errors_store);
+
+static ssize_t
null_show(mddev_t *mddev, char *page)
{
return -EINVAL;
}
if (*end && !isspace(*end)) break;
bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
- buf = end;
- while (isspace(*buf)) buf++;
+ buf = skip_spaces(end);
}
bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
out:
&md_array_state.attr,
&md_reshape_position.attr,
&md_array_size.attr,
+ &max_corr_read_errors.attr,
NULL,
};
{
mddev_t *mddev = container_of(ws, mddev_t, del_work);
- if (mddev->private == &md_redundancy_group) {
- sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
- if (mddev->sysfs_action)
- sysfs_put(mddev->sysfs_action);
- mddev->sysfs_action = NULL;
- mddev->private = NULL;
- }
+ sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
kobject_del(&mddev->kobj);
kobject_put(&mddev->kobj);
}
disk->disk_name);
error = 0;
}
+ if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
+ printk(KERN_DEBUG "pointless warning\n");
abort:
mutex_unlock(&disks_mutex);
if (!error) {
if (mddev->pers)
return -EBUSY;
+ /* These two calls synchronise us with the
+ * sysfs_remove_group calls in mddev_unlock,
+ * so they must have completed.
+ */
+ mutex_lock(&mddev->open_mutex);
+ mutex_unlock(&mddev->open_mutex);
+
/*
* Analyze all RAID superblock(s)
*/
sysfs_notify_dirent(rdev->sysfs_state);
}
- md_probe(mddev->unit, NULL, NULL);
disk = mddev->gendisk;
- if (!disk)
- return -ENOMEM;
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;
- if (start_readonly)
+ if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */
err = mddev->pers->run(mddev);
mddev->ro = 0;
atomic_set(&mddev->writes_pending,0);
+ atomic_set(&mddev->max_corr_read_errors,
+ MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
mddev->safemode_timer.function = md_safemode_timeout;
mddev->safemode_timer.data = (unsigned long) mddev;
set_capacity(disk, mddev->array_sectors);
- /* If there is a partially-recovered drive we need to
- * start recovery here. If we leave it to md_check_recovery,
- * it will remove the drives and not do the right thing
- */
- if (mddev->degraded && !mddev->sync_thread) {
- int spares = 0;
- list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags))
- /* complete an interrupted recovery */
- spares++;
- if (spares && mddev->pers->sync_request) {
- mddev->recovery = 0;
- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- mddev->sync_thread = md_register_thread(md_do_sync,
- mddev,
- "resync");
- if (!mddev->sync_thread) {
- printk(KERN_ERR "%s: could not start resync"
- " thread...\n",
- mdname(mddev));
- /* leave the spares where they are, it shouldn't hurt */
- mddev->recovery = 0;
- }
- }
- }
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
return 0;
}
-static void restore_bitmap_write_access(struct file *file)
+void restore_bitmap_write_access(struct file *file)
{
struct inode *inode = file->f_mapping->host;
mddev->queue->unplug_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;
module_put(mddev->pers->owner);
- if (mddev->pers->sync_request)
- mddev->private = &md_redundancy_group;
+ if (mddev->pers->sync_request && mddev->private == NULL)
+ mddev->private = (void*)1;
mddev->pers = NULL;
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent(mddev->sysfs_state);
}
mddev->bitmap_info.offset = 0;
- /* make sure all md_delayed_delete calls have finished */
- flush_scheduled_work();
-
export_array(mddev);
mddev->array_sectors = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
+ mddev->bitmap_info.chunksize = 0;
+ mddev->bitmap_info.daemon_sleep = 0;
+ mddev->bitmap_info.max_write_behind = 0;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
if (mddev->hold_active == UNTIL_STOP)
mddev->hold_active = 0;
mddev->minor_version = info->minor_version;
mddev->patch_version = info->patch_version;
mddev->persistent = !info->not_persistent;
+ /* ensure mddev_put doesn't delete this now that there
+ * is some minimal configuration.
+ */
+ mddev->ctime = get_seconds();
return 0;
}
mddev->major_version = MD_MAJOR_VERSION;
int err = 0;
void __user *argp = (void __user *)arg;
mddev_t *mddev = NULL;
+ int ro;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
err = do_md_stop(mddev, 1, 1);
goto done_unlock;
+ case BLKROSET:
+ if (get_user(ro, (int __user *)(arg))) {
+ err = -EFAULT;
+ goto done_unlock;
+ }
+ err = -EINVAL;
+
+ /* if the bdev is going readonly the value of mddev->ro
+ * does not matter, no writes are coming
+ */
+ if (ro)
+ goto done_unlock;
+
+ /* are we are already prepared for writes? */
+ if (mddev->ro != 1)
+ goto done_unlock;
+
+ /* transitioning to readauto need only happen for
+ * arrays that call md_write_start
+ */
+ if (mddev->pers) {
+ err = restart_array(mddev);
+ if (err == 0) {
+ mddev->ro = 2;
+ set_disk_ro(mddev->gendisk, 0);
+ }
+ }
+ goto done_unlock;
}
/*
abort:
return err;
}
+#ifdef CONFIG_COMPAT
+static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case HOT_REMOVE_DISK:
+ case HOT_ADD_DISK:
+ case SET_DISK_FAULTY:
+ case SET_BITMAP_FILE:
+ /* These take in integer arg, do not convert */
+ break;
+ default:
+ arg = (unsigned long)compat_ptr(arg);
+ break;
+ }
+
+ return md_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
static int md_open(struct block_device *bdev, fmode_t mode)
{
.open = md_open,
.release = md_release,
.ioctl = md_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = md_compat_ioctl,
+#endif
.getgeo = md_getgeo,
.media_changed = md_media_changed,
.revalidate_disk= md_revalidate,
unsigned long chunk_kb;
unsigned long flags;
spin_lock_irqsave(&bitmap->lock, flags);
- chunk_kb = bitmap->chunksize >> 10;
+ chunk_kb = mddev->bitmap_info.chunksize >> 10;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
"%lu%s chunk",
bitmap->pages - bitmap->missing_pages,
bitmap->pages,
(bitmap->pages - bitmap->missing_pages)
<< (PAGE_SHIFT - 10),
- chunk_kb ? chunk_kb : bitmap->chunksize,
+ chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
chunk_kb ? "KB" : "B");
if (bitmap->file) {
seq_printf(seq, ", file: ");
mddev->curr_resync = 2;
try_again:
- if (kthread_should_stop()) {
+ if (kthread_should_stop())
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
- }
for_each_mddev(mddev2, tmp) {
if (mddev2 == mddev)
continue;
/* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors;
j = MaxSector;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rcu_read_lock();
+ list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
j = rdev->recovery_offset;
+ rcu_read_unlock();
}
printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
- list_for_each_entry(rdev, &mddev->disks, same_set)
+ rcu_read_lock();
+ list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
rdev->recovery_offset = mddev->curr_resync;
+ rcu_read_unlock();
}
}
set_bit(MD_CHANGE_DEVS, &mddev->flags);
nm, mdname(mddev));
spares++;
md_new_event(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
} else
break;
}
EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_check_recovery);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MD RAID framework");
MODULE_ALIAS("md");
MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);