X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=block%2Fgenhd.c;h=1a4916e01732b68ebb9af5ff2fd831c78057d910;hb=c1d4c41f2fdfe66dea957b76d005affba3e56b26;hp=5a8d3bf02f1715702fe94515339f1cdfbfd3d088;hpb=68eef3b4791572ecb70249c7fb145bb3742dd899;p=safe%2Fjmp%2Flinux-2.6 diff --git a/block/genhd.c b/block/genhd.c index 5a8d3bf..1a4916e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -2,24 +2,233 @@ * gendisk handling */ -#include #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include #include +#include -static struct subsystem block_subsys; +#include "blk.h" -static DEFINE_MUTEX(block_subsys_lock); +static DEFINE_MUTEX(block_class_lock); +#ifndef CONFIG_SYSFS_DEPRECATED +struct kobject *block_depr; +#endif + +/* for extended dynamic devt allocation, currently only one major is used */ +#define MAX_EXT_DEVT (1 << MINORBITS) + +/* For extended devt allocation. ext_devt_mutex prevents look up + * results from going away underneath its user. + */ +static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_IDR(ext_devt_idr); + +static struct device_type disk_type; + +/** + * disk_get_part - get partition + * @disk: disk to look partition from + * @partno: partition number + * + * Look for partition @partno from @disk. If found, increment + * reference count and return it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Pointer to the found partition on success, NULL if not found. + */ +struct hd_struct *disk_get_part(struct gendisk *disk, int partno) +{ + struct hd_struct *part = NULL; + struct disk_part_tbl *ptbl; + + if (unlikely(partno < 0)) + return NULL; + + rcu_read_lock(); + + ptbl = rcu_dereference(disk->part_tbl); + if (likely(partno < ptbl->len)) { + part = rcu_dereference(ptbl->part[partno]); + if (part) + get_device(part_to_dev(part)); + } + + rcu_read_unlock(); + + return part; +} +EXPORT_SYMBOL_GPL(disk_get_part); + +/** + * disk_part_iter_init - initialize partition iterator + * @piter: iterator to initialize + * @disk: disk to iterate over + * @flags: DISK_PITER_* flags + * + * Initialize @piter so that it iterates over partitions of @disk. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, + unsigned int flags) +{ + struct disk_part_tbl *ptbl; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + piter->disk = disk; + piter->part = NULL; + + if (flags & DISK_PITER_REVERSE) + piter->idx = ptbl->len - 1; + else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) + piter->idx = 0; + else + piter->idx = 1; + + piter->flags = flags; + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(disk_part_iter_init); + +/** + * disk_part_iter_next - proceed iterator to the next partition and return it + * @piter: iterator of interest + * + * Proceed @piter to the next partition and return it. + * + * CONTEXT: + * Don't care. + */ +struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +{ + struct disk_part_tbl *ptbl; + int inc, end; + + /* put the last partition */ + disk_put_part(piter->part); + piter->part = NULL; + + /* get part_tbl */ + rcu_read_lock(); + ptbl = rcu_dereference(piter->disk->part_tbl); + + /* determine iteration parameters */ + if (piter->flags & DISK_PITER_REVERSE) { + inc = -1; + if (piter->flags & (DISK_PITER_INCL_PART0 | + DISK_PITER_INCL_EMPTY_PART0)) + end = -1; + else + end = 0; + } else { + inc = 1; + end = ptbl->len; + } + + /* iterate to the next partition */ + for (; piter->idx != end; piter->idx += inc) { + struct hd_struct *part; + + part = rcu_dereference(ptbl->part[piter->idx]); + if (!part) + continue; + if (!part->nr_sects && + !(piter->flags & DISK_PITER_INCL_EMPTY) && + !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && + piter->idx == 0)) + continue; + + get_device(part_to_dev(part)); + piter->part = part; + piter->idx += inc; + break; + } + + rcu_read_unlock(); + + return piter->part; +} +EXPORT_SYMBOL_GPL(disk_part_iter_next); + +/** + * disk_part_iter_exit - finish up partition iteration + * @piter: iter of interest + * + * Called when iteration is over. Cleans up @piter. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_exit(struct disk_part_iter *piter) +{ + disk_put_part(piter->part); + piter->part = NULL; +} +EXPORT_SYMBOL_GPL(disk_part_iter_exit); + +static inline int sector_in_part(struct hd_struct *part, sector_t sector) +{ + return part->start_sect <= sector && + sector < part->start_sect + part->nr_sects; +} + +/** + * disk_map_sector_rcu - map sector to partition + * @disk: gendisk of interest + * @sector: sector to map + * + * Find out which partition @sector maps to on @disk. This is + * primarily used for stats accounting. + * + * CONTEXT: + * RCU read locked. The returned partition pointer is valid only + * while preemption is disabled. + * + * RETURNS: + * Found partition on success, part0 is returned if no partition matches + */ +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +{ + struct disk_part_tbl *ptbl; + struct hd_struct *part; + int i; + + ptbl = rcu_dereference(disk->part_tbl); + + part = rcu_dereference(ptbl->last_lookup); + if (part && sector_in_part(part, sector)) + return part; + + for (i = 1; i < ptbl->len; i++) { + part = rcu_dereference(ptbl->part[i]); + + if (part && sector_in_part(part, sector)) { + rcu_assign_pointer(ptbl->last_lookup, part); + return part; + } + } + return &disk->part0; +} +EXPORT_SYMBOL_GPL(disk_map_sector_rcu); /* * Can be deleted altogether. Later. @@ -38,27 +247,41 @@ static inline int major_to_index(int major) } #ifdef CONFIG_PROC_FS - -void blkdev_show(struct seq_file *f, off_t offset) +void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; if (offset < BLKDEV_MAJOR_HASH_SIZE) { - mutex_lock(&block_subsys_lock); + mutex_lock(&block_class_lock); for (dp = major_names[offset]; dp; dp = dp->next) - seq_printf(f, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_subsys_lock); + seq_printf(seqf, "%3d %s\n", dp->major, dp->name); + mutex_unlock(&block_class_lock); } } - #endif /* CONFIG_PROC_FS */ +/** + * register_blkdev - register a new block device + * + * @major: the requested major device number [1..255]. If @major=0, try to + * allocate any unused major number. + * @name: the name of the new block device as a zero terminated string + * + * The @name must be unique within the system. + * + * The return value depends on the @major input parameter. + * - if a major device number was requested in range [1..255] then the + * function returns zero on success, or a negative error code + * - if any unused major number was requested with @major=0 parameter + * then the return value is the allocated major number in range + * [1..255] or a negative error code otherwise + */ int register_blkdev(unsigned int major, const char *name) { struct blk_major_name **n, *p; int index, ret = 0; - mutex_lock(&block_subsys_lock); + mutex_lock(&block_class_lock); /* temporary */ if (major == 0) { @@ -103,68 +326,176 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - mutex_unlock(&block_subsys_lock); + mutex_unlock(&block_class_lock); return ret; } EXPORT_SYMBOL(register_blkdev); -/* todo: make void - error printk here */ -int unregister_blkdev(unsigned int major, const char *name) +void unregister_blkdev(unsigned int major, const char *name) { struct blk_major_name **n; struct blk_major_name *p = NULL; int index = major_to_index(major); - int ret = 0; - mutex_lock(&block_subsys_lock); + mutex_lock(&block_class_lock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; - if (!*n || strcmp((*n)->name, name)) - ret = -EINVAL; - else { + if (!*n || strcmp((*n)->name, name)) { + WARN_ON(1); + } else { p = *n; *n = p->next; } - mutex_unlock(&block_subsys_lock); + mutex_unlock(&block_class_lock); kfree(p); - - return ret; } EXPORT_SYMBOL(unregister_blkdev); static struct kobj_map *bdev_map; +/** + * blk_mangle_minor - scatter minor numbers apart + * @minor: minor number to mangle + * + * Scatter consecutively allocated @minor number apart if MANGLE_DEVT + * is enabled. Mangling twice gives the original value. + * + * RETURNS: + * Mangled value. + * + * CONTEXT: + * Don't care. + */ +static int blk_mangle_minor(int minor) +{ +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + int i; + + for (i = 0; i < MINORBITS / 2; i++) { + int low = minor & (1 << i); + int high = minor & (1 << (MINORBITS - 1 - i)); + int distance = MINORBITS - 1 - 2 * i; + + minor ^= low | high; /* clear both bits */ + low <<= distance; /* swap the positions */ + high >>= distance; + minor |= low | high; /* and set */ + } +#endif + return minor; +} + +/** + * blk_alloc_devt - allocate a dev_t for a partition + * @part: partition to allocate dev_t for + * @devt: out parameter for resulting dev_t + * + * Allocate a dev_t for block device. + * + * RETURNS: + * 0 on success, allocated dev_t is returned in *@devt. -errno on + * failure. + * + * CONTEXT: + * Might sleep. + */ +int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +{ + struct gendisk *disk = part_to_disk(part); + int idx, rc; + + /* in consecutive minor range? */ + if (part->partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->partno); + return 0; + } + + /* allocate ext devt */ + do { + if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) + return -ENOMEM; + rc = idr_get_new(&ext_devt_idr, part, &idx); + } while (rc == -EAGAIN); + + if (rc) + return rc; + + if (idx > MAX_EXT_DEVT) { + idr_remove(&ext_devt_idr, idx); + return -EBUSY; + } + + *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); + return 0; +} + +/** + * blk_free_devt - free a dev_t + * @devt: dev_t to free + * + * Free @devt which was allocated using blk_alloc_devt(). + * + * CONTEXT: + * Might sleep. + */ +void blk_free_devt(dev_t devt) +{ + might_sleep(); + + if (devt == MKDEV(0, 0)) + return; + + if (MAJOR(devt) == BLOCK_EXT_MAJOR) { + mutex_lock(&ext_devt_mutex); + idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); + mutex_unlock(&ext_devt_mutex); + } +} + +static char *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + /* * Register device numbers dev..(dev+range-1) * range must be nonzero * The hash chain is sorted on range, so that subranges can override. */ -void blk_register_region(dev_t dev, unsigned long range, struct module *module, +void blk_register_region(dev_t devt, unsigned long range, struct module *module, struct kobject *(*probe)(dev_t, int *, void *), int (*lock)(dev_t, void *), void *data) { - kobj_map(bdev_map, dev, range, module, probe, lock, data); + kobj_map(bdev_map, devt, range, module, probe, lock, data); } EXPORT_SYMBOL(blk_register_region); -void blk_unregister_region(dev_t dev, unsigned long range) +void blk_unregister_region(dev_t devt, unsigned long range) { - kobj_unmap(bdev_map, dev, range); + kobj_unmap(bdev_map, devt, range); } EXPORT_SYMBOL(blk_unregister_region); -static struct kobject *exact_match(dev_t dev, int *part, void *data) +static struct kobject *exact_match(dev_t devt, int *partno, void *data) { struct gendisk *p = data; - return &p->kobj; + + return &disk_to_dev(p)->kobj; } -static int exact_lock(dev_t dev, void *data) +static int exact_lock(dev_t devt, void *data) { struct gendisk *p = data; @@ -179,14 +510,47 @@ static int exact_lock(dev_t dev, void *data) * * This function registers the partitioning information in @disk * with the kernel. + * + * FIXME: error handling */ void add_disk(struct gendisk *disk) { + struct backing_dev_info *bdi; + dev_t devt; + int retval; + + /* minors == 0 indicates to use ext devt from part0 and should + * be accompanied with EXT_DEVT flag. Make sure all + * parameters make sense. + */ + WARN_ON(disk->minors && !(disk->major || disk->first_minor)); + WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); + disk->flags |= GENHD_FL_UP; - blk_register_region(MKDEV(disk->major, disk->first_minor), - disk->minors, NULL, exact_match, exact_lock, disk); + + retval = blk_alloc_devt(&disk->part0, &devt); + if (retval) { + WARN_ON(1); + return; + } + disk_to_dev(disk)->devt = devt; + + /* ->major and ->first_minor aren't supposed to be + * dereferenced from here on, but set them just in case. + */ + disk->major = MAJOR(devt); + disk->first_minor = MINOR(devt); + + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); + + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, + "bdi"); + WARN_ON(retval); } EXPORT_SYMBOL(add_disk); @@ -194,448 +558,611 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ void unlink_gendisk(struct gendisk *disk) { + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); - blk_unregister_region(MKDEV(disk->major, disk->first_minor), - disk->minors); + blk_unregister_region(disk_devt(disk), disk->minors); } -#define to_disk(obj) container_of(obj,struct gendisk,kobj) - /** * get_gendisk - get partitioning information for a given device - * @dev: device to get partitioning information for + * @devt: device to get partitioning information for + * @partno: returned partition index * * This function gets the structure containing partitioning - * information for the given device @dev. + * information for the given device @devt. */ -struct gendisk *get_gendisk(dev_t dev, int *part) +struct gendisk *get_gendisk(dev_t devt, int *partno) { - struct kobject *kobj = kobj_lookup(bdev_map, dev, part); - return kobj ? to_disk(kobj) : NULL; + struct gendisk *disk = NULL; + + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { + struct kobject *kobj; + + kobj = kobj_lookup(bdev_map, devt, partno); + if (kobj) + disk = dev_to_disk(kobj_to_dev(kobj)); + } else { + struct hd_struct *part; + + mutex_lock(&ext_devt_mutex); + part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); + if (part && get_disk(part_to_disk(part))) { + *partno = part->partno; + disk = part_to_disk(part); + } + mutex_unlock(&ext_devt_mutex); + } + + return disk; +} + +/** + * bdget_disk - do bdget() by gendisk and partition number + * @disk: gendisk of interest + * @partno: partition number + * + * Find partition @partno from @disk, do bdget() on it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Resulting block_device on success, NULL on failure. + */ +struct block_device *bdget_disk(struct gendisk *disk, int partno) +{ + struct hd_struct *part; + struct block_device *bdev = NULL; + + part = disk_get_part(disk, partno); + if (part) + bdev = bdget(part_devt(part)); + disk_put_part(part); + + return bdev; +} +EXPORT_SYMBOL(bdget_disk); + +/* + * print a full list of all partitions - intended for places where the root + * filesystem can't be mounted and thus to give the victim some idea of what + * went wrong + */ +void __init printk_all_partitions(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; + char name_buf[BDEVNAME_SIZE]; + char devt_buf[BDEVT_SIZE]; + + /* + * Don't show empty devices or things that have been + * surpressed + */ + if (get_capacity(disk) == 0 || + (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the + * numbers in hex - the same format as the root= + * option takes. + */ + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) { + bool is_part0 = part == &disk->part0; + + printk("%s%s %10llu %s", is_part0 ? "" : " ", + bdevt_str(part_devt(part), devt_buf), + (unsigned long long)part->nr_sects >> 1, + disk_name(disk, part->partno, name_buf)); + if (is_part0) { + if (disk->driverfs_dev != NULL && + disk->driverfs_dev->driver != NULL) + printk(" driver: %s\n", + disk->driverfs_dev->driver->name); + else + printk(" (driver?)\n"); + } else + printk("\n"); + } + disk_part_iter_exit(&piter); + } + class_dev_iter_exit(&iter); } #ifdef CONFIG_PROC_FS /* iterator */ -static void *part_start(struct seq_file *part, loff_t *pos) +static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) { - struct list_head *p; - loff_t l = *pos; + loff_t skip = *pos; + struct class_dev_iter *iter; + struct device *dev; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + seqf->private = iter; + class_dev_iter_init(iter, &block_class, NULL, &disk_type); + do { + dev = class_dev_iter_next(iter); + if (!dev) + return NULL; + } while (skip--); + + return dev_to_disk(dev); +} + +static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) +{ + struct device *dev; + + (*pos)++; + dev = class_dev_iter_next(seqf->private); + if (dev) + return dev_to_disk(dev); - mutex_lock(&block_subsys_lock); - list_for_each(p, &block_subsys.kset.list) - if (!l--) - return list_entry(p, struct gendisk, kobj.entry); return NULL; } -static void *part_next(struct seq_file *part, void *v, loff_t *pos) +static void disk_seqf_stop(struct seq_file *seqf, void *v) { - struct list_head *p = ((struct gendisk *)v)->kobj.entry.next; - ++*pos; - return p==&block_subsys.kset.list ? NULL : - list_entry(p, struct gendisk, kobj.entry); + struct class_dev_iter *iter = seqf->private; + + /* stop is called even after start failed :-( */ + if (iter) { + class_dev_iter_exit(iter); + kfree(iter); + } } -static void part_stop(struct seq_file *part, void *v) +static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { - mutex_unlock(&block_subsys_lock); + static void *p; + + p = disk_seqf_start(seqf, pos); + if (!IS_ERR(p) && p && !*pos) + seq_puts(seqf, "major minor #blocks name\n\n"); + return p; } -static int show_partition(struct seq_file *part, void *v) +static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; - int n; + struct disk_part_iter piter; + struct hd_struct *part; char buf[BDEVNAME_SIZE]; - if (&sgp->kobj.entry == block_subsys.kset.list.next) - seq_puts(part, "major minor #blocks name\n\n"); - /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || - (sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE))) + if (!get_capacity(sgp) || (!disk_partitionable(sgp) && + (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) return 0; /* show the full disk and all non-0 size partitions of it */ - seq_printf(part, "%4d %4d %10llu %s\n", - sgp->major, sgp->first_minor, - (unsigned long long)get_capacity(sgp) >> 1, - disk_name(sgp, 0, buf)); - for (n = 0; n < sgp->minors - 1; n++) { - if (!sgp->part[n]) - continue; - if (sgp->part[n]->nr_sects == 0) - continue; - seq_printf(part, "%4d %4d %10llu %s\n", - sgp->major, n + 1 + sgp->first_minor, - (unsigned long long)sgp->part[n]->nr_sects >> 1 , - disk_name(sgp, n + 1, buf)); - } + disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + seq_printf(seqf, "%4d %7d %10llu %s\n", + MAJOR(part_devt(part)), MINOR(part_devt(part)), + (unsigned long long)part->nr_sects >> 1, + disk_name(sgp, part->partno, buf)); + disk_part_iter_exit(&piter); return 0; } -struct seq_operations partitions_op = { - .start =part_start, - .next = part_next, - .stop = part_stop, - .show = show_partition +static const struct seq_operations partitions_op = { + .start = show_partition_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, + .show = show_partition }; -#endif +static int partitions_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &partitions_op); +} + +static const struct file_operations proc_partitions_operations = { + .open = partitions_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif -extern int blk_dev_init(void); -static struct kobject *base_probe(dev_t dev, int *part, void *data) +static struct kobject *base_probe(dev_t devt, int *partno, void *data) { - if (request_module("block-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(dev)); + request_module("block-major-%d", MAJOR(devt)); return NULL; } static int __init genhd_device_init(void) { - bdev_map = kobj_map_init(base_probe, &block_subsys_lock); + int error; + + block_class.dev_kobj = sysfs_dev_block_kobj; + error = class_register(&block_class); + if (unlikely(error)) + return error; + bdev_map = kobj_map_init(base_probe, &block_class_lock); blk_dev_init(); - subsystem_register(&block_subsys); + + register_blkdev(BLOCK_EXT_MAJOR, "blkext"); + +#ifndef CONFIG_SYSFS_DEPRECATED + /* create top-level block dir */ + block_depr = kobject_create_and_add("block", NULL); +#endif return 0; } subsys_initcall(genhd_device_init); - - -/* - * kobject & sysfs bindings for block devices - */ -static ssize_t disk_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) +static ssize_t disk_range_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct gendisk *disk = to_disk(kobj); - struct disk_attribute *disk_attr = - container_of(attr,struct disk_attribute,attr); - ssize_t ret = -EIO; + struct gendisk *disk = dev_to_disk(dev); - if (disk_attr->show) - ret = disk_attr->show(disk,page); - return ret; + return sprintf(buf, "%d\n", disk->minors); } -static ssize_t disk_attr_store(struct kobject * kobj, struct attribute * attr, - const char *page, size_t count) +static ssize_t disk_ext_range_show(struct device *dev, + struct device_attribute *attr, char *buf) { - struct gendisk *disk = to_disk(kobj); - struct disk_attribute *disk_attr = - container_of(attr,struct disk_attribute,attr); - ssize_t ret = 0; + struct gendisk *disk = dev_to_disk(dev); - if (disk_attr->store) - ret = disk_attr->store(disk, page, count); - return ret; + return sprintf(buf, "%d\n", disk_max_parts(disk)); } -static struct sysfs_ops disk_sysfs_ops = { - .show = &disk_attr_show, - .store = &disk_attr_store, -}; - -static ssize_t disk_uevent_store(struct gendisk * disk, - const char *buf, size_t count) +static ssize_t disk_removable_show(struct device *dev, + struct device_attribute *attr, char *buf) { - kobject_uevent(&disk->kobj, KOBJ_ADD); - return count; -} -static ssize_t disk_dev_read(struct gendisk * disk, char *page) -{ - dev_t base = MKDEV(disk->major, disk->first_minor); - return print_dev_t(page, base); + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } -static ssize_t disk_range_read(struct gendisk * disk, char *page) + +static ssize_t disk_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(page, "%d\n", disk->minors); + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } -static ssize_t disk_removable_read(struct gendisk * disk, char *page) + +static ssize_t disk_capability_show(struct device *dev, + struct device_attribute *attr, char *buf) { - return sprintf(page, "%d\n", - (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); + struct gendisk *disk = dev_to_disk(dev); + return sprintf(buf, "%x\n", disk->flags); } -static ssize_t disk_size_read(struct gendisk * disk, char *page) -{ - return sprintf(page, "%llu\n", (unsigned long long)get_capacity(disk)); -} - -static ssize_t disk_stats_read(struct gendisk * disk, char *page) -{ - preempt_disable(); - disk_round_stats(disk); - preempt_enable(); - return sprintf(page, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u" - "\n", - disk_stat_read(disk, ios[READ]), - disk_stat_read(disk, merges[READ]), - (unsigned long long)disk_stat_read(disk, sectors[READ]), - jiffies_to_msecs(disk_stat_read(disk, ticks[READ])), - disk_stat_read(disk, ios[WRITE]), - disk_stat_read(disk, merges[WRITE]), - (unsigned long long)disk_stat_read(disk, sectors[WRITE]), - jiffies_to_msecs(disk_stat_read(disk, ticks[WRITE])), - disk->in_flight, - jiffies_to_msecs(disk_stat_read(disk, io_ticks)), - jiffies_to_msecs(disk_stat_read(disk, time_in_queue))); -} -static struct disk_attribute disk_attr_uevent = { - .attr = {.name = "uevent", .mode = S_IWUSR }, - .store = disk_uevent_store -}; -static struct disk_attribute disk_attr_dev = { - .attr = {.name = "dev", .mode = S_IRUGO }, - .show = disk_dev_read -}; -static struct disk_attribute disk_attr_range = { - .attr = {.name = "range", .mode = S_IRUGO }, - .show = disk_range_read -}; -static struct disk_attribute disk_attr_removable = { - .attr = {.name = "removable", .mode = S_IRUGO }, - .show = disk_removable_read -}; -static struct disk_attribute disk_attr_size = { - .attr = {.name = "size", .mode = S_IRUGO }, - .show = disk_size_read + +static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); +static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); +static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); +static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); +static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT +static struct device_attribute dev_attr_fail_timeout = + __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, + part_timeout_store); +#endif + +static struct attribute *disk_attrs[] = { + &dev_attr_range.attr, + &dev_attr_ext_range.attr, + &dev_attr_removable.attr, + &dev_attr_ro.attr, + &dev_attr_size.attr, + &dev_attr_capability.attr, + &dev_attr_stat.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT + &dev_attr_fail_timeout.attr, +#endif + NULL }; -static struct disk_attribute disk_attr_stat = { - .attr = {.name = "stat", .mode = S_IRUGO }, - .show = disk_stats_read + +static struct attribute_group disk_attr_group = { + .attrs = disk_attrs, }; -static struct attribute * default_attrs[] = { - &disk_attr_uevent.attr, - &disk_attr_dev.attr, - &disk_attr_range.attr, - &disk_attr_removable.attr, - &disk_attr_size.attr, - &disk_attr_stat.attr, - NULL, +static struct attribute_group *disk_attr_groups[] = { + &disk_attr_group, + NULL }; -static void disk_release(struct kobject * kobj) +static void disk_free_ptbl_rcu_cb(struct rcu_head *head) { - struct gendisk *disk = to_disk(kobj); - kfree(disk->random); - kfree(disk->part); - free_disk_stats(disk); - kfree(disk); -} + struct disk_part_tbl *ptbl = + container_of(head, struct disk_part_tbl, rcu_head); -static struct kobj_type ktype_block = { - .release = disk_release, - .sysfs_ops = &disk_sysfs_ops, - .default_attrs = default_attrs, -}; - -extern struct kobj_type ktype_part; + kfree(ptbl); +} -static int block_uevent_filter(struct kset *kset, struct kobject *kobj) +/** + * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way + * @disk: disk to replace part_tbl for + * @new_ptbl: new part_tbl to install + * + * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The + * original ptbl is freed using RCU callback. + * + * LOCKING: + * Matching bd_mutx locked. + */ +static void disk_replace_part_tbl(struct gendisk *disk, + struct disk_part_tbl *new_ptbl) { - struct kobj_type *ktype = get_ktype(kobj); + struct disk_part_tbl *old_ptbl = disk->part_tbl; + + rcu_assign_pointer(disk->part_tbl, new_ptbl); - return ((ktype == &ktype_block) || (ktype == &ktype_part)); + if (old_ptbl) { + rcu_assign_pointer(old_ptbl->last_lookup, NULL); + call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); + } } -static int block_uevent(struct kset *kset, struct kobject *kobj, char **envp, - int num_envp, char *buffer, int buffer_size) +/** + * disk_expand_part_tbl - expand disk->part_tbl + * @disk: disk to expand part_tbl for + * @partno: expand such that this partno can fit in + * + * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl + * uses RCU to allow unlocked dereferencing for stats and other stuff. + * + * LOCKING: + * Matching bd_mutex locked, might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int disk_expand_part_tbl(struct gendisk *disk, int partno) { - struct kobj_type *ktype = get_ktype(kobj); - struct device *physdev; - struct gendisk *disk; - struct hd_struct *part; - int length = 0; - int i = 0; - - if (ktype == &ktype_block) { - disk = container_of(kobj, struct gendisk, kobj); - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, - &length, "MINOR=%u", disk->first_minor); - } else if (ktype == &ktype_part) { - disk = container_of(kobj->parent, struct gendisk, kobj); - part = container_of(kobj, struct hd_struct, kobj); - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, - &length, "MINOR=%u", - disk->first_minor + part->partno); - } else + struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *new_ptbl; + int len = old_ptbl ? old_ptbl->len : 0; + int target = partno + 1; + size_t size; + int i; + + /* disk_max_parts() is zero during initialization, ignore if so */ + if (disk_max_parts(disk) && target > disk_max_parts(disk)) + return -EINVAL; + + if (target <= len) return 0; - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "MAJOR=%u", disk->major); - - /* add physical device, backing this device */ - physdev = disk->driverfs_dev; - if (physdev) { - char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, - &length, "PHYSDEVPATH=%s", path); - kfree(path); - - if (physdev->bus) - add_uevent_var(envp, num_envp, &i, - buffer, buffer_size, &length, - "PHYSDEVBUS=%s", - physdev->bus->name); - - if (physdev->driver) - add_uevent_var(envp, num_envp, &i, - buffer, buffer_size, &length, - "PHYSDEVDRIVER=%s", - physdev->driver->name); - } + size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]); + new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id); + if (!new_ptbl) + return -ENOMEM; + + INIT_RCU_HEAD(&new_ptbl->rcu_head); + new_ptbl->len = target; - /* terminate, set to next free slot, shrink available space */ - envp[i] = NULL; - envp = &envp[i]; - num_envp -= i; - buffer = &buffer[length]; - buffer_size -= length; + for (i = 0; i < len; i++) + rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); + disk_replace_part_tbl(disk, new_ptbl); return 0; } -static struct kset_uevent_ops block_uevent_ops = { - .filter = block_uevent_filter, - .uevent = block_uevent, -}; +static void disk_release(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); -/* declare block_subsys. */ -static decl_subsys(block, &ktype_block, &block_uevent_ops); + kfree(disk->random); + disk_replace_part_tbl(disk, NULL); + free_part_stats(&disk->part0); + kfree(disk); +} +struct class block_class = { + .name = "block", +}; +static struct device_type disk_type = { + .name = "disk", + .groups = disk_attr_groups, + .release = disk_release, +}; +#ifdef CONFIG_PROC_FS /* * aggregate disk stat collector. Uses the same stats that the sysfs * entries do, above, but makes them available through one seq_file. - * Watching a few disks may be efficient through sysfs, but watching - * all of them will be more efficient through this interface. * * The output looks suspiciously like /proc/partitions with a bunch of * extra fields. */ - -/* iterator */ -static void *diskstats_start(struct seq_file *part, loff_t *pos) -{ - loff_t k = *pos; - struct list_head *p; - - mutex_lock(&block_subsys_lock); - list_for_each(p, &block_subsys.kset.list) - if (!k--) - return list_entry(p, struct gendisk, kobj.entry); - return NULL; -} - -static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos) -{ - struct list_head *p = ((struct gendisk *)v)->kobj.entry.next; - ++*pos; - return p==&block_subsys.kset.list ? NULL : - list_entry(p, struct gendisk, kobj.entry); -} - -static void diskstats_stop(struct seq_file *part, void *v) -{ - mutex_unlock(&block_subsys_lock); -} - -static int diskstats_show(struct seq_file *s, void *v) +static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; + struct disk_part_iter piter; + struct hd_struct *hd; char buf[BDEVNAME_SIZE]; - int n = 0; + int cpu; /* - if (&sgp->kobj.entry == block_subsys.kset.list.next) - seq_puts(s, "major minor name" + if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) + seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" "\n\n"); */ - preempt_disable(); - disk_round_stats(gp); - preempt_enable(); - seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n", - gp->major, n + gp->first_minor, disk_name(gp, n, buf), - disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]), - (unsigned long long)disk_stat_read(gp, sectors[0]), - jiffies_to_msecs(disk_stat_read(gp, ticks[0])), - disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]), - (unsigned long long)disk_stat_read(gp, sectors[1]), - jiffies_to_msecs(disk_stat_read(gp, ticks[1])), - gp->in_flight, - jiffies_to_msecs(disk_stat_read(gp, io_ticks)), - jiffies_to_msecs(disk_stat_read(gp, time_in_queue))); - - /* now show all non-0 size partitions of it */ - for (n = 0; n < gp->minors - 1; n++) { - struct hd_struct *hd = gp->part[n]; - - if (hd && hd->nr_sects) - seq_printf(s, "%4d %4d %s %u %u %u %u\n", - gp->major, n + gp->first_minor + 1, - disk_name(gp, n + 1, buf), - hd->ios[0], hd->sectors[0], - hd->ios[1], hd->sectors[1]); + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); + while ((hd = disk_part_iter_next(&piter))) { + cpu = part_stat_lock(); + part_round_stats(cpu, hd); + part_stat_unlock(); + seq_printf(seqf, "%4d %7d %s %lu %lu %llu " + "%u %lu %lu %llu %u %u %u %u\n", + MAJOR(part_devt(hd)), MINOR(part_devt(hd)), + disk_name(gp, hd->partno, buf), + part_stat_read(hd, ios[0]), + part_stat_read(hd, merges[0]), + (unsigned long long)part_stat_read(hd, sectors[0]), + jiffies_to_msecs(part_stat_read(hd, ticks[0])), + part_stat_read(hd, ios[1]), + part_stat_read(hd, merges[1]), + (unsigned long long)part_stat_read(hd, sectors[1]), + jiffies_to_msecs(part_stat_read(hd, ticks[1])), + hd->in_flight, + jiffies_to_msecs(part_stat_read(hd, io_ticks)), + jiffies_to_msecs(part_stat_read(hd, time_in_queue)) + ); } + disk_part_iter_exit(&piter); return 0; } -struct seq_operations diskstats_op = { - .start = diskstats_start, - .next = diskstats_next, - .stop = diskstats_stop, +static const struct seq_operations diskstats_op = { + .start = disk_seqf_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, .show = diskstats_show }; +static int diskstats_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &diskstats_op); +} + +static const struct file_operations proc_diskstats_operations = { + .open = diskstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_genhd_init(void) +{ + proc_create("diskstats", 0, NULL, &proc_diskstats_operations); + proc_create("partitions", 0, NULL, &proc_partitions_operations); + return 0; +} +module_init(proc_genhd_init); +#endif /* CONFIG_PROC_FS */ + +static void media_change_notify_thread(struct work_struct *work) +{ + struct gendisk *gd = container_of(work, struct gendisk, async_notify); + char event[] = "MEDIA_CHANGE=1"; + char *envp[] = { event, NULL }; + + /* + * set enviroment vars to indicate which event this is for + * so that user space will know to go check the media status. + */ + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); + put_device(gd->driverfs_dev); +} + +#if 0 +void genhd_media_change_notify(struct gendisk *disk) +{ + get_device(disk->driverfs_dev); + schedule_work(&disk->async_notify); +} +EXPORT_SYMBOL_GPL(genhd_media_change_notify); +#endif /* 0 */ + +dev_t blk_lookup_devt(const char *name, int partno) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part; + + if (strcmp(dev_name(dev), name)) + continue; + + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + break; + } + part = disk_get_part(disk, partno); + if (part) { + devt = part_devt(part); + disk_put_part(part); + break; + } + disk_put_part(part); + } + class_dev_iter_exit(&iter); + return devt; +} +EXPORT_SYMBOL(blk_lookup_devt); + struct gendisk *alloc_disk(int minors) { return alloc_disk_node(minors, -1); } +EXPORT_SYMBOL(alloc_disk); struct gendisk *alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); + disk = kmalloc_node(sizeof(struct gendisk), + GFP_KERNEL | __GFP_ZERO, node_id); if (disk) { - memset(disk, 0, sizeof(struct gendisk)); - if (!init_disk_stats(disk)) { + if (!init_part_stats(&disk->part0)) { kfree(disk); return NULL; } - if (minors > 1) { - int size = (minors - 1) * sizeof(struct hd_struct *); - disk->part = kmalloc_node(size, GFP_KERNEL, node_id); - if (!disk->part) { - kfree(disk); - return NULL; - } - memset(disk->part, 0, size); + disk->node_id = node_id; + if (disk_expand_part_tbl(disk, 0)) { + free_part_stats(&disk->part0); + kfree(disk); + return NULL; } + disk->part_tbl->part[0] = &disk->part0; + disk->minors = minors; - kobj_set_kset_s(disk,block_subsys); - kobject_init(&disk->kobj); rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); + INIT_WORK(&disk->async_notify, + media_change_notify_thread); } return disk; } - -EXPORT_SYMBOL(alloc_disk); EXPORT_SYMBOL(alloc_disk_node); struct kobject *get_disk(struct gendisk *disk) @@ -648,7 +1175,7 @@ struct kobject *get_disk(struct gendisk *disk) owner = disk->fops->owner; if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&disk->kobj); + kobj = kobject_get(&disk_to_dev(disk)->kobj); if (kobj == NULL) { module_put(owner); return NULL; @@ -662,27 +1189,28 @@ EXPORT_SYMBOL(get_disk); void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk->kobj); + kobject_put(&disk_to_dev(disk)->kobj); } EXPORT_SYMBOL(put_disk); void set_device_ro(struct block_device *bdev, int flag) { - if (bdev->bd_contains != bdev) - bdev->bd_part->policy = flag; - else - bdev->bd_disk->policy = flag; + bdev->bd_part->policy = flag; } EXPORT_SYMBOL(set_device_ro); void set_disk_ro(struct gendisk *disk, int flag) { - int i; - disk->policy = flag; - for (i = 0; i < disk->minors - 1; i++) - if (disk->part[i]) disk->part[i]->policy = flag; + struct disk_part_iter piter; + struct hd_struct *part; + + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + part->policy = flag; + disk_part_iter_exit(&piter); } EXPORT_SYMBOL(set_disk_ro); @@ -691,18 +1219,15 @@ int bdev_read_only(struct block_device *bdev) { if (!bdev) return 0; - else if (bdev->bd_contains != bdev) - return bdev->bd_part->policy; - else - return bdev->bd_disk->policy; + return bdev->bd_part->policy; } EXPORT_SYMBOL(bdev_read_only); -int invalidate_partition(struct gendisk *disk, int index) +int invalidate_partition(struct gendisk *disk, int partno) { int res = 0; - struct block_device *bdev = bdget_disk(disk, index); + struct block_device *bdev = bdget_disk(disk, partno); if (bdev) { fsync_bdev(bdev); res = __invalidate_device(bdev);