/*
* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include "dm.h"
-#include "dm-bio-list.h"
+#include "dm-uevent.h"
#include <linux/init.h>
#include <linux/module.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/blktrace_api.h>
-#include <linux/smp_lock.h>
+#include <trace/block.h>
#define DM_MSG_PREFIX "core"
static DEFINE_SPINLOCK(_minor_lock);
/*
+ * For bio-based dm.
* One of these is allocated per bio.
*/
struct dm_io {
struct mapped_device *md;
int error;
- struct bio *bio;
atomic_t io_count;
+ struct bio *bio;
unsigned long start_time;
};
/*
+ * For bio-based dm.
* One of these is allocated per target within a bio. Hopefully
* this will be simplified out one day.
*/
union map_info info;
};
+DEFINE_TRACE(block_bio_complete);
+
+/*
+ * For request-based dm.
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+ struct mapped_device *md;
+ struct dm_target *ti;
+ struct request *orig, clone;
+ int error;
+ union map_info info;
+};
+
+/*
+ * For request-based dm.
+ * One of these is allocated per bio.
+ */
+struct dm_rq_clone_bio_info {
+ struct bio *orig;
+ struct request *rq;
+};
+
union map_info *dm_get_mapinfo(struct bio *bio)
{
if (bio && bio->bi_private)
/*
* Bits for the md->flags field.
*/
-#define DMF_BLOCK_IO 0
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_QUEUE_IO_TO_THREAD 6
+/*
+ * Work processed by per-device workqueue.
+ */
struct mapped_device {
struct rw_semaphore io_lock;
- struct semaphore suspend_lock;
- spinlock_t pushback_lock;
+ struct mutex suspend_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
*/
atomic_t pending;
wait_queue_head_t wait;
+ struct work_struct work;
struct bio_list deferred;
- struct bio_list pushback;
+ spinlock_t deferred_lock;
+
+ /*
+ * An error from the barrier request currently being processed.
+ */
+ int barrier_error;
+
+ /*
+ * Processing queue (flush/barriers)
+ */
+ struct workqueue_struct *wq;
/*
* The current mapping.
*/
atomic_t event_nr;
wait_queue_head_t eventq;
+ atomic_t uevent_seq;
+ struct list_head uevent_list;
+ spinlock_t uevent_lock; /* Protect access to uevent_list */
/*
* freeze/thaw support require holding onto a super block
/* forced geometry settings */
struct hd_geometry geometry;
+
+ /* sysfs handle */
+ struct kobject kobj;
};
#define MIN_IOS 256
static struct kmem_cache *_io_cache;
static struct kmem_cache *_tio_cache;
+static struct kmem_cache *_rq_tio_cache;
+static struct kmem_cache *_rq_bio_info_cache;
static int __init local_init(void)
{
- int r;
+ int r = -ENOMEM;
/* allocate a slab for the dm_ios */
_io_cache = KMEM_CACHE(dm_io, 0);
if (!_io_cache)
- return -ENOMEM;
+ return r;
/* allocate a slab for the target ios */
_tio_cache = KMEM_CACHE(dm_target_io, 0);
- if (!_tio_cache) {
- kmem_cache_destroy(_io_cache);
- return -ENOMEM;
- }
+ if (!_tio_cache)
+ goto out_free_io_cache;
+
+ _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
+ if (!_rq_tio_cache)
+ goto out_free_tio_cache;
+
+ _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
+ if (!_rq_bio_info_cache)
+ goto out_free_rq_tio_cache;
+
+ r = dm_uevent_init();
+ if (r)
+ goto out_free_rq_bio_info_cache;
_major = major;
r = register_blkdev(_major, _name);
- if (r < 0) {
- kmem_cache_destroy(_tio_cache);
- kmem_cache_destroy(_io_cache);
- return r;
- }
+ if (r < 0)
+ goto out_uevent_exit;
if (!_major)
_major = r;
return 0;
+
+out_uevent_exit:
+ dm_uevent_exit();
+out_free_rq_bio_info_cache:
+ kmem_cache_destroy(_rq_bio_info_cache);
+out_free_rq_tio_cache:
+ kmem_cache_destroy(_rq_tio_cache);
+out_free_tio_cache:
+ kmem_cache_destroy(_tio_cache);
+out_free_io_cache:
+ kmem_cache_destroy(_io_cache);
+
+ return r;
}
static void local_exit(void)
{
+ kmem_cache_destroy(_rq_bio_info_cache);
+ kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
+ dm_uevent_exit();
_major = 0;
DMINFO("cleaned up");
}
-int (*_inits[])(void) __initdata = {
+static int (*_inits[])(void) __initdata = {
local_init,
dm_target_init,
dm_linear_init,
dm_stripe_init,
+ dm_kcopyd_init,
dm_interface_init,
};
-void (*_exits[])(void) = {
+static void (*_exits[])(void) = {
local_exit,
dm_target_exit,
dm_linear_exit,
dm_stripe_exit,
+ dm_kcopyd_exit,
dm_interface_exit,
};
/*
* Block device functions
*/
-static int dm_blk_open(struct inode *inode, struct file *file)
+static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
spin_lock(&_minor_lock);
- md = inode->i_bdev->bd_disk->private_data;
+ md = bdev->bd_disk->private_data;
if (!md)
goto out;
return md ? 0 : -ENXIO;
}
-static int dm_blk_close(struct inode *inode, struct file *file)
+static int dm_blk_close(struct gendisk *disk, fmode_t mode)
{
- struct mapped_device *md;
-
- md = inode->i_bdev->bd_disk->private_data;
+ struct mapped_device *md = disk->private_data;
atomic_dec(&md->open_count);
dm_put(md);
return 0;
return dm_get_geometry(md, geo);
}
-static int dm_blk_ioctl(struct inode *inode, struct file *file,
+static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
- struct mapped_device *md;
- struct dm_table *map;
+ struct mapped_device *md = bdev->bd_disk->private_data;
+ struct dm_table *map = dm_get_table(md);
struct dm_target *tgt;
int r = -ENOTTY;
- /* We don't really need this lock, but we do need 'inode'. */
- unlock_kernel();
-
- md = inode->i_bdev->bd_disk->private_data;
-
- map = dm_get_table(md);
-
if (!map || !dm_table_get_size(map))
goto out;
}
if (tgt->type->ioctl)
- r = tgt->type->ioctl(tgt, inode, file, cmd, arg);
+ r = tgt->type->ioctl(tgt, cmd, arg);
out:
dm_table_put(map);
- lock_kernel();
return r;
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
+ int cpu;
io->start_time = jiffies;
- preempt_disable();
- disk_round_stats(dm_disk(md));
- preempt_enable();
- dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
+ cpu = part_stat_lock();
+ part_round_stats(cpu, &dm_disk(md)->part0);
+ part_stat_unlock();
+ dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
}
-static int end_io_acct(struct dm_io *io)
+static void end_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
+ int pending, cpu;
int rw = bio_data_dir(bio);
- preempt_disable();
- disk_round_stats(dm_disk(md));
- preempt_enable();
- dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
+ cpu = part_stat_lock();
+ part_round_stats(cpu, &dm_disk(md)->part0);
+ part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
+ part_stat_unlock();
- disk_stat_add(dm_disk(md), ticks[rw], duration);
+ /*
+ * After this is decremented the bio must not be touched if it is
+ * a barrier.
+ */
+ dm_disk(md)->part0.in_flight = pending =
+ atomic_dec_return(&md->pending);
- return !pending;
+ /* nudge anyone waiting on suspend queue */
+ if (!pending)
+ wake_up(&md->wait);
}
/*
* Add the bio to the list of deferred io.
*/
-static int queue_io(struct mapped_device *md, struct bio *bio)
+static void queue_io(struct mapped_device *md, struct bio *bio)
{
down_write(&md->io_lock);
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
- up_write(&md->io_lock);
- return 1;
- }
-
+ spin_lock_irq(&md->deferred_lock);
bio_list_add(&md->deferred, bio);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
+ queue_work(md->wq, &md->work);
up_write(&md->io_lock);
- return 0; /* deferred successfully */
}
/*
static void dec_pending(struct dm_io *io, int error)
{
unsigned long flags;
+ int io_error;
+ struct bio *bio;
+ struct mapped_device *md = io->md;
/* Push-back supersedes any I/O errors */
- if (error && !(io->error > 0 && __noflush_suspending(io->md)))
+ if (error && !(io->error > 0 && __noflush_suspending(md)))
io->error = error;
if (atomic_dec_and_test(&io->io_count)) {
if (io->error == DM_ENDIO_REQUEUE) {
/*
* Target requested pushing back the I/O.
- * This must be handled before the sleeper on
- * suspend queue merges the pushback list.
*/
- spin_lock_irqsave(&io->md->pushback_lock, flags);
- if (__noflush_suspending(io->md))
- bio_list_add(&io->md->pushback, io->bio);
+ spin_lock_irqsave(&md->deferred_lock, flags);
+ if (__noflush_suspending(md))
+ bio_list_add_head(&md->deferred, io->bio);
else
/* noflush suspend was interrupted. */
io->error = -EIO;
- spin_unlock_irqrestore(&io->md->pushback_lock, flags);
+ spin_unlock_irqrestore(&md->deferred_lock, flags);
}
- if (end_io_acct(io))
- /* nudge anyone waiting on suspend queue */
- wake_up(&io->md->wait);
+ io_error = io->error;
+ bio = io->bio;
+
+ if (bio_barrier(bio)) {
+ /*
+ * There can be just one barrier request so we use
+ * a per-device variable for error reporting.
+ * Note that you can't touch the bio after end_io_acct
+ */
+ md->barrier_error = io_error;
+ end_io_acct(io);
+ } else {
+ end_io_acct(io);
- if (io->error != DM_ENDIO_REQUEUE) {
- blk_add_trace_bio(io->md->queue, io->bio,
- BLK_TA_COMPLETE);
+ if (io_error != DM_ENDIO_REQUEUE) {
+ trace_block_bio_complete(md->queue, bio);
- bio_endio(io->bio, io->error);
+ bio_endio(bio, io_error);
+ }
}
- free_io(io->md, io);
+ free_io(md, io);
}
}
{
int r = 0;
struct dm_target_io *tio = bio->bi_private;
+ struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
}
}
- dec_pending(tio->io, error);
-
/*
* Store md for cleanup instead of tio which is about to get freed.
*/
bio->bi_private = md->bs;
- bio_put(bio);
free_tio(md, tio);
+ bio_put(bio);
+ dec_pending(io, error);
}
static sector_t max_io_len(struct mapped_device *md,
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
- blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+ trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev,
clone->bi_sector, sector);
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw;
+ clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone->bi_io_vec->bv_len = clone->bi_size;
+ clone->bi_flags |= 1 << BIO_CLONED;
+
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, offset), len);
+ }
return clone;
}
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
+ clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+
+ if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, 0), len);
+ }
+
return clone;
}
-static void __clone_and_map(struct clone_info *ci)
+static int __clone_and_map(struct clone_info *ci)
{
struct bio *clone, *bio = ci->bio;
- struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
- sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
+ struct dm_target *ti;
+ sector_t len = 0, max;
struct dm_target_io *tio;
+ ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
+
+ max = max_io_len(ci->md, ci->sector, ti);
+
/*
* Allocate a target io object.
*/
do {
if (offset) {
ti = dm_table_find_target(ci->map, ci->sector);
+ if (!dm_target_is_valid(ti))
+ return -EIO;
+
max = max_io_len(ci->md, ci->sector, ti);
tio = alloc_tio(ci->md);
ci->idx++;
}
+
+ return 0;
}
/*
- * Split the bio into several clones.
+ * Split the bio into several clones and submit it to targets.
*/
-static void __split_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
struct clone_info ci;
+ int error = 0;
ci.map = dm_get_table(md);
- if (!ci.map) {
- bio_io_error(bio);
+ if (unlikely(!ci.map)) {
+ if (!bio_barrier(bio))
+ bio_io_error(bio);
+ else
+ md->barrier_error = -EIO;
return;
}
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
- while (ci.sector_count)
- __clone_and_map(&ci);
+ while (ci.sector_count && !error)
+ error = __clone_and_map(&ci);
/* drop the extra reference count */
- dec_pending(ci.io, 0);
+ dec_pending(ci.io, error);
dm_table_put(ci.map);
}
/*-----------------------------------------------------------------
* CRUD END
*---------------------------------------------------------------*/
+static int dm_merge_bvec(struct request_queue *q,
+ struct bvec_merge_data *bvm,
+ struct bio_vec *biovec)
+{
+ struct mapped_device *md = q->queuedata;
+ struct dm_table *map = dm_get_table(md);
+ struct dm_target *ti;
+ sector_t max_sectors;
+ int max_size = 0;
+
+ if (unlikely(!map))
+ goto out;
+
+ ti = dm_table_find_target(map, bvm->bi_sector);
+ if (!dm_target_is_valid(ti))
+ goto out_table;
+
+ /*
+ * Find maximum amount of I/O that won't need splitting
+ */
+ max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
+ (sector_t) BIO_MAX_SECTORS);
+ max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
+ if (max_size < 0)
+ max_size = 0;
+
+ /*
+ * merge_bvec_fn() returns number of bytes
+ * it can accept at this offset
+ * max is precomputed maximal io size
+ */
+ if (max_size && ti->type->merge)
+ max_size = ti->type->merge(ti, bvm, biovec, max_size);
+
+out_table:
+ dm_table_put(map);
+
+out:
+ /*
+ * Always allow an entire first page
+ */
+ if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
+ max_size = biovec->bv_len;
+
+ return max_size;
+}
+
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
*/
static int dm_request(struct request_queue *q, struct bio *bio)
{
- int r;
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
-
- /*
- * There is no use in forwarding any barrier request since we can't
- * guarantee it is (or can be) handled by the targets correctly.
- */
- if (unlikely(bio_barrier(bio))) {
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
- }
+ int cpu;
down_read(&md->io_lock);
- disk_stat_inc(dm_disk(md), ios[rw]);
- disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio));
+ cpu = part_stat_lock();
+ part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
+ part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
+ part_stat_unlock();
/*
- * If we're suspended we have to queue
- * this io for later.
+ * If we're suspended or the thread is processing barriers
+ * we have to queue this io for later.
*/
- while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+ unlikely(bio_barrier(bio))) {
up_read(&md->io_lock);
- if (bio_rw(bio) == READA) {
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+ bio_rw(bio) == READA) {
bio_io_error(bio);
return 0;
}
- r = queue_io(md, bio);
- if (r < 0) {
- bio_io_error(bio);
- return 0;
-
- } else if (r == 0)
- return 0; /* deferred successfully */
+ queue_io(md, bio);
- /*
- * We're in a while loop, because someone could suspend
- * before we get to the following read lock.
- */
- down_read(&md->io_lock);
+ return 0;
}
- __split_bio(md, bio);
+ __split_and_process_bio(md, bio);
up_read(&md->io_lock);
return 0;
}
static int dm_any_congested(void *congested_data, int bdi_bits)
{
- int r;
- struct mapped_device *md = (struct mapped_device *) congested_data;
- struct dm_table *map = dm_get_table(md);
+ int r = bdi_bits;
+ struct mapped_device *md = congested_data;
+ struct dm_table *map;
- if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
- r = bdi_bits;
- else
- r = dm_table_any_congested(map, bdi_bits);
+ if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ map = dm_get_table(md);
+ if (map) {
+ r = dm_table_any_congested(map, bdi_bits);
+ dm_table_put(map);
+ }
+ }
- dm_table_put(map);
return r;
}
/*
* See if the device with a specific minor # is free.
*/
-static int specific_minor(struct mapped_device *md, int minor)
+static int specific_minor(int minor)
{
int r, m;
return r;
}
-static int next_free_minor(struct mapped_device *md, int *minor)
+static int next_free_minor(int *minor)
{
int r, m;
spin_lock(&_minor_lock);
r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
- if (r) {
+ if (r)
goto out;
- }
if (m >= (1 << MINORBITS)) {
idr_remove(&_minor_idr, m);
static struct block_device_operations dm_blk_dops;
+static void dm_wq_work(struct work_struct *work);
+
/*
* Allocate and initialise a blank device with a given minor.
*/
static struct mapped_device *alloc_dev(int minor)
{
int r;
- struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
+ struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
void *old_md;
if (!md) {
}
if (!try_module_get(THIS_MODULE))
- goto bad0;
+ goto bad_module_get;
/* get a minor number for the dev */
if (minor == DM_ANY_MINOR)
- r = next_free_minor(md, &minor);
+ r = next_free_minor(&minor);
else
- r = specific_minor(md, minor);
+ r = specific_minor(minor);
if (r < 0)
- goto bad1;
+ goto bad_minor;
- memset(md, 0, sizeof(*md));
init_rwsem(&md->io_lock);
- init_MUTEX(&md->suspend_lock);
- spin_lock_init(&md->pushback_lock);
+ mutex_init(&md->suspend_lock);
+ spin_lock_init(&md->deferred_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
atomic_set(&md->event_nr, 0);
+ atomic_set(&md->uevent_seq, 0);
+ INIT_LIST_HEAD(&md->uevent_list);
+ spin_lock_init(&md->uevent_lock);
md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
- goto bad1_free_minor;
+ goto bad_queue;
md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
+ blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
md->queue->unplug_fn = dm_unplug_all;
+ blk_queue_merge_bvec(md->queue, dm_merge_bvec);
md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
if (!md->io_pool)
- goto bad2;
+ goto bad_io_pool;
md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
if (!md->tio_pool)
- goto bad3;
+ goto bad_tio_pool;
- md->bs = bioset_create(16, 16);
+ md->bs = bioset_create(16, 0);
if (!md->bs)
goto bad_no_bioset;
md->disk = alloc_disk(1);
if (!md->disk)
- goto bad4;
+ goto bad_disk;
atomic_set(&md->pending, 0);
init_waitqueue_head(&md->wait);
+ INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
+ md->wq = create_singlethread_workqueue("kdmflush");
+ if (!md->wq)
+ goto bad_thread;
+
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
return md;
- bad4:
+bad_thread:
+ put_disk(md->disk);
+bad_disk:
bioset_free(md->bs);
- bad_no_bioset:
+bad_no_bioset:
mempool_destroy(md->tio_pool);
- bad3:
+bad_tio_pool:
mempool_destroy(md->io_pool);
- bad2:
+bad_io_pool:
blk_cleanup_queue(md->queue);
- bad1_free_minor:
+bad_queue:
free_minor(minor);
- bad1:
+bad_minor:
module_put(THIS_MODULE);
- bad0:
+bad_module_get:
kfree(md);
return NULL;
}
static void free_dev(struct mapped_device *md)
{
- int minor = md->disk->first_minor;
+ int minor = MINOR(disk_devt(md->disk));
if (md->suspended_bdev) {
unlock_fs(md);
bdput(md->suspended_bdev);
}
+ destroy_workqueue(md->wq);
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
bioset_free(md->bs);
+ blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
free_minor(minor);
*/
static void event_callback(void *context)
{
+ unsigned long flags;
+ LIST_HEAD(uevents);
struct mapped_device *md = (struct mapped_device *) context;
+ spin_lock_irqsave(&md->uevent_lock, flags);
+ list_splice_init(&md->uevent_list, &uevents);
+ spin_unlock_irqrestore(&md->uevent_lock, flags);
+
+ dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
+
atomic_inc(&md->event_nr);
wake_up(&md->eventq);
}
if (md->suspended_bdev)
__set_size(md, size);
- if (size == 0)
+
+ if (!size) {
+ dm_table_destroy(t);
return 0;
+ }
- dm_table_get(t);
dm_table_event_callback(t, event_callback, md);
write_lock(&md->map_lock);
write_lock(&md->map_lock);
md->map = NULL;
write_unlock(&md->map_lock);
- dm_table_put(map);
+ dm_table_destroy(map);
}
/*
if (!md)
return -ENXIO;
+ dm_sysfs_init(md);
+
*result = md;
return 0;
}
md = idr_find(&_minor_idr, minor);
if (md && (md == MINOR_ALLOCED ||
- (dm_disk(md)->first_minor != minor) ||
+ (MINOR(disk_devt(dm_disk(md))) != minor) ||
test_bit(DMF_FREEING, &md->flags))) {
md = NULL;
goto out;
if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
map = dm_get_table(md);
- idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
+ idr_replace(&_minor_idr, MINOR_ALLOCED,
+ MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
if (!dm_suspended(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
- __unbind(md);
+ dm_sysfs_exit(md);
dm_table_put(map);
+ __unbind(md);
free_dev(md);
}
}
EXPORT_SYMBOL_GPL(dm_put);
+static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
+{
+ int r = 0;
+ DECLARE_WAITQUEUE(wait, current);
+
+ dm_unplug_all(md->queue);
+
+ add_wait_queue(&md->wait, &wait);
+
+ while (1) {
+ set_current_state(interruptible);
+
+ smp_mb();
+ if (!atomic_read(&md->pending))
+ break;
+
+ if (interruptible == TASK_INTERRUPTIBLE &&
+ signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ io_schedule();
+ }
+ set_current_state(TASK_RUNNING);
+
+ remove_wait_queue(&md->wait, &wait);
+
+ return r;
+}
+
+static int dm_flush(struct mapped_device *md)
+{
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+ return 0;
+}
+
+static void process_barrier(struct mapped_device *md, struct bio *bio)
+{
+ int error = dm_flush(md);
+
+ if (unlikely(error)) {
+ bio_endio(bio, error);
+ return;
+ }
+ if (bio_empty_barrier(bio)) {
+ bio_endio(bio, 0);
+ return;
+ }
+
+ __split_and_process_bio(md, bio);
+
+ error = dm_flush(md);
+
+ if (!error && md->barrier_error)
+ error = md->barrier_error;
+
+ if (md->barrier_error != DM_ENDIO_REQUEUE)
+ bio_endio(bio, error);
+}
+
/*
* Process the deferred bios
*/
-static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
+static void dm_wq_work(struct work_struct *work)
{
- struct bio *n;
+ struct mapped_device *md = container_of(work, struct mapped_device,
+ work);
+ struct bio *c;
+
+ down_write(&md->io_lock);
+
+ while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ spin_lock_irq(&md->deferred_lock);
+ c = bio_list_pop(&md->deferred);
+ spin_unlock_irq(&md->deferred_lock);
+
+ if (!c) {
+ clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ break;
+ }
+
+ up_write(&md->io_lock);
+
+ if (bio_barrier(c))
+ process_barrier(md, c);
+ else
+ __split_and_process_bio(md, c);
- while (c) {
- n = c->bi_next;
- c->bi_next = NULL;
- __split_bio(md, c);
- c = n;
+ down_write(&md->io_lock);
}
+
+ up_write(&md->io_lock);
+}
+
+static void dm_queue_flush(struct mapped_device *md)
+{
+ clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ smp_mb__after_clear_bit();
+ queue_work(md->wq, &md->work);
}
/*
{
int r = -EINVAL;
- down(&md->suspend_lock);
+ mutex_lock(&md->suspend_lock);
/* device must be suspended */
if (!dm_suspended(md))
r = __bind(md, table);
out:
- up(&md->suspend_lock);
+ mutex_unlock(&md->suspend_lock);
return r;
}
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
- unsigned long flags;
- DECLARE_WAITQUEUE(wait, current);
- struct bio *def;
- int r = -EINVAL;
+ int r = 0;
int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
- down(&md->suspend_lock);
+ mutex_lock(&md->suspend_lock);
- if (dm_suspended(md))
+ if (dm_suspended(md)) {
+ r = -EINVAL;
goto out_unlock;
+ }
map = dm_get_table(md);
if (!md->suspended_bdev) {
DMWARN("bdget failed in dm_suspend");
r = -ENOMEM;
- goto flush_and_out;
+ goto out;
}
- }
- /*
- * Flush I/O to the device.
- * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os.
- */
- if (do_lockfs && !noflush) {
- r = lock_fs(md);
- if (r)
- goto out;
+ /*
+ * Flush I/O to the device. noflush supersedes do_lockfs,
+ * because lock_fs() needs to flush I/Os.
+ */
+ if (do_lockfs) {
+ r = lock_fs(md);
+ if (r)
+ goto out;
+ }
}
/*
- * First we set the BLOCK_IO flag so no more ios will be mapped.
+ * Here we must make sure that no processes are submitting requests
+ * to target drivers i.e. no one may be executing
+ * __split_and_process_bio. This is called from dm_request and
+ * dm_wq_work.
+ *
+ * To get all processes out of __split_and_process_bio in dm_request,
+ * we take the write lock. To prevent any process from reentering
+ * __split_and_process_bio from dm_request, we set
+ * DMF_QUEUE_IO_TO_THREAD.
+ *
+ * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
+ * and call flush_workqueue(md->wq). flush_workqueue will wait until
+ * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
+ * further calls to __split_and_process_bio from dm_wq_work.
*/
down_write(&md->io_lock);
- set_bit(DMF_BLOCK_IO, &md->flags);
-
- add_wait_queue(&md->wait, &wait);
+ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
- /* unplug */
- if (map)
- dm_table_unplug_all(map);
+ flush_workqueue(md->wq);
/*
- * Then we wait for the already mapped ios to
- * complete.
+ * At this point no more requests are entering target request routines.
+ * We call dm_wait_for_completion to wait for all existing requests
+ * to finish.
*/
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (!atomic_read(&md->pending) || signal_pending(current))
- break;
-
- io_schedule();
- }
- set_current_state(TASK_RUNNING);
+ r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
down_write(&md->io_lock);
- remove_wait_queue(&md->wait, &wait);
-
- if (noflush) {
- spin_lock_irqsave(&md->pushback_lock, flags);
+ if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- bio_list_merge_head(&md->deferred, &md->pushback);
- bio_list_init(&md->pushback);
- spin_unlock_irqrestore(&md->pushback_lock, flags);
- }
+ up_write(&md->io_lock);
/* were we interrupted ? */
- r = -EINTR;
- if (atomic_read(&md->pending)) {
- clear_bit(DMF_BLOCK_IO, &md->flags);
- def = bio_list_get(&md->deferred);
- __flush_deferred_io(md, def);
- up_write(&md->io_lock);
+ if (r < 0) {
+ dm_queue_flush(md);
+
unlock_fs(md);
goto out; /* pushback list is already flushed, so skip flush */
}
- up_write(&md->io_lock);
+
+ /*
+ * If dm_wait_for_completion returned 0, the device is completely
+ * quiescent now. There is no request-processing activity. All new
+ * requests are being added to md->deferred list.
+ */
dm_table_postsuspend_targets(map);
set_bit(DMF_SUSPENDED, &md->flags);
- r = 0;
-
-flush_and_out:
- if (r && noflush) {
- /*
- * Because there may be already I/Os in the pushback list,
- * flush them before return.
- */
- down_write(&md->io_lock);
-
- spin_lock_irqsave(&md->pushback_lock, flags);
- clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- bio_list_merge_head(&md->deferred, &md->pushback);
- bio_list_init(&md->pushback);
- spin_unlock_irqrestore(&md->pushback_lock, flags);
-
- def = bio_list_get(&md->deferred);
- __flush_deferred_io(md, def);
- up_write(&md->io_lock);
- }
-
out:
if (r && md->suspended_bdev) {
bdput(md->suspended_bdev);
dm_table_put(map);
out_unlock:
- up(&md->suspend_lock);
+ mutex_unlock(&md->suspend_lock);
return r;
}
int dm_resume(struct mapped_device *md)
{
int r = -EINVAL;
- struct bio *def;
struct dm_table *map = NULL;
- down(&md->suspend_lock);
+ mutex_lock(&md->suspend_lock);
if (!dm_suspended(md))
goto out;
if (r)
goto out;
- down_write(&md->io_lock);
- clear_bit(DMF_BLOCK_IO, &md->flags);
-
- def = bio_list_get(&md->deferred);
- __flush_deferred_io(md, def);
- up_write(&md->io_lock);
+ dm_queue_flush(md);
unlock_fs(md);
dm_table_unplug_all(map);
- kobject_uevent(&md->disk->kobj, KOBJ_CHANGE);
+ dm_kobject_uevent(md);
r = 0;
out:
dm_table_put(map);
- up(&md->suspend_lock);
+ mutex_unlock(&md->suspend_lock);
return r;
}
/*-----------------------------------------------------------------
* Event notification.
*---------------------------------------------------------------*/
+void dm_kobject_uevent(struct mapped_device *md)
+{
+ kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+}
+
+uint32_t dm_next_uevent_seq(struct mapped_device *md)
+{
+ return atomic_add_return(1, &md->uevent_seq);
+}
+
uint32_t dm_get_event_nr(struct mapped_device *md)
{
return atomic_read(&md->event_nr);
(event_nr != atomic_read(&md->event_nr)));
}
+void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&md->uevent_lock, flags);
+ list_add(elist, &md->uevent_list);
+ spin_unlock_irqrestore(&md->uevent_lock, flags);
+}
+
/*
* The gendisk is only valid as long as you have a reference
* count on 'md'.
return md->disk;
}
+struct kobject *dm_kobject(struct mapped_device *md)
+{
+ return &md->kobj;
+}
+
+/*
+ * struct mapped_device should not be exported outside of dm.c
+ * so use this check to verify that kobj is part of md structure
+ */
+struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
+{
+ struct mapped_device *md;
+
+ md = container_of(kobj, struct mapped_device, kobj);
+ if (&md->kobj != kobj)
+ return NULL;
+
+ dm_get(md);
+ return md;
+}
+
int dm_suspended(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED, &md->flags);