dm: export suspended state to targets

[safe/jmp/linux-2.6] / drivers / md / dm.c
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index eee28fa..3167480 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -47,6 +47,7 @@ struct dm_io {
         atomic_t io_count;
         struct bio *bio;
         unsigned long start_time;
+       spinlock_t endio_lock;
  };
  
  /*
@@ -142,9 +143,19 @@ struct mapped_device {
         int barrier_error;
  
         /*
+        * Protect barrier_error from concurrent endio processing
+        * in request-based dm.
+        */
+       spinlock_t barrier_error_lock;
+
+       /*
          * Processing queue (flush/barriers)
          */
         struct workqueue_struct *wq;
+       struct work_struct barrier_work;
+
+       /* A pointer to the currently processing pre/post flush request */
+       struct request *flush_request;
  
         /*
          * The current mapping.
@@ -177,9 +188,6 @@ struct mapped_device {
         /* forced geometry settings */
         struct hd_geometry geometry;
  
-       /* marker of flush suspend for request-based dm */
-       struct request suspend_rq;
-
         /* For saving the address of __make_request for request based dm */
         make_request_fn *saved_make_request_fn;
  
@@ -274,6 +282,7 @@ static int (*_inits[])(void) __initdata = {
         dm_target_init,
         dm_linear_init,
         dm_stripe_init,
+       dm_io_init,
         dm_kcopyd_init,
         dm_interface_init,
  };
@@ -283,6 +292,7 @@ static void (*_exits[])(void) = {
         dm_target_exit,
         dm_linear_exit,
         dm_stripe_exit,
+       dm_io_exit,
         dm_kcopyd_exit,
         dm_interface_exit,
  };
@@ -319,6 +329,11 @@ static void __exit dm_exit(void)
  /*
   * Block device functions
   */
+int dm_deleting_md(struct mapped_device *md)
+{
+       return test_bit(DMF_DELETING, &md->flags);
+}
+
  static int dm_blk_open(struct block_device *bdev, fmode_t mode)
  {
         struct mapped_device *md;
@@ -330,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
                 goto out;
  
         if (test_bit(DMF_FREEING, &md->flags) ||
-           test_bit(DMF_DELETING, &md->flags)) {
+           dm_deleting_md(md)) {
                 md = NULL;
                 goto out;
         }
@@ -387,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
                         unsigned int cmd, unsigned long arg)
  {
         struct mapped_device *md = bdev->bd_disk->private_data;
-       struct dm_table *map = dm_get_table(md);
+       struct dm_table *map = dm_get_live_table(md);
         struct dm_target *tgt;
         int r = -ENOTTY;
  
@@ -400,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
  
         tgt = dm_table_get_target(map, 0);
  
-       if (dm_suspended(md)) {
+       if (dm_suspended_md(md)) {
                 r = -EAGAIN;
                 goto out;
         }
@@ -429,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
         mempool_free(tio, md->tio_pool);
  }
  
-static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
+                                           gfp_t gfp_mask)
  {
-       return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+       return mempool_alloc(md->tio_pool, gfp_mask);
  }
  
  static void free_rq_tio(struct dm_rq_target_io *tio)
@@ -449,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info)
         mempool_free(info, info->tio->md->io_pool);
  }
  
+static int md_in_flight(struct mapped_device *md)
+{
+       return atomic_read(&md->pending[READ]) +
+              atomic_read(&md->pending[WRITE]);
+}
+
  static void start_io_acct(struct dm_io *io)
  {
         struct mapped_device *md = io->md;
@@ -511,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
   * function to access the md->map field, and make sure they call
   * dm_table_put() when finished.
   */
-struct dm_table *dm_get_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md)
  {
         struct dm_table *t;
         unsigned long flags;
@@ -578,8 +600,12 @@ static void dec_pending(struct dm_io *io, int error)
         struct mapped_device *md = io->md;
  
         /* Push-back supersedes any I/O errors */
-       if (error && !(io->error > 0 && __noflush_suspending(md)))
-               io->error = error;
+       if (unlikely(error)) {
+               spin_lock_irqsave(&io->endio_lock, flags);
+               if (!(io->error > 0 && __noflush_suspending(md)))
+                       io->error = error;
+               spin_unlock_irqrestore(&io->endio_lock, flags);
+       }
  
         if (atomic_dec_and_test(&io->io_count)) {
                 if (io->error == DM_ENDIO_REQUEUE) {
@@ -711,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error)
         blk_update_request(tio->orig, 0, nr_bytes);
  }
  
+static void store_barrier_error(struct mapped_device *md, int error)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&md->barrier_error_lock, flags);
+       /*
+        * Basically, the first error is taken, but:
+        *   -EOPNOTSUPP supersedes any I/O error.
+        *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
+        */
+       if (!md->barrier_error || error == -EOPNOTSUPP ||
+           (md->barrier_error != -EOPNOTSUPP &&
+            error == DM_ENDIO_REQUEUE))
+               md->barrier_error = error;
+       spin_unlock_irqrestore(&md->barrier_error_lock, flags);
+}
+
  /*
   * Don't touch any member of the md after calling this function because
   * the md may be freed in dm_put() at the end of this function.
   * Or do dm_get() before calling this function and dm_put() later.
   */
-static void rq_completed(struct mapped_device *md, int run_queue)
+static void rq_completed(struct mapped_device *md, int rw, int run_queue)
  {
-       int wakeup_waiters = 0;
-       struct request_queue *q = md->queue;
-       unsigned long flags;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       if (!queue_in_flight(q))
-               wakeup_waiters = 1;
-       spin_unlock_irqrestore(q->queue_lock, flags);
+       atomic_dec(&md->pending[rw]);
  
         /* nudge anyone waiting on suspend queue */
-       if (wakeup_waiters)
+       if (!md_in_flight(md))
                 wake_up(&md->wait);
  
         if (run_queue)
-               blk_run_queue(q);
+               blk_run_queue(md->queue);
  
         /*
          * dm_put() must be at the end of this function. See the comment above
@@ -748,6 +784,44 @@ static void free_rq_clone(struct request *clone)
         free_rq_tio(tio);
  }
  
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+       int rw = rq_data_dir(clone);
+       int run_queue = 1;
+       bool is_barrier = blk_barrier_rq(clone);
+       struct dm_rq_target_io *tio = clone->end_io_data;
+       struct mapped_device *md = tio->md;
+       struct request *rq = tio->orig;
+
+       if (blk_pc_request(rq) && !is_barrier) {
+               rq->errors = clone->errors;
+               rq->resid_len = clone->resid_len;
+
+               if (rq->sense)
+                       /*
+                        * We are using the sense buffer of the original
+                        * request.
+                        * So setting the length of the sense data is enough.
+                        */
+                       rq->sense_len = clone->sense_len;
+       }
+
+       free_rq_clone(clone);
+
+       if (unlikely(is_barrier)) {
+               if (unlikely(error))
+                       store_barrier_error(md, error);
+               run_queue = 0;
+       } else
+               blk_end_request_all(rq, error);
+
+       rq_completed(md, rw, run_queue);
+}
+
  static void dm_unprep_request(struct request *rq)
  {
         struct request *clone = rq->special;
@@ -763,12 +837,23 @@ static void dm_unprep_request(struct request *rq)
   */
  void dm_requeue_unmapped_request(struct request *clone)
  {
+       int rw = rq_data_dir(clone);
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct mapped_device *md = tio->md;
         struct request *rq = tio->orig;
         struct request_queue *q = rq->q;
         unsigned long flags;
  
+       if (unlikely(blk_barrier_rq(clone))) {
+               /*
+                * Barrier clones share an original request.
+                * Leave it to dm_end_request(), which handles this special
+                * case.
+                */
+               dm_end_request(clone, DM_ENDIO_REQUEUE);
+               return;
+       }
+
         dm_unprep_request(rq);
  
         spin_lock_irqsave(q->queue_lock, flags);
@@ -777,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone)
         blk_requeue_request(q, rq);
         spin_unlock_irqrestore(q->queue_lock, flags);
  
-       rq_completed(md, 0);
+       rq_completed(md, rw, 0);
  }
  EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
  
@@ -810,34 +895,28 @@ static void start_queue(struct request_queue *q)
         spin_unlock_irqrestore(q->queue_lock, flags);
  }
  
-/*
- * Complete the clone and the original request.
- * Must be called without queue lock.
- */
-static void dm_end_request(struct request *clone, int error)
+static void dm_done(struct request *clone, int error, bool mapped)
  {
+       int r = error;
         struct dm_rq_target_io *tio = clone->end_io_data;
-       struct mapped_device *md = tio->md;
-       struct request *rq = tio->orig;
+       dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
  
-       if (blk_pc_request(rq)) {
-               rq->errors = clone->errors;
-               rq->resid_len = clone->resid_len;
+       if (mapped && rq_end_io)
+               r = rq_end_io(tio->ti, clone, error, &tio->info);
  
-               if (rq->sense)
-                       /*
-                        * We are using the sense buffer of the original
-                        * request.
-                        * So setting the length of the sense data is enough.
-                        */
-                       rq->sense_len = clone->sense_len;
+       if (r <= 0)
+               /* The target wants to complete the I/O */
+               dm_end_request(clone, r);
+       else if (r == DM_ENDIO_INCOMPLETE)
+               /* The target will handle the I/O */
+               return;
+       else if (r == DM_ENDIO_REQUEUE)
+               /* The target wants to requeue the I/O */
+               dm_requeue_unmapped_request(clone);
+       else {
+               DMWARN("unimplemented target endio return value: %d", r);
+               BUG();
         }
-
-       free_rq_clone(clone);
-
-       blk_end_request_all(rq, error);
-
-       rq_completed(md, 1);
  }
  
  /*
@@ -845,27 +924,14 @@ static void dm_end_request(struct request *clone, int error)
   */
  static void dm_softirq_done(struct request *rq)
  {
+       bool mapped = true;
         struct request *clone = rq->completion_data;
         struct dm_rq_target_io *tio = clone->end_io_data;
-       dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
-       int error = tio->error;
  
-       if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
-               error = rq_end_io(tio->ti, clone, error, &tio->info);
+       if (rq->cmd_flags & REQ_FAILED)
+               mapped = false;
  
-       if (error <= 0)
-               /* The target wants to complete the I/O */
-               dm_end_request(clone, error);
-       else if (error == DM_ENDIO_INCOMPLETE)
-               /* The target will handle the I/O */
-               return;
-       else if (error == DM_ENDIO_REQUEUE)
-               /* The target wants to requeue the I/O */
-               dm_requeue_unmapped_request(clone);
-       else {
-               DMWARN("unimplemented target endio return value: %d", error);
-               BUG();
-       }
+       dm_done(clone, tio->error, mapped);
  }
  
  /*
@@ -877,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error)
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct request *rq = tio->orig;
  
+       if (unlikely(blk_barrier_rq(clone))) {
+               /*
+                * Barrier clones share an original request.  So can't use
+                * softirq_done with the original.
+                * Pass the clone to dm_done() directly in this special case.
+                * It is safe (even if clone->q->queue_lock is held here)
+                * because there is no I/O dispatching during the completion
+                * of barrier clone.
+                */
+               dm_done(clone, error, true);
+               return;
+       }
+
         tio->error = error;
         rq->completion_data = clone;
         blk_complete_request(rq);
@@ -893,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct request *rq = tio->orig;
  
+       if (unlikely(blk_barrier_rq(clone))) {
+               /*
+                * Barrier clones share an original request.
+                * Leave it to dm_end_request(), which handles this special
+                * case.
+                */
+               BUG_ON(error > 0);
+               dm_end_request(clone, error);
+               return;
+       }
+
         rq->cmd_flags |= REQ_FAILED;
         dm_complete_request(clone, error);
  }
@@ -1209,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
         struct clone_info ci;
         int error = 0;
  
-       ci.map = dm_get_table(md);
+       ci.map = dm_get_live_table(md);
         if (unlikely(!ci.map)) {
                 if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
                         bio_io_error(bio);
@@ -1226,6 +1316,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
         atomic_set(&ci.io->io_count, 1);
         ci.io->bio = bio;
         ci.io->md = md;
+       spin_lock_init(&ci.io->endio_lock);
         ci.sector = bio->bi_sector;
         ci.sector_count = bio_sectors(bio);
         if (unlikely(bio_empty_barrier(bio)))
@@ -1249,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q,
                          struct bio_vec *biovec)
  {
         struct mapped_device *md = q->queuedata;
-       struct dm_table *map = dm_get_table(md);
+       struct dm_table *map = dm_get_live_table(md);
         struct dm_target *ti;
         sector_t max_sectors;
         int max_size = 0;
@@ -1346,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
  {
         struct mapped_device *md = q->queuedata;
  
-       if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
-               bio_endio(bio, -EOPNOTSUPP);
-               return 0;
-       }
-
         return md->saved_make_request_fn(q, bio); /* call __make_request() */
  }
  
@@ -1369,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
         return _dm_request(q, bio);
  }
  
+/*
+ * Mark this request as flush request, so that dm_request_fn() can
+ * recognize.
+ */
+static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
+{
+       rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
+       rq->cmd[0] = REQ_LB_OP_FLUSH;
+}
+
+static bool dm_rq_is_flush_request(struct request *rq)
+{
+       if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+           rq->cmd[0] == REQ_LB_OP_FLUSH)
+               return true;
+       else
+               return false;
+}
+
  void dm_dispatch_request(struct request *rq)
  {
         int r;
@@ -1414,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
  static int setup_clone(struct request *clone, struct request *rq,
                        struct dm_rq_target_io *tio)
  {
-       int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-                                 dm_rq_bio_constructor, tio);
+       int r;
  
-       if (r)
-               return r;
+       if (dm_rq_is_flush_request(rq)) {
+               blk_rq_init(NULL, clone);
+               clone->cmd_type = REQ_TYPE_FS;
+               clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+       } else {
+               r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+                                     dm_rq_bio_constructor, tio);
+               if (r)
+                       return r;
+
+               clone->cmd = rq->cmd;
+               clone->cmd_len = rq->cmd_len;
+               clone->sense = rq->sense;
+               clone->buffer = rq->buffer;
+       }
  
-       clone->cmd = rq->cmd;
-       clone->cmd_len = rq->cmd_len;
-       clone->sense = rq->sense;
-       clone->buffer = rq->buffer;
         clone->end_io = end_clone_request;
         clone->end_io_data = tio;
  
         return 0;
  }
  
-static int dm_rq_flush_suspending(struct mapped_device *md)
+static struct request *clone_rq(struct request *rq, struct mapped_device *md,
+                               gfp_t gfp_mask)
  {
-       return !md->suspend_rq.special;
+       struct request *clone;
+       struct dm_rq_target_io *tio;
+
+       tio = alloc_rq_tio(md, gfp_mask);
+       if (!tio)
+               return NULL;
+
+       tio->md = md;
+       tio->ti = NULL;
+       tio->orig = rq;
+       tio->error = 0;
+       memset(&tio->info, 0, sizeof(tio->info));
+
+       clone = &tio->clone;
+       if (setup_clone(clone, rq, tio)) {
+               /* -ENOMEM */
+               free_rq_tio(tio);
+               return NULL;
+       }
+
+       return clone;
  }
  
  /*
@@ -1441,51 +1575,30 @@ static int dm_rq_flush_suspending(struct mapped_device *md)
  static int dm_prep_fn(struct request_queue *q, struct request *rq)
  {
         struct mapped_device *md = q->queuedata;
-       struct dm_rq_target_io *tio;
         struct request *clone;
  
-       if (unlikely(rq == &md->suspend_rq)) {
-               if (dm_rq_flush_suspending(md))
-                       return BLKPREP_OK;
-               else
-                       /* The flush suspend was interrupted */
-                       return BLKPREP_KILL;
-       }
+       if (unlikely(dm_rq_is_flush_request(rq)))
+               return BLKPREP_OK;
  
         if (unlikely(rq->special)) {
                 DMWARN("Already has something in rq->special.");
                 return BLKPREP_KILL;
         }
  
-       tio = alloc_rq_tio(md); /* Only one for each original request */
-       if (!tio)
-               /* -ENOMEM */
+       clone = clone_rq(rq, md, GFP_ATOMIC);
+       if (!clone)
                 return BLKPREP_DEFER;
  
-       tio->md = md;
-       tio->ti = NULL;
-       tio->orig = rq;
-       tio->error = 0;
-       memset(&tio->info, 0, sizeof(tio->info));
-
-       clone = &tio->clone;
-       if (setup_clone(clone, rq, tio)) {
-               /* -ENOMEM */
-               free_rq_tio(tio);
-               return BLKPREP_DEFER;
-       }
-
         rq->special = clone;
         rq->cmd_flags |= REQ_DONTPREP;
  
         return BLKPREP_OK;
  }
  
-static void map_request(struct dm_target *ti, struct request *rq,
+static void map_request(struct dm_target *ti, struct request *clone,
                         struct mapped_device *md)
  {
         int r;
-       struct request *clone = rq->special;
         struct dm_rq_target_io *tio = clone->end_io_data;
  
         /*
@@ -1505,6 +1618,8 @@ static void map_request(struct dm_target *ti, struct request *rq,
                 break;
         case DM_MAPIO_REMAPPED:
                 /* The target has remapped the I/O so dispatch it */
+               trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+                                    blk_rq_pos(tio->orig));
                 dm_dispatch_request(clone);
                 break;
         case DM_MAPIO_REQUEUE:
@@ -1530,29 +1645,26 @@ static void map_request(struct dm_target *ti, struct request *rq,
  static void dm_request_fn(struct request_queue *q)
  {
         struct mapped_device *md = q->queuedata;
-       struct dm_table *map = dm_get_table(md);
+       struct dm_table *map = dm_get_live_table(md);
         struct dm_target *ti;
-       struct request *rq;
+       struct request *rq, *clone;
  
         /*
-        * For noflush suspend, check blk_queue_stopped() to immediately
-        * quit I/O dispatching.
+        * For suspend, check blk_queue_stopped() and increment
+        * ->pending within a single queue_lock not to increment the
+        * number of in-flight I/Os after the queue is stopped in
+        * dm_suspend().
          */
         while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
                 rq = blk_peek_request(q);
                 if (!rq)
                         goto plug_and_out;
  
-               if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
-                       if (queue_in_flight(q))
-                               /* Not quiet yet.  Wait more */
-                               goto plug_and_out;
-
-                       /* This device should be quiet now */
-                       __stop_queue(q);
+               if (unlikely(dm_rq_is_flush_request(rq))) {
+                       BUG_ON(md->flush_request);
+                       md->flush_request = rq;
                         blk_start_request(rq);
-                       __blk_end_request_all(rq, 0);
-                       wake_up(&md->wait);
+                       queue_work(md->wq, &md->barrier_work);
                         goto out;
                 }
  
@@ -1561,8 +1673,11 @@ static void dm_request_fn(struct request_queue *q)
                         goto plug_and_out;
  
                 blk_start_request(rq);
+               clone = rq->special;
+               atomic_inc(&md->pending[rq_data_dir(clone)]);
+
                 spin_unlock(q->queue_lock);
-               map_request(ti, rq, md);
+               map_request(ti, clone, md);
                 spin_lock_irq(q->queue_lock);
         }
  
@@ -1589,7 +1704,7 @@ static int dm_lld_busy(struct request_queue *q)
  {
         int r;
         struct mapped_device *md = q->queuedata;
-       struct dm_table *map = dm_get_table(md);
+       struct dm_table *map = dm_get_live_table(md);
  
         if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
                 r = 1;
@@ -1604,7 +1719,7 @@ static int dm_lld_busy(struct request_queue *q)
  static void dm_unplug_all(struct request_queue *q)
  {
         struct mapped_device *md = q->queuedata;
-       struct dm_table *map = dm_get_table(md);
+       struct dm_table *map = dm_get_live_table(md);
  
         if (map) {
                 if (dm_request_based(md))
@@ -1622,7 +1737,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
         struct dm_table *map;
  
         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-               map = dm_get_table(md);
+               map = dm_get_live_table(md);
                 if (map) {
                         /*
                          * Request-based dm cares about only own queue for
@@ -1716,9 +1831,10 @@ out:
         return r;
  }
  
-static struct block_device_operations dm_blk_dops;
+static const struct block_device_operations dm_blk_dops;
  
  static void dm_wq_work(struct work_struct *work);
+static void dm_rq_barrier_work(struct work_struct *work);
  
  /*
   * Allocate and initialise a blank device with a given minor.
@@ -1748,6 +1864,7 @@ static struct mapped_device *alloc_dev(int minor)
         init_rwsem(&md->io_lock);
         mutex_init(&md->suspend_lock);
         spin_lock_init(&md->deferred_lock);
+       spin_lock_init(&md->barrier_error_lock);
         rwlock_init(&md->map_lock);
         atomic_set(&md->holders, 1);
         atomic_set(&md->open_count, 0);
@@ -1782,6 +1899,8 @@ static struct mapped_device *alloc_dev(int minor)
         blk_queue_softirq_done(md->queue, dm_softirq_done);
         blk_queue_prep_rq(md->queue, dm_prep_fn);
         blk_queue_lld_busy(md->queue, dm_lld_busy);
+       blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
+                         dm_rq_prepare_flush);
  
         md->disk = alloc_disk(1);
         if (!md->disk)
@@ -1791,6 +1910,7 @@ static struct mapped_device *alloc_dev(int minor)
         atomic_set(&md->pending[1], 0);
         init_waitqueue_head(&md->wait);
         INIT_WORK(&md->work, dm_wq_work);
+       INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
         init_waitqueue_head(&md->eventq);
  
         md->disk->major = _major;
@@ -1822,6 +1942,7 @@ static struct mapped_device *alloc_dev(int minor)
  bad_bdev:
         destroy_workqueue(md->wq);
  bad_thread:
+       del_gendisk(md->disk);
         put_disk(md->disk);
  bad_disk:
         blk_cleanup_queue(md->queue);
@@ -1914,9 +2035,13 @@ static void __set_size(struct mapped_device *md, sector_t size)
         mutex_unlock(&md->bdev->bd_inode->i_mutex);
  }
  
-static int __bind(struct mapped_device *md, struct dm_table *t,
-                 struct queue_limits *limits)
+/*
+ * Returns old map, which caller must destroy.
+ */
+static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
+                              struct queue_limits *limits)
  {
+       struct dm_table *old_map;
         struct request_queue *q = md->queue;
         sector_t size;
         unsigned long flags;
@@ -1931,11 +2056,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
  
         __set_size(md, size);
  
-       if (!size) {
-               dm_table_destroy(t);
-               return 0;
-       }
-
         dm_table_event_callback(t, event_callback, md);
  
         /*
@@ -1951,26 +2071,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
         __bind_mempools(md, t);
  
         write_lock_irqsave(&md->map_lock, flags);
+       old_map = md->map;
         md->map = t;
         dm_table_set_restrictions(t, q, limits);
         write_unlock_irqrestore(&md->map_lock, flags);
  
-       return 0;
+       return old_map;
  }
  
-static void __unbind(struct mapped_device *md)
+/*
+ * Returns unbound table for the caller to free.
+ */
+static struct dm_table *__unbind(struct mapped_device *md)
  {
         struct dm_table *map = md->map;
         unsigned long flags;
  
         if (!map)
-               return;
+               return NULL;
  
         dm_table_event_callback(map, NULL, NULL);
         write_lock_irqsave(&md->map_lock, flags);
         md->map = NULL;
         write_unlock_irqrestore(&md->map_lock, flags);
-       dm_table_destroy(map);
+
+       return map;
  }
  
  /*
@@ -2052,18 +2177,18 @@ void dm_put(struct mapped_device *md)
         BUG_ON(test_bit(DMF_FREEING, &md->flags));
  
         if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
-               map = dm_get_table(md);
+               map = dm_get_live_table(md);
                 idr_replace(&_minor_idr, MINOR_ALLOCED,
                             MINOR(disk_devt(dm_disk(md))));
                 set_bit(DMF_FREEING, &md->flags);
                 spin_unlock(&_minor_lock);
-               if (!dm_suspended(md)) {
+               if (!dm_suspended_md(md)) {
                         dm_table_presuspend_targets(map);
                         dm_table_postsuspend_targets(map);
                 }
                 dm_sysfs_exit(md);
                 dm_table_put(map);
-               __unbind(md);
+               dm_table_destroy(__unbind(md));
                 free_dev(md);
         }
  }
@@ -2073,8 +2198,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
  {
         int r = 0;
         DECLARE_WAITQUEUE(wait, current);
-       struct request_queue *q = md->queue;
-       unsigned long flags;
  
         dm_unplug_all(md->queue);
  
@@ -2084,15 +2207,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
                 set_current_state(interruptible);
  
                 smp_mb();
-               if (dm_request_based(md)) {
-                       spin_lock_irqsave(q->queue_lock, flags);
-                       if (!queue_in_flight(q) && blk_queue_stopped(q)) {
-                               spin_unlock_irqrestore(q->queue_lock, flags);
-                               break;
-                       }
-                       spin_unlock_irqrestore(q->queue_lock, flags);
-               } else if (!atomic_read(&md->pending[0]) &&
-                                       !atomic_read(&md->pending[1]))
+               if (!md_in_flight(md))
                         break;
  
                 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -2187,98 +2302,106 @@ static void dm_queue_flush(struct mapped_device *md)
         queue_work(md->wq, &md->work);
  }
  
-/*
- * Swap in a new table (destroying old one).
- */
-int dm_swap_table(struct mapped_device *md, struct dm_table *table)
+static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
  {
-       struct queue_limits limits;
-       int r = -EINVAL;
+       struct dm_rq_target_io *tio = clone->end_io_data;
  
-       mutex_lock(&md->suspend_lock);
+       tio->info.flush_request = flush_nr;
+}
  
-       /* device must be suspended */
-       if (!dm_suspended(md))
-               goto out;
+/* Issue barrier requests to targets and wait for their completion. */
+static int dm_rq_barrier(struct mapped_device *md)
+{
+       int i, j;
+       struct dm_table *map = dm_get_live_table(md);
+       unsigned num_targets = dm_table_get_num_targets(map);
+       struct dm_target *ti;
+       struct request *clone;
  
-       r = dm_calculate_queue_limits(table, &limits);
-       if (r)
-               goto out;
+       md->barrier_error = 0;
  
-       /* cannot change the device type, once a table is bound */
-       if (md->map &&
-           (dm_table_get_type(md->map) != dm_table_get_type(table))) {
-               DMWARN("can't change the device type after a table is bound");
-               goto out;
+       for (i = 0; i < num_targets; i++) {
+               ti = dm_table_get_target(map, i);
+               for (j = 0; j < ti->num_flush_requests; j++) {
+                       clone = clone_rq(md->flush_request, md, GFP_NOIO);
+                       dm_rq_set_flush_nr(clone, j);
+                       atomic_inc(&md->pending[rq_data_dir(clone)]);
+                       map_request(ti, clone, md);
+               }
         }
  
-       __unbind(md);
-       r = __bind(md, table, &limits);
-
-out:
-       mutex_unlock(&md->suspend_lock);
-       return r;
-}
+       dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+       dm_table_put(map);
  
-static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
-{
-       md->suspend_rq.special = (void *)0x1;
+       return md->barrier_error;
  }
  
-static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
+static void dm_rq_barrier_work(struct work_struct *work)
  {
+       int error;
+       struct mapped_device *md = container_of(work, struct mapped_device,
+                                               barrier_work);
         struct request_queue *q = md->queue;
+       struct request *rq;
         unsigned long flags;
  
-       spin_lock_irqsave(q->queue_lock, flags);
-       if (!noflush)
-               dm_rq_invalidate_suspend_marker(md);
-       __start_queue(q);
-       spin_unlock_irqrestore(q->queue_lock, flags);
-}
+       /*
+        * Hold the md reference here and leave it at the last part so that
+        * the md can't be deleted by device opener when the barrier request
+        * completes.
+        */
+       dm_get(md);
  
-static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
-{
-       struct request *rq = &md->suspend_rq;
-       struct request_queue *q = md->queue;
+       error = dm_rq_barrier(md);
  
-       if (noflush)
-               stop_queue(q);
-       else {
-               blk_rq_init(q, rq);
-               blk_insert_request(q, rq, 0, NULL);
-       }
+       rq = md->flush_request;
+       md->flush_request = NULL;
+
+       if (error == DM_ENDIO_REQUEUE) {
+               spin_lock_irqsave(q->queue_lock, flags);
+               blk_requeue_request(q, rq);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       } else
+               blk_end_request_all(rq, error);
+
+       blk_run_queue(q);
+
+       dm_put(md);
  }
  
-static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
+/*
+ * Swap in a new table, returning the old one for the caller to destroy.
+ */
+struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
  {
-       int r = 1;
-       struct request *rq = &md->suspend_rq;
-       struct request_queue *q = md->queue;
-       unsigned long flags;
+       struct dm_table *map = ERR_PTR(-EINVAL);
+       struct queue_limits limits;
+       int r;
  
-       if (noflush)
-               return r;
+       mutex_lock(&md->suspend_lock);
  
-       /* The marker must be protected by queue lock if it is in use */
-       spin_lock_irqsave(q->queue_lock, flags);
-       if (unlikely(rq->ref_count)) {
-               /*
-                * This can happen, when the previous flush suspend was
-                * interrupted, the marker is still in the queue and
-                * this flush suspend has been invoked, because we don't
-                * remove the marker at the time of suspend interruption.
-                * We have only one marker per mapped_device, so we can't
-                * start another flush suspend while it is in use.
-                */
-               BUG_ON(!rq->special); /* The marker should be invalidated */
-               DMWARN("Invalidating the previous flush suspend is still in"
-                      " progress.  Please retry later.");
-               r = 0;
+       /* device must be suspended */
+       if (!dm_suspended_md(md))
+               goto out;
+
+       r = dm_calculate_queue_limits(table, &limits);
+       if (r) {
+               map = ERR_PTR(r);
+               goto out;
         }
-       spin_unlock_irqrestore(q->queue_lock, flags);
  
-       return r;
+       /* cannot change the device type, once a table is bound */
+       if (md->map &&
+           (dm_table_get_type(md->map) != dm_table_get_type(table))) {
+               DMWARN("can't change the device type after a table is bound");
+               goto out;
+       }
+
+       map = __bind(md, table, &limits);
+
+out:
+       mutex_unlock(&md->suspend_lock);
+       return map;
  }
  
  /*
@@ -2323,49 +2446,11 @@ static void unlock_fs(struct mapped_device *md)
  /*
   * Suspend mechanism in request-based dm.
   *
- * After the suspend starts, further incoming requests are kept in
- * the request_queue and deferred.
- * Remaining requests in the request_queue at the start of suspend are flushed
- * if it is flush suspend.
- * The suspend completes when the following conditions have been satisfied,
- * so wait for it:
- *    1. q->in_flight is 0 (which means no in_flight request)
- *    2. queue has been stopped (which means no request dispatching)
- *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
   *
- * Noflush suspend
- * ---------------
- * Noflush suspend doesn't need to dispatch remaining requests.
- * So stop the queue immediately.  Then, wait for all in_flight requests
- * to be completed or requeued.
- *
- * To abort noflush suspend, start the queue.
- *
- *
- * Flush suspend
- * -------------
- * Flush suspend needs to dispatch remaining requests.  So stop the queue
- * after the remaining requests are completed. (Requeued request must be also
- * re-dispatched and completed.  Until then, we can't stop the queue.)
- *
- * During flushing the remaining requests, further incoming requests are also
- * inserted to the same queue.  To distinguish which requests are to be
- * flushed, we insert a marker request to the queue at the time of starting
- * flush suspend, like a barrier.
- * The dispatching is blocked when the marker is found on the top of the queue.
- * And the queue is stopped when all in_flight requests are completed, since
- * that means the remaining requests are completely flushed.
- * Then, the marker is removed from the queue.
- *
- * To abort flush suspend, we also need to take care of the marker, not only
- * starting the queue.
- * We don't remove the marker forcibly from the queue since it's against
- * the block-layer manner.  Instead, we put a invalidated mark on the marker.
- * When the invalidated marker is found on the top of the queue, it is
- * immediately removed from the queue, so it doesn't block dispatching.
- * Because we have only one marker per mapped_device, we can't start another
- * flush suspend until the invalidated marker is removed from the queue.
- * So fail and return with -EBUSY in such a case.
+ * To abort suspend, start the request_queue.
   */
  int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
  {
@@ -2376,17 +2461,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
  
         mutex_lock(&md->suspend_lock);
  
-       if (dm_suspended(md)) {
+       if (dm_suspended_md(md)) {
                 r = -EINVAL;
                 goto out_unlock;
         }
  
-       if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
-               r = -EBUSY;
-               goto out_unlock;
-       }
-
-       map = dm_get_table(md);
+       map = dm_get_live_table(md);
  
         /*
          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2399,8 +2479,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         dm_table_presuspend_targets(map);
  
         /*
-        * Flush I/O to the device. noflush supersedes do_lockfs,
-        * because lock_fs() needs to flush I/Os.
+        * Flush I/O to the device.
+        * Any I/O submitted after lock_fs() may not be flushed.
+        * noflush takes precedence over do_lockfs.
+        * (lock_fs() flushes I/Os and waits for them to complete.)
          */
         if (!noflush && do_lockfs) {
                 r = lock_fs(md);
@@ -2429,10 +2511,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
         up_write(&md->io_lock);
  
-       flush_workqueue(md->wq);
-
+       /*
+        * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+        * can be kicked until md->queue is stopped.  So stop md->queue before
+        * flushing md->wq.
+        */
         if (dm_request_based(md))
-               dm_rq_start_suspend(md, noflush);
+               stop_queue(md->queue);
+
+       flush_workqueue(md->wq);
  
         /*
          * At this point no more requests are entering target request routines.
@@ -2451,7 +2538,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
                 dm_queue_flush(md);
  
                 if (dm_request_based(md))
-                       dm_rq_abort_suspend(md, noflush);
+                       start_queue(md->queue);
  
                 unlock_fs(md);
                 goto out; /* pushback list is already flushed, so skip flush */
@@ -2463,10 +2550,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
          * requests are being added to md->deferred list.
          */
  
-       dm_table_postsuspend_targets(map);
-
         set_bit(DMF_SUSPENDED, &md->flags);
  
+       dm_table_postsuspend_targets(map);
+
  out:
         dm_table_put(map);
  
@@ -2481,10 +2568,10 @@ int dm_resume(struct mapped_device *md)
         struct dm_table *map = NULL;
  
         mutex_lock(&md->suspend_lock);
-       if (!dm_suspended(md))
+       if (!dm_suspended_md(md))
                 goto out;
  
-       map = dm_get_table(md);
+       map = dm_get_live_table(md);
         if (!map || !dm_table_get_size(map))
                 goto out;
  
@@ -2585,18 +2672,29 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
                 return NULL;
  
         if (test_bit(DMF_FREEING, &md->flags) ||
-           test_bit(DMF_DELETING, &md->flags))
+           dm_deleting_md(md))
                 return NULL;
  
         dm_get(md);
         return md;
  }
  
-int dm_suspended(struct mapped_device *md)
+int dm_suspended_md(struct mapped_device *md)
  {
         return test_bit(DMF_SUSPENDED, &md->flags);
  }
  
+int dm_suspended(struct dm_target *ti)
+{
+       struct mapped_device *md = dm_table_get_md(ti->table);
+       int r = dm_suspended_md(md);
+
+       dm_put(md);
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(dm_suspended);
+
  int dm_noflush_suspending(struct dm_target *ti)
  {
         struct mapped_device *md = dm_table_get_md(ti->table);
@@ -2663,7 +2761,7 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
         kfree(pools);
  }
  
-static struct block_device_operations dm_blk_dops = {
+static const struct block_device_operations dm_blk_dops = {
         .open = dm_blk_open,
         .release = dm_blk_close,
         .ioctl = dm_blk_ioctl,