nfsd: nfsd should drop CAP_MKNOD for non-root

[safe/jmp/linux-2.6] / block / cfq-iosched.c
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index ca198e6..664ebfd 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -11,6 +11,7 @@
  #include <linux/elevator.h>
  #include <linux/rbtree.h>
  #include <linux/ioprio.h>
+#include <linux/blktrace_api.h>
  
  /*
   * tunables
@@ -38,16 +39,18 @@ static int cfq_slice_idle = HZ / 125;
  #define CFQ_MIN_TT             (2)
  
  #define CFQ_SLICE_SCALE                (5)
+#define CFQ_HW_QUEUE_MIN       (5)
  
  #define RQ_CIC(rq)             \
         ((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq)            ((rq)->elevator_private2)
+#define RQ_CFQQ(rq)            (struct cfq_queue *) ((rq)->elevator_private2)
  
  static struct kmem_cache *cfq_pool;
  static struct kmem_cache *cfq_ioc_pool;
  
  static DEFINE_PER_CPU(unsigned long, ioc_count);
  static struct completion *ioc_gone;
+static DEFINE_SPINLOCK(ioc_gone_lock);
  
  #define CFQ_PRIO_LISTS         IOPRIO_BE_NR
  #define cfq_class_idle(cfqq)   ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -81,10 +84,22 @@ struct cfq_data {
          */
         struct cfq_rb_root service_tree;
         unsigned int busy_queues;
+       /*
+        * Used to track any pending rt requests so we can pre-empt current
+        * non-RT cfqq in service when this value is non-zero.
+        */
+       unsigned int busy_rt_queues;
  
         int rq_in_driver;
         int sync_flight;
+
+       /*
+        * queue-depth detection
+        */
+       int rq_queued;
         int hw_tag;
+       int hw_tag_samples;
+       int rq_in_driver_peak;
  
         /*
          * idle window management
@@ -124,6 +139,8 @@ struct cfq_data {
  struct cfq_queue {
         /* reference count */
         atomic_t ref;
+       /* various state flags, see below */
+       unsigned int flags;
         /* parent cfq_data */
         struct cfq_data *cfqd;
         /* service_tree member */
@@ -138,14 +155,14 @@ struct cfq_queue {
         int queued[2];
         /* currently allocated requests */
         int allocated[2];
-       /* pending metadata requests */
-       int meta_pending;
         /* fifo list of requests in sort_list */
         struct list_head fifo;
  
         unsigned long slice_end;
         long slice_resid;
  
+       /* pending metadata requests */
+       int meta_pending;
         /* number of requests that are on the dispatch list or inside driver */
         int dispatched;
  
@@ -153,8 +170,7 @@ struct cfq_queue {
         unsigned short ioprio, org_ioprio;
         unsigned short ioprio_class, org_ioprio_class;
  
-       /* various state flags, see below */
-       unsigned int flags;
+       pid_t pid;
  };
  
  enum cfqq_state_flags {
@@ -198,6 +214,11 @@ CFQ_CFQQ_FNS(slice_new);
  CFQ_CFQQ_FNS(sync);
  #undef CFQ_CFQQ_FNS
  
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
+       blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+#define cfq_log(cfqd, fmt, args...)    \
+       blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
+
  static void cfq_dispatch_insert(struct request_queue *, struct request *);
  static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
                                        struct io_context *, gfp_t);
@@ -234,8 +255,10 @@ static inline int cfq_bio_sync(struct bio *bio)
   */
  static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
  {
-       if (cfqd->busy_queues)
-               kblockd_schedule_work(&cfqd->unplug_work);
+       if (cfqd->busy_queues) {
+               cfq_log(cfqd, "schedule dispatch");
+               kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+       }
  }
  
  static int cfq_queue_empty(struct request_queue *q)
@@ -270,6 +293,7 @@ static inline void
  cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+       cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
  }
  
  /*
@@ -539,9 +563,12 @@ static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
   */
  static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
+       cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
         BUG_ON(cfq_cfqq_on_rr(cfqq));
         cfq_mark_cfqq_on_rr(cfqq);
         cfqd->busy_queues++;
+       if (cfq_class_rt(cfqq))
+               cfqd->busy_rt_queues++;
  
         cfq_resort_rr_list(cfqd, cfqq);
  }
@@ -552,6 +579,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
   */
  static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
+       cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
         BUG_ON(!cfq_cfqq_on_rr(cfqq));
         cfq_clear_cfqq_on_rr(cfqq);
  
@@ -560,6 +588,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  
         BUG_ON(!cfqd->busy_queues);
         cfqd->busy_queues--;
+       if (cfq_class_rt(cfqq))
+               cfqd->busy_rt_queues--;
  }
  
  /*
@@ -638,15 +668,8 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
         struct cfq_data *cfqd = q->elevator->elevator_data;
  
         cfqd->rq_in_driver++;
-
-       /*
-        * If the depth is larger 1, it really could be queueing. But lets
-        * make the mark a little higher - idling could still be good for
-        * low queueing, and a low queueing number could also just indicate
-        * a SCSI mid layer like behaviour where limit+1 is often seen.
-        */
-       if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
-               cfqd->hw_tag = 1;
+       cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
+                                               cfqd->rq_in_driver);
  
         cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
  }
@@ -657,6 +680,8 @@ static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
  
         WARN_ON(!cfqd->rq_in_driver);
         cfqd->rq_in_driver--;
+       cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
+                                               cfqd->rq_in_driver);
  }
  
  static void cfq_remove_request(struct request *rq)
@@ -669,6 +694,7 @@ static void cfq_remove_request(struct request *rq)
         list_del_init(&rq->queuelist);
         cfq_del_rq_rb(rq);
  
+       cfqq->cfqd->rq_queued--;
         if (rq_is_meta(rq)) {
                 WARN_ON(!cfqq->meta_pending);
                 cfqq->meta_pending--;
@@ -746,6 +772,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
                                    struct cfq_queue *cfqq)
  {
         if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "set_active");
                 cfqq->slice_end = 0;
                 cfq_clear_cfqq_must_alloc_slice(cfqq);
                 cfq_clear_cfqq_fifo_expire(cfqq);
@@ -763,6 +790,8 @@ static void
  __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                     int timed_out)
  {
+       cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
+
         if (cfq_cfqq_wait_request(cfqq))
                 del_timer(&cfqd->idle_slice_timer);
  
@@ -772,8 +801,10 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         /*
          * store what was left of this slice, if the queue idled/timed out
          */
-       if (timed_out && !cfq_cfqq_slice_new(cfqq))
+       if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
                 cfqq->slice_resid = cfqq->slice_end - jiffies;
+               cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
+       }
  
         cfq_resort_rr_list(cfqd, cfqq);
  
@@ -856,6 +887,14 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
         struct cfq_io_context *cic;
         unsigned long sl;
  
+       /*
+        * SSD device without seek penalty, disable idling. But only do so
+        * for devices that support queuing, otherwise we still have a problem
+        * with sync vs async workloads.
+        */
+       if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
+               return;
+
         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
         WARN_ON(cfq_cfqq_slice_new(cfqq));
  
@@ -866,6 +905,12 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
                 return;
  
         /*
+        * still requests with the driver, don't idle
+        */
+       if (cfqd->rq_in_driver)
+               return;
+
+       /*
          * task has exited, don't wait
          */
         cic = cfqd->active_cic;
@@ -892,6 +937,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
                 sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
  
         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+       cfq_log(cfqd, "arm_idle: %lu", sl);
  }
  
  /*
@@ -902,6 +948,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
         struct cfq_data *cfqd = q->elevator->elevator_data;
         struct cfq_queue *cfqq = RQ_CFQQ(rq);
  
+       cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
+
         cfq_remove_request(rq);
         cfqq->dispatched++;
         elv_dispatch_sort(q, rq);
@@ -931,8 +979,9 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
         rq = rq_entry_fifo(cfqq->fifo.next);
  
         if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
-               return NULL;
+               rq = NULL;
  
+       cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq);
         return rq;
  }
  
@@ -965,6 +1014,20 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
                 goto expire;
  
         /*
+        * If we have a RT cfqq waiting, then we pre-empt the current non-rt
+        * cfqq.
+        */
+       if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
+               /*
+                * We simulate this as cfqq timed out so that it gets to bank
+                * the remaining of its time slice.
+                */
+               cfq_log_cfqq(cfqd, cfqq, "preempt");
+               cfq_slice_expired(cfqd, 1);
+               goto new_queue;
+       }
+
+       /*
          * The active queue has requests and isn't expired, allow it to
          * dispatch.
          */
@@ -1027,6 +1090,13 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                 if (RB_EMPTY_ROOT(&cfqq->sort_list))
                         break;
  
+               /*
+                * If there is a non-empty RT cfqq waiting for current
+                * cfqq's timeslice to complete, pre-empt this cfqq
+                */
+               if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues)
+                       break;
+
         } while (dispatched < max_dispatch);
  
         /*
@@ -1072,6 +1142,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
  
         BUG_ON(cfqd->busy_queues);
  
+       cfq_log(cfqd, "forced_dispatch=%d\n", dispatched);
         return dispatched;
  }
  
@@ -1095,12 +1166,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
                 if (cfq_class_idle(cfqq))
                         max_dispatch = 1;
  
-               if (cfqq->dispatched >= max_dispatch) {
-                       if (cfqd->busy_queues > 1)
-                               break;
-                       if (cfqq->dispatched >= 4 * max_dispatch)
-                               break;
-               }
+               if (cfqq->dispatched >= max_dispatch && cfqd->busy_queues > 1)
+                       break;
  
                 if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
                         break;
@@ -1112,6 +1179,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
                 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
         }
  
+       cfq_log(cfqd, "dispatched=%d", dispatched);
         return dispatched;
  }
  
@@ -1130,6 +1198,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
         if (!atomic_dec_and_test(&cfqq->ref))
                 return;
  
+       cfq_log_cfqq(cfqd, cfqq, "put_queue");
         BUG_ON(rb_first(&cfqq->sort_list));
         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
         BUG_ON(cfq_cfqq_on_rr(cfqq));
@@ -1143,43 +1212,58 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
  }
  
  /*
- * Call func for each cic attached to this ioc. Returns number of cic's seen.
+ * Must always be called with the rcu_read_lock() held
   */
-#define CIC_GANG_NR    16
-static unsigned int
+static void
+__call_for_each_cic(struct io_context *ioc,
+                   void (*func)(struct io_context *, struct cfq_io_context *))
+{
+       struct cfq_io_context *cic;
+       struct hlist_node *n;
+
+       hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
+               func(ioc, cic);
+}
+
+/*
+ * Call func for each cic attached to this ioc.
+ */
+static void
  call_for_each_cic(struct io_context *ioc,
                   void (*func)(struct io_context *, struct cfq_io_context *))
  {
-       struct cfq_io_context *cics[CIC_GANG_NR];
-       unsigned long index = 0;
-       unsigned int called = 0;
-       int nr;
-
         rcu_read_lock();
+       __call_for_each_cic(ioc, func);
+       rcu_read_unlock();
+}
  
-       do {
-               int i;
-
-               /*
-                * Perhaps there's a better way - this just gang lookups from
-                * 0 to the end, restarting after each CIC_GANG_NR from the
-                * last key + 1.
-                */
-               nr = radix_tree_gang_lookup(&ioc->radix_root, (void **) cics,
-                                               index, CIC_GANG_NR);
-               if (!nr)
-                       break;
+static void cfq_cic_free_rcu(struct rcu_head *head)
+{
+       struct cfq_io_context *cic;
  
-               called += nr;
-               index = 1 + (unsigned long) cics[nr - 1]->key;
+       cic = container_of(head, struct cfq_io_context, rcu_head);
  
-               for (i = 0; i < nr; i++)
-                       func(ioc, cics[i]);
-       } while (nr == CIC_GANG_NR);
+       kmem_cache_free(cfq_ioc_pool, cic);
+       elv_ioc_count_dec(ioc_count);
  
-       rcu_read_unlock();
+       if (ioc_gone) {
+               /*
+                * CFQ scheduler is exiting, grab exit lock and check
+                * the pending io context count. If it hits zero,
+                * complete ioc_gone and set it back to NULL
+                */
+               spin_lock(&ioc_gone_lock);
+               if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+                       complete(ioc_gone);
+                       ioc_gone = NULL;
+               }
+               spin_unlock(&ioc_gone_lock);
+       }
+}
  
-       return called;
+static void cfq_cic_free(struct cfq_io_context *cic)
+{
+       call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
  }
  
  static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
@@ -1190,26 +1274,26 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
  
         spin_lock_irqsave(&ioc->lock, flags);
         radix_tree_delete(&ioc->radix_root, cic->dead_key);
+       hlist_del_rcu(&cic->cic_list);
         spin_unlock_irqrestore(&ioc->lock, flags);
  
-       kmem_cache_free(cfq_ioc_pool, cic);
+       cfq_cic_free(cic);
  }
  
+/*
+ * Must be called with rcu_read_lock() held or preemption otherwise disabled.
+ * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
+ * and ->trim() which is called with the task lock held
+ */
  static void cfq_free_io_context(struct io_context *ioc)
  {
-       int freed;
-
         /*
-        * ioc->refcount is zero here, so no more cic's are allowed to be
-        * linked into this ioc. So it should be ok to iterate over the known
-        * list, we will see all cic's since no new ones are added.
+        * ioc->refcount is zero here, or we are called from elv_unregister(),
+        * so no more cic's are allowed to be linked into this ioc.  So it
+        * should be ok to iterate over the known list, we will see all cic's
+        * since no new ones are added.
          */
-       freed = call_for_each_cic(ioc, cic_free_func);
-
-       elv_ioc_count_mod(ioc_count, -freed);
-
-       if (ioc_gone && !elv_ioc_count_read(ioc_count))
-               complete(ioc_gone);
+       __call_for_each_cic(ioc, cic_free_func);
  }
  
  static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -1225,6 +1309,8 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
                                          struct cfq_io_context *cic)
  {
+       struct io_context *ioc = cic->ioc;
+
         list_del_init(&cic->queue_list);
  
         /*
@@ -1234,6 +1320,9 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
         cic->dead_key = (unsigned long) cic->key;
         cic->key = NULL;
  
+       if (ioc->ioc_data == cic)
+               rcu_assign_pointer(ioc->ioc_data, NULL);
+
         if (cic->cfqq[ASYNC]) {
                 cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
                 cic->cfqq[ASYNC] = NULL;
@@ -1255,7 +1344,15 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
                 unsigned long flags;
  
                 spin_lock_irqsave(q->queue_lock, flags);
-               __cfq_exit_single_io_context(cfqd, cic);
+
+               /*
+                * Ensure we get a fresh copy of the ->key to prevent
+                * race between exiting task and queue
+                */
+               smp_read_barrier_depends();
+               if (cic->key)
+                       __cfq_exit_single_io_context(cfqd, cic);
+
                 spin_unlock_irqrestore(q->queue_lock, flags);
         }
  }
@@ -1266,7 +1363,6 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
   */
  static void cfq_exit_io_context(struct io_context *ioc)
  {
-       rcu_assign_pointer(ioc->ioc_data, NULL);
         call_for_each_cic(ioc, cfq_exit_single_io_context);
  }
  
@@ -1280,6 +1376,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
         if (cic) {
                 cic->last_end_request = jiffies;
                 INIT_LIST_HEAD(&cic->queue_list);
+               INIT_HLIST_NODE(&cic->cic_list);
                 cic->dtor = cfq_free_io_context;
                 cic->exit = cfq_exit_io_context;
                 elv_ioc_count_inc(ioc_count);
@@ -1302,10 +1399,10 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
         case IOPRIO_CLASS_NONE:
                 /*
-                * no prio set, place us in the middle of the BE classes
+                * no prio set, inherit CPU scheduling settings
                  */
                 cfqq->ioprio = task_nice_ioprio(tsk);
-               cfqq->ioprio_class = IOPRIO_CLASS_BE;
+               cfqq->ioprio_class = task_nice_ioclass(tsk);
                 break;
         case IOPRIO_CLASS_RT:
                 cfqq->ioprio = task_ioprio(ioc);
@@ -1418,6 +1515,8 @@ retry:
                                 cfq_mark_cfqq_idle_window(cfqq);
                         cfq_mark_cfqq_sync(cfqq);
                 }
+               cfqq->pid = current->pid;
+               cfq_log_cfqq(cfqd, cfqq, "alloced");
         }
  
         if (new_cfqq)
@@ -1475,15 +1574,6 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
         return cfqq;
  }
  
-static void cfq_cic_free(struct cfq_io_context *cic)
-{
-       kmem_cache_free(cfq_ioc_pool, cic);
-       elv_ioc_count_dec(ioc_count);
-
-       if (ioc_gone && !elv_ioc_count_read(ioc_count))
-               complete(ioc_gone);
-}
-
  /*
   * We drop cfq io contexts lazily, so we may find a dead one.
   */
@@ -1497,10 +1587,10 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
  
         spin_lock_irqsave(&ioc->lock, flags);
  
-       if (ioc->ioc_data == cic)
-               rcu_assign_pointer(ioc->ioc_data, NULL);
+       BUG_ON(ioc->ioc_data == cic);
  
         radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+       hlist_del_rcu(&cic->cic_list);
         spin_unlock_irqrestore(&ioc->lock, flags);
  
         cfq_cic_free(cic);
@@ -1510,20 +1600,24 @@ static struct cfq_io_context *
  cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
  {
         struct cfq_io_context *cic;
+       unsigned long flags;
         void *k;
  
         if (unlikely(!ioc))
                 return NULL;
  
+       rcu_read_lock();
+
         /*
          * we maintain a last-hit cache, to avoid browsing over the tree
          */
         cic = rcu_dereference(ioc->ioc_data);
-       if (cic && cic->key == cfqd)
+       if (cic && cic->key == cfqd) {
+               rcu_read_unlock();
                 return cic;
+       }
  
         do {
-               rcu_read_lock();
                 cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
                 rcu_read_unlock();
                 if (!cic)
@@ -1532,10 +1626,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
                 k = cic->key;
                 if (unlikely(!k)) {
                         cfq_drop_dead_cic(cfqd, ioc, cic);
+                       rcu_read_lock();
                         continue;
                 }
  
+               spin_lock_irqsave(&ioc->lock, flags);
                 rcu_assign_pointer(ioc->ioc_data, cic);
+               spin_unlock_irqrestore(&ioc->lock, flags);
                 break;
         } while (1);
  
@@ -1561,6 +1658,8 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
                 spin_lock_irqsave(&ioc->lock, flags);
                 ret = radix_tree_insert(&ioc->radix_root,
                                                 (unsigned long) cfqd, cic);
+               if (!ret)
+                       hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
                 spin_unlock_irqrestore(&ioc->lock, flags);
  
                 radix_tree_preload_end();
@@ -1666,7 +1765,7 @@ static void
  cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                        struct cfq_io_context *cic)
  {
-       int enable_idle;
+       int old_idle, enable_idle;
  
         /*
          * Don't idle for async or idle io prio class
@@ -1674,7 +1773,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
                 return;
  
-       enable_idle = cfq_cfqq_idle_window(cfqq);
+       enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
  
         if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
             (cfqd->hw_tag && CIC_SEEKY(cic)))
@@ -1686,10 +1785,13 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                         enable_idle = 1;
         }
  
-       if (enable_idle)
-               cfq_mark_cfqq_idle_window(cfqq);
-       else
-               cfq_clear_cfqq_idle_window(cfqq);
+       if (old_idle != enable_idle) {
+               cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
+               if (enable_idle)
+                       cfq_mark_cfqq_idle_window(cfqq);
+               else
+                       cfq_clear_cfqq_idle_window(cfqq);
+       }
  }
  
  /*
@@ -1729,6 +1831,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
         if (rq_is_meta(rq) && !cfqq->meta_pending)
                 return 1;
  
+       /*
+        * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+        */
+       if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
+               return 1;
+
         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
                 return 0;
  
@@ -1748,6 +1856,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
   */
  static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
+       cfq_log_cfqq(cfqd, cfqq, "preempt");
         cfq_slice_expired(cfqd, 1);
  
         /*
@@ -1772,6 +1881,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
  {
         struct cfq_io_context *cic = RQ_CIC(rq);
  
+       cfqd->rq_queued++;
         if (rq_is_meta(rq))
                 cfqq->meta_pending++;
  
@@ -1796,7 +1906,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                 /*
                  * not the active queue - expire current slice if it is
                  * idle and has expired it's mean thinktime or this new queue
-                * has some old slice time left and is of higher priority
+                * has some old slice time left and is of higher priority or
+                * this new queue is RT and the current one is BE
                  */
                 cfq_preempt_queue(cfqd, cfqq);
                 cfq_mark_cfqq_must_dispatch(cfqq);
@@ -1809,6 +1920,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
         struct cfq_data *cfqd = q->elevator->elevator_data;
         struct cfq_queue *cfqq = RQ_CFQQ(rq);
  
+       cfq_log_cfqq(cfqd, cfqq, "insert_request");
         cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
  
         cfq_add_rq_rb(rq);
@@ -1818,6 +1930,31 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
         cfq_rq_enqueued(cfqd, cfqq, rq);
  }
  
+/*
+ * Update hw_tag based on peak queue depth over 50 samples under
+ * sufficient load.
+ */
+static void cfq_update_hw_tag(struct cfq_data *cfqd)
+{
+       if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
+               cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+
+       if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
+           cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+               return;
+
+       if (cfqd->hw_tag_samples++ < 50)
+               return;
+
+       if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+               cfqd->hw_tag = 1;
+       else
+               cfqd->hw_tag = 0;
+
+       cfqd->hw_tag_samples = 0;
+       cfqd->rq_in_driver_peak = 0;
+}
+
  static void cfq_completed_request(struct request_queue *q, struct request *rq)
  {
         struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -1826,6 +1963,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
         unsigned long now;
  
         now = jiffies;
+       cfq_log_cfqq(cfqd, cfqq, "complete");
+
+       cfq_update_hw_tag(cfqd);
  
         WARN_ON(!cfqd->rq_in_driver);
         WARN_ON(!cfqq->dispatched);
@@ -1995,6 +2135,7 @@ queue_fail:
  
         cfq_schedule_dispatch(cfqd);
         spin_unlock_irqrestore(q->queue_lock, flags);
+       cfq_log(cfqd, "set_request fail");
         return 1;
  }
  
@@ -2020,6 +2161,8 @@ static void cfq_idle_slice_timer(unsigned long data)
         unsigned long flags;
         int timed_out = 1;
  
+       cfq_log(cfqd, "idle timer fired");
+
         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
  
         cfqq = cfqd->active_queue;
@@ -2058,7 +2201,7 @@ out_cont:
  static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
  {
         del_timer_sync(&cfqd->idle_slice_timer);
-       kblockd_flush_work(&cfqd->unplug_work);
+       cancel_work_sync(&cfqd->unplug_work);
  }
  
  static void cfq_put_async_queues(struct cfq_data *cfqd)
@@ -2076,7 +2219,7 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
                 cfq_put_queue(cfqd->async_idle_cfqq);
  }
  
-static void cfq_exit_queue(elevator_t *e)
+static void cfq_exit_queue(struct elevator_queue *e)
  {
         struct cfq_data *cfqd = e->elevator_data;
         struct request_queue *q = cfqd->queue;
@@ -2134,12 +2277,17 @@ static void *cfq_init_queue(struct request_queue *q)
         cfqd->cfq_slice[1] = cfq_slice_sync;
         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
         cfqd->cfq_slice_idle = cfq_slice_idle;
+       cfqd->hw_tag = 1;
  
         return cfqd;
  }
  
  static void cfq_slab_kill(void)
  {
+       /*
+        * Caller already ensured that pending RCU callbacks are completed,
+        * so we should have no busy allocations at this point.
+        */
         if (cfq_pool)
                 kmem_cache_destroy(cfq_pool);
         if (cfq_ioc_pool)
@@ -2152,7 +2300,7 @@ static int __init cfq_slab_setup(void)
         if (!cfq_pool)
                 goto fail;
  
-       cfq_ioc_pool = KMEM_CACHE(cfq_io_context, SLAB_DESTROY_BY_RCU);
+       cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
         if (!cfq_ioc_pool)
                 goto fail;
  
@@ -2181,7 +2329,7 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
  }
  
  #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
-static ssize_t __FUNC(elevator_t *e, char *page)                       \
+static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
  {                                                                      \
         struct cfq_data *cfqd = e->elevator_data;                       \
         unsigned int __data = __VAR;                                    \
@@ -2201,7 +2349,7 @@ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
  #undef SHOW_FUNCTION
  
  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)   \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
  {                                                                      \
         struct cfq_data *cfqd = e->elevator_data;                       \
         unsigned int __data;                                            \
@@ -2298,9 +2446,13 @@ static void __exit cfq_exit(void)
         ioc_gone = &all_gone;
         /* ioc_gone's update must be visible before reading ioc_count */
         smp_wmb();
+
+       /*
+        * this also protects us from entering cfq_slab_kill() with
+        * pending RCU callbacks
+        */
         if (elv_ioc_count_read(ioc_count))
-               wait_for_completion(ioc_gone);
-       synchronize_rcu();
+               wait_for_completion(&all_gone);
         cfq_slab_kill();
  }