Btrfs: add a priority queue to the async thread helpers
authorChris Mason <chris.mason@oracle.com>
Mon, 20 Apr 2009 19:50:09 +0000 (15:50 -0400)
committerChris Mason <chris.mason@oracle.com>
Mon, 20 Apr 2009 19:53:08 +0000 (15:53 -0400)
Btrfs is using WRITE_SYNC_PLUG to send down synchronous IOs with a
higher priority.  But, the checksumming helper threads prevent it
from being fully effective.

There are two problems.  First, a big queue of pending checksumming
will delay the synchronous IO behind other lower priority writes.  Second,
the checksumming uses an ordered async work queue.  The ordering makes sure
that IOs are sent to the block layer in the same order they are sent
to the checksumming threads.  Usually this gives us less seeky IO.

But, when we start mixing IO priorities, the lower priority IO can delay
the higher priority IO.

This patch solves both problems by adding a high priority list to the async
helper threads, and a new btrfs_set_work_high_prio(), which is used
to make put a new async work item onto the higher priority list.

The ordering is still done on high priority IO, but all of the high
priority bios are ordered separately from the low priority bios.  This
ordering is purely an IO optimization, it is not involved in data
or metadata integrity.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/file.c

index 51bfdfc..502c3d6 100644 (file)
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 
 /*
  * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
 
        /* list of struct btrfs_work that are waiting for service */
        struct list_head pending;
+       struct list_head prio_pending;
 
        /* list of worker threads from struct btrfs_workers */
        struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
        spin_lock_irqsave(&workers->lock, flags);
 
-       while (!list_empty(&workers->order_list)) {
-               work = list_entry(workers->order_list.next,
-                                 struct btrfs_work, order_list);
-
+       while (1) {
+               if (!list_empty(&workers->prio_order_list)) {
+                       work = list_entry(workers->prio_order_list.next,
+                                         struct btrfs_work, order_list);
+               } else if (!list_empty(&workers->order_list)) {
+                       work = list_entry(workers->order_list.next,
+                                         struct btrfs_work, order_list);
+               } else {
+                       break;
+               }
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
 
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
        do {
                spin_lock_irq(&worker->lock);
 again_locked:
-               while (!list_empty(&worker->pending)) {
-                       cur = worker->pending.next;
+               while (1) {
+                       if (!list_empty(&worker->prio_pending))
+                               cur = worker->prio_pending.next;
+                       else if (!list_empty(&worker->pending))
+                               cur = worker->pending.next;
+                       else
+                               break;
+
                        work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
 
                        spin_lock_irq(&worker->lock);
                        check_idle_worker(worker);
-
                }
                if (freezing(current)) {
                        worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
                                 * jump_in?
                                 */
                                smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        continue;
 
                                /*
@@ -191,7 +205,8 @@ again_locked:
                                 */
                                schedule_timeout(1);
                                smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        continue;
 
                                if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        goto again_locked;
 
                                /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
+       INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                }
 
                INIT_LIST_HEAD(&worker->pending);
+               INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                goto out;
 
        spin_lock_irqsave(&worker->lock, flags);
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
 
        /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
        return 0;
 }
 
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+       set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
 /*
  * places a struct btrfs_work into the pending queue of one of the kthreads
  */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        worker = find_worker(workers);
        if (workers->ordered) {
                spin_lock_irqsave(&workers->lock, flags);
-               list_add_tail(&work->order_list, &workers->order_list);
+               if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                       list_add_tail(&work->order_list,
+                                     &workers->prio_order_list);
+               } else {
+                       list_add_tail(&work->order_list, &workers->order_list);
+               }
                spin_unlock_irqrestore(&workers->lock, flags);
        } else {
                INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
        spin_lock_irqsave(&worker->lock, flags);
 
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
 
index 31be4ed..1b511c1 100644 (file)
@@ -85,6 +85,7 @@ struct btrfs_workers {
         * of work items waiting for completion
         */
        struct list_head order_list;
+       struct list_head prio_order_list;
 
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
index fec18b4..a6b8374 100644 (file)
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
 
        atomic_inc(&fs_info->nr_async_submits);
+
+       if (rw & (1 << BIO_RW_SYNCIO))
+               btrfs_set_work_high_prio(&async->work);
+
        btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
        int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
+
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
index 483b672..5d66cb2 100644 (file)
@@ -2501,7 +2501,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = wbc->bdi,
-               .sync_mode      = WB_SYNC_NONE,
+               .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
index 9c9fb46..e21c006 100644 (file)
@@ -1131,7 +1131,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
                                               pos + write_bytes - 1,
-                                              WB_SYNC_NONE);
+                                              WB_SYNC_ALL);
                } else {
                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                           num_pages);