Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
  Btrfs: fix btrfs fallocate oops and deadlock
  Btrfs: use the right node in reada_for_balance
  Btrfs: fix oops on page->mapping->host during writepage
  Btrfs: add a priority queue to the async thread helpers
  Btrfs: use WRITE_SYNC for synchronous writes

fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/ctree.c
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 51bfdfc..502c3d6 100644 (file)
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 
 /*
  * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
 
        /* list of struct btrfs_work that are waiting for service */
        struct list_head pending;
+       struct list_head prio_pending;
 
        /* list of worker threads from struct btrfs_workers */
        struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
        spin_lock_irqsave(&workers->lock, flags);
 
-       while (!list_empty(&workers->order_list)) {
-               work = list_entry(workers->order_list.next,
-                                 struct btrfs_work, order_list);
-
+       while (1) {
+               if (!list_empty(&workers->prio_order_list)) {
+                       work = list_entry(workers->prio_order_list.next,
+                                         struct btrfs_work, order_list);
+               } else if (!list_empty(&workers->order_list)) {
+                       work = list_entry(workers->order_list.next,
+                                         struct btrfs_work, order_list);
+               } else {
+                       break;
+               }
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
 
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
        do {
                spin_lock_irq(&worker->lock);
 again_locked:
-               while (!list_empty(&worker->pending)) {
-                       cur = worker->pending.next;
+               while (1) {
+                       if (!list_empty(&worker->prio_pending))
+                               cur = worker->prio_pending.next;
+                       else if (!list_empty(&worker->pending))
+                               cur = worker->pending.next;
+                       else
+                               break;
+
                        work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
 
                        spin_lock_irq(&worker->lock);
                        check_idle_worker(worker);
-
                }
                if (freezing(current)) {
                        worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
                                 * jump_in?
                                 */
                                smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        continue;
 
                                /*
@@ -191,7 +205,8 @@ again_locked:
                                 */
                                schedule_timeout(1);
                                smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        continue;
 
                                if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        goto again_locked;
 
                                /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
+       INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                }
 
                INIT_LIST_HEAD(&worker->pending);
+               INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                goto out;
 
        spin_lock_irqsave(&worker->lock, flags);
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
 
        /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
        return 0;
 }
 
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+       set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
 /*
  * places a struct btrfs_work into the pending queue of one of the kthreads
  */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        worker = find_worker(workers);
        if (workers->ordered) {
                spin_lock_irqsave(&workers->lock, flags);
-               list_add_tail(&work->order_list, &workers->order_list);
+               if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                       list_add_tail(&work->order_list,
+                                     &workers->prio_order_list);
+               } else {
+                       list_add_tail(&work->order_list, &workers->order_list);
+               }
                spin_unlock_irqrestore(&workers->lock, flags);
        } else {
                INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
        spin_lock_irqsave(&worker->lock, flags);
 
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
 
index 31be4ed..1b511c1 100644 (file)
@@ -85,6 +85,7 @@ struct btrfs_workers {
         * of work items waiting for completion
         */
        struct list_head order_list;
+       struct list_head prio_order_list;
 
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
index e5b2533..a99f1c2 100644 (file)
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        int ret = 0;
        int blocksize;
 
-       parent = path->nodes[level - 1];
+       parent = path->nodes[level + 1];
        if (!parent)
                return 0;
 
        nritems = btrfs_header_nritems(parent);
-       slot = path->slots[level];
+       slot = path->slots[level + 1];
        blocksize = btrfs_level_size(root, level);
 
        if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        block1 = 0;
                free_extent_buffer(eb);
        }
-       if (slot < nritems) {
+       if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
                eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        }
        if (block1 || block2) {
                ret = -EAGAIN;
+
+               /* release the whole path */
                btrfs_release_path(root, path);
+
+               /* read the blocks */
                if (block1)
                        readahead_tree_block(root, block1, blocksize, 0);
                if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                        eb = read_tree_block(root, block1, blocksize, 0);
                        free_extent_buffer(eb);
                }
-               if (block1) {
+               if (block2) {
                        eb = read_tree_block(root, block2, blocksize, 0);
                        free_extent_buffer(eb);
                }
@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans,
         * of the btree by dropping locks before
         * we read.
         */
-       btrfs_release_path(NULL, p);
+       btrfs_unlock_up_safe(p, level + 1);
+       btrfs_set_path_blocking(p);
+
        if (tmp)
                free_extent_buffer(tmp);
        if (p->reada)
                reada_for_search(root, p, level, slot, key->objectid);
 
+       btrfs_release_path(NULL, p);
        tmp = read_tree_block(root, blocknr, blocksize, gen);
        if (tmp)
                free_extent_buffer(tmp);
index 92caa80..a6b8374 100644 (file)
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
 
        atomic_inc(&fs_info->nr_async_submits);
+
+       if (rw & (1 << BIO_RW_SYNCIO))
+               btrfs_set_work_high_prio(&async->work);
+
        btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
        int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
+
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
@@ -2095,10 +2100,10 @@ static int write_dev_supers(struct btrfs_device *device,
                                device->barriers = 0;
                                get_bh(bh);
                                lock_buffer(bh);
-                               ret = submit_bh(WRITE, bh);
+                               ret = submit_bh(WRITE_SYNC, bh);
                        }
                } else {
-                       ret = submit_bh(WRITE, bh);
+                       ret = submit_bh(WRITE_SYNC, bh);
                }
 
                if (!ret && wait) {
index eb2bee8..05a1c42 100644 (file)
@@ -50,7 +50,10 @@ struct extent_page_data {
        /* tells writepage not to lock the state bits for this range
         * it still does the unlocking
         */
-       int extent_locked;
+       unsigned int extent_locked:1;
+
+       /* tells the submit_bio code to use a WRITE_SYNC */
+       unsigned int sync_io:1;
 };
 
 int __init extent_io_init(void)
@@ -2101,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
 
+static noinline void update_nr_written(struct page *page,
+                                     struct writeback_control *wbc,
+                                     unsigned long nr_written)
+{
+       wbc->nr_to_write -= nr_written;
+       if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+           wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+               page->mapping->writeback_index = page->index + nr_written;
+}
+
 /*
  * the writepage semantics are similar to regular writepage.  extent
  * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2149,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 delalloc_end;
        int page_started;
        int compressed;
+       int write_flags;
        unsigned long nr_written = 0;
 
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               write_flags = WRITE_SYNC_PLUG;
+       else
+               write_flags = WRITE;
+
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2164,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        delalloc_end = 0;
        page_started = 0;
        if (!epd->extent_locked) {
+               /*
+                * make sure the wbc mapping index is at least updated
+                * to this page.
+                */
+               update_nr_written(page, wbc, 0);
+
                while (delalloc_end < page_end) {
                        nr_delalloc = find_lock_delalloc_range(inode, tree,
                                                       page,
@@ -2185,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 */
                if (page_started) {
                        ret = 0;
-                       goto update_nr_written;
+                       /*
+                        * we've unlocked the page, so we can't update
+                        * the mapping's writeback index, just update
+                        * nr_to_write.
+                        */
+                       wbc->nr_to_write -= nr_written;
+                       goto done_unlocked;
                }
        }
        lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (ret == -EAGAIN) {
                        unlock_extent(tree, start, page_end, GFP_NOFS);
                        redirty_page_for_writepage(wbc, page);
+                       update_nr_written(page, wbc, nr_written);
                        unlock_page(page);
                        ret = 0;
-                       goto update_nr_written;
+                       goto done_unlocked;
                }
        }
 
-       nr_written++;
+       /*
+        * we don't want to touch the inode after unlocking the page,
+        * so we update the mapping writeback index now
+        */
+       update_nr_written(page, wbc, nr_written + 1);
 
        end = page_end;
        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2350,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                       (unsigned long long)end);
                        }
 
-                       ret = submit_extent_page(WRITE, tree, page, sector,
-                                                iosize, pg_offset, bdev,
-                                                &epd->bio, max_nr,
+                       ret = submit_extent_page(write_flags, tree, page,
+                                                sector, iosize, pg_offset,
+                                                bdev, &epd->bio, max_nr,
                                                 end_bio_extent_writepage,
                                                 0, 0, 0);
                        if (ret)
@@ -2336,11 +2372,8 @@ done:
                unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
        unlock_page(page);
 
-update_nr_written:
-       wbc->nr_to_write -= nr_written;
-       if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-           wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-               page->mapping->writeback_index = page->index + nr_written;
+done_unlocked:
+
        return 0;
 }
 
@@ -2460,15 +2493,23 @@ retry:
        return ret;
 }
 
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
 {
-       struct extent_page_data *epd = data;
        if (epd->bio) {
-               submit_one_bio(WRITE, epd->bio, 0, 0);
+               if (epd->sync_io)
+                       submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+               else
+                       submit_one_bio(WRITE, epd->bio, 0, 0);
                epd->bio = NULL;
        }
 }
 
+static noinline void flush_write_bio(void *data)
+{
+       struct extent_page_data *epd = data;
+       flush_epd_write_bio(epd);
+}
+
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent,
                          struct writeback_control *wbc)
@@ -2480,23 +2521,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = wbc->bdi,
-               .sync_mode      = WB_SYNC_NONE,
+               .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
                .range_end      = (loff_t)-1,
        };
 
-
        ret = __extent_writepage(page, wbc, &epd);
 
        extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                 __extent_writepage, &epd, flush_write_bio);
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
        return ret;
 }
 
@@ -2515,6 +2555,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 1,
+               .sync_io = mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2581,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                start += PAGE_CACHE_SIZE;
        }
 
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
        return ret;
 }
 
@@ -2556,13 +2596,13 @@ int extent_writepages(struct extent_io_tree *tree,
                .tree = tree,
                .get_extent = get_extent,
                .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
 
        ret = extent_write_cache_pages(tree, mapping, wbc,
                                       __extent_writepage, &epd,
                                       flush_write_bio);
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
        return ret;
 }
 
index 9c9fb46..482f8db 100644 (file)
@@ -830,7 +830,7 @@ again:
 
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                BUG_ON(ret);
-               goto done;
+               goto release;
        } else if (split == start) {
                if (locked_end < extent_end) {
                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +926,8 @@ again:
        }
 done:
        btrfs_mark_buffer_dirty(leaf);
+
+release:
        btrfs_release_path(root, path);
        if (split_end && split == start) {
                split = end;
@@ -1131,7 +1133,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
                                               pos + write_bytes - 1,
-                                              WB_SYNC_NONE);
+                                              WB_SYNC_ALL);
                } else {
                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                           num_pages);
index a0d1dd4..65219f6 100644 (file)
@@ -4970,10 +4970,10 @@ out_fail:
        return err;
 }
 
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
+                              struct inode *inode, u64 start, u64 end,
                               u64 alloc_hint, int mode)
 {
-       struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 alloc_size;
@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
        u64 num_bytes = end - start;
        int ret = 0;
 
-       trans = btrfs_join_transaction(root, 1);
-       BUG_ON(!trans);
-       btrfs_set_trans_block_group(trans, inode);
-
        while (num_bytes > 0) {
                alloc_size = min(num_bytes, root->fs_info->max_extent);
                ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -5015,7 +5011,6 @@ out:
                BUG_ON(ret);
        }
 
-       btrfs_end_transaction(trans, root);
        return ret;
 }
 
@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 alloc_hint = 0;
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
+       struct btrfs_trans_handle *trans;
        int ret;
 
        alloc_start = offset & ~mask;
        alloc_end =  (offset + len + mask) & ~mask;
 
+       /*
+        * wait for ordered IO before we have any locks.  We'll loop again
+        * below with the locks held.
+        */
+       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
        mutex_lock(&inode->i_mutex);
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, alloc_start);
@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 
        while (1) {
                struct btrfs_ordered_extent *ordered;
+
+               trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+               if (!trans) {
+                       ret = -EIO;
+                       goto out;
+               }
+
+               /* the extent lock is ordered inside the running
+                * transaction
+                */
                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
                            alloc_end - 1, GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent(&BTRFS_I(inode)->io_tree,
                                      alloc_start, alloc_end - 1, GFP_NOFS);
+                       btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+
+                       /*
+                        * we can't wait on the range with the transaction
+                        * running or with the extent lock held
+                        */
                        btrfs_wait_ordered_range(inode, alloc_start,
                                                 alloc_end - alloc_start);
                } else {
@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE) {
-                       ret = prealloc_file_range(inode, cur_offset,
+                       ret = prealloc_file_range(trans, inode, cur_offset,
                                        last_byte, alloc_hint, mode);
                        if (ret < 0) {
                                free_extent_map(em);
@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        }
        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
                      GFP_NOFS);
+
+       btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
index 53c87b1..d6f0806 100644 (file)
@@ -489,7 +489,7 @@ again:
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
 
        /* The compression code will leave pages locked but return from
         * writepage without setting the page writeback.  Starting again
index e0913e4..e53835b 100644 (file)
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
        return NULL;
 }
 
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                       struct bio *head, struct bio *tail)
+{
+
+       struct bio *old_head;
+
+       old_head = pending_bios->head;
+       pending_bios->head = head;
+       if (pending_bios->tail)
+               tail->bi_next = old_head;
+       else
+               pending_bios->tail = tail;
+}
+
 /*
  * we try to collect pending bios for a device so we don't get a large
  * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *pending;
        struct backing_dev_info *bdi;
        struct btrfs_fs_info *fs_info;
+       struct btrfs_pending_bios *pending_bios;
        struct bio *tail;
        struct bio *cur;
        int again = 0;
-       unsigned long num_run = 0;
+       unsigned long num_run;
+       unsigned long num_sync_run;
        unsigned long limit;
        unsigned long last_waited = 0;
 
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
 
+       /* we want to make sure that every time we switch from the sync
+        * list to the normal list, we unplug
+        */
+       num_sync_run = 0;
+
 loop:
        spin_lock(&device->io_lock);
+       num_run = 0;
 
 loop_lock:
+
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-       pending = device->pending_bios;
-       tail = device->pending_bio_tail;
+       if (device->pending_sync_bios.head)
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
+
+       pending = pending_bios->head;
+       tail = pending_bios->tail;
        WARN_ON(pending && !tail);
-       device->pending_bios = NULL;
-       device->pending_bio_tail = NULL;
 
        /*
         * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
         * device->running_pending is used to synchronize with the
         * schedule_bio code.
         */
-       if (pending) {
-               again = 1;
-               device->running_pending = 1;
-       } else {
+       if (device->pending_sync_bios.head == NULL &&
+           device->pending_bios.head == NULL) {
                again = 0;
                device->running_pending = 0;
+       } else {
+               again = 1;
+               device->running_pending = 1;
        }
+
+       pending_bios->head = NULL;
+       pending_bios->tail = NULL;
+
        spin_unlock(&device->io_lock);
 
+       /*
+        * if we're doing the regular priority list, make sure we unplug
+        * for any high prio bios we've sent down
+        */
+       if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
        while (pending) {
+
+               rmb();
+               if (pending_bios != &device->pending_sync_bios &&
+                   device->pending_sync_bios.head &&
+                   num_run > 16) {
+                       cond_resched();
+                       spin_lock(&device->io_lock);
+                       requeue_list(pending_bios, pending, tail);
+                       goto loop_lock;
+               }
+
                cur = pending;
                pending = pending->bi_next;
                cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
                        wake_up(&fs_info->async_submit_wait);
 
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-               bio_get(cur);
                submit_bio(cur->bi_rw, cur);
-               bio_put(cur);
                num_run++;
+               if (bio_sync(cur))
+                       num_sync_run++;
+
+               if (need_resched()) {
+                       if (num_sync_run) {
+                               blk_run_backing_dev(bdi, NULL);
+                               num_sync_run = 0;
+                       }
+                       cond_resched();
+               }
 
                /*
                 * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
                 */
                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                    fs_info->fs_devices->open_devices > 1) {
-                       struct bio *old_head;
                        struct io_context *ioc;
 
                        ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
+                               if (need_resched()) {
+                                       if (num_sync_run) {
+                                               blk_run_backing_dev(bdi, NULL);
+                                               num_sync_run = 0;
+                                       }
+                                       cond_resched();
+                               }
                                continue;
                        }
                        spin_lock(&device->io_lock);
-
-                       old_head = device->pending_bios;
-                       device->pending_bios = pending;
-                       if (device->pending_bio_tail)
-                               tail->bi_next = old_head;
-                       else
-                               device->pending_bio_tail = tail;
-
+                       requeue_list(pending_bios, pending, tail);
                        device->running_pending = 1;
 
                        spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
                        goto done;
                }
        }
+
+       if (num_sync_run) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
+       cond_resched();
        if (again)
                goto loop;
 
        spin_lock(&device->io_lock);
-       if (device->pending_bios)
+       if (device->pending_bios.head || device->pending_sync_bios.head)
                goto loop_lock;
        spin_unlock(&device->io_lock);
 
@@ -2497,7 +2562,7 @@ again:
                        max_errors = 1;
                }
        }
-       if (multi_ret && rw == WRITE &&
+       if (multi_ret && (rw & (1 << BIO_RW)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
                                 int rw, struct bio *bio)
 {
        int should_queue = 1;
+       struct btrfs_pending_bios *pending_bios;
 
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
 
        spin_lock(&device->io_lock);
+       if (bio_sync(bio))
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
 
-       if (device->pending_bio_tail)
-               device->pending_bio_tail->bi_next = bio;
+       if (pending_bios->tail)
+               pending_bios->tail->bi_next = bio;
 
-       device->pending_bio_tail = bio;
-       if (!device->pending_bios)
-               device->pending_bios = bio;
+       pending_bios->tail = bio;
+       if (!pending_bios->head)
+               pending_bios->head = bio;
        if (device->running_pending)
                should_queue = 0;
 
index 2185de7..5836327 100644 (file)
 #include "async-thread.h"
 
 struct buffer_head;
+struct btrfs_pending_bios {
+       struct bio *head;
+       struct bio *tail;
+};
+
 struct btrfs_device {
        struct list_head dev_list;
        struct list_head dev_alloc_list;
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_root *dev_root;
-       struct bio *pending_bios;
-       struct bio *pending_bio_tail;
+
+       /* regular prio bios */
+       struct btrfs_pending_bios pending_bios;
+       /* WRITE_SYNC bios */
+       struct btrfs_pending_bios pending_sync_bios;
+
        int running_pending;
        u64 generation;