Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c

index 51bfdfc..502c3d6 100644 (file)
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,6 +25,7 @@
  #define WORK_QUEUED_BIT 0
  #define WORK_DONE_BIT 1
  #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
  
  /*
   * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
  
         /* list of struct btrfs_work that are waiting for service */
         struct list_head pending;
+       struct list_head prio_pending;
  
         /* list of worker threads from struct btrfs_workers */
         struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
  
         spin_lock_irqsave(&workers->lock, flags);
  
-       while (!list_empty(&workers->order_list)) {
-               work = list_entry(workers->order_list.next,
-                                 struct btrfs_work, order_list);
-
+       while (1) {
+               if (!list_empty(&workers->prio_order_list)) {
+                       work = list_entry(workers->prio_order_list.next,
+                                         struct btrfs_work, order_list);
+               } else if (!list_empty(&workers->order_list)) {
+                       work = list_entry(workers->order_list.next,
+                                         struct btrfs_work, order_list);
+               } else {
+                       break;
+               }
                 if (!test_bit(WORK_DONE_BIT, &work->flags))
                         break;
  
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
         do {
                 spin_lock_irq(&worker->lock);
  again_locked:
-               while (!list_empty(&worker->pending)) {
-                       cur = worker->pending.next;
+               while (1) {
+                       if (!list_empty(&worker->prio_pending))
+                               cur = worker->prio_pending.next;
+                       else if (!list_empty(&worker->pending))
+                               cur = worker->pending.next;
+                       else
+                               break;
+
                         work = list_entry(cur, struct btrfs_work, list);
                         list_del(&work->list);
                         clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
  
                         spin_lock_irq(&worker->lock);
                         check_idle_worker(worker);
-
                 }
                 if (freezing(current)) {
                         worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
                                  * jump_in?
                                  */
                                 smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                         continue;
  
                                 /*
@@ -191,7 +205,8 @@ again_locked:
                                  */
                                 schedule_timeout(1);
                                 smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                         continue;
  
                                 if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
                                 /* still no more work?, sleep for real */
                                 spin_lock_irq(&worker->lock);
                                 set_current_state(TASK_INTERRUPTIBLE);
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                         goto again_locked;
  
                                 /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
         INIT_LIST_HEAD(&workers->worker_list);
         INIT_LIST_HEAD(&workers->idle_list);
         INIT_LIST_HEAD(&workers->order_list);
+       INIT_LIST_HEAD(&workers->prio_order_list);
         spin_lock_init(&workers->lock);
         workers->max_workers = max;
         workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                 }
  
                 INIT_LIST_HEAD(&worker->pending);
+               INIT_LIST_HEAD(&worker->prio_pending);
                 INIT_LIST_HEAD(&worker->worker_list);
                 spin_lock_init(&worker->lock);
                 atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                 goto out;
  
         spin_lock_irqsave(&worker->lock, flags);
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
         atomic_inc(&worker->num_pending);
  
         /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
         return 0;
  }
  
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+       set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
  /*
   * places a struct btrfs_work into the pending queue of one of the kthreads
   */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
         worker = find_worker(workers);
         if (workers->ordered) {
                 spin_lock_irqsave(&workers->lock, flags);
-               list_add_tail(&work->order_list, &workers->order_list);
+               if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                       list_add_tail(&work->order_list,
+                                     &workers->prio_order_list);
+               } else {
+                       list_add_tail(&work->order_list, &workers->order_list);
+               }
                 spin_unlock_irqrestore(&workers->lock, flags);
         } else {
                 INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
  
         spin_lock_irqsave(&worker->lock, flags);
  
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
         atomic_inc(&worker->num_pending);
         check_busy_worker(worker);
  
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h

index 31be4ed..1b511c1 100644 (file)
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -85,6 +85,7 @@ struct btrfs_workers {
          * of work items waiting for completion
          */
         struct list_head order_list;
+       struct list_head prio_order_list;
  
         /* lock for finding the next worker thread to queue on */
         spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
  int btrfs_stop_workers(struct btrfs_workers *workers);
  void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
  int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
  #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index e5b2533..a99f1c2 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
         int ret = 0;
         int blocksize;
  
-       parent = path->nodes[level - 1];
+       parent = path->nodes[level + 1];
         if (!parent)
                 return 0;
  
         nritems = btrfs_header_nritems(parent);
-       slot = path->slots[level];
+       slot = path->slots[level + 1];
         blocksize = btrfs_level_size(root, level);
  
         if (slot > 0) {
@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                         block1 = 0;
                 free_extent_buffer(eb);
         }
-       if (slot < nritems) {
+       if (slot + 1 < nritems) {
                 block2 = btrfs_node_blockptr(parent, slot + 1);
                 gen = btrfs_node_ptr_generation(parent, slot + 1);
                 eb = btrfs_find_tree_block(root, block2, blocksize);
@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
         }
         if (block1 || block2) {
                 ret = -EAGAIN;
+
+               /* release the whole path */
                 btrfs_release_path(root, path);
+
+               /* read the blocks */
                 if (block1)
                         readahead_tree_block(root, block1, blocksize, 0);
                 if (block2)
@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                         eb = read_tree_block(root, block1, blocksize, 0);
                         free_extent_buffer(eb);
                 }
-               if (block1) {
+               if (block2) {
                         eb = read_tree_block(root, block2, blocksize, 0);
                         free_extent_buffer(eb);
                 }
@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans,
          * of the btree by dropping locks before
          * we read.
          */
-       btrfs_release_path(NULL, p);
+       btrfs_unlock_up_safe(p, level + 1);
+       btrfs_set_path_blocking(p);
+
         if (tmp)
                 free_extent_buffer(tmp);
         if (p->reada)
                 reada_for_search(root, p, level, slot, key->objectid);
  
+       btrfs_release_path(NULL, p);
         tmp = read_tree_block(root, blocknr, blocksize, gen);
         if (tmp)
                 free_extent_buffer(tmp);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 92caa80..a6b8374 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
         async->bio_flags = bio_flags;
  
         atomic_inc(&fs_info->nr_async_submits);
+
+       if (rw & (1 << BIO_RW_SYNCIO))
+               btrfs_set_work_high_prio(&async->work);
+
         btrfs_queue_worker(&fs_info->workers, &async->work);
  #if 0
         int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                      mirror_num, 0);
         }
+
         /*
          * kthread helpers are used to submit writes so that checksumming
          * can happen in parallel across all CPUs
@@ -2095,10 +2100,10 @@ static int write_dev_supers(struct btrfs_device *device,
                                 device->barriers = 0;
                                 get_bh(bh);
                                 lock_buffer(bh);
-                               ret = submit_bh(WRITE, bh);
+                               ret = submit_bh(WRITE_SYNC, bh);
                         }
                 } else {
-                       ret = submit_bh(WRITE, bh);
+                       ret = submit_bh(WRITE_SYNC, bh);
                 }
  
                 if (!ret && wait) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index eb2bee8..05a1c42 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -50,7 +50,10 @@ struct extent_page_data {
         /* tells writepage not to lock the state bits for this range
          * it still does the unlocking
          */
-       int extent_locked;
+       unsigned int extent_locked:1;
+
+       /* tells the submit_bio code to use a WRITE_SYNC */
+       unsigned int sync_io:1;
  };
  
  int __init extent_io_init(void)
@@ -2101,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
         return ret;
  }
  
+static noinline void update_nr_written(struct page *page,
+                                     struct writeback_control *wbc,
+                                     unsigned long nr_written)
+{
+       wbc->nr_to_write -= nr_written;
+       if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+           wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+               page->mapping->writeback_index = page->index + nr_written;
+}
+
  /*
   * the writepage semantics are similar to regular writepage.  extent
   * records are inserted to lock ranges in the tree, and as dirty areas
@@ -2136,8 +2149,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
         u64 delalloc_end;
         int page_started;
         int compressed;
+       int write_flags;
         unsigned long nr_written = 0;
  
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               write_flags = WRITE_SYNC_PLUG;
+       else
+               write_flags = WRITE;
+
         WARN_ON(!PageLocked(page));
         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
         if (page->index > end_index ||
@@ -2164,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
         delalloc_end = 0;
         page_started = 0;
         if (!epd->extent_locked) {
+               /*
+                * make sure the wbc mapping index is at least updated
+                * to this page.
+                */
+               update_nr_written(page, wbc, 0);
+
                 while (delalloc_end < page_end) {
                         nr_delalloc = find_lock_delalloc_range(inode, tree,
                                                        page,
@@ -2185,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                  */
                 if (page_started) {
                         ret = 0;
-                       goto update_nr_written;
+                       /*
+                        * we've unlocked the page, so we can't update
+                        * the mapping's writeback index, just update
+                        * nr_to_write.
+                        */
+                       wbc->nr_to_write -= nr_written;
+                       goto done_unlocked;
                 }
         }
         lock_extent(tree, start, page_end, GFP_NOFS);
@@ -2198,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 if (ret == -EAGAIN) {
                         unlock_extent(tree, start, page_end, GFP_NOFS);
                         redirty_page_for_writepage(wbc, page);
+                       update_nr_written(page, wbc, nr_written);
                         unlock_page(page);
                         ret = 0;
-                       goto update_nr_written;
+                       goto done_unlocked;
                 }
         }
  
-       nr_written++;
+       /*
+        * we don't want to touch the inode after unlocking the page,
+        * so we update the mapping writeback index now
+        */
+       update_nr_written(page, wbc, nr_written + 1);
  
         end = page_end;
         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
@@ -2314,9 +2350,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                        (unsigned long long)end);
                         }
  
-                       ret = submit_extent_page(WRITE, tree, page, sector,
-                                                iosize, pg_offset, bdev,
-                                                &epd->bio, max_nr,
+                       ret = submit_extent_page(write_flags, tree, page,
+                                                sector, iosize, pg_offset,
+                                                bdev, &epd->bio, max_nr,
                                                  end_bio_extent_writepage,
                                                  0, 0, 0);
                         if (ret)
@@ -2336,11 +2372,8 @@ done:
                 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
         unlock_page(page);
  
-update_nr_written:
-       wbc->nr_to_write -= nr_written;
-       if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-           wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-               page->mapping->writeback_index = page->index + nr_written;
+done_unlocked:
+
         return 0;
  }
  
@@ -2460,15 +2493,23 @@ retry:
         return ret;
  }
  
-static noinline void flush_write_bio(void *data)
+static void flush_epd_write_bio(struct extent_page_data *epd)
  {
-       struct extent_page_data *epd = data;
         if (epd->bio) {
-               submit_one_bio(WRITE, epd->bio, 0, 0);
+               if (epd->sync_io)
+                       submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
+               else
+                       submit_one_bio(WRITE, epd->bio, 0, 0);
                 epd->bio = NULL;
         }
  }
  
+static noinline void flush_write_bio(void *data)
+{
+       struct extent_page_data *epd = data;
+       flush_epd_write_bio(epd);
+}
+
  int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                           get_extent_t *get_extent,
                           struct writeback_control *wbc)
@@ -2480,23 +2521,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                 .tree = tree,
                 .get_extent = get_extent,
                 .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
         };
         struct writeback_control wbc_writepages = {
                 .bdi            = wbc->bdi,
-               .sync_mode      = WB_SYNC_NONE,
+               .sync_mode      = wbc->sync_mode,
                 .older_than_this = NULL,
                 .nr_to_write    = 64,
                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
                 .range_end      = (loff_t)-1,
         };
  
-
         ret = __extent_writepage(page, wbc, &epd);
  
         extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                  __extent_writepage, &epd, flush_write_bio);
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
         return ret;
  }
  
@@ -2515,6 +2555,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                 .tree = tree,
                 .get_extent = get_extent,
                 .extent_locked = 1,
+               .sync_io = mode == WB_SYNC_ALL,
         };
         struct writeback_control wbc_writepages = {
                 .bdi            = inode->i_mapping->backing_dev_info,
@@ -2540,8 +2581,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                 start += PAGE_CACHE_SIZE;
         }
  
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
         return ret;
  }
  
@@ -2556,13 +2596,13 @@ int extent_writepages(struct extent_io_tree *tree,
                 .tree = tree,
                 .get_extent = get_extent,
                 .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
         };
  
         ret = extent_write_cache_pages(tree, mapping, wbc,
                                        __extent_writepage, &epd,
                                        flush_write_bio);
-       if (epd.bio)
-               submit_one_bio(WRITE, epd.bio, 0, 0);
+       flush_epd_write_bio(&epd);
         return ret;
  }
  
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 9c9fb46..482f8db 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -830,7 +830,7 @@ again:
  
                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                 BUG_ON(ret);
-               goto done;
+               goto release;
         } else if (split == start) {
                 if (locked_end < extent_end) {
                         ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
@@ -926,6 +926,8 @@ again:
         }
  done:
         btrfs_mark_buffer_dirty(leaf);
+
+release:
         btrfs_release_path(root, path);
         if (split_end && split == start) {
                 split = end;
@@ -1131,7 +1133,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                 if (will_write) {
                         btrfs_fdatawrite_range(inode->i_mapping, pos,
                                                pos + write_bytes - 1,
-                                              WB_SYNC_NONE);
+                                              WB_SYNC_ALL);
                 } else {
                         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                            num_pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index a0d1dd4..65219f6 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4970,10 +4970,10 @@ out_fail:
         return err;
  }
  
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+static int prealloc_file_range(struct btrfs_trans_handle *trans,
+                              struct inode *inode, u64 start, u64 end,
                                u64 alloc_hint, int mode)
  {
-       struct btrfs_trans_handle *trans;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_key ins;
         u64 alloc_size;
@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
         u64 num_bytes = end - start;
         int ret = 0;
  
-       trans = btrfs_join_transaction(root, 1);
-       BUG_ON(!trans);
-       btrfs_set_trans_block_group(trans, inode);
-
         while (num_bytes > 0) {
                 alloc_size = min(num_bytes, root->fs_info->max_extent);
                 ret = btrfs_reserve_extent(trans, root, alloc_size,
@@ -5015,7 +5011,6 @@ out:
                 BUG_ON(ret);
         }
  
-       btrfs_end_transaction(trans, root);
         return ret;
  }
  
@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode,
         u64 alloc_hint = 0;
         u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
         struct extent_map *em;
+       struct btrfs_trans_handle *trans;
         int ret;
  
         alloc_start = offset & ~mask;
         alloc_end =  (offset + len + mask) & ~mask;
  
+       /*
+        * wait for ordered IO before we have any locks.  We'll loop again
+        * below with the locks held.
+        */
+       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
         mutex_lock(&inode->i_mutex);
         if (alloc_start > inode->i_size) {
                 ret = btrfs_cont_expand(inode, alloc_start);
@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
  
         while (1) {
                 struct btrfs_ordered_extent *ordered;
+
+               trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+               if (!trans) {
+                       ret = -EIO;
+                       goto out;
+               }
+
+               /* the extent lock is ordered inside the running
+                * transaction
+                */
                 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
                             alloc_end - 1, GFP_NOFS);
                 ordered = btrfs_lookup_first_ordered_extent(inode,
@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                         btrfs_put_ordered_extent(ordered);
                         unlock_extent(&BTRFS_I(inode)->io_tree,
                                       alloc_start, alloc_end - 1, GFP_NOFS);
+                       btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+
+                       /*
+                        * we can't wait on the range with the transaction
+                        * running or with the extent lock held
+                        */
                         btrfs_wait_ordered_range(inode, alloc_start,
                                                  alloc_end - alloc_start);
                 } else {
@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                 last_byte = min(extent_map_end(em), alloc_end);
                 last_byte = (last_byte + mask) & ~mask;
                 if (em->block_start == EXTENT_MAP_HOLE) {
-                       ret = prealloc_file_range(inode, cur_offset,
+                       ret = prealloc_file_range(trans, inode, cur_offset,
                                         last_byte, alloc_hint, mode);
                         if (ret < 0) {
                                 free_extent_map(em);
@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
         }
         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
                       GFP_NOFS);
+
+       btrfs_end_transaction(trans, BTRFS_I(inode)->root);
  out:
         mutex_unlock(&inode->i_mutex);
         return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 53c87b1..d6f0806 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -489,7 +489,7 @@ again:
         /* start IO across the range first to instantiate any delalloc
          * extents
          */
-       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+       btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
  
         /* The compression code will leave pages locked but return from
          * writepage without setting the page writeback.  Starting again
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index e0913e4..e53835b 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
         return NULL;
  }
  
+static void requeue_list(struct btrfs_pending_bios *pending_bios,
+                       struct bio *head, struct bio *tail)
+{
+
+       struct bio *old_head;
+
+       old_head = pending_bios->head;
+       pending_bios->head = head;
+       if (pending_bios->tail)
+               tail->bi_next = old_head;
+       else
+               pending_bios->tail = tail;
+}
+
  /*
   * we try to collect pending bios for a device so we don't get a large
   * number of procs sending bios down to the same device.  This greatly
@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
         struct bio *pending;
         struct backing_dev_info *bdi;
         struct btrfs_fs_info *fs_info;
+       struct btrfs_pending_bios *pending_bios;
         struct bio *tail;
         struct bio *cur;
         int again = 0;
-       unsigned long num_run = 0;
+       unsigned long num_run;
+       unsigned long num_sync_run;
         unsigned long limit;
         unsigned long last_waited = 0;
  
@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
         limit = btrfs_async_submit_limit(fs_info);
         limit = limit * 2 / 3;
  
+       /* we want to make sure that every time we switch from the sync
+        * list to the normal list, we unplug
+        */
+       num_sync_run = 0;
+
  loop:
         spin_lock(&device->io_lock);
+       num_run = 0;
  
  loop_lock:
+
         /* take all the bios off the list at once and process them
          * later on (without the lock held).  But, remember the
          * tail and other pointers so the bios can be properly reinserted
          * into the list if we hit congestion
          */
-       pending = device->pending_bios;
-       tail = device->pending_bio_tail;
+       if (device->pending_sync_bios.head)
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
+
+       pending = pending_bios->head;
+       tail = pending_bios->tail;
         WARN_ON(pending && !tail);
-       device->pending_bios = NULL;
-       device->pending_bio_tail = NULL;
  
         /*
          * if pending was null this time around, no bios need processing
@@ -176,16 +202,41 @@ loop_lock:
          * device->running_pending is used to synchronize with the
          * schedule_bio code.
          */
-       if (pending) {
-               again = 1;
-               device->running_pending = 1;
-       } else {
+       if (device->pending_sync_bios.head == NULL &&
+           device->pending_bios.head == NULL) {
                 again = 0;
                 device->running_pending = 0;
+       } else {
+               again = 1;
+               device->running_pending = 1;
         }
+
+       pending_bios->head = NULL;
+       pending_bios->tail = NULL;
+
         spin_unlock(&device->io_lock);
  
+       /*
+        * if we're doing the regular priority list, make sure we unplug
+        * for any high prio bios we've sent down
+        */
+       if (pending_bios == &device->pending_bios && num_sync_run > 0) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
         while (pending) {
+
+               rmb();
+               if (pending_bios != &device->pending_sync_bios &&
+                   device->pending_sync_bios.head &&
+                   num_run > 16) {
+                       cond_resched();
+                       spin_lock(&device->io_lock);
+                       requeue_list(pending_bios, pending, tail);
+                       goto loop_lock;
+               }
+
                 cur = pending;
                 pending = pending->bi_next;
                 cur->bi_next = NULL;
@@ -196,10 +247,18 @@ loop_lock:
                         wake_up(&fs_info->async_submit_wait);
  
                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-               bio_get(cur);
                 submit_bio(cur->bi_rw, cur);
-               bio_put(cur);
                 num_run++;
+               if (bio_sync(cur))
+                       num_sync_run++;
+
+               if (need_resched()) {
+                       if (num_sync_run) {
+                               blk_run_backing_dev(bdi, NULL);
+                               num_sync_run = 0;
+                       }
+                       cond_resched();
+               }
  
                 /*
                  * we made progress, there is more work to do and the bdi
@@ -208,7 +267,6 @@ loop_lock:
                  */
                 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                     fs_info->fs_devices->open_devices > 1) {
-                       struct bio *old_head;
                         struct io_context *ioc;
  
                         ioc = current->io_context;
@@ -233,17 +291,17 @@ loop_lock:
                                  * against it before looping
                                  */
                                 last_waited = ioc->last_waited;
+                               if (need_resched()) {
+                                       if (num_sync_run) {
+                                               blk_run_backing_dev(bdi, NULL);
+                                               num_sync_run = 0;
+                                       }
+                                       cond_resched();
+                               }
                                 continue;
                         }
                         spin_lock(&device->io_lock);
-
-                       old_head = device->pending_bios;
-                       device->pending_bios = pending;
-                       if (device->pending_bio_tail)
-                               tail->bi_next = old_head;
-                       else
-                               device->pending_bio_tail = tail;
-
+                       requeue_list(pending_bios, pending, tail);
                         device->running_pending = 1;
  
                         spin_unlock(&device->io_lock);
@@ -251,11 +309,18 @@ loop_lock:
                         goto done;
                 }
         }
+
+       if (num_sync_run) {
+               num_sync_run = 0;
+               blk_run_backing_dev(bdi, NULL);
+       }
+
+       cond_resched();
         if (again)
                 goto loop;
  
         spin_lock(&device->io_lock);
-       if (device->pending_bios)
+       if (device->pending_bios.head || device->pending_sync_bios.head)
                 goto loop_lock;
         spin_unlock(&device->io_lock);
  
@@ -2497,7 +2562,7 @@ again:
                         max_errors = 1;
                 }
         }
-       if (multi_ret && rw == WRITE &&
+       if (multi_ret && (rw & (1 << BIO_RW)) &&
             stripes_allocated < stripes_required) {
                 stripes_allocated = map->num_stripes;
                 free_extent_map(em);
@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
                                  int rw, struct bio *bio)
  {
         int should_queue = 1;
+       struct btrfs_pending_bios *pending_bios;
  
         /* don't bother with additional async steps for reads, right now */
         if (!(rw & (1 << BIO_RW))) {
@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
         bio->bi_rw |= rw;
  
         spin_lock(&device->io_lock);
+       if (bio_sync(bio))
+               pending_bios = &device->pending_sync_bios;
+       else
+               pending_bios = &device->pending_bios;
  
-       if (device->pending_bio_tail)
-               device->pending_bio_tail->bi_next = bio;
+       if (pending_bios->tail)
+               pending_bios->tail->bi_next = bio;
  
-       device->pending_bio_tail = bio;
-       if (!device->pending_bios)
-               device->pending_bios = bio;
+       pending_bios->tail = bio;
+       if (!pending_bios->head)
+               pending_bios->head = bio;
         if (device->running_pending)
                 should_queue = 0;
  
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 2185de7..5836327 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,13 +23,22 @@
  #include "async-thread.h"
  
  struct buffer_head;
+struct btrfs_pending_bios {
+       struct bio *head;
+       struct bio *tail;
+};
+
  struct btrfs_device {
         struct list_head dev_list;
         struct list_head dev_alloc_list;
         struct btrfs_fs_devices *fs_devices;
         struct btrfs_root *dev_root;
-       struct bio *pending_bios;
-       struct bio *pending_bio_tail;
+
+       /* regular prio bios */
+       struct btrfs_pending_bios pending_bios;
+       /* WRITE_SYNC bios */
+       struct btrfs_pending_bios pending_sync_bios;
+
         int running_pending;
         u64 generation;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 21 Apr 2009 21:12:58 +0000 (14:12 -0700)
fs/btrfs/async-thread.c		patch \| blob \| history
fs/btrfs/async-thread.h		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history