nilfs2: fix irregular checkpoint creation due to data flush
[safe/jmp/linux-2.6] / fs / btrfs / extent-tree.c
index 2f82fab..e238a0c 100644 (file)
@@ -1568,22 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-#ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-       blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+       blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+                            DISCARD_FL_BARRIER);
 }
-#endif
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                                u64 num_bytes)
 {
-#ifdef BIO_RW_DISCARD
        int ret;
        u64 map_length = num_bytes;
        struct btrfs_multi_bio *multi = NULL;
 
+       if (!btrfs_test_opt(root, DISCARD))
+               return 0;
+
        /* Tell the block device(s) that the sectors can be discarded */
        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
                              bytenr, &map_length, &multi, 0);
@@ -1603,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        }
 
        return ret;
-#else
-       return 0;
-#endif
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2823,14 +2821,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
                                           num_items);
 
        spin_lock(&meta_sinfo->lock);
-       if (BTRFS_I(inode)->delalloc_reserved_extents <=
-           BTRFS_I(inode)->delalloc_extents) {
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       if (BTRFS_I(inode)->reserved_extents <=
+           BTRFS_I(inode)->outstanding_extents) {
+               spin_unlock(&BTRFS_I(inode)->accounting_lock);
                spin_unlock(&meta_sinfo->lock);
                return 0;
        }
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
-       BTRFS_I(inode)->delalloc_reserved_extents--;
-       BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+       BTRFS_I(inode)->reserved_extents--;
+       BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
 
        if (meta_sinfo->bytes_delalloc < num_bytes) {
                bug = true;
@@ -2863,6 +2864,107 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
                meta_sinfo->force_delalloc = 0;
 }
 
+struct async_flush {
+       struct btrfs_root *root;
+       struct btrfs_space_info *info;
+       struct btrfs_work work;
+};
+
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+       struct async_flush *async;
+       struct btrfs_root *root;
+       struct btrfs_space_info *info;
+
+       async = container_of(work, struct async_flush, work);
+       root = async->root;
+       info = async->info;
+
+       btrfs_start_delalloc_inodes(root);
+       wake_up(&info->flush_wait);
+       btrfs_wait_ordered_extents(root, 0);
+
+       spin_lock(&info->lock);
+       info->flushing = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->flush_wait);
+
+       kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+       DEFINE_WAIT(wait);
+       u64 used;
+
+       while (1) {
+               prepare_to_wait(&info->flush_wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+               spin_lock(&info->lock);
+               if (!info->flushing) {
+                       spin_unlock(&info->lock);
+                       break;
+               }
+
+               used = info->bytes_used + info->bytes_reserved +
+                       info->bytes_pinned + info->bytes_readonly +
+                       info->bytes_super + info->bytes_root +
+                       info->bytes_may_use + info->bytes_delalloc;
+               if (used < info->total_bytes) {
+                       spin_unlock(&info->lock);
+                       break;
+               }
+               spin_unlock(&info->lock);
+               schedule();
+       }
+       finish_wait(&info->flush_wait, &wait);
+}
+
+static void flush_delalloc(struct btrfs_root *root,
+                                struct btrfs_space_info *info)
+{
+       struct async_flush *async;
+       bool wait = false;
+
+       spin_lock(&info->lock);
+
+       if (!info->flushing) {
+               info->flushing = 1;
+               init_waitqueue_head(&info->flush_wait);
+       } else {
+               wait = true;
+       }
+
+       spin_unlock(&info->lock);
+
+       if (wait) {
+               wait_on_flush(info);
+               return;
+       }
+
+       async = kzalloc(sizeof(*async), GFP_NOFS);
+       if (!async)
+               goto flush;
+
+       async->root = root;
+       async->info = info;
+       async->work.func = flush_delalloc_async;
+
+       btrfs_queue_worker(&root->fs_info->enospc_workers,
+                          &async->work);
+       wait_on_flush(info);
+       return;
+
+flush:
+       btrfs_start_delalloc_inodes(root);
+       btrfs_wait_ordered_extents(root, 0);
+
+       spin_lock(&info->lock);
+       info->flushing = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->flush_wait);
+}
+
 static int maybe_allocate_chunk(struct btrfs_root *root,
                                 struct btrfs_space_info *info)
 {
@@ -2893,7 +2995,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
        if (!info->allocating_chunk) {
                info->force_alloc = 1;
                info->allocating_chunk = 1;
-               init_waitqueue_head(&info->wait);
+               init_waitqueue_head(&info->allocate_wait);
        } else {
                wait = true;
        }
@@ -2901,7 +3003,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
        spin_unlock(&info->lock);
 
        if (wait) {
-               wait_event(info->wait,
+               wait_event(info->allocate_wait,
                           !info->allocating_chunk);
                return 1;
        }
@@ -2922,7 +3024,7 @@ out:
        spin_lock(&info->lock);
        info->allocating_chunk = 0;
        spin_unlock(&info->lock);
-       wake_up(&info->wait);
+       wake_up(&info->allocate_wait);
 
        if (ret)
                return 0;
@@ -2980,21 +3082,20 @@ again:
                        filemap_flush(inode->i_mapping);
                        goto again;
                } else if (flushed == 3) {
-                       btrfs_start_delalloc_inodes(root);
-                       btrfs_wait_ordered_extents(root, 0);
+                       flush_delalloc(root, meta_sinfo);
                        goto again;
                }
                spin_lock(&meta_sinfo->lock);
                meta_sinfo->bytes_delalloc -= num_bytes;
                spin_unlock(&meta_sinfo->lock);
                printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                      BTRFS_I(inode)->delalloc_extents,
-                      BTRFS_I(inode)->delalloc_reserved_extents);
+                      BTRFS_I(inode)->outstanding_extents,
+                      BTRFS_I(inode)->reserved_extents);
                dump_space_info(meta_sinfo, 0, 0);
                return -ENOSPC;
        }
 
-       BTRFS_I(inode)->delalloc_reserved_extents++;
+       BTRFS_I(inode)->reserved_extents++;
        check_force_delalloc(meta_sinfo);
        spin_unlock(&meta_sinfo->lock);
 
@@ -3093,8 +3194,7 @@ again:
                }
 
                if (retries == 2) {
-                       btrfs_start_delalloc_inodes(root);
-                       btrfs_wait_ordered_extents(root, 0);
+                       flush_delalloc(root, meta_sinfo);
                        goto again;
                }
                spin_lock(&meta_sinfo->lock);
@@ -3587,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
        if (is_data)
                goto pinit;
 
+       /*
+        * discard is sloooow, and so triggering discards on
+        * individual btree blocks isn't a good plan.  Just
+        * pin everything in discard mode.
+        */
+       if (btrfs_test_opt(root, DISCARD))
+               goto pinit;
+
        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
        if (!buf)
                goto pinit;
@@ -4809,6 +4917,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
        u64 bytenr;
        u64 generation;
        u64 refs;
+       u64 flags;
        u64 last = 0;
        u32 nritems;
        u32 blocksize;
@@ -4846,15 +4955,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                    generation <= root->root_key.offset)
                        continue;
 
+               /* We don't lock the tree block, it's OK to be racy here */
+               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+                                              &refs, &flags);
+               BUG_ON(ret);
+               BUG_ON(refs == 0);
+
                if (wc->stage == DROP_REFERENCE) {
-                       ret = btrfs_lookup_extent_info(trans, root,
-                                               bytenr, blocksize,
-                                               &refs, NULL);
-                       BUG_ON(ret);
-                       BUG_ON(refs == 0);
                        if (refs == 1)
                                goto reada;
 
+                       if (wc->level == 1 &&
+                           (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                               continue;
                        if (!wc->update_ref ||
                            generation <= root->root_key.offset)
                                continue;
@@ -4863,6 +4976,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                                  &wc->update_progress);
                        if (ret < 0)
                                continue;
+               } else {
+                       if (wc->level == 1 &&
+                           (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                               continue;
                }
 reada:
                ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4886,7 +5003,7 @@ reada:
 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct btrfs_path *path,
-                                  struct walk_control *wc)
+                                  struct walk_control *wc, int lookup_info)
 {
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
@@ -4901,8 +5018,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
         * when reference count of tree block is 1, it won't increase
         * again. once full backref flag is set, we never clear it.
         */
-       if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
-           (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+       if (lookup_info &&
+           ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+            (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
                BUG_ON(!path->locks[level]);
                ret = btrfs_lookup_extent_info(trans, root,
                                               eb->start, eb->len,
@@ -4963,7 +5081,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
-                                struct walk_control *wc)
+                                struct walk_control *wc, int *lookup_info)
 {
        u64 bytenr;
        u64 generation;
@@ -4983,8 +5101,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         * for the subtree
         */
        if (wc->stage == UPDATE_BACKREF &&
-           generation <= root->root_key.offset)
+           generation <= root->root_key.offset) {
+               *lookup_info = 1;
                return 1;
+       }
 
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
        blocksize = btrfs_level_size(root, level - 1);
@@ -4997,14 +5117,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        btrfs_tree_lock(next);
        btrfs_set_lock_blocking(next);
 
-       if (wc->stage == DROP_REFERENCE) {
-               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-                                              &wc->refs[level - 1],
-                                              &wc->flags[level - 1]);
-               BUG_ON(ret);
-               BUG_ON(wc->refs[level - 1] == 0);
+       ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+                                      &wc->refs[level - 1],
+                                      &wc->flags[level - 1]);
+       BUG_ON(ret);
+       BUG_ON(wc->refs[level - 1] == 0);
+       *lookup_info = 0;
 
+       if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level - 1] > 1) {
+                       if (level == 1 &&
+                           (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                               goto skip;
+
                        if (!wc->update_ref ||
                            generation <= root->root_key.offset)
                                goto skip;
@@ -5018,12 +5143,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                        wc->stage = UPDATE_BACKREF;
                        wc->shared_level = level - 1;
                }
+       } else {
+               if (level == 1 &&
+                   (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                       goto skip;
        }
 
        if (!btrfs_buffer_uptodate(next, generation)) {
                btrfs_tree_unlock(next);
                free_extent_buffer(next);
                next = NULL;
+               *lookup_info = 1;
        }
 
        if (!next) {
@@ -5046,21 +5176,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 skip:
        wc->refs[level - 1] = 0;
        wc->flags[level - 1] = 0;
+       if (wc->stage == DROP_REFERENCE) {
+               if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+                       parent = path->nodes[level]->start;
+               } else {
+                       BUG_ON(root->root_key.objectid !=
+                              btrfs_header_owner(path->nodes[level]));
+                       parent = 0;
+               }
 
-       if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
-               parent = path->nodes[level]->start;
-       } else {
-               BUG_ON(root->root_key.objectid !=
-                      btrfs_header_owner(path->nodes[level]));
-               parent = 0;
+               ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+                                       root->root_key.objectid, level - 1, 0);
+               BUG_ON(ret);
        }
-
-       ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                               root->root_key.objectid, level - 1, 0);
-       BUG_ON(ret);
-
        btrfs_tree_unlock(next);
        free_extent_buffer(next);
+       *lookup_info = 1;
        return 1;
 }
 
@@ -5174,6 +5305,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                                   struct walk_control *wc)
 {
        int level = wc->level;
+       int lookup_info = 1;
        int ret;
 
        while (level >= 0) {
@@ -5181,14 +5313,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                    btrfs_header_nritems(path->nodes[level]))
                        break;
 
-               ret = walk_down_proc(trans, root, path, wc);
+               ret = walk_down_proc(trans, root, path, wc, lookup_info);
                if (ret > 0)
                        break;
 
                if (level == 0)
                        break;
 
-               ret = do_walk_down(trans, root, path, wc);
+               ret = do_walk_down(trans, root, path, wc, &lookup_info);
                if (ret > 0) {
                        path->slots[level]++;
                        continue;