do_add_mount() should sanitize mnt_flags
[safe/jmp/linux-2.6] / fs / btrfs / extent-tree.c
index 993f93f..56e5013 100644 (file)
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+                           int dump_block_groups);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -193,6 +195,14 @@ static int exclude_super_stripes(struct btrfs_root *root,
        int stripe_len;
        int i, nr, ret;
 
+       if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+               stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+               cache->bytes_super += stripe_len;
+               ret = add_excluded_extent(root, cache->key.objectid,
+                                         stripe_len);
+               BUG_ON(ret);
+       }
+
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
@@ -253,7 +263,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                if (ret)
                        break;
 
-               if (extent_start == start) {
+               if (extent_start <= start) {
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
@@ -1566,23 +1576,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-#ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
                             DISCARD_FL_BARRIER);
 }
-#endif
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                                u64 num_bytes)
 {
-#ifdef BIO_RW_DISCARD
        int ret;
        u64 map_length = num_bytes;
        struct btrfs_multi_bio *multi = NULL;
 
+       if (!btrfs_test_opt(root, DISCARD))
+               return 0;
+
        /* Tell the block device(s) that the sectors can be discarded */
        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
                              bytenr, &map_length, &multi, 0);
@@ -1602,9 +1612,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
        }
 
        return ret;
-#else
-       return 0;
-#endif
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2765,67 +2772,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
                                                       alloc_target);
 }
 
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+       u64 num_bytes;
+       int level;
+
+       level = BTRFS_MAX_LEVEL - 2;
+       /*
+        * NOTE: these calculations are absolutely the worst possible case.
+        * This assumes that _every_ item we insert will require a new leaf, and
+        * that the tree has grown to its maximum level size.
+        */
+
+       /*
+        * for every item we insert we could insert both an extent item and a
+        * extent ref item.  Then for ever item we insert, we will need to cow
+        * both the original leaf, plus the leaf to the left and right of it.
+        *
+        * Unless we are talking about the extent root, then we just want the
+        * number of items * 2, since we just need the extent item plus its ref.
+        */
+       if (root == root->fs_info->extent_root)
+               num_bytes = num_items * 2;
+       else
+               num_bytes = (num_items + (2 * num_items)) * 3;
+
+       /*
+        * num_bytes is total number of leaves we could need times the leaf
+        * size, and then for every leaf we could end up cow'ing 2 nodes per
+        * level, down to the leaf level.
+        */
+       num_bytes = (num_bytes * root->leafsize) +
+               (num_bytes * (level * 2)) * root->nodesize;
+
+       return num_bytes;
+}
+
 /*
- * for now this just makes sure we have at least 5% of our metadata space free
- * for use.
+ * Unreserve metadata space for delalloc.  If we have less reserved credits than
+ * we have extents, this function does nothing.
  */
-int btrfs_check_metadata_free_space(struct btrfs_root *root)
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+                                         struct inode *inode, int num_items)
 {
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_space_info *meta_sinfo;
-       u64 alloc_target, thresh;
-       int committed = 0, ret;
+       u64 num_bytes;
+       u64 alloc_target;
+       bool bug = false;
 
        /* get the space info for where the metadata will live */
        alloc_target = btrfs_get_alloc_profile(root, 0);
        meta_sinfo = __find_space_info(info, alloc_target);
-       if (!meta_sinfo)
-               goto alloc;
 
-again:
+       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+                                          num_items);
+
        spin_lock(&meta_sinfo->lock);
-       if (!meta_sinfo->full)
-               thresh = meta_sinfo->total_bytes * 80;
-       else
-               thresh = meta_sinfo->total_bytes * 95;
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       if (BTRFS_I(inode)->reserved_extents <=
+           BTRFS_I(inode)->outstanding_extents) {
+               spin_unlock(&BTRFS_I(inode)->accounting_lock);
+               spin_unlock(&meta_sinfo->lock);
+               return 0;
+       }
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+       BTRFS_I(inode)->reserved_extents--;
+       BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
+
+       if (meta_sinfo->bytes_delalloc < num_bytes) {
+               bug = true;
+               meta_sinfo->bytes_delalloc = 0;
+       } else {
+               meta_sinfo->bytes_delalloc -= num_bytes;
+       }
+       spin_unlock(&meta_sinfo->lock);
+
+       BUG_ON(bug);
 
+       return 0;
+}
+
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+       u64 thresh;
+
+       thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+               meta_sinfo->bytes_may_use;
+
+       thresh = meta_sinfo->total_bytes - thresh;
+       thresh *= 80;
        do_div(thresh, 100);
+       if (thresh <= meta_sinfo->bytes_delalloc)
+               meta_sinfo->force_delalloc = 1;
+       else
+               meta_sinfo->force_delalloc = 0;
+}
 
-       if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-           meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-           meta_sinfo->bytes_super > thresh) {
-               struct btrfs_trans_handle *trans;
-               if (!meta_sinfo->full) {
-                       meta_sinfo->force_alloc = 1;
+struct async_flush {
+       struct btrfs_root *root;
+       struct btrfs_space_info *info;
+       struct btrfs_work work;
+};
+
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+       struct async_flush *async;
+       struct btrfs_root *root;
+       struct btrfs_space_info *info;
+
+       async = container_of(work, struct async_flush, work);
+       root = async->root;
+       info = async->info;
+
+       btrfs_start_delalloc_inodes(root, 0);
+       wake_up(&info->flush_wait);
+       btrfs_wait_ordered_extents(root, 0, 0);
+
+       spin_lock(&info->lock);
+       info->flushing = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->flush_wait);
+
+       kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+       DEFINE_WAIT(wait);
+       u64 used;
+
+       while (1) {
+               prepare_to_wait(&info->flush_wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+               spin_lock(&info->lock);
+               if (!info->flushing) {
+                       spin_unlock(&info->lock);
+                       break;
+               }
+
+               used = info->bytes_used + info->bytes_reserved +
+                       info->bytes_pinned + info->bytes_readonly +
+                       info->bytes_super + info->bytes_root +
+                       info->bytes_may_use + info->bytes_delalloc;
+               if (used < info->total_bytes) {
+                       spin_unlock(&info->lock);
+                       break;
+               }
+               spin_unlock(&info->lock);
+               schedule();
+       }
+       finish_wait(&info->flush_wait, &wait);
+}
+
+static void flush_delalloc(struct btrfs_root *root,
+                                struct btrfs_space_info *info)
+{
+       struct async_flush *async;
+       bool wait = false;
+
+       spin_lock(&info->lock);
+
+       if (!info->flushing) {
+               info->flushing = 1;
+               init_waitqueue_head(&info->flush_wait);
+       } else {
+               wait = true;
+       }
+
+       spin_unlock(&info->lock);
+
+       if (wait) {
+               wait_on_flush(info);
+               return;
+       }
+
+       async = kzalloc(sizeof(*async), GFP_NOFS);
+       if (!async)
+               goto flush;
+
+       async->root = root;
+       async->info = info;
+       async->work.func = flush_delalloc_async;
+
+       btrfs_queue_worker(&root->fs_info->enospc_workers,
+                          &async->work);
+       wait_on_flush(info);
+       return;
+
+flush:
+       btrfs_start_delalloc_inodes(root, 0);
+       btrfs_wait_ordered_extents(root, 0, 0);
+
+       spin_lock(&info->lock);
+       info->flushing = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->flush_wait);
+}
+
+static int maybe_allocate_chunk(struct btrfs_root *root,
+                                struct btrfs_space_info *info)
+{
+       struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+       struct btrfs_trans_handle *trans;
+       bool wait = false;
+       int ret = 0;
+       u64 min_metadata;
+       u64 free_space;
+
+       free_space = btrfs_super_total_bytes(disk_super);
+       /*
+        * we allow the metadata to grow to a max of either 10gb or 5% of the
+        * space in the volume.
+        */
+       min_metadata = min((u64)10 * 1024 * 1024 * 1024,
+                            div64_u64(free_space * 5, 100));
+       if (info->total_bytes >= min_metadata) {
+               spin_unlock(&info->lock);
+               return 0;
+       }
+
+       if (info->full) {
+               spin_unlock(&info->lock);
+               return 0;
+       }
+
+       if (!info->allocating_chunk) {
+               info->force_alloc = 1;
+               info->allocating_chunk = 1;
+               init_waitqueue_head(&info->allocate_wait);
+       } else {
+               wait = true;
+       }
+
+       spin_unlock(&info->lock);
+
+       if (wait) {
+               wait_event(info->allocate_wait,
+                          !info->allocating_chunk);
+               return 1;
+       }
+
+       trans = btrfs_start_transaction(root, 1);
+       if (!trans) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                            4096 + 2 * 1024 * 1024,
+                            info->flags, 0);
+       btrfs_end_transaction(trans, root);
+       if (ret)
+               goto out;
+out:
+       spin_lock(&info->lock);
+       info->allocating_chunk = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->allocate_wait);
+
+       if (ret)
+               return 0;
+       return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+                                       struct inode *inode, int num_items)
+{
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 num_bytes;
+       u64 used;
+       u64 alloc_target;
+       int flushed = 0;
+       int force_delalloc;
+
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+
+       num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+                                          num_items);
+again:
+       spin_lock(&meta_sinfo->lock);
+
+       force_delalloc = meta_sinfo->force_delalloc;
+
+       if (unlikely(!meta_sinfo->bytes_root))
+               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+       if (!flushed)
+               meta_sinfo->bytes_delalloc += num_bytes;
+
+       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+       if (used > meta_sinfo->total_bytes) {
+               flushed++;
+
+               if (flushed == 1) {
+                       if (maybe_allocate_chunk(root, meta_sinfo))
+                               goto again;
+                       flushed++;
+               } else {
                        spin_unlock(&meta_sinfo->lock);
-alloc:
-                       trans = btrfs_start_transaction(root, 1);
-                       if (!trans)
-                               return -ENOMEM;
+               }
 
-                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                            2 * 1024 * 1024, alloc_target, 0);
-                       btrfs_end_transaction(trans, root);
-                       if (!meta_sinfo) {
-                               meta_sinfo = __find_space_info(info,
-                                                              alloc_target);
-                       }
+               if (flushed == 2) {
+                       filemap_flush(inode->i_mapping);
+                       goto again;
+               } else if (flushed == 3) {
+                       flush_delalloc(root, meta_sinfo);
                        goto again;
                }
+               spin_lock(&meta_sinfo->lock);
+               meta_sinfo->bytes_delalloc -= num_bytes;
                spin_unlock(&meta_sinfo->lock);
+               printk(KERN_ERR "enospc, has %d, reserved %d\n",
+                      BTRFS_I(inode)->outstanding_extents,
+                      BTRFS_I(inode)->reserved_extents);
+               dump_space_info(meta_sinfo, 0, 0);
+               return -ENOSPC;
+       }
 
-               if (!committed) {
-                       committed = 1;
-                       trans = btrfs_join_transaction(root, 1);
-                       if (!trans)
-                               return -ENOMEM;
-                       ret = btrfs_commit_transaction(trans, root);
-                       if (ret)
-                               return ret;
+       BTRFS_I(inode)->reserved_extents++;
+       check_force_delalloc(meta_sinfo);
+       spin_unlock(&meta_sinfo->lock);
+
+       if (!flushed && force_delalloc)
+               filemap_flush(inode->i_mapping);
+
+       return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space.  This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 num_bytes;
+       u64 alloc_target;
+       bool bug = false;
+
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+
+       num_bytes = calculate_bytes_needed(root, num_items);
+
+       spin_lock(&meta_sinfo->lock);
+       if (meta_sinfo->bytes_may_use < num_bytes) {
+               bug = true;
+               meta_sinfo->bytes_may_use = 0;
+       } else {
+               meta_sinfo->bytes_may_use -= num_bytes;
+       }
+       spin_unlock(&meta_sinfo->lock);
+
+       BUG_ON(bug);
+
+       return 0;
+}
+
+/*
+ * Reserve some metadata space for use.  We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items.  If we
+ * have space, fantastic, if not, you get -ENOSPC.  Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space.  THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 num_bytes;
+       u64 used;
+       u64 alloc_target;
+       int retries = 0;
+
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+
+       num_bytes = calculate_bytes_needed(root, num_items);
+again:
+       spin_lock(&meta_sinfo->lock);
+
+       if (unlikely(!meta_sinfo->bytes_root))
+               meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+       if (!retries)
+               meta_sinfo->bytes_may_use += num_bytes;
+
+       used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+               meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+               meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+               meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+       if (used > meta_sinfo->total_bytes) {
+               retries++;
+               if (retries == 1) {
+                       if (maybe_allocate_chunk(root, meta_sinfo))
+                               goto again;
+                       retries++;
+               } else {
+                       spin_unlock(&meta_sinfo->lock);
+               }
+
+               if (retries == 2) {
+                       flush_delalloc(root, meta_sinfo);
                        goto again;
                }
+               spin_lock(&meta_sinfo->lock);
+               meta_sinfo->bytes_may_use -= num_bytes;
+               spin_unlock(&meta_sinfo->lock);
+
+               dump_space_info(meta_sinfo, 0, 0);
                return -ENOSPC;
        }
+
+       check_force_delalloc(meta_sinfo);
        spin_unlock(&meta_sinfo->lock);
 
        return 0;
@@ -2888,7 +3276,7 @@ alloc:
                spin_unlock(&data_sinfo->lock);
 
                /* commit the current transaction and try again */
-               if (!committed) {
+               if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
                        if (!trans)
@@ -2916,7 +3304,7 @@ alloc:
        BTRFS_I(inode)->reserved_bytes += bytes;
        spin_unlock(&data_sinfo->lock);
 
-       return btrfs_check_metadata_free_space(root);
+       return 0;
 }
 
 /*
@@ -3015,17 +3403,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        BUG_ON(!space_info);
 
        spin_lock(&space_info->lock);
-       if (space_info->force_alloc) {
+       if (space_info->force_alloc)
                force = 1;
-               space_info->force_alloc = 0;
-       }
        if (space_info->full) {
                spin_unlock(&space_info->lock);
                goto out;
        }
 
        thresh = space_info->total_bytes - space_info->bytes_readonly;
-       thresh = div_factor(thresh, 6);
+       thresh = div_factor(thresh, 8);
        if (!force &&
           (space_info->bytes_used + space_info->bytes_pinned +
            space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3039,7 +3425,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
         * we keep a reasonable number of metadata chunks allocated in the
         * FS as well.
         */
-       if (flags & BTRFS_BLOCK_GROUP_DATA) {
+       if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
                fs_info->data_chunk_allocations++;
                if (!(fs_info->data_chunk_allocations %
                      fs_info->metadata_ratio))
@@ -3047,8 +3433,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
 
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
+       spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
+       space_info->force_alloc = 0;
+       spin_unlock(&space_info->lock);
 out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
@@ -3073,14 +3462,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        else
                old_val -= num_bytes;
        btrfs_set_super_bytes_used(&info->super_copy, old_val);
-
-       /* block accounting for root item */
-       old_val = btrfs_root_used(&root->root_item);
-       if (alloc)
-               old_val += num_bytes;
-       else
-               old_val -= num_bytes;
-       btrfs_set_root_used(&root->root_item, old_val);
        spin_unlock(&info->delalloc_lock);
 
        while (total) {
@@ -3306,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
        if (is_data)
                goto pinit;
 
+       /*
+        * discard is sloooow, and so triggering discards on
+        * individual btree blocks isn't a good plan.  Just
+        * pin everything in discard mode.
+        */
+       if (btrfs_test_opt(root, DISCARD))
+               goto pinit;
+
        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
        if (!buf)
                goto pinit;
@@ -3660,6 +4049,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u32 blocksize,
+                         u64 parent, u64 root_objectid, int level)
+{
+       u64 used;
+       spin_lock(&root->node_lock);
+       used = btrfs_root_used(&root->root_item) - blocksize;
+       btrfs_set_root_used(&root->root_item, used);
+       spin_unlock(&root->node_lock);
+
+       return btrfs_free_extent(trans, root, bytenr, blocksize,
+                                parent, root_objectid, level, 0);
+}
+
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
        u64 mask = ((u64)root->stripesize - 1);
@@ -3713,7 +4117,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 }
 
 enum btrfs_loop_type {
-       LOOP_CACHED_ONLY = 0,
+       LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
        LOOP_CACHING_WAIT = 2,
        LOOP_ALLOC_CHUNK = 3,
@@ -3742,11 +4146,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *block_group = NULL;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
+       int done_chunk_alloc = 0;
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
+       bool failed_alloc = false;
+       u64 ideal_cache_percent = 0;
+       u64 ideal_cache_offset = 0;
 
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3782,14 +4190,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                empty_cluster = 0;
 
        if (search_start == hint_byte) {
+ideal_cache:
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
+                *
+                * However if we are re-searching with an ideal block group
+                * picked out then we don't care that the block group is cached.
                 */
                if (block_group && block_group_bits(block_group, data) &&
-                   block_group_cache_done(block_group)) {
+                   (block_group->cached != BTRFS_CACHE_NO ||
+                    search_start == ideal_cache_offset)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -3801,13 +4214,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                 */
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
-                       } else
+                       } else {
                                goto have_block_group;
+                       }
                } else if (block_group) {
                        btrfs_put_block_group(block_group);
                }
        }
-
 search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -3819,28 +4232,45 @@ search:
 
 have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+                       u64 free_percent;
+
+                       free_percent = btrfs_block_group_used(&block_group->item);
+                       free_percent *= 100;
+                       free_percent = div64_u64(free_percent,
+                                                block_group->key.offset);
+                       free_percent = 100 - free_percent;
+                       if (free_percent > ideal_cache_percent &&
+                           likely(!block_group->ro)) {
+                               ideal_cache_offset = block_group->key.objectid;
+                               ideal_cache_percent = free_percent;
+                       }
+
                        /*
-                        * we want to start caching kthreads, but not too many
-                        * right off the bat so we don't overwhelm the system,
-                        * so only start them if there are less than 2 and we're
-                        * in the initial allocation phase.
+                        * We only want to start kthread caching if we are at
+                        * the point where we will wait for caching to make
+                        * progress, or if our ideal search is over and we've
+                        * found somebody to start caching.
                         */
                        if (loop > LOOP_CACHING_NOWAIT ||
-                           atomic_read(&space_info->caching_threads) < 2) {
+                           (loop > LOOP_FIND_IDEAL &&
+                            atomic_read(&space_info->caching_threads) < 2)) {
                                ret = cache_block_group(block_group);
                                BUG_ON(ret);
                        }
-               }
-
-               cached = block_group_cache_done(block_group);
-               if (unlikely(!cached)) {
                        found_uncached_bg = true;
 
-                       /* if we only want cached bgs, loop */
-                       if (loop == LOOP_CACHED_ONLY)
+                       /*
+                        * If loop is set for cached only, try the next block
+                        * group.
+                        */
+                       if (loop == LOOP_FIND_IDEAL)
                                goto loop;
                }
 
+               cached = block_group_cache_done(block_group);
+               if (unlikely(!cached))
+                       found_uncached_bg = true;
+
                if (unlikely(block_group->ro))
                        goto loop;
 
@@ -3951,14 +4381,23 @@ refill_cluster:
 
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-               if (!offset && (cached || (!cached &&
-                                          loop == LOOP_CACHING_NOWAIT))) {
-                       goto loop;
-               } else if (!offset && (!cached &&
-                                      loop > LOOP_CACHING_NOWAIT)) {
+               /*
+                * If we didn't find a chunk, and we haven't failed on this
+                * block group before, and this block group is in the middle of
+                * caching and we are ok with waiting, then go ahead and wait
+                * for progress to be made, and set failed_alloc to true.
+                *
+                * If failed_alloc is true then we've already waited on this
+                * block group once and should move on to the next block group.
+                */
+               if (!offset && !failed_alloc && !cached &&
+                   loop > LOOP_CACHING_NOWAIT) {
                        wait_block_group_cache_progress(block_group,
-                                       num_bytes + empty_size);
+                                               num_bytes + empty_size);
+                       failed_alloc = true;
                        goto have_block_group;
+               } else if (!offset) {
+                       goto loop;
                }
 checks:
                search_start = stripe_align(root, offset);
@@ -4006,13 +4445,16 @@ checks:
                break;
 loop:
                failed_cluster_refill = false;
+               failed_alloc = false;
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
 
-       /* LOOP_CACHED_ONLY, only search fully cached block groups
-        * LOOP_CACHING_NOWAIT, search partially cached block groups, but
-        *                      dont wait foR them to finish caching
+       /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
+        *                      for them to make caching progress.  Also
+        *                      determine the best possible bg to cache
+        * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+        *                      caching kthreads as we move along
         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -4021,12 +4463,47 @@ loop:
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
-               if (found_uncached_bg) {
+               if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
-                       if (loop < LOOP_CACHING_WAIT) {
-                               loop++;
+                       loop++;
+                       if (!ideal_cache_percent &&
+                           atomic_read(&space_info->caching_threads))
                                goto search;
-                       }
+
+                       /*
+                        * 1 of the following 2 things have happened so far
+                        *
+                        * 1) We found an ideal block group for caching that
+                        * is mostly full and will cache quickly, so we might
+                        * as well wait for it.
+                        *
+                        * 2) We searched for cached only and we didn't find
+                        * anything, and we didn't start any caching kthreads
+                        * either, so chances are we will loop through and
+                        * start a couple caching kthreads, and then come back
+                        * around and just wait for them.  This will be slower
+                        * because we will have 2 caching kthreads reading at
+                        * the same time when we could have just started one
+                        * and waited for it to get far enough to give us an
+                        * allocation, so go ahead and go to the wait caching
+                        * loop.
+                        */
+                       loop = LOOP_CACHING_WAIT;
+                       search_start = ideal_cache_offset;
+                       ideal_cache_percent = 0;
+                       goto ideal_cache;
+               } else if (loop == LOOP_FIND_IDEAL) {
+                       /*
+                        * Didn't find a uncached bg, wait on anything we find
+                        * next.
+                        */
+                       loop = LOOP_CACHING_WAIT;
+                       goto search;
+               }
+
+               if (loop < LOOP_CACHING_WAIT) {
+                       loop++;
+                       goto search;
                }
 
                if (loop == LOOP_ALLOC_CHUNK) {
@@ -4038,7 +4515,8 @@ loop:
                        ret = do_chunk_alloc(trans, root, num_bytes +
                                             2 * 1024 * 1024, data, 1);
                        allowed_chunk_alloc = 0;
-               } else {
+                       done_chunk_alloc = 1;
+               } else if (!done_chunk_alloc) {
                        space_info->force_alloc = 1;
                }
 
@@ -4063,21 +4541,32 @@ loop:
        return ret;
 }
 
-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+                           int dump_block_groups)
 {
        struct btrfs_block_group_cache *cache;
 
+       spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
-                                   info->bytes_pinned - info->bytes_reserved),
+                                   info->bytes_pinned - info->bytes_reserved -
+                                   info->bytes_super),
               (info->full) ? "" : "not ");
        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-              " may_use=%llu, used=%llu\n",
+              " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+              "\n",
               (unsigned long long)info->total_bytes,
               (unsigned long long)info->bytes_pinned,
               (unsigned long long)info->bytes_delalloc,
               (unsigned long long)info->bytes_may_use,
-              (unsigned long long)info->bytes_used);
+              (unsigned long long)info->bytes_used,
+              (unsigned long long)info->bytes_root,
+              (unsigned long long)info->bytes_super,
+              (unsigned long long)info->bytes_reserved);
+       spin_unlock(&info->lock);
+
+       if (!dump_block_groups)
+               return;
 
        down_read(&info->groups_sem);
        list_for_each_entry(cache, &info->block_groups, list) {
@@ -4104,7 +4593,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        u64 search_start = 0;
-       struct btrfs_fs_info *info = root->fs_info;
 
        data = btrfs_get_alloc_profile(root, data);
 again:
@@ -4112,17 +4600,9 @@ again:
         * the only place that sets empty_size is btrfs_realloc_node, which
         * is not called recursively on allocations
         */
-       if (empty_size || root->ref_cows) {
-               if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
-                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                    2 * 1024 * 1024,
-                                    BTRFS_BLOCK_GROUP_METADATA |
-                                    (info->metadata_alloc_profile &
-                                     info->avail_metadata_alloc_bits), 0);
-               }
+       if (empty_size || root->ref_cows)
                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                     num_bytes + 2 * 1024 * 1024, data, 0);
-       }
 
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -4145,7 +4625,7 @@ again:
                printk(KERN_ERR "btrfs allocation failed flags %llu, "
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
-               dump_space_info(sinfo, num_bytes);
+               dump_space_info(sinfo, num_bytes, 1);
        }
 
        return ret;
@@ -4423,6 +4903,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
                                        extent_op);
                BUG_ON(ret);
        }
+
+       if (root_objectid == root->root_key.objectid) {
+               u64 used;
+               spin_lock(&root->node_lock);
+               used = btrfs_root_used(&root->root_item) + num_bytes;
+               btrfs_set_root_used(&root->root_item, used);
+               spin_unlock(&root->node_lock);
+       }
        return ret;
 }
 
@@ -4445,8 +4933,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        btrfs_set_buffer_uptodate(buf);
 
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-               set_extent_dirty(&root->dirty_log_pages, buf->start,
-                        buf->start + buf->len - 1, GFP_NOFS);
+               /*
+                * we allow two log transactions at a time, use different
+                * EXENT bit to differentiate dirty pages.
+                */
+               if (root->log_transid % 2 == 0)
+                       set_extent_dirty(&root->dirty_log_pages, buf->start,
+                                       buf->start + buf->len - 1, GFP_NOFS);
+               else
+                       set_extent_new(&root->dirty_log_pages, buf->start,
+                                       buf->start + buf->len - 1, GFP_NOFS);
        } else {
                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
@@ -4506,6 +5002,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
        u64 bytenr;
        u64 generation;
        u64 refs;
+       u64 flags;
        u64 last = 0;
        u32 nritems;
        u32 blocksize;
@@ -4543,15 +5040,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                    generation <= root->root_key.offset)
                        continue;
 
+               /* We don't lock the tree block, it's OK to be racy here */
+               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+                                              &refs, &flags);
+               BUG_ON(ret);
+               BUG_ON(refs == 0);
+
                if (wc->stage == DROP_REFERENCE) {
-                       ret = btrfs_lookup_extent_info(trans, root,
-                                               bytenr, blocksize,
-                                               &refs, NULL);
-                       BUG_ON(ret);
-                       BUG_ON(refs == 0);
                        if (refs == 1)
                                goto reada;
 
+                       if (wc->level == 1 &&
+                           (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                               continue;
                        if (!wc->update_ref ||
                            generation <= root->root_key.offset)
                                continue;
@@ -4560,6 +5061,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                                  &wc->update_progress);
                        if (ret < 0)
                                continue;
+               } else {
+                       if (wc->level == 1 &&
+                           (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                               continue;
                }
 reada:
                ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4583,7 +5088,7 @@ reada:
 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct btrfs_path *path,
-                                  struct walk_control *wc)
+                                  struct walk_control *wc, int lookup_info)
 {
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
@@ -4598,8 +5103,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
         * when reference count of tree block is 1, it won't increase
         * again. once full backref flag is set, we never clear it.
         */
-       if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
-           (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+       if (lookup_info &&
+           ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+            (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
                BUG_ON(!path->locks[level]);
                ret = btrfs_lookup_extent_info(trans, root,
                                               eb->start, eb->len,
@@ -4660,7 +5166,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
-                                struct walk_control *wc)
+                                struct walk_control *wc, int *lookup_info)
 {
        u64 bytenr;
        u64 generation;
@@ -4680,8 +5186,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         * for the subtree
         */
        if (wc->stage == UPDATE_BACKREF &&
-           generation <= root->root_key.offset)
+           generation <= root->root_key.offset) {
+               *lookup_info = 1;
                return 1;
+       }
 
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
        blocksize = btrfs_level_size(root, level - 1);
@@ -4694,14 +5202,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        btrfs_tree_lock(next);
        btrfs_set_lock_blocking(next);
 
-       if (wc->stage == DROP_REFERENCE) {
-               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-                                              &wc->refs[level - 1],
-                                              &wc->flags[level - 1]);
-               BUG_ON(ret);
-               BUG_ON(wc->refs[level - 1] == 0);
+       ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+                                      &wc->refs[level - 1],
+                                      &wc->flags[level - 1]);
+       BUG_ON(ret);
+       BUG_ON(wc->refs[level - 1] == 0);
+       *lookup_info = 0;
 
+       if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level - 1] > 1) {
+                       if (level == 1 &&
+                           (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                               goto skip;
+
                        if (!wc->update_ref ||
                            generation <= root->root_key.offset)
                                goto skip;
@@ -4715,12 +5228,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                        wc->stage = UPDATE_BACKREF;
                        wc->shared_level = level - 1;
                }
+       } else {
+               if (level == 1 &&
+                   (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                       goto skip;
        }
 
        if (!btrfs_buffer_uptodate(next, generation)) {
                btrfs_tree_unlock(next);
                free_extent_buffer(next);
                next = NULL;
+               *lookup_info = 1;
        }
 
        if (!next) {
@@ -4743,21 +5261,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 skip:
        wc->refs[level - 1] = 0;
        wc->flags[level - 1] = 0;
+       if (wc->stage == DROP_REFERENCE) {
+               if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+                       parent = path->nodes[level]->start;
+               } else {
+                       BUG_ON(root->root_key.objectid !=
+                              btrfs_header_owner(path->nodes[level]));
+                       parent = 0;
+               }
 
-       if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
-               parent = path->nodes[level]->start;
-       } else {
-               BUG_ON(root->root_key.objectid !=
-                      btrfs_header_owner(path->nodes[level]));
-               parent = 0;
+               ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+                                       root->root_key.objectid, level - 1, 0);
+               BUG_ON(ret);
        }
-
-       ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                               root->root_key.objectid, level - 1, 0);
-       BUG_ON(ret);
-
        btrfs_tree_unlock(next);
        free_extent_buffer(next);
+       *lookup_info = 1;
        return 1;
 }
 
@@ -4871,6 +5390,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                                   struct walk_control *wc)
 {
        int level = wc->level;
+       int lookup_info = 1;
        int ret;
 
        while (level >= 0) {
@@ -4878,14 +5398,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                    btrfs_header_nritems(path->nodes[level]))
                        break;
 
-               ret = walk_down_proc(trans, root, path, wc);
+               ret = walk_down_proc(trans, root, path, wc, lookup_info);
                if (ret > 0)
                        break;
 
                if (level == 0)
                        break;
 
-               ret = do_walk_down(trans, root, path, wc);
+               ret = do_walk_down(trans, root, path, wc, &lookup_info);
                if (ret > 0) {
                        path->slots[level]++;
                        continue;