Btrfs: don't use spin_is_contended
[safe/jmp/linux-2.6] / fs / btrfs / extent-tree.c
index 3b26f09..7527523 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/sort.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -1521,15 +1522,55 @@ out:
        return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *orig_buf, struct extent_buffer *buf,
-                 u32 *nr_extents)
+/* when a block goes through cow, we update the reference counts of
+ * everything that block points to.  The internal pointers of the block
+ * can be in just about any order, and it is likely to have clusters of
+ * things that are close together and clusters of things that are not.
+ *
+ * To help reduce the seeks that come with updating all of these reference
+ * counts, sort them by byte number before actual updates are done.
+ *
+ * struct refsort is used to match byte number to slot in the btree block.
+ * we sort based on the byte number and then use the slot to actually
+ * find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr.  Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
+ */
+struct refsort {
+       u64 bytenr;
+       u32 slot;
+};
+
+/*
+ * for passing into sort()
+ */
+static int refsort_cmp(const void *a_void, const void *b_void)
+{
+       const struct refsort *a = a_void;
+       const struct refsort *b = b_void;
+
+       if (a->bytenr < b->bytenr)
+               return -1;
+       if (a->bytenr > b->bytenr)
+               return 1;
+       return 0;
+}
+
+
+noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct extent_buffer *orig_buf,
+                          struct extent_buffer *buf, u32 *nr_extents)
 {
        u64 bytenr;
        u64 ref_root;
        u64 orig_root;
        u64 ref_generation;
        u64 orig_generation;
+       struct refsort *sorted;
        u32 nritems;
        u32 nr_file_extents = 0;
        struct btrfs_key key;
@@ -1538,6 +1579,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int level;
        int ret = 0;
        int faili = 0;
+       int refi = 0;
+       int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
                            u64, u64, u64, u64, u64, u64, u64, u64);
 
@@ -1549,6 +1592,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);
 
+       sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+       BUG_ON(!sorted);
+
        if (root->ref_cows) {
                process_func = __btrfs_inc_extent_ref;
        } else {
@@ -1561,6 +1607,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                process_func = __btrfs_update_extent_ref;
        }
 
+       /*
+        * we make two passes through the items.  In the first pass we
+        * only record the byte number and slot.  Then we sort based on
+        * byte number and do the actual work based on the sorted results
+        */
        for (i = 0; i < nritems; i++) {
                cond_resched();
                if (level == 0) {
@@ -1577,6 +1628,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                continue;
 
                        nr_file_extents++;
+                       sorted[refi].bytenr = bytenr;
+                       sorted[refi].slot = i;
+                       refi++;
+               } else {
+                       bytenr = btrfs_node_blockptr(buf, i);
+                       sorted[refi].bytenr = bytenr;
+                       sorted[refi].slot = i;
+                       refi++;
+               }
+       }
+       /*
+        * if refi == 0, we didn't actually put anything into the sorted
+        * array and we're done
+        */
+       if (refi == 0)
+               goto out;
+
+       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+       for (i = 0; i < refi; i++) {
+               cond_resched();
+               slot = sorted[i].slot;
+               bytenr = sorted[i].bytenr;
+
+               if (level == 0) {
+                       btrfs_item_key_to_cpu(buf, &key, slot);
 
                        ret = process_func(trans, root, bytenr,
                                           orig_buf->start, buf->start,
@@ -1585,25 +1662,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                           key.objectid);
 
                        if (ret) {
-                               faili = i;
+                               faili = slot;
                                WARN_ON(1);
                                goto fail;
                        }
                } else {
-                       bytenr = btrfs_node_blockptr(buf, i);
                        ret = process_func(trans, root, bytenr,
                                           orig_buf->start, buf->start,
                                           orig_root, ref_root,
                                           orig_generation, ref_generation,
                                           level - 1);
                        if (ret) {
-                               faili = i;
+                               faili = slot;
                                WARN_ON(1);
                                goto fail;
                        }
                }
        }
 out:
+       kfree(sorted);
        if (nr_extents) {
                if (level == 0)
                        *nr_extents = nr_file_extents;
@@ -1612,6 +1689,7 @@ out:
        }
        return 0;
 fail:
+       kfree(sorted);
        WARN_ON(1);
        return ret;
 }
@@ -3334,7 +3412,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
+
+       btrfs_set_lock_blocking(buf);
        btrfs_set_buffer_uptodate(buf);
+
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
                set_extent_dirty(&root->dirty_log_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
@@ -3343,6 +3424,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                         buf->start + buf->len - 1, GFP_NOFS);
        }
        trans->blocks_used++;
+       /* this returns a buffer locked for blocking */
        return buf;
 }
 
@@ -3380,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
        u64 leaf_owner;
        u64 leaf_generation;
+       struct refsort *sorted;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        int i;
        int nritems;
        int ret;
+       int refi = 0;
+       int slot;
 
        BUG_ON(!btrfs_is_leaf(leaf));
        nritems = btrfs_header_nritems(leaf);
        leaf_owner = btrfs_header_owner(leaf);
        leaf_generation = btrfs_header_generation(leaf);
 
+       sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+       /* we do this loop twice.  The first time we build a list
+        * of the extents we have a reference on, then we sort the list
+        * by bytenr.  The second time around we actually do the
+        * extent freeing.
+        */
        for (i = 0; i < nritems; i++) {
                u64 disk_bytenr;
                cond_resched();
 
                btrfs_item_key_to_cpu(leaf, &key, i);
+
+               /* only extents have references, skip everything else */
                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
                        continue;
+
                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+
+               /* inline extents live in the btree, they don't have refs */
                if (btrfs_file_extent_type(leaf, fi) ==
                    BTRFS_FILE_EXTENT_INLINE)
                        continue;
-               /*
-                * FIXME make sure to insert a trans record that
-                * repeats the snapshot del on crash
-                */
+
                disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
+               /* holes don't have refs */
                if (disk_bytenr == 0)
                        continue;
 
+               sorted[refi].bytenr = disk_bytenr;
+               sorted[refi].slot = i;
+               refi++;
+       }
+
+       if (refi == 0)
+               goto out;
+
+       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+       for (i = 0; i < refi; i++) {
+               u64 disk_bytenr;
+
+               disk_bytenr = sorted[i].bytenr;
+               slot = sorted[i].slot;
+
+               cond_resched();
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                       continue;
+
+               fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
                ret = __btrfs_free_extent(trans, root, disk_bytenr,
                                btrfs_file_extent_disk_num_bytes(leaf, fi),
                                leaf->start, leaf_owner, leaf_generation,
@@ -3420,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                wake_up(&root->fs_info->transaction_throttle);
                cond_resched();
        }
+out:
+       kfree(sorted);
        return 0;
 }
 
@@ -3429,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
        int i;
        int ret;
-       struct btrfs_extent_info *info = ref->extents;
+       struct btrfs_extent_info *info;
+       struct refsort *sorted;
+
+       if (ref->nritems == 0)
+               return 0;
 
+       sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+       for (i = 0; i < ref->nritems; i++) {
+               sorted[i].bytenr = ref->extents[i].bytenr;
+               sorted[i].slot = i;
+       }
+       sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+
+       /*
+        * the items in the ref were sorted when the ref was inserted
+        * into the ref cache, so this is already in order
+        */
        for (i = 0; i < ref->nritems; i++) {
+               info = ref->extents + sorted[i].slot;
                ret = __btrfs_free_extent(trans, root, info->bytenr,
                                          info->num_bytes, ref->bytenr,
                                          ref->owner, ref->generation,
@@ -3445,6 +3582,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                info++;
        }
 
+       kfree(sorted);
        return 0;
 }
 
@@ -3489,6 +3627,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
 }
 
 /*
+ * this is used while deleting old snapshots, and it drops the refs
+ * on a whole subtree starting from a level 1 node.
+ *
+ * The idea is to sort all the leaf pointers, and then drop the
+ * ref on all the leaves in order.  Most of the time the leaves
+ * will have ref cache entries, so no leaf IOs will be required to
+ * find the extents they have references on.
+ *
+ * For each leaf, any references it has are also dropped in order
+ *
+ * This ends up dropping the references in something close to optimal
+ * order for reading and modifying the extent allocation tree.
+ */
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_path *path)
+{
+       u64 bytenr;
+       u64 root_owner;
+       u64 root_gen;
+       struct extent_buffer *eb = path->nodes[1];
+       struct extent_buffer *leaf;
+       struct btrfs_leaf_ref *ref;
+       struct refsort *sorted = NULL;
+       int nritems = btrfs_header_nritems(eb);
+       int ret;
+       int i;
+       int refi = 0;
+       int slot = path->slots[1];
+       u32 blocksize = btrfs_level_size(root, 0);
+       u32 refs;
+
+       if (nritems == 0)
+               goto out;
+
+       root_owner = btrfs_header_owner(eb);
+       root_gen = btrfs_header_generation(eb);
+       sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+
+       /*
+        * step one, sort all the leaf pointers so we don't scribble
+        * randomly into the extent allocation tree
+        */
+       for (i = slot; i < nritems; i++) {
+               sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+               sorted[refi].slot = i;
+               refi++;
+       }
+
+       /*
+        * nritems won't be zero, but if we're picking up drop_snapshot
+        * after a crash, slot might be > 0, so double check things
+        * just in case.
+        */
+       if (refi == 0)
+               goto out;
+
+       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+       /*
+        * the first loop frees everything the leaves point to
+        */
+       for (i = 0; i < refi; i++) {
+               u64 ptr_gen;
+
+               bytenr = sorted[i].bytenr;
+
+               /*
+                * check the reference count on this leaf.  If it is > 1
+                * we just decrement it below and don't update any
+                * of the refs the leaf points to.
+                */
+               ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+               BUG_ON(ret);
+               if (refs != 1)
+                       continue;
+
+               ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+
+               /*
+                * the leaf only had one reference, which means the
+                * only thing pointing to this leaf is the snapshot
+                * we're deleting.  It isn't possible for the reference
+                * count to increase again later
+                *
+                * The reference cache is checked for the leaf,
+                * and if found we'll be able to drop any refs held by
+                * the leaf without needing to read it in.
+                */
+               ref = btrfs_lookup_leaf_ref(root, bytenr);
+               if (ref && ref->generation != ptr_gen) {
+                       btrfs_free_leaf_ref(root, ref);
+                       ref = NULL;
+               }
+               if (ref) {
+                       ret = cache_drop_leaf_ref(trans, root, ref);
+                       BUG_ON(ret);
+                       btrfs_remove_leaf_ref(root, ref);
+                       btrfs_free_leaf_ref(root, ref);
+               } else {
+                       /*
+                        * the leaf wasn't in the reference cache, so
+                        * we have to read it.
+                        */
+                       leaf = read_tree_block(root, bytenr, blocksize,
+                                              ptr_gen);
+                       ret = btrfs_drop_leaf_ref(trans, root, leaf);
+                       BUG_ON(ret);
+                       free_extent_buffer(leaf);
+               }
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
+               cond_resched();
+       }
+
+       /*
+        * run through the loop again to free the refs on the leaves.
+        * This is faster than doing it in the loop above because
+        * the leaves are likely to be clustered together.  We end up
+        * working in nice chunks on the extent allocation tree.
+        */
+       for (i = 0; i < refi; i++) {
+               bytenr = sorted[i].bytenr;
+               ret = __btrfs_free_extent(trans, root, bytenr,
+                                       blocksize, eb->start,
+                                       root_owner, root_gen, 0, 1);
+               BUG_ON(ret);
+
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
+               cond_resched();
+       }
+out:
+       kfree(sorted);
+
+       /*
+        * update the path to show we've processed the entire level 1
+        * node.  This will get saved into the root's drop_snapshot_progress
+        * field so these drops are not repeated again if this transaction
+        * commits.
+        */
+       path->slots[1] = nritems;
+       return 0;
+}
+
+/*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
@@ -3503,7 +3787,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
        struct extent_buffer *next;
        struct extent_buffer *cur;
        struct extent_buffer *parent;
-       struct btrfs_leaf_ref *ref;
        u32 blocksize;
        int ret;
        u32 refs;
@@ -3530,17 +3813,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                if (path->slots[*level] >=
                    btrfs_header_nritems(cur))
                        break;
+
+               /* the new code goes down to level 1 and does all the
+                * leaves pointed to that node in bulk.  So, this check
+                * for level 0 will always be false.
+                *
+                * But, the disk format allows the drop_snapshot_progress
+                * field in the root to leave things in a state where
+                * a leaf will need cleaning up here.  If someone crashes
+                * with the old code and then boots with the new code,
+                * we might find a leaf here.
+                */
                if (*level == 0) {
                        ret = btrfs_drop_leaf_ref(trans, root, cur);
                        BUG_ON(ret);
                        break;
                }
+
+               /*
+                * once we get to level one, process the whole node
+                * at once, including everything below it.
+                */
+               if (*level == 1) {
+                       ret = drop_level_one_refs(trans, root, path);
+                       BUG_ON(ret);
+                       break;
+               }
+
                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
                blocksize = btrfs_level_size(root, *level - 1);
 
                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
                BUG_ON(ret);
+
+               /*
+                * if there is more than one reference, we don't need
+                * to read that node to drop any references it has.  We
+                * just drop the ref we hold on that node and move on to the
+                * next slot in this level.
+                */
                if (refs != 1) {
                        parent = path->nodes[*level];
                        root_owner = btrfs_header_owner(parent);
@@ -3559,46 +3871,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
                        continue;
                }
+
                /*
-                * at this point, we have a single ref, and since the
-                * only place referencing this extent is a dead root
-                * the reference count should never go higher.
-                * So, we don't need to check it again
+                * we need to keep freeing things in the next level down.
+                * read the block and loop around to process it
                 */
-               if (*level == 1) {
-                       ref = btrfs_lookup_leaf_ref(root, bytenr);
-                       if (ref && ref->generation != ptr_gen) {
-                               btrfs_free_leaf_ref(root, ref);
-                               ref = NULL;
-                       }
-                       if (ref) {
-                               ret = cache_drop_leaf_ref(trans, root, ref);
-                               BUG_ON(ret);
-                               btrfs_remove_leaf_ref(root, ref);
-                               btrfs_free_leaf_ref(root, ref);
-                               *level = 0;
-                               break;
-                       }
-               }
-               next = btrfs_find_tree_block(root, bytenr, blocksize);
-               if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
-                       free_extent_buffer(next);
-
-                       next = read_tree_block(root, bytenr, blocksize,
-                                              ptr_gen);
-                       cond_resched();
-#if 0
-                       /*
-                        * this is a debugging check and can go away
-                        * the ref should never go all the way down to 1
-                        * at this point
-                        */
-                       ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
-                                               &refs);
-                       BUG_ON(ret);
-                       WARN_ON(refs != 1);
-#endif
-               }
+               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
                WARN_ON(*level <= 0);
                if (path->nodes[*level-1])
                        free_extent_buffer(path->nodes[*level-1]);
@@ -3623,11 +3901,16 @@ out:
        root_owner = btrfs_header_owner(parent);
        root_gen = btrfs_header_generation(parent);
 
+       /*
+        * cleanup and free the reference on the last node
+        * we processed
+        */
        ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
                                  parent->start, root_owner, root_gen,
                                  *level, 1);
        free_extent_buffer(path->nodes[*level]);
        path->nodes[*level] = NULL;
+
        *level += 1;
        BUG_ON(ret);
 
@@ -3679,6 +3962,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 
                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
                btrfs_tree_lock(next);
+               btrfs_set_lock_blocking(next);
 
                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
                                              &refs);
@@ -3746,6 +4030,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
                        struct extent_buffer *node;
                        struct btrfs_disk_key disk_key;
+
+                       /*
+                        * there is more work to do in this level.
+                        * Update the drop_progress marker to reflect
+                        * the work we've done so far, and then bump
+                        * the slot number
+                        */
                        node = path->nodes[i];
                        path->slots[i]++;
                        *level = i;
@@ -3757,6 +4048,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                        return 0;
                } else {
                        struct extent_buffer *parent;
+
+                       /*
+                        * this whole node is done, free our reference
+                        * on it and go up one level
+                        */
                        if (path->nodes[*level] == root->node)
                                parent = path->nodes[*level];
                        else
@@ -4771,6 +5067,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
                ref->bytenr = buf->start;
                ref->owner = btrfs_header_owner(buf);
                ref->generation = btrfs_header_generation(buf);
+
                ret = btrfs_add_leaf_ref(root, ref, 0);
                WARN_ON(ret);
                btrfs_free_leaf_ref(root, ref);