Merge branch 'for-2.6.32' of git://git.kernel.dk/linux-2.6-block
[safe/jmp/linux-2.6] / fs / btrfs / disk-io.c
index 5aebddd..15831d5 100644 (file)
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/crc32c.h>
 #include "compat.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
+#include "free-space-cache.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
+static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
+
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
  * complete.  This is used during reads to verify checksums, and it is used
@@ -75,6 +77,40 @@ struct async_submit_bio {
        struct btrfs_work work;
 };
 
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+       /* leaf */
+       "btrfs-extent-00",
+       "btrfs-extent-01",
+       "btrfs-extent-02",
+       "btrfs-extent-03",
+       "btrfs-extent-04",
+       "btrfs-extent-05",
+       "btrfs-extent-06",
+       "btrfs-extent-07",
+       /* highest possible level */
+       "btrfs-extent-08",
+};
+#endif
+
 /*
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
@@ -137,7 +173,7 @@ out:
 
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-       return btrfs_crc32c(seed, data, len);
+       return crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, char *result)
@@ -197,10 +233,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
 
                        read_extent_buffer(buf, &val, 0, csum_size);
-                       printk(KERN_INFO "btrfs: %s checksum verify failed "
-                              "on %llu wanted %X found %X level %d\n",
-                              root->fs_info->sb->s_id,
-                              buf->start, val, found, btrfs_header_level(buf));
+                       if (printk_ratelimit()) {
+                               printk(KERN_INFO "btrfs: %s checksum verify "
+                                      "failed on %llu wanted %X found %X "
+                                      "level %d\n",
+                                      root->fs_info->sb->s_id,
+                                      (unsigned long long)buf->start, val, found,
+                                      btrfs_header_level(buf));
+                       }
                        if (result != (char *)&inline_result)
                                kfree(result);
                        return 1;
@@ -233,10 +273,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-       printk("parent transid verify failed on %llu wanted %llu found %llu\n",
-              (unsigned long long)eb->start,
-              (unsigned long long)parent_transid,
-              (unsigned long long)btrfs_header_generation(eb));
+       if (printk_ratelimit()) {
+               printk("parent transid verify failed on %llu wanted %llu "
+                      "found %llu\n",
+                      (unsigned long long)eb->start,
+                      (unsigned long long)parent_transid,
+                      (unsigned long long)btrfs_header_generation(eb));
+       }
        ret = 1;
        clear_extent_buffer_uptodate(io_tree, eb);
 out:
@@ -347,6 +390,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
        return ret;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+       lockdep_set_class_and_name(&eb->lock,
+                          &btrfs_eb_class[level],
+                          btrfs_eb_name[level]);
+}
+#endif
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
 {
@@ -371,9 +423,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
-               printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
-                      (unsigned long long)found_start,
-                      (unsigned long long)eb->start);
+               if (printk_ratelimit()) {
+                       printk(KERN_INFO "btrfs bad tree block start "
+                              "%llu %llu\n",
+                              (unsigned long long)found_start,
+                              (unsigned long long)eb->start);
+               }
                ret = -EIO;
                goto err;
        }
@@ -385,13 +440,17 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-               printk(KERN_INFO "btrfs bad fsid on block %llu\n",
-                      (unsigned long long)eb->start);
+               if (printk_ratelimit()) {
+                       printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                              (unsigned long long)eb->start);
+               }
                ret = -EIO;
                goto err;
        }
        found_level = btrfs_header_level(eb);
 
+       btrfs_set_buffer_lockdep_class(eb, found_level);
+
        ret = csum_tree_block(root, eb, 1);
        if (ret)
                ret = -EIO;
@@ -533,19 +592,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
 
        atomic_inc(&fs_info->nr_async_submits);
+
+       if (rw & (1 << BIO_RW_SYNCIO))
+               btrfs_set_work_high_prio(&async->work);
+
        btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
-       int limit = btrfs_async_submit_limit(fs_info);
-       if (atomic_read(&fs_info->nr_async_submits) > limit) {
-               wait_event_timeout(fs_info->async_submit_wait,
-                          (atomic_read(&fs_info->nr_async_submits) < limit),
-                          HZ/10);
 
-               wait_event_timeout(fs_info->async_submit_wait,
-                          (atomic_read(&fs_info->nr_async_bios) < limit),
-                          HZ/10);
-       }
-#endif
        while (atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
                wait_event(fs_info->async_submit_wait,
@@ -610,6 +662,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
+
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
@@ -623,14 +676,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+       struct extent_buffer *eb;
+       int was_dirty;
+
        tree = &BTRFS_I(page->mapping->host)->io_tree;
+       if (!(current->flags & PF_MEMALLOC)) {
+               return extent_write_full_page(tree, page,
+                                             btree_get_extent, wbc);
+       }
 
-       if (current->flags & PF_MEMALLOC) {
-               redirty_page_for_writepage(wbc, page);
-               unlock_page(page);
-               return 0;
+       redirty_page_for_writepage(wbc, page);
+       eb = btrfs_find_tree_block(root, page_offset(page),
+                                     PAGE_CACHE_SIZE);
+       WARN_ON(!eb);
+
+       was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+       if (!was_dirty) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+               spin_unlock(&root->fs_info->delalloc_lock);
        }
-       return extent_write_full_page(tree, page, btree_get_extent, wbc);
+       free_extent_buffer(eb);
+
+       unlock_page(page);
+       return 0;
 }
 
 static int btree_writepages(struct address_space *mapping,
@@ -639,15 +709,15 @@ static int btree_writepages(struct address_space *mapping,
        struct extent_io_tree *tree;
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
+               struct btrfs_root *root = BTRFS_I(mapping->host)->root;
                u64 num_dirty;
-               u64 start = 0;
                unsigned long thresh = 32 * 1024 * 1024;
 
                if (wbc->for_kupdate)
                        return 0;
 
-               num_dirty = count_range_bits(tree, &start, (u64)-1,
-                                            thresh, EXTENT_DIRTY);
+               /* this is a bit racy, but that's ok */
+               num_dirty = root->fs_info->dirty_metadata_bytes;
                if (num_dirty < thresh)
                        return 0;
        }
@@ -702,27 +772,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
        }
 }
 
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-       struct buffer_head *bh;
-       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-       struct buffer_head *head;
-       if (!page_has_buffers(page)) {
-               create_empty_buffers(page, root->fs_info->sb->s_blocksize,
-                                       (1 << BH_Dirty)|(1 << BH_Uptodate));
-       }
-       head = page_buffers(page);
-       bh = head;
-       do {
-               if (buffer_dirty(bh))
-                       csum_tree_block(root, bh, 0);
-               bh = bh->b_this_page;
-       } while (bh != head);
-       return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
-
 static struct address_space_operations btree_aops = {
        .readpage       = btree_readpage,
        .writepage      = btree_writepage,
@@ -800,8 +849,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
        if (ret == 0)
                set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
-       else
-               WARN_ON(1);
        return buf;
 
 }
@@ -812,11 +859,19 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        struct inode *btree_inode = root->fs_info->btree_inode;
        if (btrfs_header_generation(buf) ==
            root->fs_info->running_transaction->transid) {
-               WARN_ON(!btrfs_tree_locked(buf));
+               btrfs_assert_tree_locked(buf);
+
+               if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+                       spin_lock(&root->fs_info->delalloc_lock);
+                       if (root->fs_info->dirty_metadata_bytes >= buf->len)
+                               root->fs_info->dirty_metadata_bytes -= buf->len;
+                       else
+                               WARN_ON(1);
+                       spin_unlock(&root->fs_info->delalloc_lock);
+               }
 
-               /* ugh, clear_extent_buffer_dirty can be expensive */
+               /* ugh, clear_extent_buffer_dirty needs to lock the page */
                btrfs_set_lock_blocking(buf);
-
                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
                                          buf);
        }
@@ -830,7 +885,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
        root->node = NULL;
        root->commit_root = NULL;
-       root->ref_tree = NULL;
        root->sectorsize = sectorsize;
        root->nodesize = nodesize;
        root->leafsize = leafsize;
@@ -845,12 +899,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->last_inode_alloc = 0;
        root->name = NULL;
        root->in_sysfs = 0;
+       root->inode_tree.rb_node = NULL;
 
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
-       INIT_LIST_HEAD(&root->dead_list);
+       INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
        spin_lock_init(&root->list_lock);
+       spin_lock_init(&root->inode_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -864,9 +920,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        extent_io_tree_init(&root->dirty_log_pages,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
 
-       btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
-       root->ref_tree = &root->ref_tree_struct;
-
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -905,6 +958,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+       root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
        return 0;
 }
@@ -971,20 +1025,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
         */
        root->ref_cows = 0;
 
-       leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                     0, BTRFS_TREE_LOG_OBJECTID,
-                                     trans->transid, 0, 0, 0);
+       leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+                                     BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
        }
 
+       memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+       btrfs_set_header_bytenr(leaf, leaf->start);
+       btrfs_set_header_generation(leaf, trans->transid);
+       btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+       btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
        root->node = leaf;
-       btrfs_set_header_nritems(root->node, 0);
-       btrfs_set_header_level(root->node, 0);
-       btrfs_set_header_bytenr(root->node, root->node->start);
-       btrfs_set_header_generation(root->node, trans->transid);
-       btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
 
        write_extent_buffer(root->node, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(root->node),
@@ -1027,8 +1080,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        inode_item->nbytes = cpu_to_le64(root->leafsize);
        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
-       btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
-       btrfs_set_root_generation(&log_root->root_item, trans->transid);
+       btrfs_set_root_node(&log_root->root_item, log_root->node);
 
        WARN_ON(root->log_root);
        root->log_root = log_root;
@@ -1090,6 +1142,7 @@ out:
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+       root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
 insert:
        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1156,7 +1209,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
        }
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                           root->root_key.objectid, root);
+                                           root->root_key.objectid);
                BUG_ON(ret);
                btrfs_orphan_cleanup(root);
        }
@@ -1202,11 +1255,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        int ret = 0;
        struct btrfs_device *device;
        struct backing_dev_info *bdi;
-#if 0
-       if ((bdi_bits & (1 << BDI_write_congested)) &&
-           btrfs_congested_async(info, 0))
-               return 1;
-#endif
+
        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;
@@ -1295,12 +1344,26 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
        free_extent_map(em);
 }
 
+/*
+ * If this fails, caller must call bdi_destroy() to get rid of the
+ * bdi again.
+ */
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-       bdi_init(bdi);
+       int err;
+
+       bdi->name = "btrfs";
+       bdi->capabilities = BDI_CAP_MAP_COPY;
+       err = bdi_init(bdi);
+       if (err)
+               return err;
+
+       err = bdi_register(bdi, NULL, "btrfs-%d",
+                               atomic_inc_return(&btrfs_bdi_num));
+       if (err)
+               return err;
+
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
-       bdi->state              = 0;
-       bdi->capabilities       = default_backing_dev_info.capabilities;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
        bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
@@ -1342,8 +1405,6 @@ static int bio_ready_for_csum(struct bio *bio)
 
        ret = extent_range_uptodate(io_tree, start + length,
                                    start + buf_len - 1);
-       if (ret == 1)
-               return ret;
        return ret;
 }
 
@@ -1426,12 +1487,6 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-               if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-                       printk(KERN_INFO "btrfs: total reference cache "
-                              "size %llu\n",
-                              root->fs_info->total_ref_cache_size);
-               }
-
                mutex_lock(&root->fs_info->trans_mutex);
                cur = root->fs_info->running_transaction;
                if (!cur) {
@@ -1448,6 +1503,7 @@ static int transaction_kthread(void *arg)
                mutex_unlock(&root->fs_info->trans_mutex);
                trans = btrfs_start_transaction(root, 1);
                ret = btrfs_commit_transaction(trans, root);
+
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1507,6 +1563,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->ordered_operations);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
@@ -1525,15 +1582,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
-       atomic_set(&fs_info->throttles, 0);
-       atomic_set(&fs_info->throttle_gen, 0);
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
-       setup_bdi(fs_info, &fs_info->bdi);
+       if (setup_bdi(fs_info, &fs_info->bdi))
+               goto fail_bdi;
        fs_info->btree_inode = new_inode(sb);
        fs_info->btree_inode->i_ino = 1;
        fs_info->btree_inode->i_nlink = 1;
+       fs_info->metadata_ratio = 8;
 
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1553,6 +1610,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
+       RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
                             fs_info->btree_inode->i_mapping,
                             GFP_NOFS);
@@ -1566,31 +1624,27 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
        extent_io_tree_init(&fs_info->pinned_extents,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-       extent_io_tree_init(&fs_info->pending_del,
-                            fs_info->btree_inode->i_mapping, GFP_NOFS);
-       extent_io_tree_init(&fs_info->extent_ins,
-                            fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
 
-       INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
-       btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
-       btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
-
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
        insert_inode_hash(fs_info->btree_inode);
 
        mutex_init(&fs_info->trans_mutex);
+       mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
-       mutex_init(&fs_info->extent_ins_mutex);
-       mutex_init(&fs_info->pinned_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+       init_rwsem(&fs_info->extent_commit_sem);
+
+       btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+       btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1625,17 +1679,23 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (features) {
                printk(KERN_ERR "BTRFS: couldn't mount because of "
                       "unsupported optional features (%Lx).\n",
-                      features);
+                      (unsigned long long)features);
                err = -EINVAL;
                goto fail_iput;
        }
 
+       features = btrfs_super_incompat_flags(disk_super);
+       if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+               features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+               btrfs_set_super_incompat_flags(disk_super, features);
+       }
+
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
        if (!(sb->s_flags & MS_RDONLY) && features) {
                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
                       "unsupported option features (%Lx).\n",
-                      features);
+                      (unsigned long long)features);
                err = -EINVAL;
                goto fail_iput;
        }
@@ -1727,7 +1787,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read the system "
                       "array on %s\n", sb->s_id);
-               goto fail_sys_array;
+               goto fail_sb_buffer;
        }
 
        blocksize = btrfs_level_size(tree_root,
@@ -1741,6 +1801,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+       if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+               printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                      sb->s_id);
+               goto fail_chunk_root;
+       }
+       btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+       chunk_root->commit_root = btrfs_root_node(chunk_root);
 
        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1766,7 +1833,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
-
+       if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+               printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                      sb->s_id);
+               goto fail_tree_root;
+       }
+       btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+       tree_root->commit_root = btrfs_root_node(tree_root);
 
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1776,15 +1849,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
-       dev_root->track_dirty = 1;
-
        if (ret)
                goto fail_extent_root;
+       dev_root->track_dirty = 1;
 
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
-               goto fail_extent_root;
+               goto fail_dev_root;
 
        csum_root->track_dirty = 1;
 
@@ -1806,6 +1878,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (IS_ERR(fs_info->transaction_kthread))
                goto fail_cleaner;
 
+       if (!btrfs_test_opt(tree_root, SSD) &&
+           !btrfs_test_opt(tree_root, NOSSD) &&
+           !fs_info->fs_devices->rotating) {
+               printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+                      "mode\n");
+               btrfs_set_opt(fs_info->mount_opt, SSD);
+       }
+
        if (btrfs_super_log_root(disk_super) != 0) {
                u64 bytenr = btrfs_super_log_root(disk_super);
 
@@ -1838,7 +1918,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
 
        if (!(sb->s_flags & MS_RDONLY)) {
-               ret = btrfs_cleanup_reloc_trees(tree_root);
+               ret = btrfs_recover_relocation(tree_root);
                BUG_ON(ret);
        }
 
@@ -1849,6 +1929,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
+
        return tree_root;
 
 fail_trans_kthread:
@@ -1865,14 +1946,19 @@ fail_cleaner:
 
 fail_csum_root:
        free_extent_buffer(csum_root->node);
+       free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+       free_extent_buffer(dev_root->node);
+       free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
        free_extent_buffer(extent_root->node);
+       free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
        free_extent_buffer(tree_root->node);
+       free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
        free_extent_buffer(chunk_root->node);
-fail_sys_array:
-       free_extent_buffer(dev_root->node);
+       free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -1888,8 +1974,8 @@ fail_iput:
 
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+fail_bdi:
        bdi_destroy(&fs_info->bdi);
-
 fail:
        kfree(extent_root);
        kfree(tree_root);
@@ -1962,6 +2048,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
        return latest;
 }
 
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
 static int write_dev_supers(struct btrfs_device *device,
                            struct btrfs_super_block *sb,
                            int do_barriers, int wait, int max_mirrors)
@@ -1997,12 +2094,16 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh = __find_get_block(device->bdev, bytenr / 4096,
                                              BTRFS_SUPER_INFO_SIZE);
                        BUG_ON(!bh);
-                       brelse(bh);
                        wait_on_buffer(bh);
-                       if (buffer_uptodate(bh)) {
-                               brelse(bh);
-                               continue;
-                       }
+                       if (!buffer_uptodate(bh))
+                               errors++;
+
+                       /* drop our reference */
+                       brelse(bh);
+
+                       /* drop the reference from the wait == 0 run */
+                       brelse(bh);
+                       continue;
                } else {
                        btrfs_set_super_bytenr(sb, bytenr);
 
@@ -2013,12 +2114,18 @@ static int write_dev_supers(struct btrfs_device *device,
                                              BTRFS_CSUM_SIZE);
                        btrfs_csum_final(crc, sb->csum);
 
+                       /*
+                        * one reference for us, and we leave it for the
+                        * caller
+                        */
                        bh = __getblk(device->bdev, bytenr / 4096,
                                      BTRFS_SUPER_INFO_SIZE);
                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
-                       set_buffer_uptodate(bh);
+                       /* one reference for submit_bh */
                        get_bh(bh);
+
+                       set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
@@ -2030,30 +2137,24 @@ static int write_dev_supers(struct btrfs_device *device,
                                       device->name);
                                set_buffer_uptodate(bh);
                                device->barriers = 0;
+                               /* one reference for submit_bh */
                                get_bh(bh);
                                lock_buffer(bh);
-                               ret = submit_bh(WRITE, bh);
+                               ret = submit_bh(WRITE_SYNC, bh);
                        }
                } else {
-                       ret = submit_bh(WRITE, bh);
+                       ret = submit_bh(WRITE_SYNC, bh);
                }
 
-               if (!ret && wait) {
-                       wait_on_buffer(bh);
-                       if (!buffer_uptodate(bh))
-                               errors++;
-               } else if (ret) {
+               if (ret)
                        errors++;
-               }
-               if (wait)
-                       brelse(bh);
        }
        return errors < i ? 0 : -1;
 }
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-       struct list_head *head = &root->fs_info->fs_devices->devices;
+       struct list_head *head;
        struct btrfs_device *dev;
        struct btrfs_super_block *sb;
        struct btrfs_dev_item *dev_item;
@@ -2068,6 +2169,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
        sb = &root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
+
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       head = &root->fs_info->fs_devices->devices;
        list_for_each_entry(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
@@ -2111,6 +2215,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
                if (ret)
                        total_errors++;
        }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (total_errors > max_errors) {
                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
                       total_errors);
@@ -2130,6 +2235,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
+       WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        radix_tree_delete(&fs_info->fs_roots_radix,
                          (unsigned long)root->root_key.objectid);
        if (root->anon_super.s_dev) {
@@ -2176,10 +2282,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                                             ARRAY_SIZE(gang));
                if (!ret)
                        break;
+
+               root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
                        root_objectid = gang[i]->root_key.objectid;
                        ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                                   root_objectid, gang[i]);
+                                                   root_objectid);
                        BUG_ON(ret);
                        btrfs_orphan_cleanup(gang[i]);
                }
@@ -2226,31 +2334,31 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
 
+       fs_info->closing = 2;
+       smp_mb();
+
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-                      fs_info->delalloc_bytes);
+                      (unsigned long long)fs_info->delalloc_bytes);
        }
        if (fs_info->total_ref_cache_size) {
                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
                       (unsigned long long)fs_info->total_ref_cache_size);
        }
 
-       if (fs_info->extent_root->node)
-               free_extent_buffer(fs_info->extent_root->node);
-
-       if (fs_info->tree_root->node)
-               free_extent_buffer(fs_info->tree_root->node);
-
-       if (root->fs_info->chunk_root->node)
-               free_extent_buffer(root->fs_info->chunk_root->node);
-
-       if (root->fs_info->dev_root->node)
-               free_extent_buffer(root->fs_info->dev_root->node);
-
-       if (root->fs_info->csum_root->node)
-               free_extent_buffer(root->fs_info->csum_root->node);
+       free_extent_buffer(fs_info->extent_root->node);
+       free_extent_buffer(fs_info->extent_root->commit_root);
+       free_extent_buffer(fs_info->tree_root->node);
+       free_extent_buffer(fs_info->tree_root->commit_root);
+       free_extent_buffer(root->fs_info->chunk_root->node);
+       free_extent_buffer(root->fs_info->chunk_root->commit_root);
+       free_extent_buffer(root->fs_info->dev_root->node);
+       free_extent_buffer(root->fs_info->dev_root->commit_root);
+       free_extent_buffer(root->fs_info->csum_root->node);
+       free_extent_buffer(root->fs_info->csum_root->commit_root);
 
        btrfs_free_block_groups(root->fs_info);
+       btrfs_free_pinned_extents(root->fs_info);
 
        del_fs_roots(fs_info);
 
@@ -2265,16 +2373,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
 
-#if 0
-       while (!list_empty(&fs_info->hashers)) {
-               struct btrfs_hasher *hasher;
-               hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
-                                   hashers);
-               list_del(&hasher->hashers);
-               crypto_free_hash(&fs_info->hash_tfm);
-               kfree(hasher);
-       }
-#endif
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
@@ -2314,10 +2412,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
        u64 transid = btrfs_header_generation(buf);
        struct inode *btree_inode = root->fs_info->btree_inode;
+       int was_dirty;
 
-       btrfs_set_lock_blocking(buf);
-
-       WARN_ON(!btrfs_tree_locked(buf));
+       btrfs_assert_tree_locked(buf);
        if (transid != root->fs_info->generation) {
                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
                       "found %llu running %llu\n",
@@ -2326,7 +2423,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)root->fs_info->generation);
                WARN_ON(1);
        }
-       set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+       was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+                                           buf);
+       if (!was_dirty) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               root->fs_info->dirty_metadata_bytes += buf->len;
+               spin_unlock(&root->fs_info->delalloc_lock);
+       }
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2335,17 +2438,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-       struct extent_io_tree *tree;
        u64 num_dirty;
-       u64 start = 0;
        unsigned long thresh = 32 * 1024 * 1024;
-       tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
-       if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                return;
 
-       num_dirty = count_range_bits(tree, &start, (u64)-1,
-                                    thresh, EXTENT_DIRTY);
+       num_dirty = root->fs_info->dirty_metadata_bytes;
+
        if (num_dirty > thresh) {
                balance_dirty_pages_ratelimited_nr(
                                   root->fs_info->btree_inode->i_mapping, 1);
@@ -2366,6 +2466,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_buffer *eb;
        unsigned long len;
@@ -2381,6 +2482,16 @@ int btree_lock_page_hook(struct page *page)
 
        btrfs_tree_lock(eb);
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               if (root->fs_info->dirty_metadata_bytes >= eb->len)
+                       root->fs_info->dirty_metadata_bytes -= eb->len;
+               else
+                       WARN_ON(1);
+               spin_unlock(&root->fs_info->delalloc_lock);
+       }
+
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 out: