Merge branch 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6
[safe/jmp/linux-2.6] / fs / btrfs / inode.c
index b9fe06d..fa6ccc1 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -88,13 +89,14 @@ static noinline int cow_file_range(struct inode *inode,
                                   u64 start, u64 end, int *page_started,
                                   unsigned long *nr_written, int unlock);
 
-static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+                                    struct inode *inode,  struct inode *dir)
 {
        int err;
 
-       err = btrfs_init_acl(inode, dir);
+       err = btrfs_init_acl(trans, inode, dir);
        if (!err)
-               err = btrfs_xattr_security_init(inode, dir);
+               err = btrfs_xattr_security_init(trans, inode, dir);
        return err;
 }
 
@@ -188,8 +190,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
 
+       /*
+        * we're an inline extent, so nobody can
+        * extend the file past i_size without locking
+        * a page we already have locked.
+        *
+        * We must do any isize and inode updates
+        * before we unlock the pages.  Otherwise we
+        * could end up racing with unlink.
+        */
        BTRFS_I(inode)->disk_i_size = inode->i_size;
        btrfs_update_inode(trans, root, inode);
+
        return 0;
 fail:
        btrfs_free_path(path);
@@ -230,8 +242,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                return 1;
        }
 
-       ret = btrfs_drop_extents(trans, root, inode, start,
-                                aligned_end, aligned_end, start,
+       ret = btrfs_drop_extents(trans, inode, start, aligned_end,
                                 &hint_byte, 1);
        BUG_ON(ret);
 
@@ -241,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
+       btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
 }
@@ -369,7 +381,8 @@ again:
         * change at any time if we discover bad compression ratios.
         */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-           btrfs_test_opt(root, COMPRESS)) {
+           (btrfs_test_opt(root, COMPRESS) ||
+            (BTRFS_I(inode)->force_compress))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
@@ -402,6 +415,7 @@ again:
                trans = btrfs_join_transaction(root, 1);
                BUG_ON(!trans);
                btrfs_set_trans_block_group(trans, inode);
+               trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
                /* lets try to make an inline extent */
                if (ret || total_in < (actual_end - start)) {
@@ -416,7 +430,6 @@ again:
                                                    start, end,
                                                    total_compressed, pages);
                }
-               btrfs_end_transaction(trans, root);
                if (ret == 0) {
                        /*
                         * inline extent creation worked, we don't need
@@ -424,12 +437,16 @@ again:
                         * and free up our temp pages.
                         */
                        extent_clear_unlock_delalloc(inode,
-                                                    &BTRFS_I(inode)->io_tree,
-                                                    start, end, NULL, 1, 0,
-                                                    0, 1, 1, 1, 0);
-                       ret = 0;
+                            &BTRFS_I(inode)->io_tree,
+                            start, end, NULL,
+                            EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+                            EXTENT_CLEAR_DELALLOC |
+                            EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+
+                       btrfs_end_transaction(trans, root);
                        goto free_pages_out;
                }
+               btrfs_end_transaction(trans, root);
        }
 
        if (will_compress) {
@@ -469,7 +486,10 @@ again:
                nr_pages_ret = 0;
 
                /* flag the file so we don't compress in the future */
-               BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+               if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+                   !(BTRFS_I(inode)->force_compress)) {
+                       BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+               }
        }
        if (will_compress) {
                *num_added += 1;
@@ -535,12 +555,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree;
-       int ret;
+       int ret = 0;
 
        if (list_empty(&async_cow->extents))
                return 0;
 
-       trans = btrfs_join_transaction(root, 1);
 
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
@@ -549,21 +568,22 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
                io_tree = &BTRFS_I(inode)->io_tree;
 
+retry:
                /* did the compression code fall back to uncompressed IO? */
                if (!async_extent->pages) {
                        int page_started = 0;
                        unsigned long nr_written = 0;
 
                        lock_extent(io_tree, async_extent->start,
-                                   async_extent->start +
-                                   async_extent->ram_size - 1, GFP_NOFS);
+                                        async_extent->start +
+                                        async_extent->ram_size - 1, GFP_NOFS);
 
                        /* allocate blocks */
-                       cow_file_range(inode, async_cow->locked_page,
-                                      async_extent->start,
-                                      async_extent->start +
-                                      async_extent->ram_size - 1,
-                                      &page_started, &nr_written, 0);
+                       ret = cow_file_range(inode, async_cow->locked_page,
+                                            async_extent->start,
+                                            async_extent->start +
+                                            async_extent->ram_size - 1,
+                                            &page_started, &nr_written, 0);
 
                        /*
                         * if page_started, cow_file_range inserted an
@@ -571,7 +591,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
                         * and IO for us.  Otherwise, we need to submit
                         * all those pages down to the drive.
                         */
-                       if (!page_started)
+                       if (!page_started && !ret)
                                extent_write_locked_range(io_tree,
                                                  inode, async_extent->start,
                                                  async_extent->start +
@@ -586,6 +606,30 @@ static noinline int submit_compressed_extents(struct inode *inode,
                lock_extent(io_tree, async_extent->start,
                            async_extent->start + async_extent->ram_size - 1,
                            GFP_NOFS);
+
+               trans = btrfs_join_transaction(root, 1);
+               ret = btrfs_reserve_extent(trans, root,
+                                          async_extent->compressed_size,
+                                          async_extent->compressed_size,
+                                          0, alloc_hint,
+                                          (u64)-1, &ins, 1);
+               btrfs_end_transaction(trans, root);
+
+               if (ret) {
+                       int i;
+                       for (i = 0; i < async_extent->nr_pages; i++) {
+                               WARN_ON(async_extent->pages[i]->mapping);
+                               page_cache_release(async_extent->pages[i]);
+                       }
+                       kfree(async_extent->pages);
+                       async_extent->nr_pages = 0;
+                       async_extent->pages = NULL;
+                       unlock_extent(io_tree, async_extent->start,
+                                     async_extent->start +
+                                     async_extent->ram_size - 1, GFP_NOFS);
+                       goto retry;
+               }
+
                /*
                 * here we're doing allocation and writeback of the
                 * compressed pages
@@ -594,12 +638,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                        async_extent->start +
                                        async_extent->ram_size - 1, 0);
 
-               ret = btrfs_reserve_extent(trans, root,
-                                          async_extent->compressed_size,
-                                          async_extent->compressed_size,
-                                          0, alloc_hint,
-                                          (u64)-1, &ins, 1);
-               BUG_ON(ret);
                em = alloc_extent_map(GFP_NOFS);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
@@ -631,17 +669,18 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                               BTRFS_ORDERED_COMPRESSED);
                BUG_ON(ret);
 
-               btrfs_end_transaction(trans, root);
-
                /*
                 * clear dirty, set writeback and unlock the pages.
                 */
                extent_clear_unlock_delalloc(inode,
-                                            &BTRFS_I(inode)->io_tree,
-                                            async_extent->start,
-                                            async_extent->start +
-                                            async_extent->ram_size - 1,
-                                            NULL, 1, 1, 0, 1, 1, 0, 0);
+                               &BTRFS_I(inode)->io_tree,
+                               async_extent->start,
+                               async_extent->start +
+                               async_extent->ram_size - 1,
+                               NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+                               EXTENT_CLEAR_UNLOCK |
+                               EXTENT_CLEAR_DELALLOC |
+                               EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
 
                ret = btrfs_submit_compressed_write(inode,
                                    async_extent->start,
@@ -651,16 +690,46 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                    async_extent->nr_pages);
 
                BUG_ON(ret);
-               trans = btrfs_join_transaction(root, 1);
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
                cond_resched();
        }
 
-       btrfs_end_transaction(trans, root);
        return 0;
 }
 
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                     u64 num_bytes)
+{
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
+       u64 alloc_hint = 0;
+
+       read_lock(&em_tree->lock);
+       em = search_extent_mapping(em_tree, start, num_bytes);
+       if (em) {
+               /*
+                * if block start isn't an actual block number then find the
+                * first block in this inode and use that as a hint.  If that
+                * block is also bogus then just don't worry about it.
+                */
+               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                       free_extent_map(em);
+                       em = search_extent_mapping(em_tree, 0, 0);
+                       if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                               alloc_hint = em->block_start;
+                       if (em)
+                               free_extent_map(em);
+               } else {
+                       alloc_hint = em->block_start;
+                       free_extent_map(em);
+               }
+       }
+       read_unlock(&em_tree->lock);
+
+       return alloc_hint;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
@@ -698,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
        actual_end = min_t(u64, isize, end + 1);
 
@@ -712,9 +782,15 @@ static noinline int cow_file_range(struct inode *inode,
                                            start, end, 0, NULL);
                if (ret == 0) {
                        extent_clear_unlock_delalloc(inode,
-                                                    &BTRFS_I(inode)->io_tree,
-                                                    start, end, NULL, 1, 1,
-                                                    1, 1, 1, 1, 0);
+                                    &BTRFS_I(inode)->io_tree,
+                                    start, end, NULL,
+                                    EXTENT_CLEAR_UNLOCK_PAGE |
+                                    EXTENT_CLEAR_UNLOCK |
+                                    EXTENT_CLEAR_DELALLOC |
+                                    EXTENT_CLEAR_DIRTY |
+                                    EXTENT_SET_WRITEBACK |
+                                    EXTENT_END_WRITEBACK);
+
                        *nr_written = *nr_written +
                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
                        *page_started = 1;
@@ -726,19 +802,13 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-
-       read_lock(&BTRFS_I(inode)->extent_tree.lock);
-       em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                  start, num_bytes);
-       if (em) {
-               alloc_hint = em->block_start;
-               free_extent_map(em);
-       }
-       read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+       alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
        while (disk_num_bytes > 0) {
-               cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+               unsigned long op;
+
+               cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
@@ -789,10 +859,13 @@ static noinline int cow_file_range(struct inode *inode,
                 * Do set the Private2 bit so we know this page was properly
                 * setup for writepage
                 */
+               op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+               op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+                       EXTENT_SET_PRIVATE2;
+
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                             start, start + ram_size - 1,
-                                            locked_page, unlock, 1,
-                                            1, 0, 0, 0, 1);
+                                            locked_page, op);
                disk_num_bytes -= cur_alloc_size;
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
@@ -864,8 +937,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
        u64 cur_end;
        int limit = 10 * 1024 * 1042;
 
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
-                        EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+                        1, 0, NULL, GFP_NOFS);
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
                async_cow->inode = inode;
@@ -1006,6 +1079,7 @@ next_slot:
 
                if (found_key.offset > cur_offset) {
                        extent_end = found_key.offset;
+                       extent_type = 0;
                        goto out_check;
                }
 
@@ -1111,9 +1185,18 @@ out_check:
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
 
+               if (root->root_key.objectid ==
+                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                       ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                     num_bytes);
+                       BUG_ON(ret);
+               }
+
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-                                       cur_offset, cur_offset + num_bytes - 1,
-                                       locked_page, 1, 1, 1, 0, 0, 0, 1);
+                               cur_offset, cur_offset + num_bytes - 1,
+                               locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+                               EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+                               EXTENT_SET_PRIVATE2);
                cur_offset = extent_end;
                if (cur_offset > end)
                        break;
@@ -1150,7 +1233,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
-       else if (!btrfs_test_opt(root, COMPRESS))
+       else if (!btrfs_test_opt(root, COMPRESS) &&
+                !(BTRFS_I(inode)->force_compress))
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        else
@@ -1159,25 +1243,61 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        return ret;
 }
 
+static int btrfs_split_extent_hook(struct inode *inode,
+                                  struct extent_state *orig, u64 split)
+{
+       /* not delalloc, ignore it */
+       if (!(orig->state & EXTENT_DELALLOC))
+               return 0;
+
+       atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+       return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+                                  struct extent_state *new,
+                                  struct extent_state *other)
+{
+       /* not delalloc, ignore it */
+       if (!(other->state & EXTENT_DELALLOC))
+               return 0;
+
+       atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+       return 0;
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
-                      unsigned long old, unsigned long bits)
+static int btrfs_set_bit_hook(struct inode *inode,
+                             struct extent_state *state, int *bits)
 {
+
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-       if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+       if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
-               btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+               u64 len = state->end + 1 - state->start;
+
+               if (*bits & EXTENT_FIRST_DELALLOC)
+                       *bits &= ~EXTENT_FIRST_DELALLOC;
+               else
+                       atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+
                spin_lock(&root->fs_info->delalloc_lock);
-               BTRFS_I(inode)->delalloc_bytes += end - start + 1;
-               root->fs_info->delalloc_bytes += end - start + 1;
+               BTRFS_I(inode)->delalloc_bytes += len;
+               root->fs_info->delalloc_bytes += len;
                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
@@ -1190,33 +1310,33 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
-                        unsigned long old, unsigned long bits)
+static int btrfs_clear_bit_hook(struct inode *inode,
+                               struct extent_state *state, int *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-       if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+       if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+               u64 len = state->end + 1 - state->start;
+
+               if (*bits & EXTENT_FIRST_DELALLOC)
+                       *bits &= ~EXTENT_FIRST_DELALLOC;
+               else if (!(*bits & EXTENT_DO_ACCOUNTING))
+                       atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+               if (*bits & EXTENT_DO_ACCOUNTING)
+                       btrfs_delalloc_release_metadata(inode, len);
+
+               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                       btrfs_free_reserved_data_space(inode, len);
 
                spin_lock(&root->fs_info->delalloc_lock);
-               if (end - start + 1 > root->fs_info->delalloc_bytes) {
-                       printk(KERN_INFO "btrfs warning: delalloc account "
-                              "%llu %llu\n",
-                              (unsigned long long)end - start + 1,
-                              (unsigned long long)
-                              root->fs_info->delalloc_bytes);
-                       btrfs_delalloc_free_space(root, inode, (u64)-1);
-                       root->fs_info->delalloc_bytes = 0;
-                       BTRFS_I(inode)->delalloc_bytes = 0;
-               } else {
-                       btrfs_delalloc_free_space(root, inode,
-                                                 end - start + 1);
-                       root->fs_info->delalloc_bytes -= end - start + 1;
-                       BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
-               }
+               root->fs_info->delalloc_bytes -= len;
+               BTRFS_I(inode)->delalloc_bytes -= len;
+
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1265,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                   unsigned long bio_flags)
+                                   unsigned long bio_flags,
+                                   u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1284,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
  * are inserted into the btree
  */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1295,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * on write, or reading the csums from the tree before a read
  */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1320,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                  bio_flags, __btrfs_submit_bio_start,
+                                  bio_flags, bio_offset,
+                                  __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
 
@@ -1347,12 +1471,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+                             struct extent_state **cached_state)
 {
        if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
                WARN_ON(1);
        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-                                  GFP_NOFS);
+                                  cached_state, GFP_NOFS);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -1365,6 +1490,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
        struct btrfs_writepage_fixup *fixup;
        struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
        struct page *page;
        struct inode *inode;
        u64 page_start;
@@ -1383,7 +1509,8 @@ again:
        page_start = page_offset(page);
        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
-       lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+       lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+                        &cached_state, GFP_NOFS);
 
        /* already ordered? We're done */
        if (PagePrivate2(page))
@@ -1391,17 +1518,19 @@ again:
 
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
-               unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
-                             page_end, GFP_NOFS);
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+                                    page_end, &cached_state, GFP_NOFS);
                unlock_page(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
                goto again;
        }
 
-       btrfs_set_extent_delalloc(inode, page_start, page_end);
+       BUG();
+       btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
 out:
-       unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                            &cached_state, GFP_NOFS);
 out_page:
        unlock_page(page);
        page_cache_release(page);
@@ -1447,7 +1576,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct inode *inode, u64 file_pos,
                                       u64 disk_bytenr, u64 disk_num_bytes,
                                       u64 num_bytes, u64 ram_bytes,
-                                      u64 locked_end,
                                       u8 compression, u8 encryption,
                                       u16 other_encoding, int extent_type)
 {
@@ -1473,9 +1601,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-       ret = btrfs_drop_extents(trans, root, inode, file_pos,
-                                file_pos + num_bytes, locked_end,
-                                file_pos, &hint, 0);
+       ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+                                &hint, 0);
        BUG_ON(ret);
 
        ins.objectid = inode->i_ino;
@@ -1522,24 +1649,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
  * before we start the transaction.  It limits the amount of btree
  * reads required while inside the transaction.
  */
-static noinline void reada_csum(struct btrfs_root *root,
-                               struct btrfs_path *path,
-                               struct btrfs_ordered_extent *ordered_extent)
-{
-       struct btrfs_ordered_sum *sum;
-       u64 bytenr;
-
-       sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
-                        list);
-       bytenr = sum->sums[0].bytenr;
-
-       /*
-        * we don't care about the results, the point of this search is
-        * just to get the btree leaves into ram
-        */
-       btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
-}
-
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1547,57 +1656,45 @@ static noinline void reada_csum(struct btrfs_root *root,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
+       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct btrfs_path *path;
+       struct extent_state *cached_state = NULL;
        int compressed = 0;
        int ret;
 
-       ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+       ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+                                            end - start + 1);
        if (!ret)
                return 0;
+       BUG_ON(!ordered_extent);
 
-       /*
-        * before we join the transaction, try to do some of our IO.
-        * This will limit the amount of IO that we have to do with
-        * the transaction running.  We're unlikely to need to do any
-        * IO if the file extents are new, the disk_i_size checks
-        * covers the most common case.
-        */
-       if (start < BTRFS_I(inode)->disk_i_size) {
-               path = btrfs_alloc_path();
-               if (path) {
-                       ret = btrfs_lookup_file_extent(NULL, root, path,
-                                                      inode->i_ino,
-                                                      start, 0);
-                       ordered_extent = btrfs_lookup_ordered_extent(inode,
-                                                                    start);
-                       if (!list_empty(&ordered_extent->list)) {
-                               btrfs_release_path(root, path);
-                               reada_csum(root, path, ordered_extent);
-                       }
-                       btrfs_free_path(path);
+       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+               BUG_ON(!list_empty(&ordered_extent->list));
+               ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+               if (!ret) {
+                       trans = btrfs_join_transaction(root, 1);
+                       btrfs_set_trans_block_group(trans, inode);
+                       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+                       ret = btrfs_update_inode(trans, root, inode);
+                       BUG_ON(ret);
                }
+               goto out;
        }
 
-       trans = btrfs_join_transaction(root, 1);
-
-       if (!ordered_extent)
-               ordered_extent = btrfs_lookup_ordered_extent(inode, start);
-       BUG_ON(!ordered_extent);
-       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
-               goto nocow;
+       lock_extent_bits(io_tree, ordered_extent->file_offset,
+                        ordered_extent->file_offset + ordered_extent->len - 1,
+                        0, &cached_state, GFP_NOFS);
 
-       lock_extent(io_tree, ordered_extent->file_offset,
-                   ordered_extent->file_offset + ordered_extent->len - 1,
-                   GFP_NOFS);
+       trans = btrfs_join_transaction(root, 1);
+       btrfs_set_trans_block_group(trans, inode);
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                BUG_ON(compressed);
-               ret = btrfs_mark_extent_written(trans, root, inode,
+               ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                ordered_extent->len);
@@ -1609,8 +1706,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                               ordered_extent->file_offset +
-                                               ordered_extent->len,
                                                compressed, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
@@ -1618,25 +1713,25 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                   ordered_extent->len);
                BUG_ON(ret);
        }
-       unlock_extent(io_tree, ordered_extent->file_offset,
-                   ordered_extent->file_offset + ordered_extent->len - 1,
-                   GFP_NOFS);
-nocow:
+       unlock_extent_cached(io_tree, ordered_extent->file_offset,
+                            ordered_extent->file_offset +
+                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
 
-       mutex_lock(&BTRFS_I(inode)->extent_mutex);
-       btrfs_ordered_update_i_size(inode, ordered_extent);
-       btrfs_update_inode(trans, root, inode);
-       btrfs_remove_ordered_extent(inode, ordered_extent);
-       mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
+       btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+       ret = btrfs_update_inode(trans, root, inode);
+       BUG_ON(ret);
+out:
+       btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+       if (trans)
+               btrfs_end_transaction(trans, root);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
        btrfs_put_ordered_extent(ordered_extent);
 
-       btrfs_end_transaction(trans, root);
        return 0;
 }
 
@@ -1753,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                     failrec->bio_flags);
+                                                     failrec->bio_flags, 0);
        return 0;
 }
 
@@ -1859,33 +1954,245 @@ zeroit:
        return -EIO;
 }
 
+struct delayed_iput {
+       struct list_head list;
+       struct inode *inode;
+};
+
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+       struct delayed_iput *delayed;
+
+       if (atomic_add_unless(&inode->i_count, -1, 1))
+               return;
+
+       delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+       delayed->inode = inode;
+
+       spin_lock(&fs_info->delayed_iput_lock);
+       list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+       spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+       LIST_HEAD(list);
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct delayed_iput *delayed;
+       int empty;
+
+       spin_lock(&fs_info->delayed_iput_lock);
+       empty = list_empty(&fs_info->delayed_iputs);
+       spin_unlock(&fs_info->delayed_iput_lock);
+       if (empty)
+               return;
+
+       down_read(&root->fs_info->cleanup_work_sem);
+       spin_lock(&fs_info->delayed_iput_lock);
+       list_splice_init(&fs_info->delayed_iputs, &list);
+       spin_unlock(&fs_info->delayed_iput_lock);
+
+       while (!list_empty(&list)) {
+               delayed = list_entry(list.next, struct delayed_iput, list);
+               list_del(&delayed->list);
+               iput(delayed->inode);
+               kfree(delayed);
+       }
+       up_read(&root->fs_info->cleanup_work_sem);
+}
+
+/*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending,
+                               u64 *bytes_to_reserve)
+{
+       struct btrfs_root *root;
+       struct btrfs_block_rsv *block_rsv;
+       u64 num_bytes;
+       int index;
+
+       root = pending->root;
+       if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+               return;
+
+       block_rsv = root->orphan_block_rsv;
+
+       /* orphan block reservation for the snapshot */
+       num_bytes = block_rsv->size;
+
+       /*
+        * after the snapshot is created, COWing tree blocks may use more
+        * space than it frees. So we should make sure there is enough
+        * reserved space.
+        */
+       index = trans->transid & 0x1;
+       if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+               num_bytes += block_rsv->size -
+                            (block_rsv->reserved + block_rsv->freed[index]);
+       }
+
+       *bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+{
+       struct btrfs_root *root = pending->root;
+       struct btrfs_root *snap = pending->snap;
+       struct btrfs_block_rsv *block_rsv;
+       u64 num_bytes;
+       int index;
+       int ret;
+
+       if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+               return;
+
+       /* refill source subvolume's orphan block reservation */
+       block_rsv = root->orphan_block_rsv;
+       index = trans->transid & 0x1;
+       if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+               num_bytes = block_rsv->size -
+                           (block_rsv->reserved + block_rsv->freed[index]);
+               ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                             root->orphan_block_rsv,
+                                             num_bytes);
+               BUG_ON(ret);
+       }
+
+       /* setup orphan block reservation for the snapshot */
+       block_rsv = btrfs_alloc_block_rsv(snap);
+       BUG_ON(!block_rsv);
+
+       btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+       snap->orphan_block_rsv = block_rsv;
+
+       num_bytes = root->orphan_block_rsv->size;
+       ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                     block_rsv, num_bytes);
+       BUG_ON(ret);
+
+#if 0
+       /* insert orphan item for the snapshot */
+       WARN_ON(!root->orphan_item_inserted);
+       ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                      snap->root_key.objectid);
+       BUG_ON(ret);
+       snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+       ORPHAN_CLEANUP_STARTED  = 1,
+       ORPHAN_CLEANUP_DONE     = 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root)
+{
+       int ret;
+
+       if (!list_empty(&root->orphan_list) ||
+           root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+               return;
+
+       if (root->orphan_item_inserted &&
+           btrfs_root_refs(&root->root_item) > 0) {
+               ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+                                           root->root_key.objectid);
+               BUG_ON(ret);
+               root->orphan_item_inserted = 0;
+       }
+
+       if (root->orphan_block_rsv) {
+               WARN_ON(root->orphan_block_rsv->size > 0);
+               btrfs_free_block_rsv(root, root->orphan_block_rsv);
+               root->orphan_block_rsv = NULL;
+       }
+}
+
 /*
  * This creates an orphan entry for the given inode in case something goes
  * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *      this function.
  */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret = 0;
+       struct btrfs_block_rsv *block_rsv = NULL;
+       int reserve = 0;
+       int insert = 0;
+       int ret;
 
-       spin_lock(&root->list_lock);
+       if (!root->orphan_block_rsv) {
+               block_rsv = btrfs_alloc_block_rsv(root);
+               BUG_ON(!block_rsv);
+       }
 
-       /* already on the orphan list, we're good */
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-               spin_unlock(&root->list_lock);
-               return 0;
+       spin_lock(&root->orphan_lock);
+       if (!root->orphan_block_rsv) {
+               root->orphan_block_rsv = block_rsv;
+       } else if (block_rsv) {
+               btrfs_free_block_rsv(root, block_rsv);
+               block_rsv = NULL;
        }
 
-       list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+               /*
+                * For proper ENOSPC handling, we should do orphan
+                * cleanup when mounting. But this introduces backward
+                * compatibility issue.
+                */
+               if (!xchg(&root->orphan_item_inserted, 1))
+                       insert = 2;
+               else
+                       insert = 1;
+#endif
+               insert = 1;
+       } else {
+               WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
+       }
 
-       spin_unlock(&root->list_lock);
+       if (!BTRFS_I(inode)->orphan_meta_reserved) {
+               BTRFS_I(inode)->orphan_meta_reserved = 1;
+               reserve = 1;
+       }
+       spin_unlock(&root->orphan_lock);
 
-       /*
-        * insert an orphan item to track this unlinked/truncated file
-        */
-       ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+       if (block_rsv)
+               btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
 
-       return ret;
+       /* grab metadata reservation from transaction handle */
+       if (reserve) {
+               ret = btrfs_orphan_reserve_metadata(trans, inode);
+               BUG_ON(ret);
+       }
+
+       /* insert an orphan item to track this unlinked/truncated file */
+       if (insert >= 1) {
+               ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+               BUG_ON(ret);
+       }
+
+       /* insert an orphan item to track subvolume contains orphan files */
+       if (insert >= 2) {
+               ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                              root->root_key.objectid);
+               BUG_ON(ret);
+       }
+       return 0;
 }
 
 /*
@@ -1895,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       int delete_item = 0;
+       int release_rsv = 0;
        int ret = 0;
 
-       spin_lock(&root->list_lock);
-
-       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-               spin_unlock(&root->list_lock);
-               return 0;
+       spin_lock(&root->orphan_lock);
+       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+               list_del_init(&BTRFS_I(inode)->i_orphan);
+               delete_item = 1;
        }
 
-       list_del_init(&BTRFS_I(inode)->i_orphan);
-       if (!trans) {
-               spin_unlock(&root->list_lock);
-               return 0;
+       if (BTRFS_I(inode)->orphan_meta_reserved) {
+               BTRFS_I(inode)->orphan_meta_reserved = 0;
+               release_rsv = 1;
        }
+       spin_unlock(&root->orphan_lock);
 
-       spin_unlock(&root->list_lock);
+       if (trans && delete_item) {
+               ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+               BUG_ON(ret);
+       }
 
-       ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+       if (release_rsv)
+               btrfs_orphan_release_metadata(inode);
 
-       return ret;
+       return 0;
 }
 
 /*
@@ -1931,16 +2243,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
-       path = btrfs_alloc_path();
-       if (!path)
+       if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
        path->reada = -1;
 
        key.objectid = BTRFS_ORPHAN_OBJECTID;
        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
        key.offset = (u64)-1;
 
-
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0) {
@@ -1982,17 +2295,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.objectid = found_key.offset;
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
-               inode = btrfs_iget(root->fs_info->sb, &found_key, root);
-               if (IS_ERR(inode))
-                       break;
+               inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
+               BUG_ON(IS_ERR(inode));
 
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-               spin_lock(&root->list_lock);
+               spin_lock(&root->orphan_lock);
                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-               spin_unlock(&root->list_lock);
+               spin_unlock(&root->orphan_lock);
 
                /*
                 * if this is a bad inode, means we actually succeeded in
@@ -2001,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
-                       trans = btrfs_start_transaction(root, 1);
+                       trans = btrfs_start_transaction(root, 0);
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2019,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* this will do delete_inode and everything for us */
                iput(inode);
        }
+       btrfs_free_path(path);
 
-       if (nr_unlink)
-               printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
-       if (nr_truncate)
-               printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+       root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
 
-       btrfs_free_path(path);
+       if (root->orphan_block_rsv)
+               btrfs_block_rsv_release(root, root->orphan_block_rsv,
+                                       (u64)-1);
+
+       if (root->orphan_block_rsv || root->orphan_item_inserted) {
+               trans = btrfs_join_transaction(root, 1);
+               btrfs_end_transaction(trans, root);
+       }
+
+       if (nr_unlink)
+               printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+       if (nr_truncate)
+               printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
 }
 
 /*
@@ -2344,17 +2666,201 @@ out:
        return ret;
 }
 
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+                            struct btrfs_path *path)
+{
+       struct extent_buffer *eb;
+       int level;
+       int ret;
+       u64 refs;
+
+       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+               if (!path->nodes[level])
+                       break;
+               eb = path->nodes[level];
+               if (!btrfs_block_can_be_shared(root, eb))
+                       continue;
+               ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+                                              &refs, NULL);
+               if (refs > 1)
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+                                                      struct dentry *dentry)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(dir)->root;
+       struct btrfs_path *path;
+       struct btrfs_inode_ref *ref;
+       struct btrfs_dir_item *di;
+       struct inode *inode = dentry->d_inode;
+       u64 index;
+       int check_link = 1;
+       int err = -ENOSPC;
+       int ret;
+
+       trans = btrfs_start_transaction(root, 10);
+       if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+               return trans;
+
+       if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+               return ERR_PTR(-ENOSPC);
+
+       /* check if there is someone else holds reference */
+       if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+               return ERR_PTR(-ENOSPC);
+
+       if (atomic_read(&inode->i_count) > 2)
+               return ERR_PTR(-ENOSPC);
+
+       if (xchg(&root->fs_info->enospc_unlink, 1))
+               return ERR_PTR(-ENOSPC);
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               root->fs_info->enospc_unlink = 0;
+               return ERR_PTR(-ENOMEM);
+       }
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               root->fs_info->enospc_unlink = 0;
+               return trans;
+       }
+
+       path->skip_locking = 1;
+       path->search_commit_root = 1;
+
+       ret = btrfs_lookup_inode(trans, root, path,
+                               &BTRFS_I(dir)->location, 0);
+       if (ret < 0) {
+               err = ret;
+               goto out;
+       }
+       if (ret == 0) {
+               if (check_path_shared(root, path))
+                       goto out;
+       } else {
+               check_link = 0;
+       }
+       btrfs_release_path(root, path);
+
+       ret = btrfs_lookup_inode(trans, root, path,
+                               &BTRFS_I(inode)->location, 0);
+       if (ret < 0) {
+               err = ret;
+               goto out;
+       }
+       if (ret == 0) {
+               if (check_path_shared(root, path))
+                       goto out;
+       } else {
+               check_link = 0;
+       }
+       btrfs_release_path(root, path);
+
+       if (ret == 0 && S_ISREG(inode->i_mode)) {
+               ret = btrfs_lookup_file_extent(trans, root, path,
+                                              inode->i_ino, (u64)-1, 0);
+               if (ret < 0) {
+                       err = ret;
+                       goto out;
+               }
+               BUG_ON(ret == 0);
+               if (check_path_shared(root, path))
+                       goto out;
+               btrfs_release_path(root, path);
+       }
+
+       if (!check_link) {
+               err = 0;
+               goto out;
+       }
+
+       di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                               dentry->d_name.name, dentry->d_name.len, 0);
+       if (IS_ERR(di)) {
+               err = PTR_ERR(di);
+               goto out;
+       }
+       if (di) {
+               if (check_path_shared(root, path))
+                       goto out;
+       } else {
+               err = 0;
+               goto out;
+       }
+       btrfs_release_path(root, path);
+
+       ref = btrfs_lookup_inode_ref(trans, root, path,
+                               dentry->d_name.name, dentry->d_name.len,
+                               inode->i_ino, dir->i_ino, 0);
+       if (IS_ERR(ref)) {
+               err = PTR_ERR(ref);
+               goto out;
+       }
+       BUG_ON(!ref);
+       if (check_path_shared(root, path))
+               goto out;
+       index = btrfs_inode_ref_index(path->nodes[0], ref);
+       btrfs_release_path(root, path);
+
+       di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+                               dentry->d_name.name, dentry->d_name.len, 0);
+       if (IS_ERR(di)) {
+               err = PTR_ERR(di);
+               goto out;
+       }
+       BUG_ON(ret == -ENOENT);
+       if (check_path_shared(root, path))
+               goto out;
+
+       err = 0;
+out:
+       btrfs_free_path(path);
+       if (err) {
+               btrfs_end_transaction(trans, root);
+               root->fs_info->enospc_unlink = 0;
+               return ERR_PTR(err);
+       }
+
+       trans->block_rsv = &root->fs_info->global_block_rsv;
+       return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+       if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+               BUG_ON(!root->fs_info->enospc_unlink);
+               root->fs_info->enospc_unlink = 0;
+       }
+       btrfs_end_transaction_throttle(trans, root);
+}
+
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-       struct btrfs_root *root;
+       struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        struct inode *inode = dentry->d_inode;
        int ret;
        unsigned long nr = 0;
 
-       root = BTRFS_I(dir)->root;
-
-       trans = btrfs_start_transaction(root, 1);
+       trans = __unlink_start_trans(dir, dentry);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
 
        btrfs_set_trans_block_group(trans, dir);
 
@@ -2362,13 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
+       BUG_ON(ret);
 
-       if (inode->i_nlink == 0)
+       if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, inode);
+               BUG_ON(ret);
+       }
 
        nr = trans->blocks_used;
-
-       btrfs_end_transaction_throttle(trans, root);
+       __unlink_end_trans(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
@@ -2440,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        int err = 0;
-       int ret;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
@@ -2449,7 +2956,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
 
-       trans = btrfs_start_transaction(root, 1);
+       trans = __unlink_start_trans(dir, dentry);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
        btrfs_set_trans_block_group(trans, dir);
 
        if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -2471,11 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
                btrfs_i_size_write(inode, 0);
 out:
        nr = trans->blocks_used;
-       ret = btrfs_end_transaction_throttle(trans, root);
+       __unlink_end_trans(trans, root);
        btrfs_btree_balance_dirty(root, nr);
 
-       if (ret && !err)
-               err = ret;
        return err;
 }
 
@@ -2662,37 +3170,40 @@ out:
  * min_type is the minimum key type to truncate down to.  If set to 0, this
  * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
-noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
-                                       struct btrfs_root *root,
-                                       struct inode *inode,
-                                       u64 new_size, u32 min_type)
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode,
+                              u64 new_size, u32 min_type)
 {
-       int ret;
        struct btrfs_path *path;
-       struct btrfs_key key;
-       struct btrfs_key found_key;
-       u32 found_type = (u8)-1;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
        u64 extent_start = 0;
        u64 extent_num_bytes = 0;
        u64 extent_offset = 0;
        u64 item_end = 0;
+       u64 mask = root->sectorsize - 1;
+       u32 found_type = (u8)-1;
        int found_extent;
        int del_item;
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
        int encoding;
-       u64 mask = root->sectorsize - 1;
+       int ret;
+       int err = 0;
+
+       BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->reada = -1;
 
-       /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
        key.offset = (u64)-1;
        key.type = (u8)-1;
@@ -2700,17 +3211,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 search_again:
        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-       if (ret < 0)
-               goto error;
+       if (ret < 0) {
+               err = ret;
+               goto out;
+       }
 
        if (ret > 0) {
                /* there are no items in the tree for us to truncate, we're
                 * done
                 */
-               if (path->slots[0] == 0) {
-                       ret = 0;
-                       goto error;
-               }
+               if (path->slots[0] == 0)
+                       goto out;
                path->slots[0]--;
        }
 
@@ -2745,28 +3256,17 @@ search_again:
                        }
                        item_end--;
                }
-               if (item_end < new_size) {
-                       if (found_type == BTRFS_DIR_ITEM_KEY)
-                               found_type = BTRFS_INODE_ITEM_KEY;
-                       else if (found_type == BTRFS_EXTENT_ITEM_KEY)
-                               found_type = BTRFS_EXTENT_DATA_KEY;
-                       else if (found_type == BTRFS_EXTENT_DATA_KEY)
-                               found_type = BTRFS_XATTR_ITEM_KEY;
-                       else if (found_type == BTRFS_XATTR_ITEM_KEY)
-                               found_type = BTRFS_INODE_REF_KEY;
-                       else if (found_type)
-                               found_type--;
-                       else
+               if (found_type > min_type) {
+                       del_item = 1;
+               } else {
+                       if (item_end < new_size)
                                break;
-                       btrfs_set_key_type(&key, found_type);
-                       goto next;
+                       if (found_key.offset >= new_size)
+                               del_item = 1;
+                       else
+                               del_item = 0;
                }
-               if (found_key.offset >= new_size)
-                       del_item = 1;
-               else
-                       del_item = 0;
                found_extent = 0;
-
                /* FIXME, shrink the extent if the ref count is only 1 */
                if (found_type != BTRFS_EXTENT_DATA_KEY)
                        goto delete;
@@ -2853,42 +3353,37 @@ delete:
                                                inode->i_ino, extent_offset);
                        BUG_ON(ret);
                }
-next:
-               if (path->slots[0] == 0) {
-                       if (pending_del_nr)
-                               goto del_pending;
-                       btrfs_release_path(root, path);
-                       if (found_type == BTRFS_INODE_ITEM_KEY)
-                               break;
-                       goto search_again;
-               }
 
-               path->slots[0]--;
-               if (pending_del_nr &&
-                   path->slots[0] + 1 != pending_del_slot) {
-                       struct btrfs_key debug;
-del_pending:
-                       btrfs_item_key_to_cpu(path->nodes[0], &debug,
-                                             pending_del_slot);
-                       ret = btrfs_del_items(trans, root, path,
-                                             pending_del_slot,
-                                             pending_del_nr);
-                       BUG_ON(ret);
-                       pending_del_nr = 0;
+               if (found_type == BTRFS_INODE_ITEM_KEY)
+                       break;
+
+               if (path->slots[0] == 0 ||
+                   path->slots[0] != pending_del_slot) {
+                       if (root->ref_cows) {
+                               err = -EAGAIN;
+                               goto out;
+                       }
+                       if (pending_del_nr) {
+                               ret = btrfs_del_items(trans, root, path,
+                                               pending_del_slot,
+                                               pending_del_nr);
+                               BUG_ON(ret);
+                               pending_del_nr = 0;
+                       }
                        btrfs_release_path(root, path);
-                       if (found_type == BTRFS_INODE_ITEM_KEY)
-                               break;
                        goto search_again;
+               } else {
+                       path->slots[0]--;
                }
        }
-       ret = 0;
-error:
+out:
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
+               BUG_ON(ret);
        }
        btrfs_free_path(path);
-       return ret;
+       return err;
 }
 
 /*
@@ -2901,6 +3396,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
        char *kaddr;
        u32 blocksize = root->sectorsize;
        pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -2912,12 +3408,17 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
        if ((offset & (blocksize - 1)) == 0)
                goto out;
+       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       if (ret)
+               goto out;
 
        ret = -ENOMEM;
 again:
        page = grab_cache_page(mapping, index);
-       if (!page)
+       if (!page) {
+               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
                goto out;
+       }
 
        page_start = page_offset(page);
        page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -2937,12 +3438,14 @@ again:
        }
        wait_on_page_writeback(page);
 
-       lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+                        GFP_NOFS);
        set_page_extent_mapped(page);
 
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
-               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+               unlock_extent_cached(io_tree, page_start, page_end,
+                                    &cached_state, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
@@ -2950,7 +3453,18 @@ again:
                goto again;
        }
 
-       btrfs_set_extent_delalloc(inode, page_start, page_end);
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                         0, 0, &cached_state, GFP_NOFS);
+
+       ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+                                       &cached_state);
+       if (ret) {
+               unlock_extent_cached(io_tree, page_start, page_end,
+                                    &cached_state, GFP_NOFS);
+               goto out_unlock;
+       }
+
        ret = 0;
        if (offset != PAGE_CACHE_SIZE) {
                kaddr = kmap(page);
@@ -2960,9 +3474,12 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
-       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+       unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+                            GFP_NOFS);
 
 out_unlock:
+       if (ret)
+               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -2974,39 +3491,33 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct extent_map *em;
+       struct extent_map *em = NULL;
+       struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
        u64 last_byte;
        u64 cur_offset;
        u64 hole_size;
-       int err;
+       int err = 0;
 
        if (size <= hole_start)
                return 0;
 
-       err = btrfs_check_metadata_free_space(root);
-       if (err)
-               return err;
-
-       btrfs_truncate_page(inode->i_mapping, inode->i_size);
-
        while (1) {
                struct btrfs_ordered_extent *ordered;
                btrfs_wait_ordered_range(inode, hole_start,
                                         block_end - hole_start);
-               lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+               lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+                                &cached_state, GFP_NOFS);
                ordered = btrfs_lookup_ordered_extent(inode, hole_start);
                if (!ordered)
                        break;
-               unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+               unlock_extent_cached(io_tree, hole_start, block_end - 1,
+                                    &cached_state, GFP_NOFS);
                btrfs_put_ordered_extent(ordered);
        }
 
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, inode);
-
        cur_offset = hole_start;
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
@@ -3014,34 +3525,123 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                BUG_ON(IS_ERR(em) || !em);
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
-               if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+               if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                       err = btrfs_drop_extents(trans, root, inode,
-                                                cur_offset,
-                                                cur_offset + hole_size,
-                                                block_end,
-                                                cur_offset, &hint_byte, 1);
-                       if (err)
+
+                       trans = btrfs_start_transaction(root, 2);
+                       if (IS_ERR(trans)) {
+                               err = PTR_ERR(trans);
                                break;
+                       }
+                       btrfs_set_trans_block_group(trans, inode);
+
+                       err = btrfs_drop_extents(trans, inode, cur_offset,
+                                                cur_offset + hole_size,
+                                                &hint_byte, 1);
+                       BUG_ON(err);
+
                        err = btrfs_insert_file_extent(trans, root,
                                        inode->i_ino, cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
+                       BUG_ON(err);
+
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
+
+                       btrfs_end_transaction(trans, root);
                }
                free_extent_map(em);
+               em = NULL;
                cur_offset = last_byte;
-               if (err || cur_offset >= block_end)
+               if (cur_offset >= block_end)
                        break;
        }
 
-       btrfs_end_transaction(trans, root);
-       unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+       free_extent_map(em);
+       unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+                            GFP_NOFS);
        return err;
 }
 
+static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       unsigned long nr;
+       int ret;
+
+       if (attr->ia_size == inode->i_size)
+               return 0;
+
+       if (attr->ia_size > inode->i_size) {
+               unsigned long limit;
+               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+               if (attr->ia_size > inode->i_sb->s_maxbytes)
+                       return -EFBIG;
+               if (limit != RLIM_INFINITY && attr->ia_size > limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
+               }
+       }
+
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       btrfs_set_trans_block_group(trans, inode);
+
+       ret = btrfs_orphan_add(trans, inode);
+       BUG_ON(ret);
+
+       nr = trans->blocks_used;
+       btrfs_end_transaction(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+
+       if (attr->ia_size > inode->i_size) {
+               ret = btrfs_cont_expand(inode, attr->ia_size);
+               if (ret) {
+                       btrfs_truncate(inode);
+                       return ret;
+               }
+
+               i_size_write(inode, attr->ia_size);
+               btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+
+               trans = btrfs_start_transaction(root, 0);
+               BUG_ON(IS_ERR(trans));
+               btrfs_set_trans_block_group(trans, inode);
+               trans->block_rsv = root->orphan_block_rsv;
+               BUG_ON(!trans->block_rsv);
+
+               ret = btrfs_update_inode(trans, root, inode);
+               BUG_ON(ret);
+               if (inode->i_nlink > 0) {
+                       ret = btrfs_orphan_del(trans, inode);
+                       BUG_ON(ret);
+               }
+               nr = trans->blocks_used;
+               btrfs_end_transaction(trans, root);
+               btrfs_btree_balance_dirty(root, nr);
+               return 0;
+       }
+
+       /*
+        * We're truncating a file that used to have good data down to
+        * zero. Make sure it gets into the ordered flush list so that
+        * any new writes get down to disk quickly.
+        */
+       if (attr->ia_size == 0)
+               BTRFS_I(inode)->ordered_data_close = 1;
+
+       /* we don't support swapfiles, so vmtruncate shouldn't fail */
+       ret = vmtruncate(inode, attr->ia_size);
+       BUG_ON(ret);
+
+       return 0;
+}
+
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
@@ -3052,23 +3652,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
 
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-               if (attr->ia_size > inode->i_size) {
-                       err = btrfs_cont_expand(inode, attr->ia_size);
-                       if (err)
-                               return err;
-               } else if (inode->i_size > 0 &&
-                          attr->ia_size == 0) {
-
-                       /* we're truncating a file that used to have good
-                        * data down to zero.  Make sure it gets into
-                        * the ordered flush list so that any new writes
-                        * get down to disk quickly.
-                        */
-                       BTRFS_I(inode)->ordered_data_close = 1;
-               }
+               err = btrfs_setattr_size(inode, attr);
+               if (err)
+                       return err;
        }
+       attr->ia_valid &= ~ATTR_SIZE;
 
-       err = inode_setattr(inode, attr);
+       if (attr->ia_valid)
+               err = inode_setattr(inode, attr);
 
        if (!err && ((attr->ia_valid & ATTR_MODE)))
                err = btrfs_acl_chmod(inode);
@@ -3089,36 +3680,55 @@ void btrfs_delete_inode(struct inode *inode)
        }
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+       if (root->fs_info->log_root_recovering) {
+               BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+               goto no_delete;
+       }
+
        if (inode->i_nlink > 0) {
                BUG_ON(btrfs_root_refs(&root->root_item) != 0);
                goto no_delete;
        }
 
        btrfs_i_size_write(inode, 0);
-       trans = btrfs_join_transaction(root, 1);
 
-       btrfs_set_trans_block_group(trans, inode);
-       ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
-       if (ret) {
-               btrfs_orphan_del(NULL, inode);
-               goto no_delete_lock;
-       }
+       while (1) {
+               trans = btrfs_start_transaction(root, 0);
+               BUG_ON(IS_ERR(trans));
+               btrfs_set_trans_block_group(trans, inode);
+               trans->block_rsv = root->orphan_block_rsv;
 
-       btrfs_orphan_del(trans, inode);
+               ret = btrfs_block_rsv_check(trans, root,
+                                           root->orphan_block_rsv, 0, 5);
+               if (ret) {
+                       BUG_ON(ret != -EAGAIN);
+                       ret = btrfs_commit_transaction(trans, root);
+                       BUG_ON(ret);
+                       continue;
+               }
 
-       nr = trans->blocks_used;
-       clear_inode(inode);
+               ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+               if (ret != -EAGAIN)
+                       break;
 
-       btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
-       return;
+               nr = trans->blocks_used;
+               btrfs_end_transaction(trans, root);
+               trans = NULL;
+               btrfs_btree_balance_dirty(root, nr);
+
+       }
+
+       if (ret == 0) {
+               ret = btrfs_orphan_del(trans, inode);
+               BUG_ON(ret);
+       }
 
-no_delete_lock:
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
 no_delete:
        clear_inode(inode);
+       return;
 }
 
 /*
@@ -3346,39 +3956,10 @@ again:
        return 0;
 }
 
-static noinline void init_btrfs_i(struct inode *inode)
-{
-       struct btrfs_inode *bi = BTRFS_I(inode);
-
-       bi->generation = 0;
-       bi->sequence = 0;
-       bi->last_trans = 0;
-       bi->logged_trans = 0;
-       bi->delalloc_bytes = 0;
-       bi->reserved_bytes = 0;
-       bi->disk_i_size = 0;
-       bi->flags = 0;
-       bi->index_cnt = (u64)-1;
-       bi->last_unlink_trans = 0;
-       bi->ordered_data_close = 0;
-       extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-       extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-                            inode->i_mapping, GFP_NOFS);
-       extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-                            inode->i_mapping, GFP_NOFS);
-       INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-       INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-       RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-       btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-       mutex_init(&BTRFS_I(inode)->extent_mutex);
-       mutex_init(&BTRFS_I(inode)->log_mutex);
-}
-
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
        inode->i_ino = args->ino;
-       init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
        btrfs_set_inode_space_info(args->root, inode);
        return 0;
@@ -3410,7 +3991,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                        struct btrfs_root *root)
+                        struct btrfs_root *root, int *new)
 {
        struct inode *inode;
 
@@ -3425,6 +4006,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
                inode_tree_add(inode);
                unlock_new_inode(inode);
+               if (new)
+                       *new = 1;
        }
 
        return inode;
@@ -3439,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
        if (!inode)
                return ERR_PTR(-ENOMEM);
 
-       init_btrfs_i(inode);
-
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        BTRFS_I(inode)->dummy_inode = 1;
@@ -3477,7 +4058,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                return NULL;
 
        if (location.type == BTRFS_INODE_ITEM_KEY) {
-               inode = btrfs_iget(dir->i_sb, &location, root);
+               inode = btrfs_iget(dir->i_sb, &location, root, NULL);
                return inode;
        }
 
@@ -3492,10 +4073,17 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                else
                        inode = new_simple_dir(dir->i_sb, &location, sub_root);
        } else {
-               inode = btrfs_iget(dir->i_sb, &location, sub_root);
+               inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
        }
        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
+       if (root != sub_root) {
+               down_read(&root->fs_info->cleanup_work_sem);
+               if (!(inode->i_sb->s_flags & MS_RDONLY))
+                       btrfs_orphan_cleanup(sub_root);
+               up_read(&root->fs_info->cleanup_work_sem);
+       }
+
        return inode;
 }
 
@@ -3503,12 +4091,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
 {
        struct btrfs_root *root;
 
-       if (!dentry->d_inode)
-               return 0;
+       if (!dentry->d_inode && !IS_ROOT(dentry))
+               dentry = dentry->d_parent;
 
-       root = BTRFS_I(dentry->d_inode)->root;
-       if (btrfs_root_refs(&root->root_item) == 0)
-               return 1;
+       if (dentry->d_inode) {
+               root = BTRFS_I(dentry->d_inode)->root;
+               if (btrfs_root_refs(&root->root_item) == 0)
+                       return 1;
+       }
        return 0;
 }
 
@@ -3668,7 +4258,11 @@ skip:
 
        /* Reached end of directory/root. Bump pos past the last item. */
        if (key_type == BTRFS_DIR_INDEX_KEY)
-               filp->f_pos = INT_LIMIT(off_t);
+               /*
+                * 32-bit glibc will use getdents64, but then strtol -
+                * so the last number we can serve is this.
+                */
+               filp->f_pos = 0x7fffffff;
        else
                filp->f_pos++;
 nopos:
@@ -3678,16 +4272,16 @@ err:
        return ret;
 }
 
-int btrfs_write_inode(struct inode *inode, int wait)
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
 
-       if (root->fs_info->btree_inode == inode)
+       if (BTRFS_I(inode)->dummy_inode)
                return 0;
 
-       if (wait) {
+       if (wbc->sync_mode == WB_SYNC_ALL) {
                trans = btrfs_join_transaction(root, 1);
                btrfs_set_trans_block_group(trans, inode);
                ret = btrfs_commit_transaction(trans, root);
@@ -3705,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+       int ret;
+
+       if (BTRFS_I(inode)->dummy_inode)
+               return;
 
        trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
-       btrfs_update_inode(trans, root, inode);
+
+       ret = btrfs_update_inode(trans, root, inode);
+       if (ret && ret == -ENOSPC) {
+               /* whoops, lets try again with the full transaction */
+               btrfs_end_transaction(trans, root);
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       if (printk_ratelimit()) {
+                               printk(KERN_ERR "btrfs: fail to "
+                                      "dirty  inode %lu error %ld\n",
+                                      inode->i_ino, PTR_ERR(trans));
+                       }
+                       return;
+               }
+               btrfs_set_trans_block_group(trans, inode);
+
+               ret = btrfs_update_inode(trans, root, inode);
+               if (ret) {
+                       if (printk_ratelimit()) {
+                               printk(KERN_ERR "btrfs: fail to "
+                                      "dirty  inode %lu error %d\n",
+                                      inode->i_ino, ret);
+                       }
+               }
+       }
        btrfs_end_transaction(trans, root);
 }
 
@@ -3826,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         * btrfs_get_inode_index_count has an explanation for the magic
         * number
         */
-       init_btrfs_i(inode);
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
@@ -3855,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (ret != 0)
                goto fail;
 
-       inode->i_uid = current_fsuid();
-
-       if (dir && (dir->i_mode & S_ISGID)) {
-               inode->i_gid = dir->i_gid;
-               if (S_ISDIR(mode))
-                       mode |= S_ISGID;
-       } else
-               inode->i_gid = current_fsgid();
-
-       inode->i_mode = mode;
+       inode_init_owner(inode, dir, mode);
        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -3990,18 +4602,20 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
 
-       err = btrfs_check_metadata_free_space(root);
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
        if (err)
-               goto fail;
+               return err;
 
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, dir);
+       /*
+        * 2 for inode item and ref
+        * 2 for dir items
+        * 1 for xattr if selinux is on
+        */
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
 
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
+       btrfs_set_trans_block_group(trans, dir);
 
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
@@ -4011,7 +4625,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
 
-       err = btrfs_init_inode_security(inode, dir);
+       err = btrfs_init_inode_security(trans, inode, dir);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4031,12 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
+       btrfs_btree_balance_dirty(root, nr);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-       btrfs_btree_balance_dirty(root, nr);
        return err;
 }
 
@@ -4046,23 +4659,25 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-       int err;
        int drop_inode = 0;
+       int err;
        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
 
-       err = btrfs_check_metadata_free_space(root);
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
        if (err)
-               goto fail;
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, dir);
+               return err;
+       /*
+        * 2 for inode item and ref
+        * 2 for dir items
+        * 1 for xattr if selinux is on
+        */
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
 
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
+       btrfs_set_trans_block_group(trans, dir);
 
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
@@ -4073,7 +4688,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
 
-       err = btrfs_init_inode_security(inode, dir);
+       err = btrfs_init_inode_security(trans, inode, dir);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4095,7 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4118,15 +4732,25 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (inode->i_nlink == 0)
                return -ENOENT;
 
+       /* do not allow sys_link's with other subvols of the same device */
+       if (root->objectid != BTRFS_I(inode)->root->objectid)
+               return -EPERM;
+
        btrfs_inc_nlink(inode);
-       err = btrfs_check_metadata_free_space(root);
-       if (err)
-               goto fail;
+
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
 
-       trans = btrfs_start_transaction(root, 1);
+       /*
+        * 1 item for inode ref
+        * 2 items for dir items
+        */
+       trans = btrfs_start_transaction(root, 3);
+       if (IS_ERR(trans)) {
+               err = PTR_ERR(trans);
+               goto fail;
+       }
 
        btrfs_set_trans_block_group(trans, dir);
        atomic_inc(&inode->i_count);
@@ -4164,24 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
 
-       err = btrfs_check_metadata_free_space(root);
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
        if (err)
-               goto out_unlock;
+               return err;
 
-       trans = btrfs_start_transaction(root, 1);
+       /*
+        * 2 items for inode and ref
+        * 2 items for dir items
+        * 1 for xattr if selinux is on
+        */
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
 
-       if (IS_ERR(trans)) {
-               err = PTR_ERR(trans);
-               goto out_unlock;
-       }
-
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
-
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4194,7 +4814,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
        drop_on_err = 1;
 
-       err = btrfs_init_inode_security(inode, dir);
+       err = btrfs_init_inode_security(trans, inode, dir);
        if (err)
                goto out_fail;
 
@@ -4221,8 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-
-out_unlock:
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -4480,6 +5098,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
+                       WARN_ON(1);
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -4576,11 +5195,651 @@ out:
        return em;
 }
 
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                 u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct extent_map *em;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct btrfs_key ins;
+       u64 alloc_hint;
+       int ret;
+
+       btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+       trans = btrfs_join_transaction(root, 0);
+       if (!trans)
+               return ERR_PTR(-ENOMEM);
+
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+       alloc_hint = get_extent_allocation_hint(inode, start, len);
+       ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                  alloc_hint, (u64)-1, &ins, 1);
+       if (ret) {
+               em = ERR_PTR(ret);
+               goto out;
+       }
+
+       em = alloc_extent_map(GFP_NOFS);
+       if (!em) {
+               em = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       em->start = start;
+       em->orig_start = em->start;
+       em->len = ins.offset;
+
+       em->block_start = ins.objectid;
+       em->block_len = ins.offset;
+       em->bdev = root->fs_info->fs_devices->latest_bdev;
+       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+       while (1) {
+               write_lock(&em_tree->lock);
+               ret = add_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               if (ret != -EEXIST)
+                       break;
+               btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+       }
+
+       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                          ins.offset, ins.offset, 0);
+       if (ret) {
+               btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+               em = ERR_PTR(ret);
+       }
+out:
+       btrfs_end_transaction(trans, root);
+       return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                     struct inode *inode, u64 offset, u64 len)
+{
+       struct btrfs_path *path;
+       int ret;
+       struct extent_buffer *leaf;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 disk_bytenr;
+       u64 backref_offset;
+       u64 extent_end;
+       u64 num_bytes;
+       int slot;
+       int found_type;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                      offset, 0);
+       if (ret < 0)
+               goto out;
+
+       slot = path->slots[0];
+       if (ret == 1) {
+               if (slot == 0) {
+                       /* can't find the item, must cow */
+                       ret = 0;
+                       goto out;
+               }
+               slot--;
+       }
+       ret = 0;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != inode->i_ino ||
+           key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* not our file or wrong item type, must cow */
+               goto out;
+       }
+
+       if (key.offset > offset) {
+               /* Wrong offset, must cow */
+               goto out;
+       }
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       found_type = btrfs_file_extent_type(leaf, fi);
+       if (found_type != BTRFS_FILE_EXTENT_REG &&
+           found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+               /* not a regular extent, must cow */
+               goto out;
+       }
+       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       if (extent_end < offset + len) {
+               /* extent doesn't include our full range, must cow */
+               goto out;
+       }
+
+       if (btrfs_extent_readonly(root, disk_bytenr))
+               goto out;
+
+       /*
+        * look for other files referencing this extent, if we
+        * find any we must cow
+        */
+       if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                 key.offset - backref_offset, disk_bytenr))
+               goto out;
+
+       /*
+        * adjust disk_bytenr and num_bytes to cover just the bytes
+        * in this extent we are about to write.  If there
+        * are any csums in that range we have to cow in order
+        * to keep the csums correct
+        */
+       disk_bytenr += backref_offset;
+       disk_bytenr += offset - key.offset;
+       num_bytes = min(offset + len, extent_end) - offset;
+       if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                               goto out;
+       /*
+        * all of the above have passed, it is safe to overwrite this extent
+        * without cow
+        */
+       ret = 1;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+       struct extent_map *em;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 start = iblock << inode->i_blkbits;
+       u64 len = bh_result->b_size;
+       struct btrfs_trans_handle *trans;
+
+       em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+
+       /*
+        * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+        * io.  INLINE is special, and we could probably kludge it in here, but
+        * it's still buffered so for safety lets just fall back to the generic
+        * buffered path.
+        *
+        * For COMPRESSED we _have_ to read the entire extent in so we can
+        * decompress it, so there will be buffering required no matter what we
+        * do, so go ahead and fallback to buffered.
+        *
+        * We return -ENOTBLK because thats what makes DIO go ahead and go back
+        * to buffered IO.  Don't blame me, this is the price we pay for using
+        * the generic code.
+        */
+       if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+           em->block_start == EXTENT_MAP_INLINE) {
+               free_extent_map(em);
+               return -ENOTBLK;
+       }
+
+       /* Just a good old fashioned hole, return */
+       if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+               free_extent_map(em);
+               /* DIO will do one hole at a time, so just unlock a sector */
+               unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                             start + root->sectorsize - 1, GFP_NOFS);
+               return 0;
+       }
+
+       /*
+        * We don't allocate a new extent in the following cases
+        *
+        * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+        * existing extent.
+        * 2) The extent is marked as PREALLOC.  We're good to go here and can
+        * just use the extent.
+        *
+        */
+       if (!create) {
+               len = em->len - (start - em->start);
+               goto map;
+       }
+
+       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+            em->block_start != EXTENT_MAP_HOLE)) {
+               int type;
+               int ret;
+               u64 block_start;
+
+               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       type = BTRFS_ORDERED_PREALLOC;
+               else
+                       type = BTRFS_ORDERED_NOCOW;
+               len = min(len, em->len - (start - em->start));
+               block_start = em->block_start + (start - em->start);
+
+               /*
+                * we're not going to log anything, but we do need
+                * to make sure the current transaction stays open
+                * while we look for nocow cross refs
+                */
+               trans = btrfs_join_transaction(root, 0);
+               if (!trans)
+                       goto must_cow;
+
+               if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                       ret = btrfs_add_ordered_extent_dio(inode, start,
+                                          block_start, len, len, type);
+                       btrfs_end_transaction(trans, root);
+                       if (ret) {
+                               free_extent_map(em);
+                               return ret;
+                       }
+                       goto unlock;
+               }
+               btrfs_end_transaction(trans, root);
+       }
+must_cow:
+       /*
+        * this will cow the extent, reset the len in case we changed
+        * it above
+        */
+       len = bh_result->b_size;
+       free_extent_map(em);
+       em = btrfs_new_extent_direct(inode, start, len);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+       len = min(len, em->len - (start - em->start));
+unlock:
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                         EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                         0, NULL, GFP_NOFS);
+map:
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = em->bdev;
+       set_buffer_mapped(bh_result);
+       if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               set_buffer_new(bh_result);
+
+       free_extent_map(em);
+
+       return 0;
+}
+
+struct btrfs_dio_private {
+       struct inode *inode;
+       u64 logical_offset;
+       u64 disk_bytenr;
+       u64 bytes;
+       u32 *csums;
+       void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+       struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 start;
+       u32 *private = dip->csums;
+
+       start = dip->logical_offset;
+       do {
+               if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                       struct page *page = bvec->bv_page;
+                       char *kaddr;
+                       u32 csum = ~(u32)0;
+                       unsigned long flags;
+
+                       local_irq_save(flags);
+                       kaddr = kmap_atomic(page, KM_IRQ0);
+                       csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                              csum, bvec->bv_len);
+                       btrfs_csum_final(csum, (char *)&csum);
+                       kunmap_atomic(kaddr, KM_IRQ0);
+                       local_irq_restore(flags);
+
+                       flush_dcache_page(bvec->bv_page);
+                       if (csum != *private) {
+                               printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                     " %llu csum %u private %u\n",
+                                     inode->i_ino, (unsigned long long)start,
+                                     csum, *private);
+                               err = -EIO;
+                       }
+               }
+
+               start += bvec->bv_len;
+               private++;
+               bvec++;
+       } while (bvec <= bvec_end);
+
+       unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                     dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+       bio->bi_private = dip->private;
+
+       kfree(dip->csums);
+       kfree(dip);
+       dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_ordered_extent *ordered = NULL;
+       struct extent_state *cached_state = NULL;
+       int ret;
+
+       if (err)
+               goto out_done;
+
+       ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                            dip->logical_offset, dip->bytes);
+       if (!ret)
+               goto out_done;
+
+       BUG_ON(!ordered);
+
+       trans = btrfs_join_transaction(root, 1);
+       if (!trans) {
+               err = -ENOMEM;
+               goto out;
+       }
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+               ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, inode);
+               err = ret;
+               goto out;
+       }
+
+       lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                        ordered->file_offset + ordered->len - 1, 0,
+                        &cached_state, GFP_NOFS);
+
+       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+               ret = btrfs_mark_extent_written(trans, inode,
+                                               ordered->file_offset,
+                                               ordered->file_offset +
+                                               ordered->len);
+               if (ret) {
+                       err = ret;
+                       goto out_unlock;
+               }
+       } else {
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 ordered->file_offset,
+                                                 ordered->start,
+                                                 ordered->disk_len,
+                                                 ordered->len,
+                                                 ordered->len,
+                                                 0, 0, 0,
+                                                 BTRFS_FILE_EXTENT_REG);
+               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                  ordered->file_offset, ordered->len);
+               if (ret) {
+                       err = ret;
+                       WARN_ON(1);
+                       goto out_unlock;
+               }
+       }
+
+       add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+       btrfs_ordered_update_i_size(inode, 0, ordered);
+       btrfs_update_inode(trans, root, inode);
+out_unlock:
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                            ordered->file_offset + ordered->len - 1,
+                            &cached_state, GFP_NOFS);
+out:
+       btrfs_delalloc_release_metadata(inode, ordered->len);
+       btrfs_end_transaction(trans, root);
+       btrfs_put_ordered_extent(ordered);
+       btrfs_put_ordered_extent(ordered);
+out_done:
+       bio->bi_private = dip->private;
+
+       kfree(dip->csums);
+       kfree(dip);
+       dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                   struct bio *bio, int mirror_num,
+                                   unsigned long bio_flags, u64 offset)
+{
+       int ret;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+       BUG_ON(ret);
+       return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                               loff_t file_offset)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_dio_private *dip;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       u64 start;
+       int skip_sum;
+       int write = rw & (1 << BIO_RW);
+       int ret = 0;
+
+       skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       dip = kmalloc(sizeof(*dip), GFP_NOFS);
+       if (!dip) {
+               ret = -ENOMEM;
+               goto free_ordered;
+       }
+       dip->csums = NULL;
+
+       if (!skip_sum) {
+               dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+               if (!dip->csums) {
+                       ret = -ENOMEM;
+                       goto free_ordered;
+               }
+       }
+
+       dip->private = bio->bi_private;
+       dip->inode = inode;
+       dip->logical_offset = file_offset;
+
+       start = dip->logical_offset;
+       dip->bytes = 0;
+       do {
+               dip->bytes += bvec->bv_len;
+               bvec++;
+       } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+       dip->disk_bytenr = (u64)bio->bi_sector << 9;
+       bio->bi_private = dip;
+
+       if (write)
+               bio->bi_end_io = btrfs_endio_direct_write;
+       else
+               bio->bi_end_io = btrfs_endio_direct_read;
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+       if (ret)
+               goto out_err;
+
+       if (write && !skip_sum) {
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                  inode, rw, bio, 0, 0,
+                                  dip->logical_offset,
+                                  __btrfs_submit_bio_start_direct_io,
+                                  __btrfs_submit_bio_done);
+               if (ret)
+                       goto out_err;
+               return;
+       } else if (!skip_sum)
+               btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                         dip->logical_offset, dip->csums);
+
+       ret = btrfs_map_bio(root, rw, bio, 0, 1);
+       if (ret)
+               goto out_err;
+       return;
+out_err:
+       kfree(dip->csums);
+       kfree(dip);
+free_ordered:
+       /*
+        * If this is a write, we need to clean up the reserved space and kill
+        * the ordered extent.
+        */
+       if (write) {
+               struct btrfs_ordered_extent *ordered;
+               ordered = btrfs_lookup_ordered_extent(inode,
+                                                     dip->logical_offset);
+               if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                       btrfs_free_reserved_extent(root, ordered->start,
+                                                  ordered->disk_len);
+               btrfs_put_ordered_extent(ordered);
+               btrfs_put_ordered_extent(ordered);
+       }
+       bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                       const struct iovec *iov, loff_t offset,
+                       unsigned long nr_segs)
+{
+       int seg;
+       size_t size;
+       unsigned long addr;
+       unsigned blocksize_mask = root->sectorsize - 1;
+       ssize_t retval = -EINVAL;
+       loff_t end = offset;
+
+       if (offset & blocksize_mask)
+               goto out;
+
+       /* Check the memory alignment.  Blocks cannot straddle pages */
+       for (seg = 0; seg < nr_segs; seg++) {
+               addr = (unsigned long)iov[seg].iov_base;
+               size = iov[seg].iov_len;
+               end += size;
+               if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                       goto out;
+       }
+       retval = 0;
+out:
+       return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
-       return -EINVAL;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
+       u64 lockstart, lockend;
+       ssize_t ret;
+       int writing = rw & WRITE;
+       int write_bits = 0;
+       size_t count = iov_length(iov, nr_segs);
+
+       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                           offset, nr_segs)) {
+               return 0;
+       }
+
+       lockstart = offset;
+       lockend = offset + count - 1;
+
+       if (writing) {
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
+       }
+
+       while (1) {
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, &cached_state, GFP_NOFS);
+               /*
+                * We're concerned with the entire range that we're going to be
+                * doing DIO to, so we need to make sure theres no ordered
+                * extents in this range.
+                */
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+               if (!ordered)
+                       break;
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    &cached_state, GFP_NOFS);
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               cond_resched();
+       }
+
+       /*
+        * we don't use btrfs_set_extent_delalloc because we don't want
+        * the dirty or uptodate bits
+        */
+       if (writing) {
+               write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                    GFP_NOFS);
+               if (ret) {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockend, EXTENT_LOCKED | write_bits,
+                                        1, 0, &cached_state, GFP_NOFS);
+                       goto out;
+               }
+       }
+
+       free_extent_state(cached_state);
+       cached_state = NULL;
+
+       ret = __blockdev_direct_IO(rw, iocb, inode,
+                  BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                  btrfs_submit_direct, 0);
+
+       if (ret < 0 && ret != -EIOCBQUEUED) {
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
+       } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+               /*
+                * We're falling back to buffered, unlock the section we didn't
+                * do IO on.
+                */
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
+       }
+out:
+       free_extent_state(cached_state);
+       return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -4656,6 +5915,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
        struct extent_io_tree *tree;
        struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -4674,7 +5934,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
-       lock_extent(tree, page_start, page_end, GFP_NOFS);
+       lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+                        GFP_NOFS);
        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
                                           page_offset(page));
        if (ordered) {
@@ -4684,7 +5945,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                 */
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+                                EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+                                &cached_state, GFP_NOFS);
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
@@ -4694,11 +5956,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                                                page_start, page_end);
                }
                btrfs_put_ordered_extent(ordered);
-               lock_extent(tree, page_start, page_end, GFP_NOFS);
+               cached_state = NULL;
+               lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+                                GFP_NOFS);
        }
        clear_extent_bit(tree, page_start, page_end,
-                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-                1, 1, NULL, GFP_NOFS);
+                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+                EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
 
        ClearPageChecked(page);
@@ -4731,6 +5995,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
        char *kaddr;
        unsigned long zero_start;
        loff_t size;
@@ -4738,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
 
-       ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+       ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -4756,13 +6021,13 @@ again:
 
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
-               btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
        wait_on_page_writeback(page);
 
-       lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+                        GFP_NOFS);
        set_page_extent_mapped(page);
 
        /*
@@ -4771,14 +6036,33 @@ again:
         */
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
        if (ordered) {
-               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+               unlock_extent_cached(io_tree, page_start, page_end,
+                                    &cached_state, GFP_NOFS);
                unlock_page(page);
                btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
                goto again;
        }
 
-       btrfs_set_extent_delalloc(inode, page_start, page_end);
+       /*
+        * XXX - page_mkwrite gets called every time the page is dirtied, even
+        * if it was already dirty, so for space accounting reasons we need to
+        * clear any delalloc bits for the range we are fixing to save.  There
+        * is probably a better way to do this, but for now keep consistent with
+        * prepare_pages in the normal write path.
+        */
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                         0, 0, &cached_state, GFP_NOFS);
+
+       ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+                                       &cached_state);
+       if (ret) {
+               unlock_extent_cached(io_tree, page_start, page_end,
+                                    &cached_state, GFP_NOFS);
+               ret = VM_FAULT_SIGBUS;
+               goto out_unlock;
+       }
        ret = 0;
 
        /* page is wholly or partially inside EOF */
@@ -4797,13 +6081,16 @@ again:
        set_page_dirty(page);
        SetPageUptodate(page);
 
-       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
-       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+       BTRFS_I(inode)->last_trans = root->fs_info->generation;
+       BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
+       unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
+       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
        return ret;
 }
@@ -4816,15 +6103,22 @@ static void btrfs_truncate(struct inode *inode)
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
 
-       if (!S_ISREG(inode->i_mode))
+       if (!S_ISREG(inode->i_mode)) {
+               WARN_ON(1);
                return;
-       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+       }
+
+       ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+       if (ret)
                return;
 
-       btrfs_truncate_page(inode->i_mapping, inode->i_size);
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+       btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
-       trans = btrfs_start_transaction(root, 1);
+       trans = btrfs_start_transaction(root, 0);
+       BUG_ON(IS_ERR(trans));
+       btrfs_set_trans_block_group(trans, inode);
+       trans->block_rsv = root->orphan_block_rsv;
 
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -4846,21 +6140,47 @@ static void btrfs_truncate(struct inode *inode)
        if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
                btrfs_add_ordered_operation(trans, root, inode);
 
-       btrfs_set_trans_block_group(trans, inode);
-       btrfs_i_size_write(inode, inode->i_size);
+       while (1) {
+               if (!trans) {
+                       trans = btrfs_start_transaction(root, 0);
+                       BUG_ON(IS_ERR(trans));
+                       btrfs_set_trans_block_group(trans, inode);
+                       trans->block_rsv = root->orphan_block_rsv;
+               }
 
-       ret = btrfs_orphan_add(trans, inode);
-       if (ret)
-               goto out;
-       /* FIXME, add redo link to tree so we don't leak on crash */
-       ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
-                                     BTRFS_EXTENT_DATA_KEY);
-       btrfs_update_inode(trans, root, inode);
+               ret = btrfs_block_rsv_check(trans, root,
+                                           root->orphan_block_rsv, 0, 5);
+               if (ret) {
+                       BUG_ON(ret != -EAGAIN);
+                       ret = btrfs_commit_transaction(trans, root);
+                       BUG_ON(ret);
+                       trans = NULL;
+                       continue;
+               }
+
+               ret = btrfs_truncate_inode_items(trans, root, inode,
+                                                inode->i_size,
+                                                BTRFS_EXTENT_DATA_KEY);
+               if (ret != -EAGAIN)
+                       break;
+
+               ret = btrfs_update_inode(trans, root, inode);
+               BUG_ON(ret);
 
-       ret = btrfs_orphan_del(trans, inode);
+               nr = trans->blocks_used;
+               btrfs_end_transaction(trans, root);
+               trans = NULL;
+               btrfs_btree_balance_dirty(root, nr);
+       }
+
+       if (ret == 0 && inode->i_nlink > 0) {
+               ret = btrfs_orphan_del(trans, inode);
+               BUG_ON(ret);
+       }
+
+       ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 
-out:
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
        BUG_ON(ret);
@@ -4911,16 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
        struct btrfs_inode *ei;
+       struct inode *inode;
 
        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+
+       ei->root = NULL;
+       ei->space_info = NULL;
+       ei->generation = 0;
+       ei->sequence = 0;
        ei->last_trans = 0;
+       ei->last_sub_trans = 0;
        ei->logged_trans = 0;
+       ei->delalloc_bytes = 0;
+       ei->reserved_bytes = 0;
+       ei->disk_i_size = 0;
+       ei->flags = 0;
+       ei->index_cnt = (u64)-1;
+       ei->last_unlink_trans = 0;
+
+       spin_lock_init(&ei->accounting_lock);
+       atomic_set(&ei->outstanding_extents, 0);
+       ei->reserved_extents = 0;
+
+       ei->ordered_data_close = 0;
+       ei->orphan_meta_reserved = 0;
+       ei->dummy_inode = 0;
+       ei->force_compress = 0;
+
+       inode = &ei->vfs_inode;
+       extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+       extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+       extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+       mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
+       INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
-       return &ei->vfs_inode;
+       RB_CLEAR_NODE(&ei->rb_node);
+
+       return inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
@@ -4930,6 +6281,16 @@ void btrfs_destroy_inode(struct inode *inode)
 
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
+       WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+       WARN_ON(BTRFS_I(inode)->reserved_extents);
+
+       /*
+        * This can happen where we create an inode, but somebody else also
+        * created the same inode and we need to destroy the one we already
+        * created.
+        */
+       if (!root)
+               goto free;
 
        /*
         * Make sure we're properly removed from the ordered operation
@@ -4942,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
 
-       spin_lock(&root->list_lock);
+       spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-               printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
-                      " list\n", inode->i_ino);
-               dump_stack();
+               printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
+                      inode->i_ino);
+               list_del_init(&BTRFS_I(inode)->i_orphan);
        }
-       spin_unlock(&root->list_lock);
+       spin_unlock(&root->orphan_lock);
 
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4966,13 +6327,13 @@ void btrfs_destroy_inode(struct inode *inode)
        }
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+free:
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
 void btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-
        if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
                generic_delete_inode(inode);
        else
@@ -5069,11 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
-
-       ret = btrfs_check_metadata_free_space(root);
-       if (ret)
-               return ret;
-
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -5086,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* close the racy window with snapshot create/destroy ioctl */
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 20);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
 
-       trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, new_dir);
 
        if (dest != root)
@@ -5185,6 +6551,7 @@ out_fail:
 
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
+
        return ret;
 }
 
@@ -5192,7 +6559,7 @@ out_fail:
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
@@ -5211,7 +6578,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
                spin_unlock(&root->fs_info->delalloc_lock);
                if (inode) {
                        filemap_flush(inode->i_mapping);
-                       iput(inode);
+                       if (delay_iput)
+                               btrfs_add_delayed_iput(inode);
+                       else
+                               iput(inode);
                }
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
@@ -5233,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
        return 0;
 }
 
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+       struct btrfs_inode *binode;
+       struct inode *inode = NULL;
+
+       spin_lock(&root->fs_info->delalloc_lock);
+       while (!list_empty(&root->fs_info->delalloc_inodes)) {
+               binode = list_entry(root->fs_info->delalloc_inodes.next,
+                                   struct btrfs_inode, delalloc_inodes);
+               inode = igrab(&binode->vfs_inode);
+               if (inode) {
+                       list_move_tail(&binode->delalloc_inodes,
+                                      &root->fs_info->delalloc_inodes);
+                       break;
+               }
+
+               list_del_init(&binode->delalloc_inodes);
+               cond_resched_lock(&root->fs_info->delalloc_lock);
+       }
+       spin_unlock(&root->fs_info->delalloc_lock);
+
+       if (inode) {
+               write_inode_now(inode, 0);
+               if (delay_iput)
+                       btrfs_add_delayed_iput(inode);
+               else
+                       iput(inode);
+               return 1;
+       }
+       return 0;
+}
+
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -5256,19 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
 
-       err = btrfs_check_metadata_free_space(root);
+       err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
        if (err)
-               goto out_fail;
+               return err;
+       /*
+        * 2 items for inode item and ref
+        * 2 items for dir items
+        * 1 item for xattr if selinux is on
+        */
+       trans = btrfs_start_transaction(root, 5);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
 
-       trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, dir);
 
-       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-       if (err) {
-               err = -ENOSPC;
-               goto out_unlock;
-       }
-
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -5278,7 +6681,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
 
-       err = btrfs_init_inode_security(inode, dir);
+       err = btrfs_init_inode_security(trans, inode, dir);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -5340,7 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -5349,56 +6751,68 @@ out_fail:
        return err;
 }
 
-static int prealloc_file_range(struct btrfs_trans_handle *trans,
-                              struct inode *inode, u64 start, u64 end,
-                              u64 locked_end, u64 alloc_hint, int mode)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                             u64 start, u64 num_bytes, u64 min_size,
+                             loff_t actual_len, u64 *alloc_hint)
 {
+       struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
-       u64 alloc_size;
        u64 cur_offset = start;
-       u64 num_bytes = end - start;
        int ret = 0;
 
        while (num_bytes > 0) {
-               alloc_size = min(num_bytes, root->fs_info->max_extent);
-               ret = btrfs_reserve_extent(trans, root, alloc_size,
-                                          root->sectorsize, 0, alloc_hint,
-                                          (u64)-1, &ins, 1);
+               trans = btrfs_start_transaction(root, 3);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       break;
+               }
+
+               ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                          0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                       WARN_ON(1);
-                       goto out;
+                       btrfs_end_transaction(trans, root);
+                       break;
                }
+
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,
-                                                 ins.offset, locked_end,
-                                                 0, 0, 0,
+                                                 ins.offset, 0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                BUG_ON(ret);
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
+
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-               alloc_hint = ins.objectid + ins.offset;
-       }
-out:
-       if (cur_offset > start) {
+               *alloc_hint = ins.objectid + ins.offset;
+
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                   cur_offset > i_size_read(inode))
-                       btrfs_i_size_write(inode, cur_offset);
+                   (actual_len > inode->i_size) &&
+                   (cur_offset > inode->i_size)) {
+                       if (cur_offset > actual_len)
+                               i_size_write(inode, actual_len);
+                       else
+                               i_size_write(inode, cur_offset);
+                       i_size_write(inode, cur_offset);
+                       btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+               }
+
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
-       }
 
+               btrfs_end_transaction(trans, root);
+       }
        return ret;
 }
 
 static long btrfs_fallocate(struct inode *inode, int mode,
                            loff_t offset, loff_t len)
 {
+       struct extent_state *cached_state = NULL;
        u64 cur_offset;
        u64 last_byte;
        u64 alloc_start;
@@ -5407,8 +6821,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 locked_end;
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
-       struct btrfs_trans_handle *trans;
-       struct btrfs_root *root;
        int ret;
 
        alloc_start = offset & ~mask;
@@ -5427,10 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
 
-       root = BTRFS_I(inode)->root;
-
-       ret = btrfs_check_data_free_space(root, inode,
-                                         alloc_end - alloc_start);
+       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
        if (ret)
                goto out;
 
@@ -5438,27 +6847,20 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        while (1) {
                struct btrfs_ordered_extent *ordered;
 
-               trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
-               if (!trans) {
-                       ret = -EIO;
-                       goto out_free;
-               }
-
                /* the extent lock is ordered inside the running
                 * transaction
                 */
-               lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                           GFP_NOFS);
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                locked_end, 0, &cached_state, GFP_NOFS);
                ordered = btrfs_lookup_first_ordered_extent(inode,
                                                            alloc_end - 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > alloc_start &&
                    ordered->file_offset < alloc_end) {
                        btrfs_put_ordered_extent(ordered);
-                       unlock_extent(&BTRFS_I(inode)->io_tree,
-                                     alloc_start, locked_end, GFP_NOFS);
-                       btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-
+                       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                            alloc_start, locked_end,
+                                            &cached_state, GFP_NOFS);
                        /*
                         * we can't wait on the range with the transaction
                         * running or with the extent lock held
@@ -5479,17 +6881,19 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                BUG_ON(IS_ERR(em) || !em);
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
-               if (em->block_start == EXTENT_MAP_HOLE) {
-                       ret = prealloc_file_range(trans, inode, cur_offset,
-                                       last_byte, locked_end + 1,
-                                       alloc_hint, mode);
+               if (em->block_start == EXTENT_MAP_HOLE ||
+                   (cur_offset >= inode->i_size &&
+                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                       ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+                                                       last_byte - cur_offset,
+                                                       1 << inode->i_blkbits,
+                                                       offset + len,
+                                                       &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-               if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                       alloc_hint = em->block_start;
                free_extent_map(em);
 
                cur_offset = last_byte;
@@ -5498,12 +6902,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        break;
                }
        }
-       unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                     GFP_NOFS);
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                            &cached_state, GFP_NOFS);
 
-       btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-out_free:
-       btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
+       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
@@ -5566,6 +6968,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
        .readpage_io_failed_hook = btrfs_io_failed_hook,
        .set_bit_hook = btrfs_set_bit_hook,
        .clear_bit_hook = btrfs_clear_bit_hook,
+       .merge_extent_hook = btrfs_merge_extent_hook,
+       .split_extent_hook = btrfs_split_extent_hook,
 };
 
 /*
@@ -5632,6 +7036,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .removexattr    = btrfs_removexattr,
 };
 
-struct dentry_operations btrfs_dentry_operations = {
+const struct dentry_operations btrfs_dentry_operations = {
        .d_delete       = btrfs_dentry_delete,
 };