NFSv4: Eliminate nfs4_path_walk()
[safe/jmp/linux-2.6] / fs / btrfs / relocation.c
index b23dc20..e558dd9 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -121,6 +122,15 @@ struct inodevec {
        int nr;
 };
 
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+       u64 start;
+       u64 end;
+       u64 boundary[MAX_EXTENTS];
+       unsigned int nr;
+};
+
 struct reloc_control {
        /* block group to relocate */
        struct btrfs_block_group_cache *block_group;
@@ -161,14 +171,14 @@ struct async_merge {
 
 static void mapping_tree_init(struct mapping_tree *tree)
 {
-       tree->rb_root.rb_node = NULL;
+       tree->rb_root = RB_ROOT;
        spin_lock_init(&tree->lock);
 }
 
 static void backref_cache_init(struct backref_cache *cache)
 {
        int i;
-       cache->rb_root.rb_node = NULL;
+       cache->rb_root = RB_ROOT;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                INIT_LIST_HEAD(&cache->pending[i]);
        spin_lock_init(&cache->lock);
@@ -670,6 +680,8 @@ again:
                        err = ret;
                        goto out;
                }
+               if (ret > 0 && path2->slots[level] > 0)
+                       path2->slots[level]--;
 
                eb = path2->nodes[level];
                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1550,6 +1562,20 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        return 0;
 }
 
+static void put_inodes(struct list_head *list)
+{
+       struct inodevec *ivec;
+       while (!list_empty(list)) {
+               ivec = list_entry(list->next, struct inodevec, list);
+               list_del(&ivec->list);
+               while (ivec->nr > 0) {
+                       ivec->nr--;
+                       iput(ivec->inode[ivec->nr]);
+               }
+               kfree(ivec);
+       }
+}
+
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)
 
@@ -1609,6 +1635,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+               path->lowest_level = 0;
                if (ret < 0) {
                        btrfs_free_path(path);
                        return ret;
@@ -1711,6 +1738,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
                btrfs_btree_balance_dirty(root, nr);
 
+               /*
+                * put inodes outside transaction, otherwise we may deadlock.
+                */
+               put_inodes(&inode_list);
+
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }
@@ -1740,19 +1772,7 @@ out:
 
        btrfs_btree_balance_dirty(root, nr);
 
-       /*
-        * put inodes while we aren't holding the tree locks
-        */
-       while (!list_empty(&inode_list)) {
-               struct inodevec *ivec;
-               ivec = list_entry(inode_list.next, struct inodevec, list);
-               list_del(&ivec->list);
-               while (ivec->nr > 0) {
-                       ivec->nr--;
-                       iput(ivec->inode[ivec->nr]);
-               }
-               kfree(ivec);
-       }
+       put_inodes(&inode_list);
 
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
@@ -1788,7 +1808,7 @@ static void merge_func(struct btrfs_work *work)
                btrfs_end_transaction(trans, root);
        }
 
-       btrfs_drop_dead_root(reloc_root);
+       btrfs_drop_snapshot(reloc_root, 0);
 
        if (atomic_dec_and_test(async->num_pending))
                complete(async->done);
@@ -2075,9 +2095,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
-
-                       btrfs_tree_unlock(eb);
-                       free_extent_buffer(eb);
                }
                if (!lowest) {
                        btrfs_tree_unlock(upper->eb);
@@ -2180,7 +2197,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
                                struct reloc_control *rc)
 {
        if (test_range_bit(&rc->processed_blocks, bytenr,
-                          bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+                          bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
                return 1;
        return 0;
 }
@@ -2529,51 +2546,94 @@ out:
 }
 
 static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+                        u64 block_start)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
+       int ret = 0;
+
+       em = alloc_extent_map(GFP_NOFS);
+       if (!em)
+               return -ENOMEM;
+
+       em->start = start;
+       em->len = end + 1 - start;
+       em->block_len = em->len;
+       em->block_start = block_start;
+       em->bdev = root->fs_info->fs_devices->latest_bdev;
+       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+       lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+       while (1) {
+               write_lock(&em_tree->lock);
+               ret = add_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               if (ret != -EEXIST) {
+                       free_extent_map(em);
+                       break;
+               }
+               btrfs_drop_extent_cache(inode, start, end, 0);
+       }
+       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+       return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+                                       struct file_extent_cluster *cluster)
 {
        u64 page_start;
        u64 page_end;
-       unsigned long i;
-       unsigned long first_index;
+       u64 offset = BTRFS_I(inode)->index_cnt;
+       unsigned long index;
        unsigned long last_index;
-       unsigned int total_read = 0;
-       unsigned int total_dirty = 0;
+       unsigned int dirty_page = 0;
        struct page *page;
        struct file_ra_state *ra;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       int nr = 0;
        int ret = 0;
 
+       if (!cluster->nr)
+               return 0;
+
        ra = kzalloc(sizeof(*ra), GFP_NOFS);
        if (!ra)
                return -ENOMEM;
 
+       index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+       last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
        mutex_lock(&inode->i_mutex);
-       first_index = start >> PAGE_CACHE_SHIFT;
-       last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
-       /* make sure the dirty trick played by the caller work */
-       ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                           first_index, last_index);
+       i_size_write(inode, cluster->end + 1 - offset);
+       ret = setup_extent_mapping(inode, cluster->start - offset,
+                                  cluster->end - offset, cluster->start);
        if (ret)
                goto out_unlock;
 
        file_ra_state_init(ra, inode->i_mapping);
 
-       for (i = first_index ; i <= last_index; i++) {
-               if (total_read % ra->ra_pages == 0) {
-                       btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-                               min(last_index, ra->ra_pages + i - 1));
-               }
-               total_read++;
-again:
-               if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-                       BUG_ON(1);
-               page = grab_cache_page(inode->i_mapping, i);
+       WARN_ON(cluster->start != cluster->boundary[0]);
+       while (index <= last_index) {
+               page = find_lock_page(inode->i_mapping, index);
                if (!page) {
-                       ret = -ENOMEM;
-                       goto out_unlock;
+                       page_cache_sync_readahead(inode->i_mapping,
+                                                 ra, NULL, index,
+                                                 last_index + 1 - index);
+                       page = grab_cache_page(inode->i_mapping, index);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto out_unlock;
+                       }
+               }
+
+               if (PageReadahead(page)) {
+                       page_cache_async_readahead(inode->i_mapping,
+                                                  ra, NULL, page, index,
+                                                  last_index + 1 - index);
                }
+
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
                        lock_page(page);
@@ -2584,75 +2644,79 @@ again:
                                goto out_unlock;
                        }
                }
-               wait_on_page_writeback(page);
 
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
-               lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-               ordered = btrfs_lookup_ordered_extent(inode, page_start);
-               if (ordered) {
-                       unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                       unlock_page(page);
-                       page_cache_release(page);
-                       btrfs_start_ordered_extent(inode, ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-                       goto again;
-               }
+
+               lock_extent(&BTRFS_I(inode)->io_tree,
+                           page_start, page_end, GFP_NOFS);
+
                set_page_extent_mapped(page);
 
-               if (i == first_index)
-                       set_extent_bits(io_tree, page_start, page_end,
+               if (nr < cluster->nr &&
+                   page_start + offset == cluster->boundary[nr]) {
+                       set_extent_bits(&BTRFS_I(inode)->io_tree,
+                                       page_start, page_end,
                                        EXTENT_BOUNDARY, GFP_NOFS);
-               btrfs_set_extent_delalloc(inode, page_start, page_end);
+                       nr++;
+               }
+               btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
 
                set_page_dirty(page);
-               total_dirty++;
+               dirty_page++;
 
-               unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+               unlock_extent(&BTRFS_I(inode)->io_tree,
+                             page_start, page_end, GFP_NOFS);
                unlock_page(page);
                page_cache_release(page);
+
+               index++;
+               if (nr < cluster->nr &&
+                   page_end + 1 + offset == cluster->boundary[nr]) {
+                       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                          dirty_page);
+                       dirty_page = 0;
+               }
        }
+       if (dirty_page) {
+               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                  dirty_page);
+       }
+       WARN_ON(nr != cluster->nr);
 out_unlock:
        mutex_unlock(&inode->i_mutex);
        kfree(ra);
-       balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
        return ret;
 }
 
 static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+                        struct file_extent_cluster *cluster)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-       struct extent_map *em;
-       u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
-       u64 end = start + extent_key->offset - 1;
-
-       em = alloc_extent_map(GFP_NOFS);
-       em->start = start;
-       em->len = extent_key->offset;
-       em->block_len = extent_key->offset;
-       em->block_start = extent_key->objectid;
-       em->bdev = root->fs_info->fs_devices->latest_bdev;
-       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+       int ret;
 
-       /* setup extent map to cheat btrfs_readpage */
-       lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
-       while (1) {
-               int ret;
-               spin_lock(&em_tree->lock);
-               ret = add_extent_mapping(em_tree, em);
-               spin_unlock(&em_tree->lock);
-               if (ret != -EEXIST) {
-                       free_extent_map(em);
-                       break;
-               }
-               btrfs_drop_extent_cache(inode, start, end, 0);
+       if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+               ret = relocate_file_extent_cluster(inode, cluster);
+               if (ret)
+                       return ret;
+               cluster->nr = 0;
        }
-       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
 
-       return relocate_inode_pages(inode, start, extent_key->offset);
+       if (!cluster->nr)
+               cluster->start = extent_key->objectid;
+       else
+               BUG_ON(cluster->nr >= MAX_EXTENTS);
+       cluster->end = extent_key->objectid + extent_key->offset - 1;
+       cluster->boundary[cluster->nr] = extent_key->objectid;
+       cluster->nr++;
+
+       if (cluster->nr >= MAX_EXTENTS) {
+               ret = relocate_file_extent_cluster(inode, cluster);
+               if (ret)
+                       return ret;
+               cluster->nr = 0;
+       }
+       return 0;
 }
 
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3198,10 +3262,12 @@ static int check_extent_flags(u64 flags)
        return 0;
 }
 
+
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
+       struct file_extent_cluster *cluster;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
@@ -3211,9 +3277,18 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        int ret;
        int err = 0;
 
+       cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+       if (!cluster)
+               return -ENOMEM;
+
        path = btrfs_alloc_path();
-       if (!path)
+       if (!path) {
+               kfree(cluster);
                return -ENOMEM;
+       }
+
+       rc->extents_found = 0;
+       rc->extents_skipped = 0;
 
        rc->search_start = rc->block_group->key.objectid;
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
@@ -3301,14 +3376,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                }
 
                nr = trans->blocks_used;
-               btrfs_end_transaction_throttle(trans, rc->extent_root);
+               btrfs_end_transaction(trans, rc->extent_root);
                trans = NULL;
                btrfs_btree_balance_dirty(rc->extent_root, nr);
 
                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = 1;
-                       ret = relocate_data_extent(rc->data_inode, &key);
+                       ret = relocate_data_extent(rc->data_inode,
+                                                  &key, cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
@@ -3323,6 +3399,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                btrfs_btree_balance_dirty(rc->extent_root, nr);
        }
 
+       if (!err) {
+               ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+               if (ret < 0)
+                       err = ret;
+       }
+
+       kfree(cluster);
+
        rc->create_reloc_root = 0;
        smp_mb();
 
@@ -3343,8 +3427,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 }
 
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                u64 objectid, u64 size)
+                                struct btrfs_root *root, u64 objectid)
 {
        struct btrfs_path *path;
        struct btrfs_inode_item *item;
@@ -3363,7 +3446,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
        btrfs_set_inode_generation(leaf, item, 1);
-       btrfs_set_inode_size(leaf, item, size);
+       btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
        btrfs_mark_buffer_dirty(leaf);
@@ -3399,18 +3482,13 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (err)
                goto out;
 
-       err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
-       BUG_ON(err);
-
-       err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-                                      group->key.offset, 0, group->key.offset,
-                                      0, 0, 0);
+       err = __insert_orphan_inode(trans, root, objectid);
        BUG_ON(err);
 
        key.objectid = objectid;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-       inode = btrfs_iget(root->fs_info->sb, &key, root);
+       inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
        BTRFS_I(inode)->index_cnt = group->key.objectid;
 
@@ -3450,7 +3528,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        BUG_ON(!rc->block_group);
 
        btrfs_init_workers(&rc->workers, "relocate",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size, NULL);
 
        rc->extent_root = extent_root;
        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3466,18 +3544,19 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
 
-       btrfs_start_delalloc_inodes(fs_info->tree_root);
-       btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+       btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+       btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
 
        while (1) {
-               mutex_lock(&fs_info->cleaner_mutex);
-               btrfs_clean_old_snapshots(fs_info->tree_root);
-               mutex_unlock(&fs_info->cleaner_mutex);
-
                rc->extents_found = 0;
                rc->extents_skipped = 0;
 
+               mutex_lock(&fs_info->cleaner_mutex);
+
+               btrfs_clean_old_snapshots(fs_info->tree_root);
                ret = relocate_block_group(rc);
+
+               mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0) {
                        err = ret;
                        break;
@@ -3509,10 +3588,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                }
        }
 
-       filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
-                                rc->block_group->key.objectid,
-                                rc->block_group->key.objectid +
-                                rc->block_group->key.offset - 1);
+       filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+                                    rc->block_group->key.objectid,
+                                    rc->block_group->key.objectid +
+                                    rc->block_group->key.offset - 1);
 
        WARN_ON(rc->block_group->pinned > 0);
        WARN_ON(rc->block_group->reserved > 0);
@@ -3525,6 +3604,26 @@ out:
        return err;
 }
 
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+       struct btrfs_trans_handle *trans;
+       int ret;
+
+       trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+       memset(&root->root_item.drop_progress, 0,
+               sizeof(root->root_item.drop_progress));
+       root->root_item.drop_level = 0;
+       btrfs_set_root_refs(&root->root_item, 0);
+       ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                               &root->root_key, &root->root_item);
+       BUG_ON(ret);
+
+       ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+       BUG_ON(ret);
+       return 0;
+}
+
 /*
  * recover relocation interrupted by system crash.
  *
@@ -3584,8 +3683,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                        fs_root = read_fs_root(root->fs_info,
                                               reloc_root->root_key.offset);
                        if (IS_ERR(fs_root)) {
-                               err = PTR_ERR(fs_root);
-                               goto out;
+                               ret = PTR_ERR(fs_root);
+                               if (ret != -ENOENT) {
+                                       err = ret;
+                                       goto out;
+                               }
+                               mark_garbage_root(reloc_root);
                        }
                }
 
@@ -3608,7 +3711,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        mapping_tree_init(&rc->reloc_root_tree);
        INIT_LIST_HEAD(&rc->reloc_roots);
        btrfs_init_workers(&rc->workers, "relocate",
-                          root->fs_info->thread_pool_size);
+                          root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
 
        set_reloc_control(rc);
@@ -3662,6 +3765,8 @@ out:
                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
                if (IS_ERR(fs_root))
                        err = PTR_ERR(fs_root);
+               else
+                       btrfs_orphan_cleanup(fs_root);
        }
        return err;
 }