Btrfs: extent_map and data=ordered fixes for space balancing
authorZheng Yan <zheng.yan@oracle.com>
Fri, 26 Sep 2008 14:05:38 +0000 (10:05 -0400)
committerChris Mason <chris.mason@oracle.com>
Fri, 26 Sep 2008 14:05:38 +0000 (10:05 -0400)
* Add an EXTENT_BOUNDARY state bit to keep the writepage code
from merging data extents that are in the process of being
relocated.  This allows us to do accounting for them properly.

* The balancing code relocates data extents indepdent of the underlying
inode.  The extent_map code was modified to properly account for
things moving around (invalidating extent_map caches in the inode).

* Don't take the drop_mutex in the create_subvol ioctl.  It isn't
required.

* Fix walking of the ordered extent list to avoid races with sys_unlink

* Change the lock ordering rules.  Transaction start goes outside
the drop_mutex.  This allows btrfs_commit_transaction to directly
drop the relocation trees.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ordered-data.c
fs/btrfs/transaction.c

index 50aea8c..f9cd409 100644 (file)
@@ -290,7 +290,6 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct extent_buffer **cow_ret, u64 prealloc_dest)
 {
        u64 search_start;
-       u64 header_trans;
        int ret;
 
        if (trans->transaction != root->fs_info->running_transaction) {
@@ -304,9 +303,9 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
                WARN_ON(1);
        }
 
-       header_trans = btrfs_header_generation(buf);
        spin_lock(&root->fs_info->hash_lock);
-       if (header_trans == trans->transid &&
+       if (btrfs_header_generation(buf) == trans->transid &&
+           btrfs_header_owner(buf) == root->root_key.objectid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
                spin_unlock(&root->fs_info->hash_lock);
@@ -1300,6 +1299,7 @@ again:
                        /* is a cow on this block not required */
                        spin_lock(&root->fs_info->hash_lock);
                        if (btrfs_header_generation(b) == trans->transid &&
+                           btrfs_header_owner(b) == root->root_key.objectid &&
                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                spin_unlock(&root->fs_info->hash_lock);
                                goto cow_done;
@@ -1396,7 +1396,8 @@ cow_done:
 
                        /* this is only true while dropping a snapshot */
                        if (level == lowest_level) {
-                               break;
+                               ret = 0;
+                               goto done;
                        }
 
                        blocknr = btrfs_node_blockptr(b, slot);
index b9f9f81..3e62a1b 100644 (file)
@@ -1486,6 +1486,9 @@ static inline struct dentry *fdentry(struct file *file)
 
 /* extent-tree.c */
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytenr,
+                           u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
                                u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1812,6 +1815,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                           struct btrfs_root *root, int wait);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
                                struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
@@ -1824,13 +1829,17 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                           int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
index e3a25be..8bd1b40 100644 (file)
@@ -292,7 +292,7 @@ static int merge_state(struct extent_io_tree *tree,
        struct extent_state *other;
        struct rb_node *other_node;
 
-       if (state->state & EXTENT_IOBITS)
+       if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                return 0;
 
        other_node = rb_prev(&state->rb_node);
@@ -1070,7 +1070,8 @@ search_again:
 
        while(1) {
                state = rb_entry(node, struct extent_state, rb_node);
-               if (found && state->start != cur_start) {
+               if (found && (state->start != cur_start ||
+                             (state->state & EXTENT_BOUNDARY))) {
                        goto out;
                }
                if (!(state->state & EXTENT_DELALLOC)) {
@@ -1078,7 +1079,7 @@ search_again:
                                *end = state->end;
                        goto out;
                }
-               if (!found) {
+               if (!found && !(state->state & EXTENT_BOUNDARY)) {
                        struct extent_state *prev_state;
                        struct rb_node *prev_node = node;
                        while(1) {
@@ -1088,7 +1089,11 @@ search_again:
                                prev_state = rb_entry(prev_node,
                                                      struct extent_state,
                                                      rb_node);
-                               if (!(prev_state->state & EXTENT_DELALLOC))
+                               if ((prev_state->end + 1 != state->start) ||
+                                   !(prev_state->state & EXTENT_DELALLOC))
+                                       break;
+                               if ((cur_start - prev_state->start) * 2 >
+                                    max_bytes)
                                        break;
                                state = prev_state;
                                node = prev_node;
index 3cb411a..c9d1908 100644 (file)
@@ -15,6 +15,7 @@
 #define EXTENT_BUFFER_FILLED (1 << 8)
 #define EXTENT_ORDERED (1 << 9)
 #define EXTENT_ORDERED_METADATA (1 << 10)
+#define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
index 8856570..1b7e51a 100644 (file)
@@ -294,7 +294,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                                                       last_pos_in_file,
                                                       0, 0, hole_size, 0);
                        btrfs_drop_extent_cache(inode, last_pos_in_file,
-                                       last_pos_in_file + hole_size -1);
+                                       last_pos_in_file + hole_size - 1, 0);
                        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
                        btrfs_check_file(root, inode);
                }
@@ -337,7 +337,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                inline_size -= start_pos;
                err = insert_inline_extent(trans, root, inode, start_pos,
                                           inline_size, pages, 0, num_pages);
-               btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
+               btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
                BUG_ON(err);
                mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
@@ -362,7 +362,8 @@ out_unlock:
        return err;
 }
 
-int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                           int skip_pinned)
 {
        struct extent_map *em;
        struct extent_map *split = NULL;
@@ -371,6 +372,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
        u64 len = end - start + 1;
        int ret;
        int testend = 1;
+       unsigned long flags;
 
        WARN_ON(end < start);
        if (end == (u64)-1) {
@@ -389,6 +391,23 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
                        spin_unlock(&em_tree->lock);
                        break;
                }
+               flags = em->flags;
+               if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+                       spin_unlock(&em_tree->lock);
+                       if (em->start <= start &&
+                           (!testend || em->start + em->len >= start + len)) {
+                               free_extent_map(em);
+                               break;
+                       }
+                       if (start < em->start) {
+                               len = em->start - start;
+                       } else {
+                               len = start + len - (em->start + em->len);
+                               start = em->start + em->len;
+                       }
+                       free_extent_map(em);
+                       continue;
+               }
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                remove_extent_mapping(em_tree, em);
 
@@ -398,7 +417,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
                        split->len = start - em->start;
                        split->block_start = em->block_start;
                        split->bdev = em->bdev;
-                       split->flags = em->flags;
+                       split->flags = flags;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
@@ -412,7 +431,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
                        split->start = start + len;
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
-                       split->flags = em->flags;
+                       split->flags = flags;
 
                        split->block_start = em->block_start + diff;
 
@@ -541,7 +560,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
        int recow;
        int ret;
 
-       btrfs_drop_extent_cache(inode, start, end - 1);
+       btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
        path = btrfs_alloc_path();
        if (!path)
index cd6171c..80038c5 100644 (file)
@@ -117,10 +117,14 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
                                        *objectid = last_ino;
                                        goto found;
                                }
+                       } else if (key.objectid > search_start) {
+                               *objectid = search_start;
+                               goto found;
                        }
                }
                if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
                        break;
+
                start_found = 1;
                last_ino = key.objectid + 1;
                path->slots[0]++;
index 48a3dc0..4516fbf 100644 (file)
@@ -135,7 +135,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 
        BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
        mutex_lock(&BTRFS_I(inode)->extent_mutex);
-       btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
+       btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
        while(num_bytes > 0) {
@@ -163,7 +163,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
                                break;
                        }
                        btrfs_drop_extent_cache(inode, start,
-                                               start + ins.offset - 1);
+                                               start + ins.offset - 1, 0);
                }
                mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
@@ -587,7 +587,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
        btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
                                ordered_extent->file_offset +
-                               ordered_extent->len - 1);
+                               ordered_extent->len - 1, 0);
        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
        ins.objectid = ordered_extent->start;
@@ -880,7 +880,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
        /* don't do orphan cleanup if the fs is readonly. */
-       if (root->inode->i_sb->s_flags & MS_RDONLY)
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
                return;
 
        path = btrfs_alloc_path();
@@ -892,8 +892,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
        key.offset = (u64)-1;
 
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, root->inode);
 
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -933,7 +931,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */
-               inode = btrfs_iget_locked(root->inode->i_sb,
+               inode = btrfs_iget_locked(root->fs_info->sb,
                                          found_key.offset, root);
                if (!inode)
                        break;
@@ -965,7 +963,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
+                       trans = btrfs_start_transaction(root, 1);
                        btrfs_orphan_del(trans, inode);
+                       btrfs_end_transaction(trans, root);
                        iput(inode);
                        continue;
                }
@@ -988,7 +988,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
 
        btrfs_free_path(path);
-       btrfs_end_transaction(trans, root);
 }
 
 void btrfs_read_locked_inode(struct inode *inode)
@@ -1343,8 +1342,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        u64 mask = root->sectorsize - 1;
 
        if (root->ref_cows)
-               btrfs_drop_extent_cache(inode,
-                                       new_size & (~mask), (u64)-1);
+               btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
        path->reada = -1;
        BUG_ON(!path);
@@ -1677,7 +1675,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                                                       hole_start, 0, 0,
                                                       hole_size, 0);
                        btrfs_drop_extent_cache(inode, hole_start,
-                                               (u64)-1);
+                                               (u64)-1, 0);
                        btrfs_check_file(root, inode);
                }
                mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -1843,6 +1841,24 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
                args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                           struct btrfs_root *root, int wait)
+{
+       struct inode *inode;
+       struct btrfs_iget_args args;
+       args.ino = objectid;
+       args.root = root;
+
+       if (wait) {
+               inode = ilookup5(s, objectid, btrfs_find_actor,
+                                (void *)&args);
+       } else {
+               inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+                                       (void *)&args);
+       }
+       return inode;
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
                                struct btrfs_root *root)
 {
@@ -3266,7 +3282,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
-       btrfs_drop_extent_cache(inode, 0, (u64)-1);
+       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
@@ -3412,16 +3428,22 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 {
        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
+       struct inode *inode;
        unsigned long flags;
 
        spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
        while(!list_empty(head)) {
                binode = list_entry(head->next, struct btrfs_inode,
                                    delalloc_inodes);
-               atomic_inc(&binode->vfs_inode.i_count);
+               inode = igrab(&binode->vfs_inode);
+               if (!inode)
+                       list_del_init(&binode->delalloc_inodes);
                spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
-               filemap_write_and_wait(binode->vfs_inode.i_mapping);
-               iput(&binode->vfs_inode);
+               if (inode) {
+                       filemap_write_and_wait(inode->i_mapping);
+                       iput(inode);
+               }
+               cond_resched();
                spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
        }
        spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
index 4c6e0c1..04de767 100644 (file)
@@ -444,12 +444,10 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
                goto out;
        }
 
-       mutex_lock(&root->fs_info->drop_mutex);
        if (root == root->fs_info->tree_root)
                ret = create_subvol(root, vol_args->name, namelen);
        else
                ret = create_snapshot(root, vol_args->name, namelen);
-       mutex_unlock(&root->fs_info->drop_mutex);
 out:
        kfree(vol_args);
        return ret;
index da6d43e..951eacf 100644 (file)
@@ -309,7 +309,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 {
        struct list_head splice;
        struct list_head *cur;
-       struct list_head *tmp;
        struct btrfs_ordered_extent *ordered;
        struct inode *inode;
 
@@ -317,37 +316,38 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_splice_init(&root->fs_info->ordered_extents, &splice);
-       list_for_each_safe(cur, tmp, &splice) {
+       while (!list_empty(&splice)) {
                cur = splice.next;
                ordered = list_entry(cur, struct btrfs_ordered_extent,
                                     root_extent_list);
                if (nocow_only &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+                       list_move(&ordered->root_extent_list,
+                                 &root->fs_info->ordered_extents);
                        cond_resched_lock(&root->fs_info->ordered_extent_lock);
                        continue;
                }
 
                list_del_init(&ordered->root_extent_list);
                atomic_inc(&ordered->refs);
-               inode = ordered->inode;
 
                /*
-                * the inode can't go away until all the pages are gone
-                * and the pages won't go away while there is still
-                * an ordered extent and the ordered extent won't go
-                * away until it is off this list.  So, we can safely
-                * increment i_count here and call iput later
+                * the inode may be getting freed (in sys_unlink path).
                 */
-               atomic_inc(&inode->i_count);
+               inode = igrab(ordered->inode);
+
                spin_unlock(&root->fs_info->ordered_extent_lock);
 
-               btrfs_start_ordered_extent(inode, ordered, 1);
-               btrfs_put_ordered_extent(ordered);
-               iput(inode);
+               if (inode) {
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+                       iput(inode);
+               } else {
+                       btrfs_put_ordered_extent(ordered);
+               }
 
                spin_lock(&root->fs_info->ordered_extent_lock);
        }
-       list_splice_init(&splice, &root->fs_info->ordered_extents);
        spin_unlock(&root->fs_info->ordered_extent_lock);
        return 0;
 }
index 656baef..8c83cf4 100644 (file)
@@ -109,6 +109,7 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
                        spin_lock_init(&dirty->root->node_lock);
                        spin_lock_init(&dirty->root->list_lock);
                        mutex_init(&dirty->root->objectid_mutex);
+                       mutex_init(&dirty->root->log_mutex);
                        INIT_LIST_HEAD(&dirty->root->dead_list);
                        dirty->root->node = root->commit_root;
                        dirty->root->commit_root = NULL;
@@ -590,13 +591,14 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                root = dirty->latest_root;
                atomic_inc(&root->fs_info->throttles);
 
-               mutex_lock(&root->fs_info->drop_mutex);
                while(1) {
                        trans = btrfs_start_transaction(tree_root, 1);
+                       mutex_lock(&root->fs_info->drop_mutex);
                        ret = btrfs_drop_snapshot(trans, dirty->root);
                        if (ret != -EAGAIN) {
                                break;
                        }
+                       mutex_unlock(&root->fs_info->drop_mutex);
 
                        err = btrfs_update_root(trans,
                                        tree_root,
@@ -608,10 +610,8 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                        ret = btrfs_end_transaction(trans, tree_root);
                        BUG_ON(ret);
 
-                       mutex_unlock(&root->fs_info->drop_mutex);
                        btrfs_btree_balance_dirty(tree_root, nr);
                        cond_resched();
-                       mutex_lock(&root->fs_info->drop_mutex);
                }
                BUG_ON(ret);
                atomic_dec(&root->fs_info->throttles);
@@ -689,7 +689,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
        key.objectid = objectid;
-       key.offset = 1;
+       key.offset = trans->transid;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
        old = btrfs_lock_root_node(root);