Btrfs: add extra flushing for renames and truncates

author Chris Mason <chris.mason@oracle.com>

Tue, 31 Mar 2009 17:27:11 +0000 (13:27 -0400)

committer Chris Mason <chris.mason@oracle.com>

Tue, 31 Mar 2009 18:27:58 +0000 (14:27 -0400)
author Chris Mason <chris.mason@oracle.com>
Tue, 31 Mar 2009 17:27:11 +0000 (13:27 -0400)
committer Chris Mason <chris.mason@oracle.com>
Tue, 31 Mar 2009 18:27:58 +0000 (14:27 -0400)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 3af4cfb..b30986f 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
          */
         struct list_head delalloc_inodes;
  
+       /*
+        * list for tracking inodes that must be sent to disk before a
+        * rename or truncate commit
+        */
+       struct list_head ordered_operations;
+
         /* the space_info for where this inode's data allocations are done */
         struct btrfs_space_info *space_info;
  
@@ -122,6 +128,18 @@ struct btrfs_inode {
          */
         u64 last_unlink_trans;
  
+       /*
+        * ordered_data_close is set by truncate when a file that used
+        * to have good data has been truncated to zero.  When it is set
+        * the btrfs file release call will add this inode to the
+        * ordered operations list so that we make sure to flush out any
+        * new data the application may have written before commit.
+        *
+        * yes, its silly to have a single bitflag, but we might grow more
+        * of these.
+        */
+       unsigned ordered_data_close:1;
+
         struct inode vfs_inode;
  };
  
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 2737fac..f48905e 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
  
  #define BTRFS_MAX_LEVEL 8
  
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
  /* holds pointers to all of the tree roots */
  #define BTRFS_ROOT_TREE_OBJECTID 1ULL
  
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
         struct mutex volume_mutex;
         struct mutex tree_reloc_mutex;
  
+       /*
+        * this protects the ordered operations list only while we are
+        * processing all of the entries on it.  This way we make
+        * sure the commit code doesn't find the list temporarily empty
+        * because another function happens to be doing non-waiting preflush
+        * before jumping into the main commit.
+        */
+       struct mutex ordered_operations_mutex;
+
         struct list_head trans_list;
         struct list_head hashers;
         struct list_head dead_roots;
@@ -741,10 +757,29 @@ struct btrfs_fs_info {
          * ordered extents
          */
         spinlock_t ordered_extent_lock;
+
+       /*
+        * all of the data=ordered extents pending writeback
+        * these can span multiple transactions and basically include
+        * every dirty data page that isn't from nodatacow
+        */
         struct list_head ordered_extents;
+
+       /*
+        * all of the inodes that have delalloc bytes.  It is possible for
+        * this list to be empty even when there is still dirty data=ordered
+        * extents waiting to finish IO.
+        */
         struct list_head delalloc_inodes;
  
         /*
+        * special rename and truncate targets that must be on disk before
+        * we're allowed to commit.  This is basically the ext3 style
+        * data=ordered list.
+        */
+       struct list_head ordered_operations;
+
+       /*
          * there is a pool of worker threads for checksumming during writes
          * and a pool for checksumming after reads.  This is because readers
          * can run with FS locks held, and the writers may be waiting for
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 9244cd7..1747dfd 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         INIT_LIST_HEAD(&fs_info->dead_roots);
         INIT_LIST_HEAD(&fs_info->hashers);
         INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->ordered_operations);
         spin_lock_init(&fs_info->delalloc_lock);
         spin_lock_init(&fs_info->new_trans_lock);
         spin_lock_init(&fs_info->ref_cache_lock);
@@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         insert_inode_hash(fs_info->btree_inode);
  
         mutex_init(&fs_info->trans_mutex);
+       mutex_init(&fs_info->ordered_operations_mutex);
         mutex_init(&fs_info->tree_log_mutex);
         mutex_init(&fs_info->drop_mutex);
         mutex_init(&fs_info->pinned_mutex);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 32d10a6..9c9fb46 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1161,6 +1161,20 @@ out_nolock:
                 page_cache_release(pinned[1]);
         *ppos = pos;
  
+       /*
+        * we want to make sure fsync finds this change
+        * but we haven't joined a transaction running right now.
+        *
+        * Later on, someone is sure to update the inode and get the
+        * real transid recorded.
+        *
+        * We set last_trans now to the fs_info generation + 1,
+        * this will either be one more than the running transaction
+        * or the generation used for the next transaction if there isn't
+        * one running right now.
+        */
+       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
         if (num_written > 0 && will_write) {
                 struct btrfs_trans_handle *trans;
  
@@ -1194,6 +1208,18 @@ out_nolock:
  
  int btrfs_release_file(struct inode *inode, struct file *filp)
  {
+       /*
+        * ordered_data_close is set by settattr when we are about to truncate
+        * a file from a non-zero size to a zero size.  This tries to
+        * flush down new bytes that may have been written if the
+        * application were using truncate to replace a file in place.
+        */
+       if (BTRFS_I(inode)->ordered_data_close) {
+               BTRFS_I(inode)->ordered_data_close = 0;
+               btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+               if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                       filemap_flush(inode->i_mapping);
+       }
         if (filp->private_data)
                 btrfs_ioctl_trans_end(filp);
         return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index bffd79f..1cff528 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
         if (err)
                 return err;
  
-       if (S_ISREG(inode->i_mode) &&
-           attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
-               err = btrfs_cont_expand(inode, attr->ia_size);
-               if (err)
-                       return err;
+       if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+               if (attr->ia_size > inode->i_size) {
+                       err = btrfs_cont_expand(inode, attr->ia_size);
+                       if (err)
+                               return err;
+               } else if (inode->i_size > 0 &&
+                          attr->ia_size == 0) {
+
+                       /* we're truncating a file that used to have good
+                        * data down to zero.  Make sure it gets into
+                        * the ordered flush list so that any new writes
+                        * get down to disk quickly.
+                        */
+                       BTRFS_I(inode)->ordered_data_close = 1;
+               }
         }
  
         err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
         extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                              inode->i_mapping, GFP_NOFS);
         INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+       INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
         btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
         mutex_init(&BTRFS_I(inode)->extent_mutex);
         mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
         }
         ClearPageChecked(page);
         set_page_dirty(page);
+
+       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
  
  out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
  
         trans = btrfs_start_transaction(root, 1);
+
+       /*
+        * setattr is responsible for setting the ordered_data_close flag,
+        * but that is only tested during the last file release.  That
+        * could happen well after the next commit, leaving a great big
+        * window where new writes may get lost if someone chooses to write
+        * to this file after truncating to zero
+        *
+        * The inode doesn't have any dirty data here, and so if we commit
+        * this is a noop.  If someone immediately starts writing to the inode
+        * it is very likely we'll catch some of their writes in this
+        * transaction, and the commit will find this file on the ordered
+        * data list with good things to send down.
+        *
+        * This is a best effort solution, there is still a window where
+        * using truncate to replace the contents of the file will
+        * end up with a zero length file after a crash.
+        */
+       if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+               btrfs_add_ordered_operation(trans, root, inode);
+
         btrfs_set_trans_block_group(trans, inode);
         btrfs_i_size_write(inode, inode->i_size);
  
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         ei->i_acl = BTRFS_ACL_NOT_CACHED;
         ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
         INIT_LIST_HEAD(&ei->i_orphan);
+       INIT_LIST_HEAD(&ei->ordered_operations);
         return &ei->vfs_inode;
  }
  
  void btrfs_destroy_inode(struct inode *inode)
  {
         struct btrfs_ordered_extent *ordered;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
         WARN_ON(!list_empty(&inode->i_dentry));
         WARN_ON(inode->i_data.nrpages);
  
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
             BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
                 posix_acl_release(BTRFS_I(inode)->i_default_acl);
  
-       spin_lock(&BTRFS_I(inode)->root->list_lock);
+       /*
+        * Make sure we're properly removed from the ordered operation
+        * lists.
+        */
+       smp_mb();
+       if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+               spin_lock(&root->fs_info->ordered_extent_lock);
+               list_del_init(&BTRFS_I(inode)->ordered_operations);
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+       }
+
+       spin_lock(&root->list_lock);
         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
                        " list\n", inode->i_ino);
                 dump_stack();
         }
-       spin_unlock(&BTRFS_I(inode)->root->list_lock);
+       spin_unlock(&root->list_lock);
  
         while (1) {
                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (ret)
                 goto out_unlock;
  
+       /*
+        * we're using rename to replace one file with another.
+        * and the replacement file is large.  Start IO on it now so
+        * we don't add too much work to the end of the transaction
+        */
+       if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+           new_inode->i_size &&
+           old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+               filemap_flush(old_inode->i_mapping);
+
         trans = btrfs_start_transaction(root, 1);
  
         /*
+        * make sure the inode gets flushed if it is replacing
+        * something.
+        */
+       if (new_inode && new_inode->i_size &&
+           old_inode && S_ISREG(old_inode->i_mode)) {
+               btrfs_add_ordered_operation(trans, root, old_inode);
+       }
+
+       /*
          * this is an ugly little race, but the rename is required to make
          * sure that if we crash, the inode is either at the old name
          * or the new one.  pinning the log transaction lets us make sure
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 77c2411..53c87b1 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
  
         spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
         list_del_init(&entry->root_extent_list);
+
+       /*
+        * we have no more ordered extents for this inode and
+        * no dirty pages.  We can safely remove it from the
+        * list of ordered extents
+        */
+       if (RB_EMPTY_ROOT(&tree->tree) &&
+           !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+               list_del_init(&BTRFS_I(inode)->ordered_operations);
+       }
         spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
  
         mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
  }
  
  /*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+       struct btrfs_inode *btrfs_inode;
+       struct inode *inode;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       mutex_lock(&root->fs_info->ordered_operations_mutex);
+       spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+       list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                  ordered_operations);
+
+               inode = &btrfs_inode->vfs_inode;
+
+               list_del_init(&btrfs_inode->ordered_operations);
+
+               /*
+                * the inode may be getting freed (in sys_unlink path).
+                */
+               inode = igrab(inode);
+
+               if (!wait && inode) {
+                       list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                             &root->fs_info->ordered_operations);
+               }
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+
+               if (inode) {
+                       if (wait)
+                               btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                       else
+                               filemap_flush(inode->i_mapping);
+                       iput(inode);
+               }
+
+               cond_resched();
+               spin_lock(&root->fs_info->ordered_extent_lock);
+       }
+       if (wait && !list_empty(&root->fs_info->ordered_operations))
+               goto again;
+
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+       return 0;
+}
+
+/*
   * Used to start IO or wait for a given ordered extent to finish.
   *
   * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
  
         return ret;
  }
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode)
+{
+       u64 last_mod;
+
+       last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+       /*
+        * if this file hasn't been changed since the last transaction
+        * commit, we can safely return without doing anything
+        */
+       if (last_mod < root->fs_info->last_trans_committed)
+               return 0;
+
+       /*
+        * the transaction is already committing.  Just start the IO and
+        * don't bother with all of this list nonsense
+        */
+       if (trans && root->fs_info->running_transaction->blocked) {
+               btrfs_wait_ordered_range(inode, 0, (u64)-1);
+               return 0;
+       }
+
+       spin_lock(&root->fs_info->ordered_extent_lock);
+       if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+               list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                             &root->fs_info->ordered_operations);
+       }
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+
+       return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index ab66d5e..3d31c88 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
  int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                            loff_t end, int sync_mode);
  int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode);
  #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 9c8f158..664782c 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         int should_grow = 0;
         unsigned long now = get_seconds();
  
+       btrfs_run_ordered_operations(root, 0);
+
         /* make a pass through all the delayed refs we have so far
          * any runnings procs may add more while we are here
          */
@@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                         BUG_ON(ret);
                 }
  
+               /*
+                * rename don't use btrfs_join_transaction, so, once we
+                * set the transaction to blocked above, we aren't going
+                * to get any new ordered operations.  We can safely run
+                * it here and no for sure that nothing new will be added
+                * to the list
+                */
+               btrfs_run_ordered_operations(root, 1);
+
                 smp_mb();
                 if (cur_trans->num_writers > 1 || should_grow)
                         schedule_timeout(timeout);
author	Chris Mason <chris.mason@oracle.com>
	Tue, 31 Mar 2009 17:27:11 +0000 (13:27 -0400)
committer	Chris Mason <chris.mason@oracle.com>
	Tue, 31 Mar 2009 18:27:58 +0000 (14:27 -0400)
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history