ext4: Wait for proper transaction commit on fsync
authorJan Kara <jack@suse.cz>
Wed, 9 Dec 2009 04:51:10 +0000 (23:51 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Wed, 9 Dec 2009 04:51:10 +0000 (23:51 -0500)
We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/fsync.c
fs/ext4/inode.c
fs/ext4/super.c

index 4cfc2f0..ab31e65 100644 (file)
@@ -709,6 +709,13 @@ struct ext4_inode_info {
        struct list_head i_aio_dio_complete_list;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+
+       /*
+        * Transactions that contain inode's metadata needed to complete
+        * fsync and fdatasync, respectively.
+        */
+       tid_t i_sync_tid;
+       tid_t i_datasync_tid;
 };
 
 /*
index 2c2b262..05eca81 100644 (file)
@@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
        return 0;
 }
 
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+                                                struct inode *inode,
+                                                int datasync)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+
+       if (ext4_handle_valid(handle)) {
+               ei->i_sync_tid = handle->h_transaction->t_tid;
+               if (datasync)
+                       ei->i_datasync_tid = handle->h_transaction->t_tid;
+       }
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
index 5967f18..700206e 100644 (file)
@@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
                ret = ext4_convert_unwritten_extents_dio(handle, inode,
                                                        path);
+               if (ret >= 0)
+                       ext4_update_inode_fsync_trans(handle, inode, 1);
                goto out2;
        }
        /* buffered IO case */
@@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ret = ext4_ext_convert_to_initialized(handle, inode,
                                                path, iblock,
                                                max_blocks);
+       if (ret >= 0)
+               ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
        if (ret <= 0) {
                err = ret;
@@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        allocated = ext4_ext_get_actual_len(&newex);
        set_buffer_new(bh_result);
 
-       /* Cache only when it is _not_ an uninitialized extent */
-       if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+       /*
+        * Cache the extent and update transaction to commit on fdatasync only
+        * when it is _not_ an uninitialized extent.
+        */
+       if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
+               ext4_update_inode_fsync_trans(handle, inode, 1);
+       } else
+               ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
        if (allocated > max_blocks)
                allocated = max_blocks;
index a3c2507..0b22497 100644 (file)
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
+       struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-       int err, ret = 0;
+       int ret;
+       tid_t commit_tid;
 
        J_ASSERT(ext4_journal_current_handle() == NULL);
 
        trace_ext4_sync_file(file, dentry, datasync);
 
+       if (inode->i_sb->s_flags & MS_RDONLY)
+               return 0;
+
        ret = flush_aio_dio_completed_IO(inode);
        if (ret < 0)
                return ret;
+       
+       if (!journal)
+               return simple_fsync(file, dentry, datasync);
+
        /*
-        * data=writeback:
+        * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
-        *  sync_inode() will sync the metadata
-        *
-        * data=ordered:
-        *  The caller's filemap_fdatawrite() will write the data and
-        *  sync_inode() will write the inode if it is dirty.  Then the caller's
-        *  filemap_fdatawait() will wait on the pages.
+        *  Metadata is in the journal, we wait for proper transaction to
+        *  commit here.
         *
         * data=journal:
         *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (ext4_should_journal_data(inode))
                return ext4_force_commit(inode->i_sb);
 
-       if (!journal)
-               ret = sync_mapping_buffers(inode->i_mapping);
-
-       if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-               goto out;
-
-       /*
-        * The VFS has written the file data.  If the inode is unaltered
-        * then we need not start a commit.
-        */
-       if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-               struct writeback_control wbc = {
-                       .sync_mode = WB_SYNC_ALL,
-                       .nr_to_write = 0, /* sys_fsync did this */
-               };
-               err = sync_inode(inode, &wbc);
-               if (ret == 0)
-                       ret = err;
-       }
-out:
-       if (journal && (journal->j_flags & JBD2_BARRIER))
+       commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+       if (jbd2_log_start_commit(journal, commit_tid))
+               jbd2_log_wait_commit(journal, commit_tid);
+       else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        return ret;
 }
index 958c3ff..f1bc1e3 100644 (file)
@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
                goto cleanup;
 
        set_buffer_new(bh_result);
+
+       ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
        if (count > blocks_to_boundary)
@@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
        struct inode *inode;
+       journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        int block;
 
@@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
 
+       /*
+        * Set transaction id's of transactions that have to be committed
+        * to finish f[data]sync. We set them to currently running transaction
+        * as we cannot be sure that the inode or some of its metadata isn't
+        * part of the transaction - the inode could have been reclaimed and
+        * now it is reread from disk.
+        */
+       if (journal) {
+               transaction_t *transaction;
+               tid_t tid;
+
+               spin_lock(&journal->j_state_lock);
+               if (journal->j_running_transaction)
+                       transaction = journal->j_running_transaction;
+               else
+                       transaction = journal->j_committing_transaction;
+               if (transaction)
+                       tid = transaction->t_tid;
+               else
+                       tid = journal->j_commit_sequence;
+               spin_unlock(&journal->j_state_lock);
+               ei->i_sync_tid = tid;
+               ei->i_datasync_tid = tid;
+       }
+
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
@@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
 
+       ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
index 8ab0c95..2b13dcf 100644 (file)
@@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        spin_lock_init(&(ei->i_block_reservation_lock));
        INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
        ei->cur_aio_dio = NULL;
+       ei->i_sync_tid = 0;
+       ei->i_datasync_tid = 0;
 
        return &ei->vfs_inode;
 }