ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O

author Mingming Cao <cmm@us.ibm.com>

Mon, 28 Sep 2009 19:48:41 +0000 (15:48 -0400)

committer Theodore Ts'o <tytso@mit.edu>

Mon, 28 Sep 2009 19:48:41 +0000 (15:48 -0400)
author Mingming Cao <cmm@us.ibm.com>
Mon, 28 Sep 2009 19:48:41 +0000 (15:48 -0400)
committer Theodore Ts'o <tytso@mit.edu>
Mon, 28 Sep 2009 19:48:41 +0000 (15:48 -0400)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 2b4293a..ccb4dbf 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -999,6 +999,9 @@ struct ext4_sb_info {
  
         unsigned int s_log_groups_per_flex;
         struct flex_groups *s_flex_groups;
+
+       /* workqueue for dio unwritten */
+       struct workqueue_struct *dio_unwritten_wq;
  };
  
  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index da4f2ec..5633af6 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
  #include <linux/namei.h>
  #include <linux/uio.h>
  #include <linux/bio.h>
+#include <linux/workqueue.h>
  
  #include "ext4_jbd2.h"
  #include "xattr.h"
@@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  }
  
  /*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
   * If the O_DIRECT write will extend the file then add this inode to the
   * orphan list.  So recovery will truncate it back to the original size
   * if the machine crashes during the write.
@@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
   * crashes then stale disk data _may_ be exposed inside the file. But current
   * VFS code falls back into buffered path in that case so we are safe.
   */
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
                               const struct iovec *iov, loff_t offset,
                               unsigned long nr_segs)
  {
@@ -3438,6 +3441,198 @@ out:
         return ret;
  }
  
+/* Maximum number of blocks we map for direct IO at once. */
+
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+                  struct buffer_head *bh_result, int create)
+{
+       handle_t *handle = NULL;
+       int ret = 0;
+       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+       int dio_credits;
+
+       /*
+        * DIO VFS code passes create = 0 flag for write to
+        * the middle of file. It does this to avoid block
+        * allocation for holes, to prevent expose stale data
+        * out when there is parallel buffered read (which does
+        * not hold the i_mutex lock) while direct IO write has
+        * not completed. DIO request on holes finally falls back
+        * to buffered IO for this reason.
+        *
+        * For ext4 extent based file, since we support fallocate,
+        * new allocated extent as uninitialized, for holes, we
+        * could fallocate blocks for holes, thus parallel
+        * buffered IO read will zero out the page when read on
+        * a hole while parallel DIO write to the hole has not completed.
+        *
+        * when we come here, we know it's a direct IO write to
+        * to the middle of file (<i_size)
+        * so it's safe to override the create flag from VFS.
+        */
+       create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+       if (max_blocks > DIO_MAX_BLOCKS)
+               max_blocks = DIO_MAX_BLOCKS;
+       dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+       handle = ext4_journal_start(inode, dio_credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out;
+       }
+       ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+                             create);
+       if (ret > 0) {
+               bh_result->b_size = (ret << inode->i_blkbits);
+               ret = 0;
+       }
+       ext4_journal_stop(handle);
+out:
+       return ret;
+}
+
+#define                DIO_AIO         0x1
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+       kfree(io);
+}
+
+/*
+ * IO write completion for unwritten extents.
+ *
+ * check a range of space and convert unwritten extents to written.
+ */
+static void ext4_end_dio_unwritten(struct work_struct *work)
+{
+       ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+       struct inode *inode = io->inode;
+       loff_t offset = io->offset;
+       size_t size = io->size;
+       int ret = 0;
+       int aio = io->flag & DIO_AIO;
+
+       if (aio)
+               mutex_lock(&inode->i_mutex);
+       if (offset + size <= i_size_read(inode))
+               ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+       if (ret < 0)
+               printk(KERN_EMERG "%s: failed to convert unwritten"
+                       "extents to written extents, error is %d\n",
+                       __func__, ret);
+
+       ext4_free_io_end(io);
+       if (aio)
+               mutex_unlock(&inode->i_mutex);
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
+{
+       ext4_io_end_t *io = NULL;
+
+       io = kmalloc(sizeof(*io), GFP_NOFS);
+
+       if (io) {
+               io->inode = inode;
+               io->flag = flag;
+               io->offset = 0;
+               io->size = 0;
+               io->error = 0;
+               INIT_WORK(&io->work, ext4_end_dio_unwritten);
+       }
+
+       return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+                           ssize_t size, void *private)
+{
+        ext4_io_end_t *io_end = iocb->private;
+       struct workqueue_struct *wq;
+
+       /* if not hole or unwritten extents, just simple return */
+       if (!io_end || !size || !iocb->private)
+               return;
+       io_end->offset = offset;
+       io_end->size = size;
+       wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+       /* We need to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
+
+        if (is_sync_kiocb(iocb))
+               flush_workqueue(wq);
+
+       iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * When end_io call back function called at the last IO complete time,
+ * those extents will be converted to written extents.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+                             const struct iovec *iov, loff_t offset,
+                             unsigned long nr_segs)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       ssize_t ret;
+       size_t count = iov_length(iov, nr_segs);
+
+       loff_t final_size = offset + count;
+       if (rw == WRITE && final_size <= inode->i_size) {
+               /*
+                * For DIO we fallocate blocks for holes, we fallocate blocks
+                * The fallocated extent for hole is marked as uninitialized
+                * to prevent paralel buffered read to expose the stale data
+                * before DIO complete the data IO.
+                * as for previously fallocated extents, ext4 get_block
+                * will just simply mark the buffer mapped but still
+                * keep the extents uninitialized.
+                *
+                * At the end of IO, the ext4 end_io callback function
+                * will convert those unwritten extents to written,
+                *
+                */
+               iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
+               if (!iocb->private)
+                       return -ENOMEM;
+               ret = blockdev_direct_IO(rw, iocb, inode,
+                                        inode->i_sb->s_bdev, iov,
+                                        offset, nr_segs,
+                                        ext4_get_block_dio_write,
+                                        ext4_end_io_dio);
+               return ret;
+       }
+       return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+                             const struct iovec *iov, loff_t offset,
+                             unsigned long nr_segs)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+               return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+       return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
  /*
   * Pages can be marked dirty completely asynchronously from ext4's journalling
   * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 1681773..1a03ea9 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb)
         struct ext4_super_block *es = sbi->s_es;
         int i, err;
  
+       flush_workqueue(sbi->dio_unwritten_wq);
+       destroy_workqueue(sbi->dio_unwritten_wq);
+
         lock_super(sb);
         lock_kernel();
         if (sb->s_dirt)
@@ -2801,6 +2804,12 @@ no_journal:
                         clear_opt(sbi->s_mount_opt, NOBH);
                 }
         }
+       EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+       if (!EXT4_SB(sb)->dio_unwritten_wq) {
+               printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+               goto failed_mount_wq;
+       }
+
         /*
          * The jbd2_journal_load will have done any necessary log recovery,
          * so we can safely mount the rest of the filesystem now.
@@ -2913,6 +2922,8 @@ cantfind_ext4:
  
  failed_mount4:
         ext4_msg(sb, KERN_ERR, "mount failed");
+       destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
         ext4_release_system_zone(sb);
         if (sbi->s_journal) {
                 jbd2_journal_destroy(sbi->s_journal);
author	Mingming Cao <cmm@us.ibm.com>
	Mon, 28 Sep 2009 19:48:41 +0000 (15:48 -0400)
committer	Theodore Ts'o <tytso@mit.edu>
	Mon, 28 Sep 2009 19:48:41 +0000 (15:48 -0400)
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history