xfs: bump up nr_to_write in xfs_vm_writepage

[safe/jmp/linux-2.6] / fs / xfs / linux-2.6 / xfs_lrw.c
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c

index c73d3c1..7078974 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -23,7 +23,6 @@
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir.h"
  #include "xfs_dir2.h"
  #include "xfs_alloc.h"
  #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
  #include "xfs_bmap_btree.h"
  #include "xfs_alloc_btree.h"
  #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
  #include "xfs_dir2_sf.h"
  #include "xfs_attr_sf.h"
  #include "xfs_dinode.h"
@@ -44,14 +42,12 @@
  #include "xfs_error.h"
  #include "xfs_itable.h"
  #include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
  #include "xfs_attr.h"
  #include "xfs_inode_item.h"
  #include "xfs_buf_item.h"
  #include "xfs_utils.h"
  #include "xfs_iomap.h"
+#include "xfs_vnodeops.h"
  
  #include <linux/capability.h>
  #include <linux/writeback.h>
@@ -61,14 +57,12 @@
  void
  xfs_rw_enter_trace(
         int                     tag,
-       xfs_iocore_t            *io,
+       xfs_inode_t             *ip,
         void                    *data,
         size_t                  segs,
         loff_t                  offset,
         int                     ioflags)
  {
-       xfs_inode_t     *ip = XFS_IO_INODE(io);
-
         if (ip->i_rwtrace == NULL)
                 return;
         ktrace_enter(ip->i_rwtrace,
@@ -81,9 +75,9 @@ xfs_rw_enter_trace(
                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
                 (void *)((unsigned long)(offset & 0xffffffff)),
                 (void *)((unsigned long)ioflags),
-               (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
-               (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
-               (void *)NULL,
+               (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
+               (void *)((unsigned long)current_pid()),
                 (void *)NULL,
                 (void *)NULL,
                 (void *)NULL,
@@ -92,13 +86,12 @@ xfs_rw_enter_trace(
  
  void
  xfs_inval_cached_trace(
-       xfs_iocore_t    *io,
+       xfs_inode_t     *ip,
         xfs_off_t       offset,
         xfs_off_t       len,
         xfs_off_t       first,
         xfs_off_t       last)
  {
-       xfs_inode_t     *ip = XFS_IO_INODE(io);
  
         if (ip->i_rwtrace == NULL)
                 return;
@@ -113,7 +106,7 @@ xfs_inval_cached_trace(
                 (void *)((unsigned long)(first & 0xffffffff)),
                 (void *)((unsigned long)((last >> 32) & 0xffffffff)),
                 (void *)((unsigned long)(last & 0xffffffff)),
-               (void *)NULL,
+               (void *)((unsigned long)current_pid()),
                 (void *)NULL,
                 (void *)NULL,
                 (void *)NULL,
@@ -134,56 +127,38 @@ xfs_inval_cached_trace(
   */
  STATIC int
  xfs_iozero(
-       struct inode            *ip,    /* inode                        */
+       struct xfs_inode        *ip,    /* inode                        */
         loff_t                  pos,    /* offset in file               */
-       size_t                  count,  /* size of data to zero         */
-       loff_t                  end_size)       /* max file size to set */
+       size_t                  count)  /* size of data to zero         */
  {
-       unsigned                bytes;
         struct page             *page;
         struct address_space    *mapping;
-       char                    *kaddr;
         int                     status;
  
-       mapping = ip->i_mapping;
+       mapping = VFS_I(ip)->i_mapping;
         do {
-               unsigned long index, offset;
+               unsigned offset, bytes;
+               void *fsdata;
  
                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-               index = pos >> PAGE_CACHE_SHIFT;
                 bytes = PAGE_CACHE_SIZE - offset;
                 if (bytes > count)
                         bytes = count;
  
-               status = -ENOMEM;
-               page = grab_cache_page(mapping, index);
-               if (!page)
+               status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                       AOP_FLAG_UNINTERRUPTIBLE,
+                                       &page, &fsdata);
+               if (status)
                         break;
  
-               kaddr = kmap(page);
-               status = mapping->a_ops->prepare_write(NULL, page, offset,
-                                                       offset + bytes);
-               if (status) {
-                       goto unlock;
-               }
-
-               memset((void *) (kaddr + offset), 0, bytes);
-               flush_dcache_page(page);
-               status = mapping->a_ops->commit_write(NULL, page, offset,
-                                                       offset + bytes);
-               if (!status) {
-                       pos += bytes;
-                       count -= bytes;
-                       if (pos > i_size_read(ip))
-                               i_size_write(ip, pos < end_size ? pos : end_size);
-               }
+               zero_user(page, offset, bytes);
  
-unlock:
-               kunmap(page);
-               unlock_page(page);
-               page_cache_release(page);
-               if (status)
-                       break;
+               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+                                       page, fsdata);
+               WARN_ON(status <= 0); /* can't return less than zero! */
+               pos += bytes;
+               count -= bytes;
+               status = 0;
         } while (count);
  
         return (-status);
@@ -191,27 +166,21 @@ unlock:
  
  ssize_t                        /* bytes read, or (-)  error */
  xfs_read(
-       bhv_desc_t              *bdp,
+       xfs_inode_t             *ip,
         struct kiocb            *iocb,
         const struct iovec      *iovp,
         unsigned int            segs,
         loff_t                  *offset,
-       int                     ioflags,
-       cred_t                  *credp)
+       int                     ioflags)
  {
         struct file             *file = iocb->ki_filp;
         struct inode            *inode = file->f_mapping->host;
+       xfs_mount_t             *mp = ip->i_mount;
         size_t                  size = 0;
-       ssize_t                 ret;
+       ssize_t                 ret = 0;
         xfs_fsize_t             n;
-       xfs_inode_t             *ip;
-       xfs_mount_t             *mp;
-       vnode_t                 *vp;
         unsigned long           seg;
  
-       ip = XFS_BHVTOI(bdp);
-       vp = BHV_TO_VNODE(bdp);
-       mp = ip->i_mount;
  
         XFS_STATS_INC(xs_read_calls);
  
@@ -231,11 +200,11 @@ xfs_read(
  
         if (unlikely(ioflags & IO_ISDIRECT)) {
                 xfs_buftarg_t   *target =
-                       (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+                       XFS_IS_REALTIME_INODE(ip) ?
                                 mp->m_rtdev_targp : mp->m_ddev_targp;
-               if ((*offset & target->pbr_smask) ||
-                   (size & target->pbr_smask)) {
-                       if (*offset == ip->i_d.di_size) {
+               if ((*offset & target->bt_smask) ||
+                   (size & target->bt_smask)) {
+                       if (*offset == ip->i_size) {
                                 return (0);
                         }
                         return -XFS_ERROR(EINVAL);
@@ -249,106 +218,157 @@ xfs_read(
         if (n < size)
                 size = n;
  
-       if (XFS_FORCED_SHUTDOWN(mp)) {
+       if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
-       }
  
         if (unlikely(ioflags & IO_ISDIRECT))
                 mutex_lock(&inode->i_mutex);
         xfs_ilock(ip, XFS_IOLOCK_SHARED);
  
-       if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
-           !(ioflags & IO_INVIS)) {
-               vrwlock_t locktype = VRWLOCK_READ;
+       if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
                 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+               int iolock = XFS_IOLOCK_SHARED;
+
+               ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
+                                       dmflags, &iolock);
+               if (ret) {
+                       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                       if (unlikely(ioflags & IO_ISDIRECT))
+                               mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
+       }
  
-               ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
-                                       BHV_TO_VNODE(bdp), *offset, size,
-                                       dmflags, &locktype);
+       if (unlikely(ioflags & IO_ISDIRECT)) {
+               if (inode->i_mapping->nrpages)
+                       ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
+                                                   -1, FI_REMAPF_LOCKED);
+               mutex_unlock(&inode->i_mutex);
                 if (ret) {
                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                       goto unlock_isem;
+                       return ret;
                 }
         }
  
-       xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
+       xfs_rw_enter_trace(XFS_READ_ENTER, ip,
                                 (void *)iovp, segs, *offset, ioflags);
-       ret = __generic_file_aio_read(iocb, iovp, segs, offset);
+
+       iocb->ki_pos = *offset;
+       ret = generic_file_aio_read(iocb, iovp, segs, *offset);
         if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
                 ret = wait_on_sync_kiocb(iocb);
         if (ret > 0)
                 XFS_STATS_ADD(xs_read_bytes, ret);
  
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-       if (likely(!(ioflags & IO_INVIS)))
-               xfs_ichgtime_fast(ip, inode, XFS_ICHGTIME_ACC);
-
-unlock_isem:
-       if (unlikely(ioflags & IO_ISDIRECT))
-               mutex_unlock(&inode->i_mutex);
         return ret;
  }
  
  ssize_t
-xfs_sendfile(
-       bhv_desc_t              *bdp,
-       struct file             *filp,
-       loff_t                  *offset,
-       int                     ioflags,
+xfs_splice_read(
+       xfs_inode_t             *ip,
+       struct file             *infilp,
+       loff_t                  *ppos,
+       struct pipe_inode_info  *pipe,
         size_t                  count,
-       read_actor_t            actor,
-       void                    *target,
-       cred_t                  *credp)
+       int                     flags,
+       int                     ioflags)
  {
+       xfs_mount_t             *mp = ip->i_mount;
         ssize_t                 ret;
-       xfs_fsize_t             n;
-       xfs_inode_t             *ip;
-       xfs_mount_t             *mp;
-       vnode_t                 *vp;
-
-       ip = XFS_BHVTOI(bdp);
-       vp = BHV_TO_VNODE(bdp);
-       mp = ip->i_mount;
  
         XFS_STATS_INC(xs_read_calls);
-
-       n = XFS_MAXIOFFSET(mp) - *offset;
-       if ((n <= 0) || (count == 0))
-               return 0;
-
-       if (n < count)
-               count = n;
-
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
         xfs_ilock(ip, XFS_IOLOCK_SHARED);
  
-       if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
-           (!(ioflags & IO_INVIS))) {
-               vrwlock_t locktype = VRWLOCK_READ;
+       if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+               int iolock = XFS_IOLOCK_SHARED;
                 int error;
  
-               error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
-                                     FILP_DELAY_FLAG(filp), &locktype);
+               error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
+                                       FILP_DELAY_FLAG(infilp), &iolock);
                 if (error) {
                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
                         return -error;
                 }
         }
-       xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
-                  (void *)(unsigned long)target, count, *offset, ioflags);
-       ret = generic_file_sendfile(filp, offset, count, actor, target);
+       xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
+                          pipe, count, *ppos, ioflags);
+       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       if (ret > 0)
+               XFS_STATS_ADD(xs_read_bytes, ret);
  
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+       return ret;
+}
+
+ssize_t
+xfs_splice_write(
+       xfs_inode_t             *ip,
+       struct pipe_inode_info  *pipe,
+       struct file             *outfilp,
+       loff_t                  *ppos,
+       size_t                  count,
+       int                     flags,
+       int                     ioflags)
+{
+       xfs_mount_t             *mp = ip->i_mount;
+       ssize_t                 ret;
+       struct inode            *inode = outfilp->f_mapping->host;
+       xfs_fsize_t             isize, new_size;
  
+       XFS_STATS_INC(xs_write_calls);
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
+
+       xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
+               int iolock = XFS_IOLOCK_EXCL;
+               int error;
+
+               error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
+                                       FILP_DELAY_FLAG(outfilp), &iolock);
+               if (error) {
+                       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                       return -error;
+               }
+       }
+
+       new_size = *ppos + count;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if (new_size > ip->i_size)
+               ip->i_new_size = new_size;
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
+                          pipe, count, *ppos, ioflags);
+       ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
         if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(xs_write_bytes, ret);
  
-       if (likely(!(ioflags & IO_INVIS)))
-               xfs_ichgtime_fast(ip, LINVFS_GET_IP(vp), XFS_ICHGTIME_ACC);
+       isize = i_size_read(inode);
+       if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+               *ppos = isize;
+
+       if (*ppos > ip->i_size) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               if (*ppos > ip->i_size)
+                       ip->i_size = *ppos;
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       }
  
+       if (ip->i_new_size) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               ip->i_new_size = 0;
+               if (ip->i_d.di_size > ip->i_size)
+                       ip->i_d.di_size = ip->i_size;
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
         return ret;
  }
  
@@ -360,23 +380,19 @@ xfs_sendfile(
   */
  STATIC int                             /* error (positive) */
  xfs_zero_last_block(
-       struct inode    *ip,
-       xfs_iocore_t    *io,
-       xfs_fsize_t     isize,
-       xfs_fsize_t     end_size)
+       xfs_inode_t     *ip,
+       xfs_fsize_t     offset,
+       xfs_fsize_t     isize)
  {
         xfs_fileoff_t   last_fsb;
-       xfs_mount_t     *mp;
+       xfs_mount_t     *mp = ip->i_mount;
         int             nimaps;
         int             zero_offset;
         int             zero_len;
         int             error = 0;
         xfs_bmbt_irec_t imap;
-       loff_t          loff;
-
-       ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
  
-       mp = io->io_mount;
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
  
         zero_offset = XFS_B_FSB_OFFSET(mp, isize);
         if (zero_offset == 0) {
@@ -389,8 +405,8 @@ xfs_zero_last_block(
  
         last_fsb = XFS_B_TO_FSBT(mp, isize);
         nimaps = 1;
-       error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
-                         &nimaps, NULL);
+       error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+                         &nimaps, NULL, NULL);
         if (error) {
                 return error;
         }
@@ -407,14 +423,14 @@ xfs_zero_last_block(
          * out sync.  We need to drop the ilock while we do this so we
          * don't deadlock when the buffer cache calls back to us.
          */
-       XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
-       loff = XFS_FSB_TO_B(mp, last_fsb);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
  
         zero_len = mp->m_sb.sb_blocksize - zero_offset;
+       if (isize + zero_len > offset)
+               zero_len = offset - isize;
+       error = xfs_iozero(ip, isize, zero_len);
  
-       error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
-
-       XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
         ASSERT(error >= 0);
         return error;
  }
@@ -432,37 +448,31 @@ xfs_zero_last_block(
  
  int                                    /* error (positive) */
  xfs_zero_eof(
-       vnode_t         *vp,
-       xfs_iocore_t    *io,
+       xfs_inode_t     *ip,
         xfs_off_t       offset,         /* starting I/O offset */
-       xfs_fsize_t     isize,          /* current inode size */
-       xfs_fsize_t     end_size)       /* terminal inode size */
+       xfs_fsize_t     isize)          /* current inode size */
  {
-       struct inode    *ip = LINVFS_GET_IP(vp);
+       xfs_mount_t     *mp = ip->i_mount;
         xfs_fileoff_t   start_zero_fsb;
         xfs_fileoff_t   end_zero_fsb;
         xfs_fileoff_t   zero_count_fsb;
         xfs_fileoff_t   last_fsb;
-       xfs_extlen_t    buf_len_fsb;
-       xfs_mount_t     *mp;
+       xfs_fileoff_t   zero_off;
+       xfs_fsize_t     zero_len;
         int             nimaps;
         int             error = 0;
         xfs_bmbt_irec_t imap;
  
-       ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
-       ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
         ASSERT(offset > isize);
  
-       mp = io->io_mount;
-
         /*
          * First handle zeroing the block on which isize resides.
          * We only zero a part of that block so it is handled specially.
          */
-       error = xfs_zero_last_block(ip, io, isize, end_size);
+       error = xfs_zero_last_block(ip, offset, isize);
         if (error) {
-               ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
-               ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
                 return error;
         }
  
@@ -490,11 +500,10 @@ xfs_zero_eof(
         while (start_zero_fsb <= end_zero_fsb) {
                 nimaps = 1;
                 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-               error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
-                                 0, NULL, 0, &imap, &nimaps, NULL);
+               error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+                                 0, NULL, 0, &imap, &nimaps, NULL, NULL);
                 if (error) {
-                       ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
-                       ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+                       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
                         return error;
                 }
                 ASSERT(nimaps > 0);
@@ -514,96 +523,64 @@ xfs_zero_eof(
                 }
  
                 /*
-                * There are blocks in the range requested.
-                * Zero them a single write at a time.  We actually
-                * don't zero the entire range returned if it is
-                * too big and simply loop around to get the rest.
-                * That is not the most efficient thing to do, but it
-                * is simple and this path should not be exercised often.
-                */
-               buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
-                                             mp->m_writeio_blocks << 8);
-               /*
+                * There are blocks we need to zero.
                  * Drop the inode lock while we're doing the I/O.
                  * We'll still have the iolock to protect us.
                  */
-               XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
  
-               error = xfs_iozero(ip,
-                                  XFS_FSB_TO_B(mp, start_zero_fsb),
-                                  XFS_FSB_TO_B(mp, buf_len_fsb),
-                                  end_size);
+               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
  
+               if ((zero_off + zero_len) > offset)
+                       zero_len = offset - zero_off;
+
+               error = xfs_iozero(ip, zero_off, zero_len);
                 if (error) {
                         goto out_lock;
                 }
  
-               start_zero_fsb = imap.br_startoff + buf_len_fsb;
+               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
                 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
  
-               XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
         }
  
         return 0;
  
  out_lock:
-
-       XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
         ASSERT(error >= 0);
         return error;
  }
  
  ssize_t                                /* bytes written, or (-) error */
  xfs_write(
-       bhv_desc_t              *bdp,
+       struct xfs_inode        *xip,
         struct kiocb            *iocb,
         const struct iovec      *iovp,
         unsigned int            nsegs,
         loff_t                  *offset,
-       int                     ioflags,
-       cred_t                  *credp)
+       int                     ioflags)
  {
         struct file             *file = iocb->ki_filp;
         struct address_space    *mapping = file->f_mapping;
         struct inode            *inode = mapping->host;
         unsigned long           segs = nsegs;
-       xfs_inode_t             *xip;
         xfs_mount_t             *mp;
         ssize_t                 ret = 0, error = 0;
         xfs_fsize_t             isize, new_size;
-       xfs_iocore_t            *io;
-       vnode_t                 *vp;
-       unsigned long           seg;
         int                     iolock;
         int                     eventsent = 0;
-       vrwlock_t               locktype;
         size_t                  ocount = 0, count;
         loff_t                  pos;
-       int                     need_isem = 1, need_flush = 0;
+       int                     need_i_mutex;
  
         XFS_STATS_INC(xs_write_calls);
  
-       vp = BHV_TO_VNODE(bdp);
-       xip = XFS_BHVTOI(bdp);
-
-       for (seg = 0; seg < segs; seg++) {
-               const struct iovec *iv = &iovp[seg];
-
-               /*
-                * If any segment has a negative length, or the cumulative
-                * length ever wraps negative then return -EINVAL.
-                */
-               ocount += iv->iov_len;
-               if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-                       return -EINVAL;
-               if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-                       continue;
-               if (seg == 0)
-                       return -EFAULT;
-               segs = seg;
-               ocount -= iv->iov_len;  /* This segment is no good */
-               break;
-       }
+       error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
+       if (error)
+               return error;
  
         count = ocount;
         pos = *offset;
@@ -611,97 +588,87 @@ xfs_write(
         if (count == 0)
                 return 0;
  
-       io = &xip->i_iocore;
-       mp = io->io_mount;
+       mp = xip->i_mount;
+
+       xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
  
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
  
-       fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
-
-       if (ioflags & IO_ISDIRECT) {
-               xfs_buftarg_t   *target =
-                       (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-
-               if ((pos & target->pbr_smask) || (count & target->pbr_smask))
-                       return XFS_ERROR(-EINVAL);
-
-               if (!VN_CACHED(vp) && pos < i_size_read(inode))
-                       need_isem = 0;
-
-               if (VN_CACHED(vp))
-                       need_flush = 1;
-       }
-
  relock:
-       if (need_isem) {
+       if (ioflags & IO_ISDIRECT) {
+               iolock = XFS_IOLOCK_SHARED;
+               need_i_mutex = 0;
+       } else {
                 iolock = XFS_IOLOCK_EXCL;
-               locktype = VRWLOCK_WRITE;
-
+               need_i_mutex = 1;
                 mutex_lock(&inode->i_mutex);
-       } else {
-               iolock = XFS_IOLOCK_SHARED;
-               locktype = VRWLOCK_WRITE_DIRECT;
         }
  
         xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
  
-       isize = i_size_read(inode);
-
-       if (file->f_flags & O_APPEND)
-               *offset = isize;
-
  start:
         error = -generic_write_checks(file, &pos, &count,
                                         S_ISBLK(inode->i_mode));
         if (error) {
                 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-               goto out_unlock_isem;
+               goto out_unlock_mutex;
         }
  
-       new_size = pos + count;
-       if (new_size > isize)
-               io->io_new_size = new_size;
-
-       if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
+       if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
             !(ioflags & IO_INVIS) && !eventsent)) {
-               loff_t          savedsize = pos;
                 int             dmflags = FILP_DELAY_FLAG(file);
  
-               if (need_isem)
+               if (need_i_mutex)
                         dmflags |= DM_FLAGS_IMUX;
  
                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
-               error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
-                                     pos, count,
-                                     dmflags, &locktype);
+               error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
+                                     pos, count, dmflags, &iolock);
                 if (error) {
-                       xfs_iunlock(xip, iolock);
-                       goto out_unlock_isem;
+                       goto out_unlock_internal;
                 }
                 xfs_ilock(xip, XFS_ILOCK_EXCL);
                 eventsent = 1;
  
                 /*
-                * The iolock was dropped and reaquired in XFS_SEND_DATA
+                * The iolock was dropped and reacquired in XFS_SEND_DATA
                  * so we have to recheck the size when appending.
                  * We will only "goto start;" once, since having sent the
                  * event prevents another call to XFS_SEND_DATA, which is
                  * what allows the size to change in the first place.
                  */
-               if ((file->f_flags & O_APPEND) && savedsize != isize) {
-                       pos = isize = xip->i_d.di_size;
+               if ((file->f_flags & O_APPEND) && pos != xip->i_size)
                         goto start;
-               }
         }
  
-       if (likely(!(ioflags & IO_INVIS))) {
-               file_update_time(file);
-               xfs_ichgtime_fast(xip, inode,
-                                 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       if (ioflags & IO_ISDIRECT) {
+               xfs_buftarg_t   *target =
+                       XFS_IS_REALTIME_INODE(xip) ?
+                               mp->m_rtdev_targp : mp->m_ddev_targp;
+
+               if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+                       xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+                       return XFS_ERROR(-EINVAL);
+               }
+
+               if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
+                       xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+                       iolock = XFS_IOLOCK_EXCL;
+                       need_i_mutex = 1;
+                       mutex_lock(&inode->i_mutex);
+                       xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+                       goto start;
+               }
         }
  
+       new_size = pos + count;
+       if (new_size > xip->i_size)
+               xip->i_new_size = new_size;
+
+       if (likely(!(ioflags & IO_INVIS)))
+               xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
         /*
          * If the offset is beyond the size of the file, we have a couple
          * of things to do. First, if there is already space allocated
@@ -711,12 +678,11 @@ start:
          * to zero it out up to the new size.
          */
  
-       if (pos > isize) {
-               error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
-                                       isize, pos + count);
+       if (pos > xip->i_size) {
+               error = xfs_zero_eof(xip, pos, xip->i_size);
                 if (error) {
-                       xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-                       goto out_unlock_isem;
+                       xfs_iunlock(xip, XFS_ILOCK_EXCL);
+                       goto out_unlock_internal;
                 }
         }
         xfs_iunlock(xip, XFS_ILOCK_EXCL);
@@ -734,36 +700,37 @@ start:
              !capable(CAP_FSETID)) {
                 error = xfs_write_clear_setuid(xip);
                 if (likely(!error))
-                       error = -remove_suid(file->f_dentry);
+                       error = -file_remove_suid(file);
                 if (unlikely(error)) {
-                       xfs_iunlock(xip, iolock);
-                       goto out_unlock_isem;
+                       goto out_unlock_internal;
                 }
         }
  
-retry:
         /* We can write back this queue in page reclaim */
         current->backing_dev_info = mapping->backing_dev_info;
  
         if ((ioflags & IO_ISDIRECT)) {
-               if (need_flush) {
-                       xfs_inval_cached_trace(io, pos, -1,
-                                       ctooff(offtoct(pos)), -1);
-                       VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
+               if (mapping->nrpages) {
+                       WARN_ON(need_i_mutex == 0);
+                       xfs_inval_cached_trace(xip, pos, -1,
+                                       (pos & PAGE_CACHE_MASK), -1);
+                       error = xfs_flushinval_pages(xip,
+                                       (pos & PAGE_CACHE_MASK),
                                         -1, FI_REMAPF_LOCKED);
+                       if (error)
+                               goto out_unlock_internal;
                 }
  
-               if (need_isem) {
+               if (need_i_mutex) {
                         /* demote the lock now the cached pages are gone */
-                       XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
+                       xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
                         mutex_unlock(&inode->i_mutex);
  
                         iolock = XFS_IOLOCK_SHARED;
-                       locktype = VRWLOCK_WRITE_DIRECT;
-                       need_isem = 0;
+                       need_i_mutex = 0;
                 }
  
-               xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
+               xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
                                 *offset, ioflags);
                 ret = generic_file_direct_write(iocb, iovp,
                                 &segs, pos, offset, count, ocount);
@@ -778,16 +745,31 @@ retry:
                         pos += ret;
                         count -= ret;
  
-                       need_isem = 1;
                         ioflags &= ~IO_ISDIRECT;
                         xfs_iunlock(xip, iolock);
                         goto relock;
                 }
         } else {
-               xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
+               int enospc = 0;
+               ssize_t ret2 = 0;
+
+write_retry:
+               xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
                                 *offset, ioflags);
-               ret = generic_file_buffered_write(iocb, iovp, segs,
+               ret2 = generic_file_buffered_write(iocb, iovp, segs,
                                 pos, offset, count, ret);
+               /*
+                * if we just got an ENOSPC, flush the inode now we
+                * aren't holding any page locks and retry *once*
+                */
+               if (ret2 == -ENOSPC && !enospc) {
+                       error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
+                       if (error)
+                               goto out_unlock_internal;
+                       enospc = 1;
+                       goto write_retry;
+               }
+               ret = ret2;
         }
  
         current->backing_dev_info = NULL;
@@ -795,35 +777,31 @@ retry:
         if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
                 ret = wait_on_sync_kiocb(iocb);
  
-       if ((ret == -ENOSPC) &&
-           DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
-           !(ioflags & IO_INVIS)) {
+       isize = i_size_read(inode);
+       if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+               *offset = isize;
+
+       if (*offset > xip->i_size) {
+               xfs_ilock(xip, XFS_ILOCK_EXCL);
+               if (*offset > xip->i_size)
+                       xip->i_size = *offset;
+               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+       }
  
-               xfs_rwunlock(bdp, locktype);
-               if (need_isem)
+       if (ret == -ENOSPC &&
+           DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
+               xfs_iunlock(xip, iolock);
+               if (need_i_mutex)
                         mutex_unlock(&inode->i_mutex);
-               error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
-                               DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+               error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
+                               DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
                                 0, 0, 0); /* Delay flag intentionally  unused */
-               if (error)
-                       goto out_nounlocks;
-               if (need_isem)
+               if (need_i_mutex)
                         mutex_lock(&inode->i_mutex);
-               xfs_rwlock(bdp, locktype);
-               pos = xip->i_d.di_size;
-               ret = 0;
-               goto retry;
-       }
-
-       if (*offset > xip->i_d.di_size) {
-               xfs_ilock(xip, XFS_ILOCK_EXCL);
-               if (*offset > xip->i_d.di_size) {
-                       xip->i_d.di_size = *offset;
-                       i_size_write(inode, *offset);
-                       xip->i_update_core = 1;
-                       xip->i_update_size = 1;
-               }
-               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+               xfs_ilock(xip, iolock);
+               if (error)
+                       goto out_unlock_internal;
+               goto start;
         }
  
         error = -ret;
@@ -834,96 +812,41 @@ retry:
  
         /* Handle various SYNC-type writes */
         if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
-               /*
-                * If we're treating this as O_DSYNC and we have not updated the
-                * size, force the log.
-                */
-               if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-                   !(xip->i_update_size)) {
-                       xfs_inode_log_item_t    *iip = xip->i_itemp;
-
-                       /*
-                        * If an allocation transaction occurred
-                        * without extending the size, then we have to force
-                        * the log up the proper point to ensure that the
-                        * allocation is permanent.  We can't count on
-                        * the fact that buffered writes lock out direct I/O
-                        * writes - the direct I/O write could have extended
-                        * the size nontransactionally, then finished before
-                        * we started.  xfs_write_file will think that the file
-                        * didn't grow but the update isn't safe unless the
-                        * size change is logged.
-                        *
-                        * Force the log if we've committed a transaction
-                        * against the inode or if someone else has and
-                        * the commit record hasn't gone to disk (e.g.
-                        * the inode is pinned).  This guarantees that
-                        * all changes affecting the inode are permanent
-                        * when we return.
-                        */
-                       if (iip && iip->ili_last_lsn) {
-                               xfs_log_force(mp, iip->ili_last_lsn,
-                                               XFS_LOG_FORCE | XFS_LOG_SYNC);
-                       } else if (xfs_ipincount(xip) > 0) {
-                               xfs_log_force(mp, (xfs_lsn_t)0,
-                                               XFS_LOG_FORCE | XFS_LOG_SYNC);
-                       }
-
-               } else {
-                       xfs_trans_t     *tp;
-
-                       /*
-                        * O_SYNC or O_DSYNC _with_ a size update are handled
-                        * the same way.
-                        *
-                        * If the write was synchronous then we need to make
-                        * sure that the inode modification time is permanent.
-                        * We'll have updated the timestamp above, so here
-                        * we use a synchronous transaction to log the inode.
-                        * It's not fast, but it's necessary.
-                        *
-                        * If this a dsync write and the size got changed
-                        * non-transactionally, then we need to ensure that
-                        * the size change gets logged in a synchronous
-                        * transaction.
-                        */
+               int error2;
  
-                       tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-                       if ((error = xfs_trans_reserve(tp, 0,
-                                                     XFS_SWRITE_LOG_RES(mp),
-                                                     0, 0, 0))) {
-                               /* Transaction reserve failed */
-                               xfs_trans_cancel(tp, 0);
-                       } else {
-                               /* Transaction reserve successful */
-                               xfs_ilock(xip, XFS_ILOCK_EXCL);
-                               xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
-                               xfs_trans_ihold(tp, xip);
-                               xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
-                               xfs_trans_set_sync(tp);
-                               error = xfs_trans_commit(tp, 0, NULL);
-                               xfs_iunlock(xip, XFS_ILOCK_EXCL);
-                       }
-                       if (error)
-                               goto out_unlock_internal;
-               }
-       
-               xfs_rwunlock(bdp, locktype);
-               if (need_isem)
+               xfs_iunlock(xip, iolock);
+               if (need_i_mutex)
                         mutex_unlock(&inode->i_mutex);
-
-               error = sync_page_range(inode, mapping, pos, ret);
+               error2 = sync_page_range(inode, mapping, pos, ret);
                 if (!error)
-                       error = ret;
-               return error;
+                       error = error2;
+               if (need_i_mutex)
+                       mutex_lock(&inode->i_mutex);
+               xfs_ilock(xip, iolock);
+               error2 = xfs_write_sync_logforce(mp, xip);
+               if (!error)
+                       error = error2;
         }
  
   out_unlock_internal:
-       xfs_rwunlock(bdp, locktype);
- out_unlock_isem:
-       if (need_isem)
+       if (xip->i_new_size) {
+               xfs_ilock(xip, XFS_ILOCK_EXCL);
+               xip->i_new_size = 0;
+               /*
+                * If this was a direct or synchronous I/O that failed (such
+                * as ENOSPC) then part of the I/O may have been written to
+                * disk before the error occured.  In this case the on-disk
+                * file size may have been adjusted beyond the in-memory file
+                * size and now needs to be truncated back.
+                */
+               if (xip->i_d.di_size > xip->i_size)
+                       xip->i_d.di_size = xip->i_size;
+               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+       }
+       xfs_iunlock(xip, iolock);
+ out_unlock_mutex:
+       if (need_i_mutex)
                 mutex_unlock(&inode->i_mutex);
- out_nounlocks:
         return -error;
  }
  
@@ -936,13 +859,7 @@ retry:
  int
  xfs_bdstrat_cb(struct xfs_buf *bp)
  {
-       xfs_mount_t     *mp;
-
-       mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-       if (!XFS_FORCED_SHUTDOWN(mp)) {
-               pagebuf_iorequest(bp);
-               return 0;
-       } else {
+       if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
                 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
                 /*
                  * Metadata write that didn't get logged but
@@ -955,50 +872,29 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
                 else
                         return (xfs_bioerror(bp));
         }
-}
-
  
-int
-xfs_bmap(bhv_desc_t    *bdp,
-       xfs_off_t       offset,
-       ssize_t         count,
-       int             flags,
-       xfs_iomap_t     *iomapp,
-       int             *niomaps)
-{
-       xfs_inode_t     *ip = XFS_BHVTOI(bdp);
-       xfs_iocore_t    *io = &ip->i_iocore;
-
-       ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-       ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
-              ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
-
-       return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
+       xfs_buf_iorequest(bp);
+       return 0;
  }
  
  /*
- * Wrapper around bdstrat so that we can stop data
- * from going to disk in case we are shutting down the filesystem.
- * Typically user data goes thru this path; one of the exceptions
- * is the superblock.
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
   */
-int
+void
  xfsbdstrat(
         struct xfs_mount        *mp,
         struct xfs_buf          *bp)
  {
         ASSERT(mp);
         if (!XFS_FORCED_SHUTDOWN(mp)) {
-               /* Grio redirection would go here
-                * if (XFS_BUF_IS_GRIO(bp)) {
-                */
-
-               pagebuf_iorequest(bp);
-               return 0;
+               xfs_buf_iorequest(bp);
+               return;
         }
  
         xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-       return (xfs_bioerror_relse(bp));
+       xfs_bioerror_relse(bp);
  }
  
  /*