SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/xfs/linux-2.6/xfs_file.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_bit.h"
  21 #include "xfs_log.h"
  22 #include "xfs_inum.h"
  23 #include "xfs_sb.h"
  24 #include "xfs_ag.h"
  25 #include "xfs_dir2.h"
  26 #include "xfs_trans.h"
  27 #include "xfs_dmapi.h"
  28 #include "xfs_mount.h"
  29 #include "xfs_bmap_btree.h"
  30 #include "xfs_alloc_btree.h"
  31 #include "xfs_ialloc_btree.h"
  32 #include "xfs_alloc.h"
  33 #include "xfs_btree.h"
  34 #include "xfs_attr_sf.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_dinode.h"
  37 #include "xfs_inode.h"
  38 #include "xfs_bmap.h"
  39 #include "xfs_error.h"
  40 #include "xfs_rw.h"
  41 #include "xfs_vnodeops.h"
  42 #include "xfs_da_btree.h"
  43 #include "xfs_ioctl.h"
  44 #include "xfs_trace.h"
  45
  46 #include <linux/dcache.h>
  47
  48 static const struct vm_operations_struct xfs_file_vm_ops;
  49
  50 /*
  51  *      xfs_iozero
  52  *
  53  *      xfs_iozero clears the specified range of buffer supplied,
  54  *      and marks all the affected blocks as valid and modified.  If
  55  *      an affected block is not allocated, it will be allocated.  If
  56  *      an affected block is not completely overwritten, and is not
  57  *      valid before the operation, it will be read from disk before
  58  *      being partially zeroed.
  59  */
  60 STATIC int
  61 xfs_iozero(
  62         struct xfs_inode        *ip,    /* inode                        */
  63         loff_t                  pos,    /* offset in file               */
  64         size_t                  count)  /* size of data to zero         */
  65 {
  66         struct page             *page;
  67         struct address_space    *mapping;
  68         int                     status;
  69
  70         mapping = VFS_I(ip)->i_mapping;
  71         do {
  72                 unsigned offset, bytes;
  73                 void *fsdata;
  74
  75                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
  76                 bytes = PAGE_CACHE_SIZE - offset;
  77                 if (bytes > count)
  78                         bytes = count;
  79
  80                 status = pagecache_write_begin(NULL, mapping, pos, bytes,
  81                                         AOP_FLAG_UNINTERRUPTIBLE,
  82                                         &page, &fsdata);
  83                 if (status)
  84                         break;
  85
  86                 zero_user(page, offset, bytes);
  87
  88                 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
  89                                         page, fsdata);
  90                 WARN_ON(status <= 0); /* can't return less than zero! */
  91                 pos += bytes;
  92                 count -= bytes;
  93                 status = 0;
  94         } while (count);
  95
  96         return (-status);
  97 }
  98
  99 STATIC ssize_t
 100 xfs_file_aio_read(
 101         struct kiocb            *iocb,
 102         const struct iovec      *iovp,
 103         unsigned long           nr_segs,
 104         loff_t                  pos)
 105 {
 106         struct file             *file = iocb->ki_filp;
 107         struct inode            *inode = file->f_mapping->host;
 108         struct xfs_inode        *ip = XFS_I(inode);
 109         struct xfs_mount        *mp = ip->i_mount;
 110         size_t                  size = 0;
 111         ssize_t                 ret = 0;
 112         int                     ioflags = 0;
 113         xfs_fsize_t             n;
 114         unsigned long           seg;
 115
 116         XFS_STATS_INC(xs_read_calls);
 117
 118         BUG_ON(iocb->ki_pos != pos);
 119
 120         if (unlikely(file->f_flags & O_DIRECT))
 121                 ioflags |= IO_ISDIRECT;
 122         if (file->f_mode & FMODE_NOCMTIME)
 123                 ioflags |= IO_INVIS;
 124
 125         /* START copy & waste from filemap.c */
 126         for (seg = 0; seg < nr_segs; seg++) {
 127                 const struct iovec *iv = &iovp[seg];
 128
 129                 /*
 130                  * If any segment has a negative length, or the cumulative
 131                  * length ever wraps negative then return -EINVAL.
 132                  */
 133                 size += iv->iov_len;
 134                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 135                         return XFS_ERROR(-EINVAL);
 136         }
 137         /* END copy & waste from filemap.c */
 138
 139         if (unlikely(ioflags & IO_ISDIRECT)) {
 140                 xfs_buftarg_t   *target =
 141                         XFS_IS_REALTIME_INODE(ip) ?
 142                                 mp->m_rtdev_targp : mp->m_ddev_targp;
 143                 if ((iocb->ki_pos & target->bt_smask) ||
 144                     (size & target->bt_smask)) {
 145                         if (iocb->ki_pos == ip->i_size)
 146                                 return 0;
 147                         return -XFS_ERROR(EINVAL);
 148                 }
 149         }
 150
 151         n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
 152         if (n <= 0 || size == 0)
 153                 return 0;
 154
 155         if (n < size)
 156                 size = n;
 157
 158         if (XFS_FORCED_SHUTDOWN(mp))
 159                 return -EIO;
 160
 161         if (unlikely(ioflags & IO_ISDIRECT))
 162                 mutex_lock(&inode->i_mutex);
 163         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 164
 165         if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
 166                 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 167                 int iolock = XFS_IOLOCK_SHARED;
 168
 169                 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
 170                                         dmflags, &iolock);
 171                 if (ret) {
 172                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 173                         if (unlikely(ioflags & IO_ISDIRECT))
 174                                 mutex_unlock(&inode->i_mutex);
 175                         return ret;
 176                 }
 177         }
 178
 179         if (unlikely(ioflags & IO_ISDIRECT)) {
 180                 if (inode->i_mapping->nrpages) {
 181                         ret = -xfs_flushinval_pages(ip,
 182                                         (iocb->ki_pos & PAGE_CACHE_MASK),
 183                                         -1, FI_REMAPF_LOCKED);
 184                 }
 185                 mutex_unlock(&inode->i_mutex);
 186                 if (ret) {
 187                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 188                         return ret;
 189                 }
 190         }
 191
 192         trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
 193
 194         ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
 195         if (ret > 0)
 196                 XFS_STATS_ADD(xs_read_bytes, ret);
 197
 198         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 199         return ret;
 200 }
 201
 202 STATIC ssize_t
 203 xfs_file_splice_read(
 204         struct file             *infilp,
 205         loff_t                  *ppos,
 206         struct pipe_inode_info  *pipe,
 207         size_t                  count,
 208         unsigned int            flags)
 209 {
 210         struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
 211         struct xfs_mount        *mp = ip->i_mount;
 212         int                     ioflags = 0;
 213         ssize_t                 ret;
 214
 215         XFS_STATS_INC(xs_read_calls);
 216
 217         if (infilp->f_mode & FMODE_NOCMTIME)
 218                 ioflags |= IO_INVIS;
 219
 220         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 221                 return -EIO;
 222
 223         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 224
 225         if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
 226                 int iolock = XFS_IOLOCK_SHARED;
 227                 int error;
 228
 229                 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
 230                                         FILP_DELAY_FLAG(infilp), &iolock);
 231                 if (error) {
 232                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 233                         return -error;
 234                 }
 235         }
 236
 237         trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 238
 239         ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 240         if (ret > 0)
 241                 XFS_STATS_ADD(xs_read_bytes, ret);
 242
 243         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 244         return ret;
 245 }
 246
 247 STATIC ssize_t
 248 xfs_file_splice_write(
 249         struct pipe_inode_info  *pipe,
 250         struct file             *outfilp,
 251         loff_t                  *ppos,
 252         size_t                  count,
 253         unsigned int            flags)
 254 {
 255         struct inode            *inode = outfilp->f_mapping->host;
 256         struct xfs_inode        *ip = XFS_I(inode);
 257         struct xfs_mount        *mp = ip->i_mount;
 258         xfs_fsize_t             isize, new_size;
 259         int                     ioflags = 0;
 260         ssize_t                 ret;
 261
 262         XFS_STATS_INC(xs_write_calls);
 263
 264         if (outfilp->f_mode & FMODE_NOCMTIME)
 265                 ioflags |= IO_INVIS;
 266
 267         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 268                 return -EIO;
 269
 270         xfs_ilock(ip, XFS_IOLOCK_EXCL);
 271
 272         if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
 273                 int iolock = XFS_IOLOCK_EXCL;
 274                 int error;
 275
 276                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
 277                                         FILP_DELAY_FLAG(outfilp), &iolock);
 278                 if (error) {
 279                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 280                         return -error;
 281                 }
 282         }
 283
 284         new_size = *ppos + count;
 285
 286         xfs_ilock(ip, XFS_ILOCK_EXCL);
 287         if (new_size > ip->i_size)
 288                 ip->i_new_size = new_size;
 289         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 290
 291         trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 292
 293         ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 294         if (ret > 0)
 295                 XFS_STATS_ADD(xs_write_bytes, ret);
 296
 297         isize = i_size_read(inode);
 298         if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
 299                 *ppos = isize;
 300
 301         if (*ppos > ip->i_size) {
 302                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 303                 if (*ppos > ip->i_size)
 304                         ip->i_size = *ppos;
 305                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 306         }
 307
 308         if (ip->i_new_size) {
 309                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 310                 ip->i_new_size = 0;
 311                 if (ip->i_d.di_size > ip->i_size)
 312                         ip->i_d.di_size = ip->i_size;
 313                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 314         }
 315         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 316         return ret;
 317 }
 318
 319 /*
 320  * This routine is called to handle zeroing any space in the last
 321  * block of the file that is beyond the EOF.  We do this since the
 322  * size is being increased without writing anything to that block
 323  * and we don't want anyone to read the garbage on the disk.
 324  */
 325 STATIC int                              /* error (positive) */
 326 xfs_zero_last_block(
 327         xfs_inode_t     *ip,
 328         xfs_fsize_t     offset,
 329         xfs_fsize_t     isize)
 330 {
 331         xfs_fileoff_t   last_fsb;
 332         xfs_mount_t     *mp = ip->i_mount;
 333         int             nimaps;
 334         int             zero_offset;
 335         int             zero_len;
 336         int             error = 0;
 337         xfs_bmbt_irec_t imap;
 338
 339         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 340
 341         zero_offset = XFS_B_FSB_OFFSET(mp, isize);
 342         if (zero_offset == 0) {
 343                 /*
 344                  * There are no extra bytes in the last block on disk to
 345                  * zero, so return.
 346                  */
 347                 return 0;
 348         }
 349
 350         last_fsb = XFS_B_TO_FSBT(mp, isize);
 351         nimaps = 1;
 352         error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
 353                           &nimaps, NULL, NULL);
 354         if (error) {
 355                 return error;
 356         }
 357         ASSERT(nimaps > 0);
 358         /*
 359          * If the block underlying isize is just a hole, then there
 360          * is nothing to zero.
 361          */
 362         if (imap.br_startblock == HOLESTARTBLOCK) {
 363                 return 0;
 364         }
 365         /*
 366          * Zero the part of the last block beyond the EOF, and write it
 367          * out sync.  We need to drop the ilock while we do this so we
 368          * don't deadlock when the buffer cache calls back to us.
 369          */
 370         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 371
 372         zero_len = mp->m_sb.sb_blocksize - zero_offset;
 373         if (isize + zero_len > offset)
 374                 zero_len = offset - isize;
 375         error = xfs_iozero(ip, isize, zero_len);
 376
 377         xfs_ilock(ip, XFS_ILOCK_EXCL);
 378         ASSERT(error >= 0);
 379         return error;
 380 }
 381
 382 /*
 383  * Zero any on disk space between the current EOF and the new,
 384  * larger EOF.  This handles the normal case of zeroing the remainder
 385  * of the last block in the file and the unusual case of zeroing blocks
 386  * out beyond the size of the file.  This second case only happens
 387  * with fixed size extents and when the system crashes before the inode
 388  * size was updated but after blocks were allocated.  If fill is set,
 389  * then any holes in the range are filled and zeroed.  If not, the holes
 390  * are left alone as holes.
 391  */
 392
 393 int                                     /* error (positive) */
 394 xfs_zero_eof(
 395         xfs_inode_t     *ip,
 396         xfs_off_t       offset,         /* starting I/O offset */
 397         xfs_fsize_t     isize)          /* current inode size */
 398 {
 399         xfs_mount_t     *mp = ip->i_mount;
 400         xfs_fileoff_t   start_zero_fsb;
 401         xfs_fileoff_t   end_zero_fsb;
 402         xfs_fileoff_t   zero_count_fsb;
 403         xfs_fileoff_t   last_fsb;
 404         xfs_fileoff_t   zero_off;
 405         xfs_fsize_t     zero_len;
 406         int             nimaps;
 407         int             error = 0;
 408         xfs_bmbt_irec_t imap;
 409
 410         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 411         ASSERT(offset > isize);
 412
 413         /*
 414          * First handle zeroing the block on which isize resides.
 415          * We only zero a part of that block so it is handled specially.
 416          */
 417         error = xfs_zero_last_block(ip, offset, isize);
 418         if (error) {
 419                 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 420                 return error;
 421         }
 422
 423         /*
 424          * Calculate the range between the new size and the old
 425          * where blocks needing to be zeroed may exist.  To get the
 426          * block where the last byte in the file currently resides,
 427          * we need to subtract one from the size and truncate back
 428          * to a block boundary.  We subtract 1 in case the size is
 429          * exactly on a block boundary.
 430          */
 431         last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
 432         start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 433         end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
 434         ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
 435         if (last_fsb == end_zero_fsb) {
 436                 /*
 437                  * The size was only incremented on its last block.
 438                  * We took care of that above, so just return.
 439                  */
 440                 return 0;
 441         }
 442
 443         ASSERT(start_zero_fsb <= end_zero_fsb);
 444         while (start_zero_fsb <= end_zero_fsb) {
 445                 nimaps = 1;
 446                 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 447                 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
 448                                   0, NULL, 0, &imap, &nimaps, NULL, NULL);
 449                 if (error) {
 450                         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 451                         return error;
 452                 }
 453                 ASSERT(nimaps > 0);
 454
 455                 if (imap.br_state == XFS_EXT_UNWRITTEN ||
 456                     imap.br_startblock == HOLESTARTBLOCK) {
 457                         /*
 458                          * This loop handles initializing pages that were
 459                          * partially initialized by the code below this
 460                          * loop. It basically zeroes the part of the page
 461                          * that sits on a hole and sets the page as P_HOLE
 462                          * and calls remapf if it is a mapped file.
 463                          */
 464                         start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 465                         ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 466                         continue;
 467                 }
 468
 469                 /*
 470                  * There are blocks we need to zero.
 471                  * Drop the inode lock while we're doing the I/O.
 472                  * We'll still have the iolock to protect us.
 473                  */
 474                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 475
 476                 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
 477                 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
 478
 479                 if ((zero_off + zero_len) > offset)
 480                         zero_len = offset - zero_off;
 481
 482                 error = xfs_iozero(ip, zero_off, zero_len);
 483                 if (error) {
 484                         goto out_lock;
 485                 }
 486
 487                 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 488                 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 489
 490                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 491         }
 492
 493         return 0;
 494
 495 out_lock:
 496         xfs_ilock(ip, XFS_ILOCK_EXCL);
 497         ASSERT(error >= 0);
 498         return error;
 499 }
 500
 501 STATIC ssize_t
 502 xfs_file_aio_write(
 503         struct kiocb            *iocb,
 504         const struct iovec      *iovp,
 505         unsigned long           nr_segs,
 506         loff_t                  pos)
 507 {
 508         struct file             *file = iocb->ki_filp;
 509         struct address_space    *mapping = file->f_mapping;
 510         struct inode            *inode = mapping->host;
 511         struct xfs_inode        *ip = XFS_I(inode);
 512         struct xfs_mount        *mp = ip->i_mount;
 513         ssize_t                 ret = 0, error = 0;
 514         int                     ioflags = 0;
 515         xfs_fsize_t             isize, new_size;
 516         int                     iolock;
 517         int                     eventsent = 0;
 518         size_t                  ocount = 0, count;
 519         int                     need_i_mutex;
 520
 521         XFS_STATS_INC(xs_write_calls);
 522
 523         BUG_ON(iocb->ki_pos != pos);
 524
 525         if (unlikely(file->f_flags & O_DIRECT))
 526                 ioflags |= IO_ISDIRECT;
 527         if (file->f_mode & FMODE_NOCMTIME)
 528                 ioflags |= IO_INVIS;
 529
 530         error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
 531         if (error)
 532                 return error;
 533
 534         count = ocount;
 535         if (count == 0)
 536                 return 0;
 537
 538         xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
 539
 540         if (XFS_FORCED_SHUTDOWN(mp))
 541                 return -EIO;
 542
 543 relock:
 544         if (ioflags & IO_ISDIRECT) {
 545                 iolock = XFS_IOLOCK_SHARED;
 546                 need_i_mutex = 0;
 547         } else {
 548                 iolock = XFS_IOLOCK_EXCL;
 549                 need_i_mutex = 1;
 550                 mutex_lock(&inode->i_mutex);
 551         }
 552
 553         xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
 554
 555 start:
 556         error = -generic_write_checks(file, &pos, &count,
 557                                         S_ISBLK(inode->i_mode));
 558         if (error) {
 559                 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
 560                 goto out_unlock_mutex;
 561         }
 562
 563         if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
 564             !(ioflags & IO_INVIS) && !eventsent)) {
 565                 int             dmflags = FILP_DELAY_FLAG(file);
 566
 567                 if (need_i_mutex)
 568                         dmflags |= DM_FLAGS_IMUX;
 569
 570                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 571                 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
 572                                       pos, count, dmflags, &iolock);
 573                 if (error) {
 574                         goto out_unlock_internal;
 575                 }
 576                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 577                 eventsent = 1;
 578
 579                 /*
 580                  * The iolock was dropped and reacquired in XFS_SEND_DATA
 581                  * so we have to recheck the size when appending.
 582                  * We will only "goto start;" once, since having sent the
 583                  * event prevents another call to XFS_SEND_DATA, which is
 584                  * what allows the size to change in the first place.
 585                  */
 586                 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
 587                         goto start;
 588         }
 589
 590         if (ioflags & IO_ISDIRECT) {
 591                 xfs_buftarg_t   *target =
 592                         XFS_IS_REALTIME_INODE(ip) ?
 593                                 mp->m_rtdev_targp : mp->m_ddev_targp;
 594
 595                 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
 596                         xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
 597                         return XFS_ERROR(-EINVAL);
 598                 }
 599
 600                 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
 601                         xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
 602                         iolock = XFS_IOLOCK_EXCL;
 603                         need_i_mutex = 1;
 604                         mutex_lock(&inode->i_mutex);
 605                         xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
 606                         goto start;
 607                 }
 608         }
 609
 610         new_size = pos + count;
 611         if (new_size > ip->i_size)
 612                 ip->i_new_size = new_size;
 613
 614         if (likely(!(ioflags & IO_INVIS)))
 615                 file_update_time(file);
 616
 617         /*
 618          * If the offset is beyond the size of the file, we have a couple
 619          * of things to do. First, if there is already space allocated
 620          * we need to either create holes or zero the disk or ...
 621          *
 622          * If there is a page where the previous size lands, we need
 623          * to zero it out up to the new size.
 624          */
 625
 626         if (pos > ip->i_size) {
 627                 error = xfs_zero_eof(ip, pos, ip->i_size);
 628                 if (error) {
 629                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 630                         goto out_unlock_internal;
 631                 }
 632         }
 633         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 634
 635         /*
 636          * If we're writing the file then make sure to clear the
 637          * setuid and setgid bits if the process is not being run
 638          * by root.  This keeps people from modifying setuid and
 639          * setgid binaries.
 640          */
 641         error = -file_remove_suid(file);
 642         if (unlikely(error))
 643                 goto out_unlock_internal;
 644
 645         /* We can write back this queue in page reclaim */
 646         current->backing_dev_info = mapping->backing_dev_info;
 647
 648         if ((ioflags & IO_ISDIRECT)) {
 649                 if (mapping->nrpages) {
 650                         WARN_ON(need_i_mutex == 0);
 651                         error = xfs_flushinval_pages(ip,
 652                                         (pos & PAGE_CACHE_MASK),
 653                                         -1, FI_REMAPF_LOCKED);
 654                         if (error)
 655                                 goto out_unlock_internal;
 656                 }
 657
 658                 if (need_i_mutex) {
 659                         /* demote the lock now the cached pages are gone */
 660                         xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 661                         mutex_unlock(&inode->i_mutex);
 662
 663                         iolock = XFS_IOLOCK_SHARED;
 664                         need_i_mutex = 0;
 665                 }
 666
 667                 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
 668                 ret = generic_file_direct_write(iocb, iovp,
 669                                 &nr_segs, pos, &iocb->ki_pos, count, ocount);
 670
 671                 /*
 672                  * direct-io write to a hole: fall through to buffered I/O
 673                  * for completing the rest of the request.
 674                  */
 675                 if (ret >= 0 && ret != count) {
 676                         XFS_STATS_ADD(xs_write_bytes, ret);
 677
 678                         pos += ret;
 679                         count -= ret;
 680
 681                         ioflags &= ~IO_ISDIRECT;
 682                         xfs_iunlock(ip, iolock);
 683                         goto relock;
 684                 }
 685         } else {
 686                 int enospc = 0;
 687                 ssize_t ret2 = 0;
 688
 689 write_retry:
 690                 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
 691                 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
 692                                 pos, &iocb->ki_pos, count, ret);
 693                 /*
 694                  * if we just got an ENOSPC, flush the inode now we
 695                  * aren't holding any page locks and retry *once*
 696                  */
 697                 if (ret2 == -ENOSPC && !enospc) {
 698                         error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
 699                         if (error)
 700                                 goto out_unlock_internal;
 701                         enospc = 1;
 702                         goto write_retry;
 703                 }
 704                 ret = ret2;
 705         }
 706
 707         current->backing_dev_info = NULL;
 708
 709         isize = i_size_read(inode);
 710         if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
 711                 iocb->ki_pos = isize;
 712
 713         if (iocb->ki_pos > ip->i_size) {
 714                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 715                 if (iocb->ki_pos > ip->i_size)
 716                         ip->i_size = iocb->ki_pos;
 717                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 718         }
 719
 720         if (ret == -ENOSPC &&
 721             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
 722                 xfs_iunlock(ip, iolock);
 723                 if (need_i_mutex)
 724                         mutex_unlock(&inode->i_mutex);
 725                 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
 726                                 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
 727                                 0, 0, 0); /* Delay flag intentionally  unused */
 728                 if (need_i_mutex)
 729                         mutex_lock(&inode->i_mutex);
 730                 xfs_ilock(ip, iolock);
 731                 if (error)
 732                         goto out_unlock_internal;
 733                 goto start;
 734         }
 735
 736         error = -ret;
 737         if (ret <= 0)
 738                 goto out_unlock_internal;
 739
 740         XFS_STATS_ADD(xs_write_bytes, ret);
 741
 742         /* Handle various SYNC-type writes */
 743         if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
 744                 loff_t end = pos + ret - 1;
 745                 int error2;
 746
 747                 xfs_iunlock(ip, iolock);
 748                 if (need_i_mutex)
 749                         mutex_unlock(&inode->i_mutex);
 750
 751                 error2 = filemap_write_and_wait_range(mapping, pos, end);
 752                 if (!error)
 753                         error = error2;
 754                 if (need_i_mutex)
 755                         mutex_lock(&inode->i_mutex);
 756                 xfs_ilock(ip, iolock);
 757
 758                 error2 = xfs_fsync(ip);
 759                 if (!error)
 760                         error = error2;
 761         }
 762
 763  out_unlock_internal:
 764         if (ip->i_new_size) {
 765                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 766                 ip->i_new_size = 0;
 767                 /*
 768                  * If this was a direct or synchronous I/O that failed (such
 769                  * as ENOSPC) then part of the I/O may have been written to
 770                  * disk before the error occured.  In this case the on-disk
 771                  * file size may have been adjusted beyond the in-memory file
 772                  * size and now needs to be truncated back.
 773                  */
 774                 if (ip->i_d.di_size > ip->i_size)
 775                         ip->i_d.di_size = ip->i_size;
 776                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 777         }
 778         xfs_iunlock(ip, iolock);
 779  out_unlock_mutex:
 780         if (need_i_mutex)
 781                 mutex_unlock(&inode->i_mutex);
 782         return -error;
 783 }
 784
 785 STATIC int
 786 xfs_file_open(
 787         struct inode    *inode,
 788         struct file     *file)
 789 {
 790         if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 791                 return -EFBIG;
 792         if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
 793                 return -EIO;
 794         return 0;
 795 }
 796
 797 STATIC int
 798 xfs_dir_open(
 799         struct inode    *inode,
 800         struct file     *file)
 801 {
 802         struct xfs_inode *ip = XFS_I(inode);
 803         int             mode;
 804         int             error;
 805
 806         error = xfs_file_open(inode, file);
 807         if (error)
 808                 return error;
 809
 810         /*
 811          * If there are any blocks, read-ahead block 0 as we're almost
 812          * certain to have the next operation be a read there.
 813          */
 814         mode = xfs_ilock_map_shared(ip);
 815         if (ip->i_d.di_nextents > 0)
 816                 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
 817         xfs_iunlock(ip, mode);
 818         return 0;
 819 }
 820
 821 STATIC int
 822 xfs_file_release(
 823         struct inode    *inode,
 824         struct file     *filp)
 825 {
 826         return -xfs_release(XFS_I(inode));
 827 }
 828
 829 /*
 830  * We ignore the datasync flag here because a datasync is effectively
 831  * identical to an fsync. That is, datasync implies that we need to write
 832  * only the metadata needed to be able to access the data that is written
 833  * if we crash after the call completes. Hence if we are writing beyond
 834  * EOF we have to log the inode size change as well, which makes it a
 835  * full fsync. If we don't write beyond EOF, the inode core will be
 836  * clean in memory and so we don't need to log the inode, just like
 837  * fsync.
 838  */
 839 STATIC int
 840 xfs_file_fsync(
 841         struct file             *file,
 842         struct dentry           *dentry,
 843         int                     datasync)
 844 {
 845         struct xfs_inode        *ip = XFS_I(dentry->d_inode);
 846
 847         xfs_iflags_clear(ip, XFS_ITRUNCATED);
 848         return -xfs_fsync(ip);
 849 }
 850
 851 STATIC int
 852 xfs_file_readdir(
 853         struct file     *filp,
 854         void            *dirent,
 855         filldir_t       filldir)
 856 {
 857         struct inode    *inode = filp->f_path.dentry->d_inode;
 858         xfs_inode_t     *ip = XFS_I(inode);
 859         int             error;
 860         size_t          bufsize;
 861
 862         /*
 863          * The Linux API doesn't pass down the total size of the buffer
 864          * we read into down to the filesystem.  With the filldir concept
 865          * it's not needed for correct information, but the XFS dir2 leaf
 866          * code wants an estimate of the buffer size to calculate it's
 867          * readahead window and size the buffers used for mapping to
 868          * physical blocks.
 869          *
 870          * Try to give it an estimate that's good enough, maybe at some
 871          * point we can change the ->readdir prototype to include the
 872          * buffer size.  For now we use the current glibc buffer size.
 873          */
 874         bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
 875
 876         error = xfs_readdir(ip, dirent, bufsize,
 877                                 (xfs_off_t *)&filp->f_pos, filldir);
 878         if (error)
 879                 return -error;
 880         return 0;
 881 }
 882
 883 STATIC int
 884 xfs_file_mmap(
 885         struct file     *filp,
 886         struct vm_area_struct *vma)
 887 {
 888         vma->vm_ops = &xfs_file_vm_ops;
 889         vma->vm_flags |= VM_CAN_NONLINEAR;
 890
 891         file_accessed(filp);
 892         return 0;
 893 }
 894
 895 /*
 896  * mmap()d file has taken write protection fault and is being made
 897  * writable. We can set the page state up correctly for a writable
 898  * page, which means we can do correct delalloc accounting (ENOSPC
 899  * checking!) and unwritten extent mapping.
 900  */
 901 STATIC int
 902 xfs_vm_page_mkwrite(
 903         struct vm_area_struct   *vma,
 904         struct vm_fault         *vmf)
 905 {
 906         return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 907 }
 908
 909 const struct file_operations xfs_file_operations = {
 910         .llseek         = generic_file_llseek,
 911         .read           = do_sync_read,
 912         .write          = do_sync_write,
 913         .aio_read       = xfs_file_aio_read,
 914         .aio_write      = xfs_file_aio_write,
 915         .splice_read    = xfs_file_splice_read,
 916         .splice_write   = xfs_file_splice_write,
 917         .unlocked_ioctl = xfs_file_ioctl,
 918 #ifdef CONFIG_COMPAT
 919         .compat_ioctl   = xfs_file_compat_ioctl,
 920 #endif
 921         .mmap           = xfs_file_mmap,
 922         .open           = xfs_file_open,
 923         .release        = xfs_file_release,
 924         .fsync          = xfs_file_fsync,
 925 #ifdef HAVE_FOP_OPEN_EXEC
 926         .open_exec      = xfs_file_open_exec,
 927 #endif
 928 };
 929
 930 const struct file_operations xfs_dir_file_operations = {
 931         .open           = xfs_dir_open,
 932         .read           = generic_read_dir,
 933         .readdir        = xfs_file_readdir,
 934         .llseek         = generic_file_llseek,
 935         .unlocked_ioctl = xfs_file_ioctl,
 936 #ifdef CONFIG_COMPAT
 937         .compat_ioctl   = xfs_file_compat_ioctl,
 938 #endif
 939         .fsync          = xfs_file_fsync,
 940 };
 941
 942 static const struct vm_operations_struct xfs_file_vm_ops = {
 943         .fault          = filemap_fault,
 944         .page_mkwrite   = xfs_vm_page_mkwrite,
 945 };