SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_trans_space.h"
  52 #include "xfs_log_priv.h"
  53 #include "xfs_filestream.h"
  54 #include "xfs_vnodeops.h"
  55
  56 int
  57 xfs_open(
  58         xfs_inode_t     *ip)
  59 {
  60         int             mode;
  61
  62         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  63                 return XFS_ERROR(EIO);
  64
  65         /*
  66          * If it's a directory with any blocks, read-ahead block 0
  67          * as we're almost certain to have the next operation be a read there.
  68          */
  69         if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
  70                 mode = xfs_ilock_map_shared(ip);
  71                 if (ip->i_d.di_nextents > 0)
  72                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  73                 xfs_iunlock(ip, mode);
  74         }
  75         return 0;
  76 }
  77
  78 /*
  79  * xfs_getattr
  80  */
  81 int
  82 xfs_getattr(
  83         xfs_inode_t     *ip,
  84         bhv_vattr_t     *vap,
  85         int             flags)
  86 {
  87         bhv_vnode_t     *vp = XFS_ITOV(ip);
  88         xfs_mount_t     *mp = ip->i_mount;
  89
  90         xfs_itrace_entry(ip);
  91
  92         if (XFS_FORCED_SHUTDOWN(mp))
  93                 return XFS_ERROR(EIO);
  94
  95         if (!(flags & ATTR_LAZY))
  96                 xfs_ilock(ip, XFS_ILOCK_SHARED);
  97
  98         vap->va_size = XFS_ISIZE(ip);
  99         if (vap->va_mask == XFS_AT_SIZE)
 100                 goto all_done;
 101
 102         vap->va_nblocks =
 103                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 104         vap->va_nodeid = ip->i_ino;
 105 #if XFS_BIG_INUMS
 106         vap->va_nodeid += mp->m_inoadd;
 107 #endif
 108         vap->va_nlink = ip->i_d.di_nlink;
 109
 110         /*
 111          * Quick exit for non-stat callers
 112          */
 113         if ((vap->va_mask &
 114             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 115               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 116                 goto all_done;
 117
 118         /*
 119          * Copy from in-core inode.
 120          */
 121         vap->va_mode = ip->i_d.di_mode;
 122         vap->va_uid = ip->i_d.di_uid;
 123         vap->va_gid = ip->i_d.di_gid;
 124         vap->va_projid = ip->i_d.di_projid;
 125
 126         /*
 127          * Check vnode type block/char vs. everything else.
 128          */
 129         switch (ip->i_d.di_mode & S_IFMT) {
 130         case S_IFBLK:
 131         case S_IFCHR:
 132                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 133                 vap->va_blocksize = BLKDEV_IOSIZE;
 134                 break;
 135         default:
 136                 vap->va_rdev = 0;
 137
 138                 if (!(XFS_IS_REALTIME_INODE(ip))) {
 139                         vap->va_blocksize = xfs_preferred_iosize(mp);
 140                 } else {
 141
 142                         /*
 143                          * If the file blocks are being allocated from a
 144                          * realtime partition, then return the inode's
 145                          * realtime extent size or the realtime volume's
 146                          * extent size.
 147                          */
 148                         vap->va_blocksize =
 149                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 150                 }
 151                 break;
 152         }
 153
 154         vn_atime_to_timespec(vp, &vap->va_atime);
 155         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 156         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 157         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 158         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 159
 160         /*
 161          * Exit for stat callers.  See if any of the rest of the fields
 162          * to be filled in are needed.
 163          */
 164         if ((vap->va_mask &
 165              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 166               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 167                 goto all_done;
 168
 169         /*
 170          * Convert di_flags to xflags.
 171          */
 172         vap->va_xflags = xfs_ip2xflags(ip);
 173
 174         /*
 175          * Exit for inode revalidate.  See if any of the rest of
 176          * the fields to be filled in are needed.
 177          */
 178         if ((vap->va_mask &
 179              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 180               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 181                 goto all_done;
 182
 183         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 184         vap->va_nextents =
 185                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 186                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 187                         ip->i_d.di_nextents;
 188         if (ip->i_afp)
 189                 vap->va_anextents =
 190                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 191                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 192                                  ip->i_d.di_anextents;
 193         else
 194                 vap->va_anextents = 0;
 195         vap->va_gen = ip->i_d.di_gen;
 196
 197  all_done:
 198         if (!(flags & ATTR_LAZY))
 199                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 200         return 0;
 201 }
 202
 203
 204 /*
 205  * xfs_setattr
 206  */
 207 int
 208 xfs_setattr(
 209         xfs_inode_t             *ip,
 210         bhv_vattr_t             *vap,
 211         int                     flags,
 212         cred_t                  *credp)
 213 {
 214         bhv_vnode_t             *vp = XFS_ITOV(ip);
 215         xfs_mount_t             *mp = ip->i_mount;
 216         xfs_trans_t             *tp;
 217         int                     mask;
 218         int                     code;
 219         uint                    lock_flags;
 220         uint                    commit_flags=0;
 221         uid_t                   uid=0, iuid=0;
 222         gid_t                   gid=0, igid=0;
 223         int                     timeflags = 0;
 224         xfs_prid_t              projid=0, iprojid=0;
 225         int                     mandlock_before, mandlock_after;
 226         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 227         int                     file_owner;
 228         int                     need_iolock = 1;
 229
 230         xfs_itrace_entry(ip);
 231
 232         if (mp->m_flags & XFS_MOUNT_RDONLY)
 233                 return XFS_ERROR(EROFS);
 234
 235         /*
 236          * Cannot set certain attributes.
 237          */
 238         mask = vap->va_mask;
 239         if (mask & XFS_AT_NOSET) {
 240                 return XFS_ERROR(EINVAL);
 241         }
 242
 243         if (XFS_FORCED_SHUTDOWN(mp))
 244                 return XFS_ERROR(EIO);
 245
 246         /*
 247          * Timestamps do not need to be logged and hence do not
 248          * need to be done within a transaction.
 249          */
 250         if (mask & XFS_AT_UPDTIMES) {
 251                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 252                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 253                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 254                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 255                 xfs_ichgtime(ip, timeflags);
 256                 return 0;
 257         }
 258
 259         olddquot1 = olddquot2 = NULL;
 260         udqp = gdqp = NULL;
 261
 262         /*
 263          * If disk quotas is on, we make sure that the dquots do exist on disk,
 264          * before we start any other transactions. Trying to do this later
 265          * is messy. We don't care to take a readlock to look at the ids
 266          * in inode here, because we can't hold it across the trans_reserve.
 267          * If the IDs do change before we take the ilock, we're covered
 268          * because the i_*dquot fields will get updated anyway.
 269          */
 270         if (XFS_IS_QUOTA_ON(mp) &&
 271             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 272                 uint    qflags = 0;
 273
 274                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 275                         uid = vap->va_uid;
 276                         qflags |= XFS_QMOPT_UQUOTA;
 277                 } else {
 278                         uid = ip->i_d.di_uid;
 279                 }
 280                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 281                         gid = vap->va_gid;
 282                         qflags |= XFS_QMOPT_GQUOTA;
 283                 }  else {
 284                         gid = ip->i_d.di_gid;
 285                 }
 286                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 287                         projid = vap->va_projid;
 288                         qflags |= XFS_QMOPT_PQUOTA;
 289                 }  else {
 290                         projid = ip->i_d.di_projid;
 291                 }
 292                 /*
 293                  * We take a reference when we initialize udqp and gdqp,
 294                  * so it is important that we never blindly double trip on
 295                  * the same variable. See xfs_create() for an example.
 296                  */
 297                 ASSERT(udqp == NULL);
 298                 ASSERT(gdqp == NULL);
 299                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 300                                          &udqp, &gdqp);
 301                 if (code)
 302                         return code;
 303         }
 304
 305         /*
 306          * For the other attributes, we acquire the inode lock and
 307          * first do an error checking pass.
 308          */
 309         tp = NULL;
 310         lock_flags = XFS_ILOCK_EXCL;
 311         if (flags & ATTR_NOLOCK)
 312                 need_iolock = 0;
 313         if (!(mask & XFS_AT_SIZE)) {
 314                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 315                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 316                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 317                         commit_flags = 0;
 318                         if ((code = xfs_trans_reserve(tp, 0,
 319                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 320                                                      0, 0))) {
 321                                 lock_flags = 0;
 322                                 goto error_return;
 323                         }
 324                 }
 325         } else {
 326                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 327                     !(flags & ATTR_DMI)) {
 328                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 329                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 330                                 vap->va_size, 0, dmflags, NULL);
 331                         if (code) {
 332                                 lock_flags = 0;
 333                                 goto error_return;
 334                         }
 335                 }
 336                 if (need_iolock)
 337                         lock_flags |= XFS_IOLOCK_EXCL;
 338         }
 339
 340         xfs_ilock(ip, lock_flags);
 341
 342         /* boolean: are we the file owner? */
 343         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 344
 345         /*
 346          * Change various properties of a file.
 347          * Only the owner or users with CAP_FOWNER
 348          * capability may do these things.
 349          */
 350         if (mask &
 351             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 352              XFS_AT_GID|XFS_AT_PROJID)) {
 353                 /*
 354                  * CAP_FOWNER overrides the following restrictions:
 355                  *
 356                  * The user ID of the calling process must be equal
 357                  * to the file owner ID, except in cases where the
 358                  * CAP_FSETID capability is applicable.
 359                  */
 360                 if (!file_owner && !capable(CAP_FOWNER)) {
 361                         code = XFS_ERROR(EPERM);
 362                         goto error_return;
 363                 }
 364
 365                 /*
 366                  * CAP_FSETID overrides the following restrictions:
 367                  *
 368                  * The effective user ID of the calling process shall match
 369                  * the file owner when setting the set-user-ID and
 370                  * set-group-ID bits on that file.
 371                  *
 372                  * The effective group ID or one of the supplementary group
 373                  * IDs of the calling process shall match the group owner of
 374                  * the file when setting the set-group-ID bit on that file
 375                  */
 376                 if (mask & XFS_AT_MODE) {
 377                         mode_t m = 0;
 378
 379                         if ((vap->va_mode & S_ISUID) && !file_owner)
 380                                 m |= S_ISUID;
 381                         if ((vap->va_mode & S_ISGID) &&
 382                             !in_group_p((gid_t)ip->i_d.di_gid))
 383                                 m |= S_ISGID;
 384 #if 0
 385                         /* Linux allows this, Irix doesn't. */
 386                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 387                                 m |= S_ISVTX;
 388 #endif
 389                         if (m && !capable(CAP_FSETID))
 390                                 vap->va_mode &= ~m;
 391                 }
 392         }
 393
 394         /*
 395          * Change file ownership.  Must be the owner or privileged.
 396          * If the system was configured with the "restricted_chown"
 397          * option, the owner is not permitted to give away the file,
 398          * and can change the group id only to a group of which he
 399          * or she is a member.
 400          */
 401         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 402                 /*
 403                  * These IDs could have changed since we last looked at them.
 404                  * But, we're assured that if the ownership did change
 405                  * while we didn't have the inode locked, inode's dquot(s)
 406                  * would have changed also.
 407                  */
 408                 iuid = ip->i_d.di_uid;
 409                 iprojid = ip->i_d.di_projid;
 410                 igid = ip->i_d.di_gid;
 411                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 412                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 413                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 414                          iprojid;
 415
 416                 /*
 417                  * CAP_CHOWN overrides the following restrictions:
 418                  *
 419                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 420                  * shall override the restriction that a process cannot
 421                  * change the user ID of a file it owns and the restriction
 422                  * that the group ID supplied to the chown() function
 423                  * shall be equal to either the group ID or one of the
 424                  * supplementary group IDs of the calling process.
 425                  */
 426                 if (restricted_chown &&
 427                     (iuid != uid || (igid != gid &&
 428                                      !in_group_p((gid_t)gid))) &&
 429                     !capable(CAP_CHOWN)) {
 430                         code = XFS_ERROR(EPERM);
 431                         goto error_return;
 432                 }
 433                 /*
 434                  * Do a quota reservation only if uid/projid/gid is actually
 435                  * going to change.
 436                  */
 437                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 438                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 439                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 440                         ASSERT(tp);
 441                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 442                                                 capable(CAP_FOWNER) ?
 443                                                 XFS_QMOPT_FORCE_RES : 0);
 444                         if (code)       /* out of quota */
 445                                 goto error_return;
 446                 }
 447         }
 448
 449         /*
 450          * Truncate file.  Must have write permission and not be a directory.
 451          */
 452         if (mask & XFS_AT_SIZE) {
 453                 /* Short circuit the truncate case for zero length files */
 454                 if ((vap->va_size == 0) &&
 455                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 456                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 457                         lock_flags &= ~XFS_ILOCK_EXCL;
 458                         if (mask & XFS_AT_CTIME)
 459                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 460                         code = 0;
 461                         goto error_return;
 462                 }
 463
 464                 if (VN_ISDIR(vp)) {
 465                         code = XFS_ERROR(EISDIR);
 466                         goto error_return;
 467                 } else if (!VN_ISREG(vp)) {
 468                         code = XFS_ERROR(EINVAL);
 469                         goto error_return;
 470                 }
 471                 /*
 472                  * Make sure that the dquots are attached to the inode.
 473                  */
 474                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 475                         goto error_return;
 476         }
 477
 478         /*
 479          * Change file access or modified times.
 480          */
 481         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 482                 if (!file_owner) {
 483                         if ((flags & ATTR_UTIME) &&
 484                             !capable(CAP_FOWNER)) {
 485                                 code = XFS_ERROR(EPERM);
 486                                 goto error_return;
 487                         }
 488                 }
 489         }
 490
 491         /*
 492          * Change extent size or realtime flag.
 493          */
 494         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 495                 /*
 496                  * Can't change extent size if any extents are allocated.
 497                  */
 498                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 499                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 500                      vap->va_extsize) ) {
 501                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 502                         goto error_return;
 503                 }
 504
 505                 /*
 506                  * Can't change realtime flag if any extents are allocated.
 507                  */
 508                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 509                     (mask & XFS_AT_XFLAGS) &&
 510                     (XFS_IS_REALTIME_INODE(ip)) !=
 511                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 512                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 513                         goto error_return;
 514                 }
 515                 /*
 516                  * Extent size must be a multiple of the appropriate block
 517                  * size, if set at all.
 518                  */
 519                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 520                         xfs_extlen_t    size;
 521
 522                         if (XFS_IS_REALTIME_INODE(ip) ||
 523                             ((mask & XFS_AT_XFLAGS) &&
 524                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 525                                 size = mp->m_sb.sb_rextsize <<
 526                                        mp->m_sb.sb_blocklog;
 527                         } else {
 528                                 size = mp->m_sb.sb_blocksize;
 529                         }
 530                         if (vap->va_extsize % size) {
 531                                 code = XFS_ERROR(EINVAL);
 532                                 goto error_return;
 533                         }
 534                 }
 535                 /*
 536                  * If realtime flag is set then must have realtime data.
 537                  */
 538                 if ((mask & XFS_AT_XFLAGS) &&
 539                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 540                         if ((mp->m_sb.sb_rblocks == 0) ||
 541                             (mp->m_sb.sb_rextsize == 0) ||
 542                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 543                                 code = XFS_ERROR(EINVAL);
 544                                 goto error_return;
 545                         }
 546                 }
 547
 548                 /*
 549                  * Can't modify an immutable/append-only file unless
 550                  * we have appropriate permission.
 551                  */
 552                 if ((mask & XFS_AT_XFLAGS) &&
 553                     (ip->i_d.di_flags &
 554                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 555                      (vap->va_xflags &
 556                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 557                     !capable(CAP_LINUX_IMMUTABLE)) {
 558                         code = XFS_ERROR(EPERM);
 559                         goto error_return;
 560                 }
 561         }
 562
 563         /*
 564          * Now we can make the changes.  Before we join the inode
 565          * to the transaction, if XFS_AT_SIZE is set then take care of
 566          * the part of the truncation that must be done without the
 567          * inode lock.  This needs to be done before joining the inode
 568          * to the transaction, because the inode cannot be unlocked
 569          * once it is a part of the transaction.
 570          */
 571         if (mask & XFS_AT_SIZE) {
 572                 code = 0;
 573                 if ((vap->va_size > ip->i_size) &&
 574                     (flags & ATTR_NOSIZETOK) == 0) {
 575                         code = xfs_igrow_start(ip, vap->va_size, credp);
 576                 }
 577                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 578
 579                 /*
 580                  * We are going to log the inode size change in this
 581                  * transaction so any previous writes that are beyond the on
 582                  * disk EOF and the new EOF that have not been written out need
 583                  * to be written here. If we do not write the data out, we
 584                  * expose ourselves to the null files problem.
 585                  *
 586                  * Only flush from the on disk size to the smaller of the in
 587                  * memory file size or the new size as that's the range we
 588                  * really care about here and prevents waiting for other data
 589                  * not within the range we care about here.
 590                  */
 591                 if (!code &&
 592                     (ip->i_size != ip->i_d.di_size) &&
 593                     (vap->va_size > ip->i_d.di_size)) {
 594                         code = xfs_flush_pages(ip,
 595                                         ip->i_d.di_size, vap->va_size,
 596                                         XFS_B_ASYNC, FI_NONE);
 597                 }
 598
 599                 /* wait for all I/O to complete */
 600                 vn_iowait(ip);
 601
 602                 if (!code)
 603                         code = xfs_itruncate_data(ip, vap->va_size);
 604                 if (code) {
 605                         ASSERT(tp == NULL);
 606                         lock_flags &= ~XFS_ILOCK_EXCL;
 607                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 608                         goto error_return;
 609                 }
 610                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 611                 if ((code = xfs_trans_reserve(tp, 0,
 612                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 613                                              XFS_TRANS_PERM_LOG_RES,
 614                                              XFS_ITRUNCATE_LOG_COUNT))) {
 615                         xfs_trans_cancel(tp, 0);
 616                         if (need_iolock)
 617                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 618                         return code;
 619                 }
 620                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 621                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 622         }
 623
 624         if (tp) {
 625                 xfs_trans_ijoin(tp, ip, lock_flags);
 626                 xfs_trans_ihold(tp, ip);
 627         }
 628
 629         /* determine whether mandatory locking mode changes */
 630         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 631
 632         /*
 633          * Truncate file.  Must have write permission and not be a directory.
 634          */
 635         if (mask & XFS_AT_SIZE) {
 636                 if (vap->va_size > ip->i_size) {
 637                         xfs_igrow_finish(tp, ip, vap->va_size,
 638                             !(flags & ATTR_DMI));
 639                 } else if ((vap->va_size <= ip->i_size) ||
 640                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 641                         /*
 642                          * signal a sync transaction unless
 643                          * we're truncating an already unlinked
 644                          * file on a wsync filesystem
 645                          */
 646                         code = xfs_itruncate_finish(&tp, ip,
 647                                             (xfs_fsize_t)vap->va_size,
 648                                             XFS_DATA_FORK,
 649                                             ((ip->i_d.di_nlink != 0 ||
 650                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 651                                              ? 1 : 0));
 652                         if (code)
 653                                 goto abort_return;
 654                         /*
 655                          * Truncated "down", so we're removing references
 656                          * to old data here - if we now delay flushing for
 657                          * a long time, we expose ourselves unduly to the
 658                          * notorious NULL files problem.  So, we mark this
 659                          * vnode and flush it when the file is closed, and
 660                          * do not wait the usual (long) time for writeout.
 661                          */
 662                         xfs_iflags_set(ip, XFS_ITRUNCATED);
 663                 }
 664                 /*
 665                  * Have to do this even if the file's size doesn't change.
 666                  */
 667                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 668         }
 669
 670         /*
 671          * Change file access modes.
 672          */
 673         if (mask & XFS_AT_MODE) {
 674                 ip->i_d.di_mode &= S_IFMT;
 675                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 676
 677                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 678                 timeflags |= XFS_ICHGTIME_CHG;
 679         }
 680
 681         /*
 682          * Change file ownership.  Must be the owner or privileged.
 683          * If the system was configured with the "restricted_chown"
 684          * option, the owner is not permitted to give away the file,
 685          * and can change the group id only to a group of which he
 686          * or she is a member.
 687          */
 688         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 689                 /*
 690                  * CAP_FSETID overrides the following restrictions:
 691                  *
 692                  * The set-user-ID and set-group-ID bits of a file will be
 693                  * cleared upon successful return from chown()
 694                  */
 695                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 696                     !capable(CAP_FSETID)) {
 697                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 698                 }
 699
 700                 /*
 701                  * Change the ownerships and register quota modifications
 702                  * in the transaction.
 703                  */
 704                 if (iuid != uid) {
 705                         if (XFS_IS_UQUOTA_ON(mp)) {
 706                                 ASSERT(mask & XFS_AT_UID);
 707                                 ASSERT(udqp);
 708                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 709                                                         &ip->i_udquot, udqp);
 710                         }
 711                         ip->i_d.di_uid = uid;
 712                 }
 713                 if (igid != gid) {
 714                         if (XFS_IS_GQUOTA_ON(mp)) {
 715                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 716                                 ASSERT(mask & XFS_AT_GID);
 717                                 ASSERT(gdqp);
 718                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 719                                                         &ip->i_gdquot, gdqp);
 720                         }
 721                         ip->i_d.di_gid = gid;
 722                 }
 723                 if (iprojid != projid) {
 724                         if (XFS_IS_PQUOTA_ON(mp)) {
 725                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 726                                 ASSERT(mask & XFS_AT_PROJID);
 727                                 ASSERT(gdqp);
 728                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 729                                                         &ip->i_gdquot, gdqp);
 730                         }
 731                         ip->i_d.di_projid = projid;
 732                         /*
 733                          * We may have to rev the inode as well as
 734                          * the superblock version number since projids didn't
 735                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 736                          */
 737                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 738                                 xfs_bump_ino_vers2(tp, ip);
 739                 }
 740
 741                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 742                 timeflags |= XFS_ICHGTIME_CHG;
 743         }
 744
 745
 746         /*
 747          * Change file access or modified times.
 748          */
 749         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 750                 if (mask & XFS_AT_ATIME) {
 751                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 752                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 753                         ip->i_update_core = 1;
 754                         timeflags &= ~XFS_ICHGTIME_ACC;
 755                 }
 756                 if (mask & XFS_AT_MTIME) {
 757                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 758                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 759                         timeflags &= ~XFS_ICHGTIME_MOD;
 760                         timeflags |= XFS_ICHGTIME_CHG;
 761                 }
 762                 if (tp && (flags & ATTR_UTIME))
 763                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 764         }
 765
 766         /*
 767          * Change XFS-added attributes.
 768          */
 769         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 770                 if (mask & XFS_AT_EXTSIZE) {
 771                         /*
 772                          * Converting bytes to fs blocks.
 773                          */
 774                         ip->i_d.di_extsize = vap->va_extsize >>
 775                                 mp->m_sb.sb_blocklog;
 776                 }
 777                 if (mask & XFS_AT_XFLAGS) {
 778                         uint    di_flags;
 779
 780                         /* can't set PREALLOC this way, just preserve it */
 781                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 782                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 783                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 784                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 785                                 di_flags |= XFS_DIFLAG_APPEND;
 786                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 787                                 di_flags |= XFS_DIFLAG_SYNC;
 788                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 789                                 di_flags |= XFS_DIFLAG_NOATIME;
 790                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 791                                 di_flags |= XFS_DIFLAG_NODUMP;
 792                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 793                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 794                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 795                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 796                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 797                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 798                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 799                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 800                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 801                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 802                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 803                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 804                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 805                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 806                                 if (vap->va_xflags & XFS_XFLAG_REALTIME)
 807                                         di_flags |= XFS_DIFLAG_REALTIME;
 808                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 809                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 810                         }
 811                         ip->i_d.di_flags = di_flags;
 812                 }
 813                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 814                 timeflags |= XFS_ICHGTIME_CHG;
 815         }
 816
 817         /*
 818          * Change file inode change time only if XFS_AT_CTIME set
 819          * AND we have been called by a DMI function.
 820          */
 821
 822         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 823                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 824                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 825                 ip->i_update_core = 1;
 826                 timeflags &= ~XFS_ICHGTIME_CHG;
 827         }
 828
 829         /*
 830          * Send out timestamp changes that need to be set to the
 831          * current time.  Not done when called by a DMI function.
 832          */
 833         if (timeflags && !(flags & ATTR_DMI))
 834                 xfs_ichgtime(ip, timeflags);
 835
 836         XFS_STATS_INC(xs_ig_attrchg);
 837
 838         /*
 839          * If this is a synchronous mount, make sure that the
 840          * transaction goes to disk before returning to the user.
 841          * This is slightly sub-optimal in that truncates require
 842          * two sync transactions instead of one for wsync filesystems.
 843          * One for the truncate and one for the timestamps since we
 844          * don't want to change the timestamps unless we're sure the
 845          * truncate worked.  Truncates are less than 1% of the laddis
 846          * mix so this probably isn't worth the trouble to optimize.
 847          */
 848         code = 0;
 849         if (tp) {
 850                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 851                         xfs_trans_set_sync(tp);
 852
 853                 code = xfs_trans_commit(tp, commit_flags);
 854         }
 855
 856         /*
 857          * If the (regular) file's mandatory locking mode changed, then
 858          * notify the vnode.  We do this under the inode lock to prevent
 859          * racing calls to vop_vnode_change.
 860          */
 861         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 862
 863         xfs_iunlock(ip, lock_flags);
 864
 865         /*
 866          * Release any dquot(s) the inode had kept before chown.
 867          */
 868         XFS_QM_DQRELE(mp, olddquot1);
 869         XFS_QM_DQRELE(mp, olddquot2);
 870         XFS_QM_DQRELE(mp, udqp);
 871         XFS_QM_DQRELE(mp, gdqp);
 872
 873         if (code) {
 874                 return code;
 875         }
 876
 877         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 878             !(flags & ATTR_DMI)) {
 879                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 880                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 881                                         0, 0, AT_DELAY_FLAG(flags));
 882         }
 883         return 0;
 884
 885  abort_return:
 886         commit_flags |= XFS_TRANS_ABORT;
 887         /* FALLTHROUGH */
 888  error_return:
 889         XFS_QM_DQRELE(mp, udqp);
 890         XFS_QM_DQRELE(mp, gdqp);
 891         if (tp) {
 892                 xfs_trans_cancel(tp, commit_flags);
 893         }
 894         if (lock_flags != 0) {
 895                 xfs_iunlock(ip, lock_flags);
 896         }
 897         return code;
 898 }
 899
 900 /*
 901  * The maximum pathlen is 1024 bytes. Since the minimum file system
 902  * blocksize is 512 bytes, we can get a max of 2 extents back from
 903  * bmapi.
 904  */
 905 #define SYMLINK_MAPS 2
 906
 907 STATIC int
 908 xfs_readlink_bmap(
 909         xfs_inode_t     *ip,
 910         char            *link)
 911 {
 912         xfs_mount_t     *mp = ip->i_mount;
 913         int             pathlen = ip->i_d.di_size;
 914         int             nmaps = SYMLINK_MAPS;
 915         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 916         xfs_daddr_t     d;
 917         int             byte_cnt;
 918         int             n;
 919         xfs_buf_t       *bp;
 920         int             error = 0;
 921
 922         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 923                         mval, &nmaps, NULL, NULL);
 924         if (error)
 925                 goto out;
 926
 927         for (n = 0; n < nmaps; n++) {
 928                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 929                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 930
 931                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 932                 error = XFS_BUF_GETERROR(bp);
 933                 if (error) {
 934                         xfs_ioerror_alert("xfs_readlink",
 935                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
 936                         xfs_buf_relse(bp);
 937                         goto out;
 938                 }
 939                 if (pathlen < byte_cnt)
 940                         byte_cnt = pathlen;
 941                 pathlen -= byte_cnt;
 942
 943                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 944                 xfs_buf_relse(bp);
 945         }
 946
 947         link[ip->i_d.di_size] = '\0';
 948         error = 0;
 949
 950  out:
 951         return error;
 952 }
 953
 954 int
 955 xfs_readlink(
 956         xfs_inode_t     *ip,
 957         char            *link)
 958 {
 959         xfs_mount_t     *mp = ip->i_mount;
 960         int             pathlen;
 961         int             error = 0;
 962
 963         xfs_itrace_entry(ip);
 964
 965         if (XFS_FORCED_SHUTDOWN(mp))
 966                 return XFS_ERROR(EIO);
 967
 968         xfs_ilock(ip, XFS_ILOCK_SHARED);
 969
 970         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 971         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 972
 973         pathlen = ip->i_d.di_size;
 974         if (!pathlen)
 975                 goto out;
 976
 977         if (ip->i_df.if_flags & XFS_IFINLINE) {
 978                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 979                 link[pathlen] = '\0';
 980         } else {
 981                 error = xfs_readlink_bmap(ip, link);
 982         }
 983
 984  out:
 985         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 986         return error;
 987 }
 988
 989 /*
 990  * xfs_fsync
 991  *
 992  * This is called to sync the inode and its data out to disk.
 993  * We need to hold the I/O lock while flushing the data, and
 994  * the inode lock while flushing the inode.  The inode lock CANNOT
 995  * be held while flushing the data, so acquire after we're done
 996  * with that.
 997  */
 998 int
 999 xfs_fsync(
1000         xfs_inode_t     *ip,
1001         int             flag,
1002         xfs_off_t       start,
1003         xfs_off_t       stop)
1004 {
1005         xfs_trans_t     *tp;
1006         int             error;
1007         int             log_flushed = 0, changed = 1;
1008
1009         xfs_itrace_entry(ip);
1010
1011         ASSERT(start >= 0 && stop >= -1);
1012
1013         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1014                 return XFS_ERROR(EIO);
1015
1016         if (flag & FSYNC_DATA)
1017                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1018
1019         /*
1020          * We always need to make sure that the required inode state
1021          * is safe on disk.  The vnode might be clean but because
1022          * of committed transactions that haven't hit the disk yet.
1023          * Likewise, there could be unflushed non-transactional
1024          * changes to the inode core that have to go to disk.
1025          *
1026          * The following code depends on one assumption:  that
1027          * any transaction that changes an inode logs the core
1028          * because it has to change some field in the inode core
1029          * (typically nextents or nblocks).  That assumption
1030          * implies that any transactions against an inode will
1031          * catch any non-transactional updates.  If inode-altering
1032          * transactions exist that violate this assumption, the
1033          * code breaks.  Right now, it figures that if the involved
1034          * update_* field is clear and the inode is unpinned, the
1035          * inode is clean.  Either it's been flushed or it's been
1036          * committed and the commit has hit the disk unpinning the inode.
1037          * (Note that xfs_inode_item_format() called at commit clears
1038          * the update_* fields.)
1039          */
1040         xfs_ilock(ip, XFS_ILOCK_SHARED);
1041
1042         /* If we are flushing data then we care about update_size
1043          * being set, otherwise we care about update_core
1044          */
1045         if ((flag & FSYNC_DATA) ?
1046                         (ip->i_update_size == 0) :
1047                         (ip->i_update_core == 0)) {
1048                 /*
1049                  * Timestamps/size haven't changed since last inode
1050                  * flush or inode transaction commit.  That means
1051                  * either nothing got written or a transaction
1052                  * committed which caught the updates.  If the
1053                  * latter happened and the transaction hasn't
1054                  * hit the disk yet, the inode will be still
1055                  * be pinned.  If it is, force the log.
1056                  */
1057
1058                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1059
1060                 if (xfs_ipincount(ip)) {
1061                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1062                                       XFS_LOG_FORCE |
1063                                       ((flag & FSYNC_WAIT)
1064                                        ? XFS_LOG_SYNC : 0),
1065                                       &log_flushed);
1066                 } else {
1067                         /*
1068                          * If the inode is not pinned and nothing
1069                          * has changed we don't need to flush the
1070                          * cache.
1071                          */
1072                         changed = 0;
1073                 }
1074                 error = 0;
1075         } else  {
1076                 /*
1077                  * Kick off a transaction to log the inode
1078                  * core to get the updates.  Make it
1079                  * sync if FSYNC_WAIT is passed in (which
1080                  * is done by everybody but specfs).  The
1081                  * sync transaction will also force the log.
1082                  */
1083                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1084                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1085                 if ((error = xfs_trans_reserve(tp, 0,
1086                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1087                                 0, 0, 0)))  {
1088                         xfs_trans_cancel(tp, 0);
1089                         return error;
1090                 }
1091                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1092
1093                 /*
1094                  * Note - it's possible that we might have pushed
1095                  * ourselves out of the way during trans_reserve
1096                  * which would flush the inode.  But there's no
1097                  * guarantee that the inode buffer has actually
1098                  * gone out yet (it's delwri).  Plus the buffer
1099                  * could be pinned anyway if it's part of an
1100                  * inode in another recent transaction.  So we
1101                  * play it safe and fire off the transaction anyway.
1102                  */
1103                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1104                 xfs_trans_ihold(tp, ip);
1105                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1106                 if (flag & FSYNC_WAIT)
1107                         xfs_trans_set_sync(tp);
1108                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1109
1110                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1111         }
1112
1113         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1114                 /*
1115                  * If the log write didn't issue an ordered tag we need
1116                  * to flush the disk cache for the data device now.
1117                  */
1118                 if (!log_flushed)
1119                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1120
1121                 /*
1122                  * If this inode is on the RT dev we need to flush that
1123                  * cache as well.
1124                  */
1125                 if (XFS_IS_REALTIME_INODE(ip))
1126                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1127         }
1128
1129         return error;
1130 }
1131
1132 /*
1133  * This is called by xfs_inactive to free any blocks beyond eof
1134  * when the link count isn't zero and by xfs_dm_punch_hole() when
1135  * punching a hole to EOF.
1136  */
1137 int
1138 xfs_free_eofblocks(
1139         xfs_mount_t     *mp,
1140         xfs_inode_t     *ip,
1141         int             flags)
1142 {
1143         xfs_trans_t     *tp;
1144         int             error;
1145         xfs_fileoff_t   end_fsb;
1146         xfs_fileoff_t   last_fsb;
1147         xfs_filblks_t   map_len;
1148         int             nimaps;
1149         xfs_bmbt_irec_t imap;
1150         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1151
1152         /*
1153          * Figure out if there are any blocks beyond the end
1154          * of the file.  If not, then there is nothing to do.
1155          */
1156         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1157         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1158         map_len = last_fsb - end_fsb;
1159         if (map_len <= 0)
1160                 return 0;
1161
1162         nimaps = 1;
1163         xfs_ilock(ip, XFS_ILOCK_SHARED);
1164         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1165                           NULL, 0, &imap, &nimaps, NULL, NULL);
1166         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1167
1168         if (!error && (nimaps != 0) &&
1169             (imap.br_startblock != HOLESTARTBLOCK ||
1170              ip->i_delayed_blks)) {
1171                 /*
1172                  * Attach the dquots to the inode up front.
1173                  */
1174                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1175                         return error;
1176
1177                 /*
1178                  * There are blocks after the end of file.
1179                  * Free them up now by truncating the file to
1180                  * its current size.
1181                  */
1182                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1183
1184                 /*
1185                  * Do the xfs_itruncate_start() call before
1186                  * reserving any log space because
1187                  * itruncate_start will call into the buffer
1188                  * cache and we can't
1189                  * do that within a transaction.
1190                  */
1191                 if (use_iolock)
1192                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1193                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1194                                     ip->i_size);
1195                 if (error) {
1196                         xfs_trans_cancel(tp, 0);
1197                         if (use_iolock)
1198                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1199                         return error;
1200                 }
1201
1202                 error = xfs_trans_reserve(tp, 0,
1203                                           XFS_ITRUNCATE_LOG_RES(mp),
1204                                           0, XFS_TRANS_PERM_LOG_RES,
1205                                           XFS_ITRUNCATE_LOG_COUNT);
1206                 if (error) {
1207                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1208                         xfs_trans_cancel(tp, 0);
1209                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1210                         return error;
1211                 }
1212
1213                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1214                 xfs_trans_ijoin(tp, ip,
1215                                 XFS_IOLOCK_EXCL |
1216                                 XFS_ILOCK_EXCL);
1217                 xfs_trans_ihold(tp, ip);
1218
1219                 error = xfs_itruncate_finish(&tp, ip,
1220                                              ip->i_size,
1221                                              XFS_DATA_FORK,
1222                                              0);
1223                 /*
1224                  * If we get an error at this point we
1225                  * simply don't bother truncating the file.
1226                  */
1227                 if (error) {
1228                         xfs_trans_cancel(tp,
1229                                          (XFS_TRANS_RELEASE_LOG_RES |
1230                                           XFS_TRANS_ABORT));
1231                 } else {
1232                         error = xfs_trans_commit(tp,
1233                                                 XFS_TRANS_RELEASE_LOG_RES);
1234                 }
1235                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1236                                             : XFS_ILOCK_EXCL));
1237         }
1238         return error;
1239 }
1240
1241 /*
1242  * Free a symlink that has blocks associated with it.
1243  */
1244 STATIC int
1245 xfs_inactive_symlink_rmt(
1246         xfs_inode_t     *ip,
1247         xfs_trans_t     **tpp)
1248 {
1249         xfs_buf_t       *bp;
1250         int             committed;
1251         int             done;
1252         int             error;
1253         xfs_fsblock_t   first_block;
1254         xfs_bmap_free_t free_list;
1255         int             i;
1256         xfs_mount_t     *mp;
1257         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1258         int             nmaps;
1259         xfs_trans_t     *ntp;
1260         int             size;
1261         xfs_trans_t     *tp;
1262
1263         tp = *tpp;
1264         mp = ip->i_mount;
1265         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1266         /*
1267          * We're freeing a symlink that has some
1268          * blocks allocated to it.  Free the
1269          * blocks here.  We know that we've got
1270          * either 1 or 2 extents and that we can
1271          * free them all in one bunmapi call.
1272          */
1273         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1274         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1275                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1276                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1277                 xfs_trans_cancel(tp, 0);
1278                 *tpp = NULL;
1279                 return error;
1280         }
1281         /*
1282          * Lock the inode, fix the size, and join it to the transaction.
1283          * Hold it so in the normal path, we still have it locked for
1284          * the second transaction.  In the error paths we need it
1285          * held so the cancel won't rele it, see below.
1286          */
1287         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1288         size = (int)ip->i_d.di_size;
1289         ip->i_d.di_size = 0;
1290         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1291         xfs_trans_ihold(tp, ip);
1292         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1293         /*
1294          * Find the block(s) so we can inval and unmap them.
1295          */
1296         done = 0;
1297         XFS_BMAP_INIT(&free_list, &first_block);
1298         nmaps = ARRAY_SIZE(mval);
1299         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1300                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1301                         &free_list, NULL)))
1302                 goto error0;
1303         /*
1304          * Invalidate the block(s).
1305          */
1306         for (i = 0; i < nmaps; i++) {
1307                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1308                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1309                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1310                 xfs_trans_binval(tp, bp);
1311         }
1312         /*
1313          * Unmap the dead block(s) to the free_list.
1314          */
1315         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1316                         &first_block, &free_list, NULL, &done)))
1317                 goto error1;
1318         ASSERT(done);
1319         /*
1320          * Commit the first transaction.  This logs the EFI and the inode.
1321          */
1322         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1323                 goto error1;
1324         /*
1325          * The transaction must have been committed, since there were
1326          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1327          * The new tp has the extent freeing and EFDs.
1328          */
1329         ASSERT(committed);
1330         /*
1331          * The first xact was committed, so add the inode to the new one.
1332          * Mark it dirty so it will be logged and moved forward in the log as
1333          * part of every commit.
1334          */
1335         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1336         xfs_trans_ihold(tp, ip);
1337         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1338         /*
1339          * Get a new, empty transaction to return to our caller.
1340          */
1341         ntp = xfs_trans_dup(tp);
1342         /*
1343          * Commit the transaction containing extent freeing and EFDs.
1344          * If we get an error on the commit here or on the reserve below,
1345          * we need to unlock the inode since the new transaction doesn't
1346          * have the inode attached.
1347          */
1348         error = xfs_trans_commit(tp, 0);
1349         tp = ntp;
1350         if (error) {
1351                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1352                 goto error0;
1353         }
1354         /*
1355          * Remove the memory for extent descriptions (just bookkeeping).
1356          */
1357         if (ip->i_df.if_bytes)
1358                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1359         ASSERT(ip->i_df.if_bytes == 0);
1360         /*
1361          * Put an itruncate log reservation in the new transaction
1362          * for our caller.
1363          */
1364         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1365                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1366                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1367                 goto error0;
1368         }
1369         /*
1370          * Return with the inode locked but not joined to the transaction.
1371          */
1372         *tpp = tp;
1373         return 0;
1374
1375  error1:
1376         xfs_bmap_cancel(&free_list);
1377  error0:
1378         /*
1379          * Have to come here with the inode locked and either
1380          * (held and in the transaction) or (not in the transaction).
1381          * If the inode isn't held then cancel would iput it, but
1382          * that's wrong since this is inactive and the vnode ref
1383          * count is 0 already.
1384          * Cancel won't do anything to the inode if held, but it still
1385          * needs to be locked until the cancel is done, if it was
1386          * joined to the transaction.
1387          */
1388         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1389         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1390         *tpp = NULL;
1391         return error;
1392
1393 }
1394
1395 STATIC int
1396 xfs_inactive_symlink_local(
1397         xfs_inode_t     *ip,
1398         xfs_trans_t     **tpp)
1399 {
1400         int             error;
1401
1402         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1403         /*
1404          * We're freeing a symlink which fit into
1405          * the inode.  Just free the memory used
1406          * to hold the old symlink.
1407          */
1408         error = xfs_trans_reserve(*tpp, 0,
1409                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1410                                   0, XFS_TRANS_PERM_LOG_RES,
1411                                   XFS_ITRUNCATE_LOG_COUNT);
1412
1413         if (error) {
1414                 xfs_trans_cancel(*tpp, 0);
1415                 *tpp = NULL;
1416                 return error;
1417         }
1418         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1419
1420         /*
1421          * Zero length symlinks _can_ exist.
1422          */
1423         if (ip->i_df.if_bytes > 0) {
1424                 xfs_idata_realloc(ip,
1425                                   -(ip->i_df.if_bytes),
1426                                   XFS_DATA_FORK);
1427                 ASSERT(ip->i_df.if_bytes == 0);
1428         }
1429         return 0;
1430 }
1431
1432 STATIC int
1433 xfs_inactive_attrs(
1434         xfs_inode_t     *ip,
1435         xfs_trans_t     **tpp)
1436 {
1437         xfs_trans_t     *tp;
1438         int             error;
1439         xfs_mount_t     *mp;
1440
1441         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1442         tp = *tpp;
1443         mp = ip->i_mount;
1444         ASSERT(ip->i_d.di_forkoff != 0);
1445         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1446         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1447
1448         error = xfs_attr_inactive(ip);
1449         if (error) {
1450                 *tpp = NULL;
1451                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1452                 return error; /* goto out */
1453         }
1454
1455         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1456         error = xfs_trans_reserve(tp, 0,
1457                                   XFS_IFREE_LOG_RES(mp),
1458                                   0, XFS_TRANS_PERM_LOG_RES,
1459                                   XFS_INACTIVE_LOG_COUNT);
1460         if (error) {
1461                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1462                 xfs_trans_cancel(tp, 0);
1463                 *tpp = NULL;
1464                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1465                 return error;
1466         }
1467
1468         xfs_ilock(ip, XFS_ILOCK_EXCL);
1469         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1470         xfs_trans_ihold(tp, ip);
1471         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1472
1473         ASSERT(ip->i_d.di_anextents == 0);
1474
1475         *tpp = tp;
1476         return 0;
1477 }
1478
1479 int
1480 xfs_release(
1481         xfs_inode_t     *ip)
1482 {
1483         bhv_vnode_t     *vp = XFS_ITOV(ip);
1484         xfs_mount_t     *mp = ip->i_mount;
1485         int             error;
1486
1487         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1488                 return 0;
1489
1490         /* If this is a read-only mount, don't do this (would generate I/O) */
1491         if (mp->m_flags & XFS_MOUNT_RDONLY)
1492                 return 0;
1493
1494         if (!XFS_FORCED_SHUTDOWN(mp)) {
1495                 int truncated;
1496
1497                 /*
1498                  * If we are using filestreams, and we have an unlinked
1499                  * file that we are processing the last close on, then nothing
1500                  * will be able to reopen and write to this file. Purge this
1501                  * inode from the filestreams cache so that it doesn't delay
1502                  * teardown of the inode.
1503                  */
1504                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1505                         xfs_filestream_deassociate(ip);
1506
1507                 /*
1508                  * If we previously truncated this file and removed old data
1509                  * in the process, we want to initiate "early" writeout on
1510                  * the last close.  This is an attempt to combat the notorious
1511                  * NULL files problem which is particularly noticable from a
1512                  * truncate down, buffered (re-)write (delalloc), followed by
1513                  * a crash.  What we are effectively doing here is
1514                  * significantly reducing the time window where we'd otherwise
1515                  * be exposed to that problem.
1516                  */
1517                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1518                 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1519                         xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1520         }
1521
1522         if (ip->i_d.di_nlink != 0) {
1523                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1524                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1525                        ip->i_delayed_blks > 0)) &&
1526                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1527                     (!(ip->i_d.di_flags &
1528                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1529                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1530                         if (error)
1531                                 return error;
1532                 }
1533         }
1534
1535         return 0;
1536 }
1537
1538 /*
1539  * xfs_inactive
1540  *
1541  * This is called when the vnode reference count for the vnode
1542  * goes to zero.  If the file has been unlinked, then it must
1543  * now be truncated.  Also, we clear all of the read-ahead state
1544  * kept for the inode here since the file is now closed.
1545  */
1546 int
1547 xfs_inactive(
1548         xfs_inode_t     *ip)
1549 {
1550         bhv_vnode_t     *vp = XFS_ITOV(ip);
1551         xfs_bmap_free_t free_list;
1552         xfs_fsblock_t   first_block;
1553         int             committed;
1554         xfs_trans_t     *tp;
1555         xfs_mount_t     *mp;
1556         int             error;
1557         int             truncate;
1558
1559         xfs_itrace_entry(ip);
1560
1561         /*
1562          * If the inode is already free, then there can be nothing
1563          * to clean up here.
1564          */
1565         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1566                 ASSERT(ip->i_df.if_real_bytes == 0);
1567                 ASSERT(ip->i_df.if_broot_bytes == 0);
1568                 return VN_INACTIVE_CACHE;
1569         }
1570
1571         /*
1572          * Only do a truncate if it's a regular file with
1573          * some actual space in it.  It's OK to look at the
1574          * inode's fields without the lock because we're the
1575          * only one with a reference to the inode.
1576          */
1577         truncate = ((ip->i_d.di_nlink == 0) &&
1578             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1579              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1580             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1581
1582         mp = ip->i_mount;
1583
1584         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) {
1585                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1586         }
1587
1588         error = 0;
1589
1590         /* If this is a read-only mount, don't do this (would generate I/O) */
1591         if (mp->m_flags & XFS_MOUNT_RDONLY)
1592                 goto out;
1593
1594         if (ip->i_d.di_nlink != 0) {
1595                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1596                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1597                        ip->i_delayed_blks > 0)) &&
1598                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1599                      (!(ip->i_d.di_flags &
1600                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1601                       (ip->i_delayed_blks != 0)))) {
1602                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1603                         if (error)
1604                                 return VN_INACTIVE_CACHE;
1605                 }
1606                 goto out;
1607         }
1608
1609         ASSERT(ip->i_d.di_nlink == 0);
1610
1611         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1612                 return VN_INACTIVE_CACHE;
1613
1614         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1615         if (truncate) {
1616                 /*
1617                  * Do the xfs_itruncate_start() call before
1618                  * reserving any log space because itruncate_start
1619                  * will call into the buffer cache and we can't
1620                  * do that within a transaction.
1621                  */
1622                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1623
1624                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1625                 if (error) {
1626                         xfs_trans_cancel(tp, 0);
1627                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1628                         return VN_INACTIVE_CACHE;
1629                 }
1630
1631                 error = xfs_trans_reserve(tp, 0,
1632                                           XFS_ITRUNCATE_LOG_RES(mp),
1633                                           0, XFS_TRANS_PERM_LOG_RES,
1634                                           XFS_ITRUNCATE_LOG_COUNT);
1635                 if (error) {
1636                         /* Don't call itruncate_cleanup */
1637                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1638                         xfs_trans_cancel(tp, 0);
1639                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1640                         return VN_INACTIVE_CACHE;
1641                 }
1642
1643                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1644                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1645                 xfs_trans_ihold(tp, ip);
1646
1647                 /*
1648                  * normally, we have to run xfs_itruncate_finish sync.
1649                  * But if filesystem is wsync and we're in the inactive
1650                  * path, then we know that nlink == 0, and that the
1651                  * xaction that made nlink == 0 is permanently committed
1652                  * since xfs_remove runs as a synchronous transaction.
1653                  */
1654                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1655                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1656
1657                 if (error) {
1658                         xfs_trans_cancel(tp,
1659                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1660                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1661                         return VN_INACTIVE_CACHE;
1662                 }
1663         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1664
1665                 /*
1666                  * If we get an error while cleaning up a
1667                  * symlink we bail out.
1668                  */
1669                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1670                         xfs_inactive_symlink_rmt(ip, &tp) :
1671                         xfs_inactive_symlink_local(ip, &tp);
1672
1673                 if (error) {
1674                         ASSERT(tp == NULL);
1675                         return VN_INACTIVE_CACHE;
1676                 }
1677
1678                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1679                 xfs_trans_ihold(tp, ip);
1680         } else {
1681                 error = xfs_trans_reserve(tp, 0,
1682                                           XFS_IFREE_LOG_RES(mp),
1683                                           0, XFS_TRANS_PERM_LOG_RES,
1684                                           XFS_INACTIVE_LOG_COUNT);
1685                 if (error) {
1686                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1687                         xfs_trans_cancel(tp, 0);
1688                         return VN_INACTIVE_CACHE;
1689                 }
1690
1691                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1692                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1693                 xfs_trans_ihold(tp, ip);
1694         }
1695
1696         /*
1697          * If there are attributes associated with the file
1698          * then blow them away now.  The code calls a routine
1699          * that recursively deconstructs the attribute fork.
1700          * We need to just commit the current transaction
1701          * because we can't use it for xfs_attr_inactive().
1702          */
1703         if (ip->i_d.di_anextents > 0) {
1704                 error = xfs_inactive_attrs(ip, &tp);
1705                 /*
1706                  * If we got an error, the transaction is already
1707                  * cancelled, and the inode is unlocked. Just get out.
1708                  */
1709                  if (error)
1710                          return VN_INACTIVE_CACHE;
1711         } else if (ip->i_afp) {
1712                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1713         }
1714
1715         /*
1716          * Free the inode.
1717          */
1718         XFS_BMAP_INIT(&free_list, &first_block);
1719         error = xfs_ifree(tp, ip, &free_list);
1720         if (error) {
1721                 /*
1722                  * If we fail to free the inode, shut down.  The cancel
1723                  * might do that, we need to make sure.  Otherwise the
1724                  * inode might be lost for a long time or forever.
1725                  */
1726                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1727                         cmn_err(CE_NOTE,
1728                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1729                                 error, mp->m_fsname);
1730                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1731                 }
1732                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1733         } else {
1734                 /*
1735                  * Credit the quota account(s). The inode is gone.
1736                  */
1737                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1738
1739                 /*
1740                  * Just ignore errors at this point.  There is
1741                  * nothing we can do except to try to keep going.
1742                  */
1743                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1744                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1745         }
1746         /*
1747          * Release the dquots held by inode, if any.
1748          */
1749         XFS_QM_DQDETACH(mp, ip);
1750
1751         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1752
1753  out:
1754         return VN_INACTIVE_CACHE;
1755 }
1756
1757
1758 int
1759 xfs_lookup(
1760         xfs_inode_t             *dp,
1761         bhv_vname_t             *dentry,
1762         bhv_vnode_t             **vpp)
1763 {
1764         xfs_inode_t             *ip;
1765         xfs_ino_t               e_inum;
1766         int                     error;
1767         uint                    lock_mode;
1768
1769         xfs_itrace_entry(dp);
1770
1771         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1772                 return XFS_ERROR(EIO);
1773
1774         lock_mode = xfs_ilock_map_shared(dp);
1775         error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip);
1776         if (!error) {
1777                 *vpp = XFS_ITOV(ip);
1778                 xfs_itrace_ref(ip);
1779         }
1780         xfs_iunlock_map_shared(dp, lock_mode);
1781         return error;
1782 }
1783
1784 int
1785 xfs_create(
1786         xfs_inode_t             *dp,
1787         bhv_vname_t             *dentry,
1788         mode_t                  mode,
1789         xfs_dev_t               rdev,
1790         bhv_vnode_t             **vpp,
1791         cred_t                  *credp)
1792 {
1793         char                    *name = VNAME(dentry);
1794         xfs_mount_t             *mp = dp->i_mount;
1795         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
1796         xfs_inode_t             *ip;
1797         bhv_vnode_t             *vp = NULL;
1798         xfs_trans_t             *tp;
1799         int                     error;
1800         xfs_bmap_free_t         free_list;
1801         xfs_fsblock_t           first_block;
1802         boolean_t               unlock_dp_on_error = B_FALSE;
1803         int                     dm_event_sent = 0;
1804         uint                    cancel_flags;
1805         int                     committed;
1806         xfs_prid_t              prid;
1807         struct xfs_dquot        *udqp, *gdqp;
1808         uint                    resblks;
1809         int                     namelen;
1810
1811         ASSERT(!*vpp);
1812         xfs_itrace_entry(dp);
1813
1814         namelen = VNAMELEN(dentry);
1815
1816         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1817                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1818                                 dir_vp, DM_RIGHT_NULL, NULL,
1819                                 DM_RIGHT_NULL, name, NULL,
1820                                 mode, 0, 0);
1821
1822                 if (error)
1823                         return error;
1824                 dm_event_sent = 1;
1825         }
1826
1827         if (XFS_FORCED_SHUTDOWN(mp))
1828                 return XFS_ERROR(EIO);
1829
1830         /* Return through std_return after this point. */
1831
1832         udqp = gdqp = NULL;
1833         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1834                 prid = dp->i_d.di_projid;
1835         else
1836                 prid = (xfs_prid_t)dfltprid;
1837
1838         /*
1839          * Make sure that we have allocated dquot(s) on disk.
1840          */
1841         error = XFS_QM_DQVOPALLOC(mp, dp,
1842                         current_fsuid(credp), current_fsgid(credp), prid,
1843                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1844         if (error)
1845                 goto std_return;
1846
1847         ip = NULL;
1848
1849         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1850         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1851         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1852         /*
1853          * Initially assume that the file does not exist and
1854          * reserve the resources for that case.  If that is not
1855          * the case we'll drop the one we have and get a more
1856          * appropriate transaction later.
1857          */
1858         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1859                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1860         if (error == ENOSPC) {
1861                 resblks = 0;
1862                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1863                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1864         }
1865         if (error) {
1866                 cancel_flags = 0;
1867                 goto error_return;
1868         }
1869
1870         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1871         unlock_dp_on_error = B_TRUE;
1872
1873         XFS_BMAP_INIT(&free_list, &first_block);
1874
1875         ASSERT(ip == NULL);
1876
1877         /*
1878          * Reserve disk quota and the inode.
1879          */
1880         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1881         if (error)
1882                 goto error_return;
1883
1884         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1885                 goto error_return;
1886         error = xfs_dir_ialloc(&tp, dp, mode, 1,
1887                         rdev, credp, prid, resblks > 0,
1888                         &ip, &committed);
1889         if (error) {
1890                 if (error == ENOSPC)
1891                         goto error_return;
1892                 goto abort_return;
1893         }
1894         xfs_itrace_ref(ip);
1895
1896         /*
1897          * At this point, we've gotten a newly allocated inode.
1898          * It is locked (and joined to the transaction).
1899          */
1900
1901         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1902
1903         /*
1904          * Now we join the directory inode to the transaction.  We do not do it
1905          * earlier because xfs_dir_ialloc might commit the previous transaction
1906          * (and release all the locks).  An error from here on will result in
1907          * the transaction cancel unlocking dp so don't do it explicitly in the
1908          * error path.
1909          */
1910         VN_HOLD(dir_vp);
1911         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1912         unlock_dp_on_error = B_FALSE;
1913
1914         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
1915                                         &first_block, &free_list, resblks ?
1916                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1917         if (error) {
1918                 ASSERT(error != ENOSPC);
1919                 goto abort_return;
1920         }
1921         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1922         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1923
1924         /*
1925          * If this is a synchronous mount, make sure that the
1926          * create transaction goes to disk before returning to
1927          * the user.
1928          */
1929         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1930                 xfs_trans_set_sync(tp);
1931         }
1932
1933         dp->i_gen++;
1934
1935         /*
1936          * Attach the dquot(s) to the inodes and modify them incore.
1937          * These ids of the inode couldn't have changed since the new
1938          * inode has been locked ever since it was created.
1939          */
1940         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1941
1942         /*
1943          * xfs_trans_commit normally decrements the vnode ref count
1944          * when it unlocks the inode. Since we want to return the
1945          * vnode to the caller, we bump the vnode ref count now.
1946          */
1947         IHOLD(ip);
1948         vp = XFS_ITOV(ip);
1949
1950         error = xfs_bmap_finish(&tp, &free_list, &committed);
1951         if (error) {
1952                 xfs_bmap_cancel(&free_list);
1953                 goto abort_rele;
1954         }
1955
1956         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1957         if (error) {
1958                 IRELE(ip);
1959                 tp = NULL;
1960                 goto error_return;
1961         }
1962
1963         XFS_QM_DQRELE(mp, udqp);
1964         XFS_QM_DQRELE(mp, gdqp);
1965
1966         *vpp = vp;
1967
1968         /* Fallthrough to std_return with error = 0  */
1969
1970 std_return:
1971         if ((*vpp || (error != 0 && dm_event_sent != 0)) &&
1972             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1973                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1974                         dir_vp, DM_RIGHT_NULL,
1975                         *vpp ? vp:NULL,
1976                         DM_RIGHT_NULL, name, NULL,
1977                         mode, error, 0);
1978         }
1979         return error;
1980
1981  abort_return:
1982         cancel_flags |= XFS_TRANS_ABORT;
1983         /* FALLTHROUGH */
1984
1985  error_return:
1986         if (tp != NULL)
1987                 xfs_trans_cancel(tp, cancel_flags);
1988
1989         XFS_QM_DQRELE(mp, udqp);
1990         XFS_QM_DQRELE(mp, gdqp);
1991
1992         if (unlock_dp_on_error)
1993                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1994
1995         goto std_return;
1996
1997  abort_rele:
1998         /*
1999          * Wait until after the current transaction is aborted to
2000          * release the inode.  This prevents recursive transactions
2001          * and deadlocks from xfs_inactive.
2002          */
2003         cancel_flags |= XFS_TRANS_ABORT;
2004         xfs_trans_cancel(tp, cancel_flags);
2005         IRELE(ip);
2006
2007         XFS_QM_DQRELE(mp, udqp);
2008         XFS_QM_DQRELE(mp, gdqp);
2009
2010         goto std_return;
2011 }
2012
2013 #ifdef DEBUG
2014 /*
2015  * Some counters to see if (and how often) we are hitting some deadlock
2016  * prevention code paths.
2017  */
2018
2019 int xfs_rm_locks;
2020 int xfs_rm_lock_delays;
2021 int xfs_rm_attempts;
2022 #endif
2023
2024 /*
2025  * The following routine will lock the inodes associated with the
2026  * directory and the named entry in the directory. The locks are
2027  * acquired in increasing inode number.
2028  *
2029  * If the entry is "..", then only the directory is locked. The
2030  * vnode ref count will still include that from the .. entry in
2031  * this case.
2032  *
2033  * There is a deadlock we need to worry about. If the locked directory is
2034  * in the AIL, it might be blocking up the log. The next inode we lock
2035  * could be already locked by another thread waiting for log space (e.g
2036  * a permanent log reservation with a long running transaction (see
2037  * xfs_itruncate_finish)). To solve this, we must check if the directory
2038  * is in the ail and use lock_nowait. If we can't lock, we need to
2039  * drop the inode lock on the directory and try again. xfs_iunlock will
2040  * potentially push the tail if we were holding up the log.
2041  */
2042 STATIC int
2043 xfs_lock_dir_and_entry(
2044         xfs_inode_t     *dp,
2045         xfs_inode_t     *ip)    /* inode of entry 'name' */
2046 {
2047         int             attempts;
2048         xfs_ino_t       e_inum;
2049         xfs_inode_t     *ips[2];
2050         xfs_log_item_t  *lp;
2051
2052 #ifdef DEBUG
2053         xfs_rm_locks++;
2054 #endif
2055         attempts = 0;
2056
2057 again:
2058         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2059
2060         e_inum = ip->i_ino;
2061
2062         xfs_itrace_ref(ip);
2063
2064         /*
2065          * We want to lock in increasing inum. Since we've already
2066          * acquired the lock on the directory, we may need to release
2067          * if if the inum of the entry turns out to be less.
2068          */
2069         if (e_inum > dp->i_ino) {
2070                 /*
2071                  * We are already in the right order, so just
2072                  * lock on the inode of the entry.
2073                  * We need to use nowait if dp is in the AIL.
2074                  */
2075
2076                 lp = (xfs_log_item_t *)dp->i_itemp;
2077                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2078                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2079                                 attempts++;
2080 #ifdef DEBUG
2081                                 xfs_rm_attempts++;
2082 #endif
2083
2084                                 /*
2085                                  * Unlock dp and try again.
2086                                  * xfs_iunlock will try to push the tail
2087                                  * if the inode is in the AIL.
2088                                  */
2089
2090                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2091
2092                                 if ((attempts % 5) == 0) {
2093                                         delay(1); /* Don't just spin the CPU */
2094 #ifdef DEBUG
2095                                         xfs_rm_lock_delays++;
2096 #endif
2097                                 }
2098                                 goto again;
2099                         }
2100                 } else {
2101                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2102                 }
2103         } else if (e_inum < dp->i_ino) {
2104                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2105
2106                 ips[0] = ip;
2107                 ips[1] = dp;
2108                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2109         }
2110         /* else  e_inum == dp->i_ino */
2111         /*     This can happen if we're asked to lock /x/..
2112          *     the entry is "..", which is also the parent directory.
2113          */
2114
2115         return 0;
2116 }
2117
2118 #ifdef DEBUG
2119 int xfs_locked_n;
2120 int xfs_small_retries;
2121 int xfs_middle_retries;
2122 int xfs_lots_retries;
2123 int xfs_lock_delays;
2124 #endif
2125
2126 /*
2127  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2128  * a different value
2129  */
2130 static inline int
2131 xfs_lock_inumorder(int lock_mode, int subclass)
2132 {
2133         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2134                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2135         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2136                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2137
2138         return lock_mode;
2139 }
2140
2141 /*
2142  * The following routine will lock n inodes in exclusive mode.
2143  * We assume the caller calls us with the inodes in i_ino order.
2144  *
2145  * We need to detect deadlock where an inode that we lock
2146  * is in the AIL and we start waiting for another inode that is locked
2147  * by a thread in a long running transaction (such as truncate). This can
2148  * result in deadlock since the long running trans might need to wait
2149  * for the inode we just locked in order to push the tail and free space
2150  * in the log.
2151  */
2152 void
2153 xfs_lock_inodes(
2154         xfs_inode_t     **ips,
2155         int             inodes,
2156         int             first_locked,
2157         uint            lock_mode)
2158 {
2159         int             attempts = 0, i, j, try_lock;
2160         xfs_log_item_t  *lp;
2161
2162         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2163
2164         if (first_locked) {
2165                 try_lock = 1;
2166                 i = 1;
2167         } else {
2168                 try_lock = 0;
2169                 i = 0;
2170         }
2171
2172 again:
2173         for (; i < inodes; i++) {
2174                 ASSERT(ips[i]);
2175
2176                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2177                         continue;
2178
2179                 /*
2180                  * If try_lock is not set yet, make sure all locked inodes
2181                  * are not in the AIL.
2182                  * If any are, set try_lock to be used later.
2183                  */
2184
2185                 if (!try_lock) {
2186                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2187                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2188                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2189                                         try_lock++;
2190                                 }
2191                         }
2192                 }
2193
2194                 /*
2195                  * If any of the previous locks we have locked is in the AIL,
2196                  * we must TRY to get the second and subsequent locks. If
2197                  * we can't get any, we must release all we have
2198                  * and try again.
2199                  */
2200
2201                 if (try_lock) {
2202                         /* try_lock must be 0 if i is 0. */
2203                         /*
2204                          * try_lock means we have an inode locked
2205                          * that is in the AIL.
2206                          */
2207                         ASSERT(i != 0);
2208                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2209                                 attempts++;
2210
2211                                 /*
2212                                  * Unlock all previous guys and try again.
2213                                  * xfs_iunlock will try to push the tail
2214                                  * if the inode is in the AIL.
2215                                  */
2216
2217                                 for(j = i - 1; j >= 0; j--) {
2218
2219                                         /*
2220                                          * Check to see if we've already
2221                                          * unlocked this one.
2222                                          * Not the first one going back,
2223                                          * and the inode ptr is the same.
2224                                          */
2225                                         if ((j != (i - 1)) && ips[j] ==
2226                                                                 ips[j+1])
2227                                                 continue;
2228
2229                                         xfs_iunlock(ips[j], lock_mode);
2230                                 }
2231
2232                                 if ((attempts % 5) == 0) {
2233                                         delay(1); /* Don't just spin the CPU */
2234 #ifdef DEBUG
2235                                         xfs_lock_delays++;
2236 #endif
2237                                 }
2238                                 i = 0;
2239                                 try_lock = 0;
2240                                 goto again;
2241                         }
2242                 } else {
2243                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2244                 }
2245         }
2246
2247 #ifdef DEBUG
2248         if (attempts) {
2249                 if (attempts < 5) xfs_small_retries++;
2250                 else if (attempts < 100) xfs_middle_retries++;
2251                 else xfs_lots_retries++;
2252         } else {
2253                 xfs_locked_n++;
2254         }
2255 #endif
2256 }
2257
2258 #ifdef  DEBUG
2259 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2260 int remove_which_error_return = 0;
2261 #else /* ! DEBUG */
2262 #define REMOVE_DEBUG_TRACE(x)
2263 #endif  /* ! DEBUG */
2264
2265 int
2266 xfs_remove(
2267         xfs_inode_t             *dp,
2268         bhv_vname_t             *dentry)
2269 {
2270         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2271         char                    *name = VNAME(dentry);
2272         xfs_mount_t             *mp = dp->i_mount;
2273         xfs_inode_t             *ip;
2274         xfs_trans_t             *tp = NULL;
2275         int                     error = 0;
2276         xfs_bmap_free_t         free_list;
2277         xfs_fsblock_t           first_block;
2278         int                     cancel_flags;
2279         int                     committed;
2280         int                     dm_di_mode = 0;
2281         int                     link_zero;
2282         uint                    resblks;
2283         int                     namelen;
2284
2285         xfs_itrace_entry(dp);
2286
2287         if (XFS_FORCED_SHUTDOWN(mp))
2288                 return XFS_ERROR(EIO);
2289
2290         namelen = VNAMELEN(dentry);
2291
2292         if (!xfs_get_dir_entry(dentry, &ip)) {
2293                 dm_di_mode = ip->i_d.di_mode;
2294                 IRELE(ip);
2295         }
2296
2297         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2298                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2299                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2300                                         name, NULL, dm_di_mode, 0, 0);
2301                 if (error)
2302                         return error;
2303         }
2304
2305         /* From this point on, return through std_return */
2306         ip = NULL;
2307
2308         /*
2309          * We need to get a reference to ip before we get our log
2310          * reservation. The reason for this is that we cannot call
2311          * xfs_iget for an inode for which we do not have a reference
2312          * once we've acquired a log reservation. This is because the
2313          * inode we are trying to get might be in xfs_inactive going
2314          * for a log reservation. Since we'll have to wait for the
2315          * inactive code to complete before returning from xfs_iget,
2316          * we need to make sure that we don't have log space reserved
2317          * when we call xfs_iget.  Instead we get an unlocked reference
2318          * to the inode before getting our log reservation.
2319          */
2320         error = xfs_get_dir_entry(dentry, &ip);
2321         if (error) {
2322                 REMOVE_DEBUG_TRACE(__LINE__);
2323                 goto std_return;
2324         }
2325
2326         dm_di_mode = ip->i_d.di_mode;
2327
2328         xfs_itrace_entry(ip);
2329         xfs_itrace_ref(ip);
2330
2331         error = XFS_QM_DQATTACH(mp, dp, 0);
2332         if (!error && dp != ip)
2333                 error = XFS_QM_DQATTACH(mp, ip, 0);
2334         if (error) {
2335                 REMOVE_DEBUG_TRACE(__LINE__);
2336                 IRELE(ip);
2337                 goto std_return;
2338         }
2339
2340         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2341         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2342         /*
2343          * We try to get the real space reservation first,
2344          * allowing for directory btree deletion(s) implying
2345          * possible bmap insert(s).  If we can't get the space
2346          * reservation then we use 0 instead, and avoid the bmap
2347          * btree insert(s) in the directory code by, if the bmap
2348          * insert tries to happen, instead trimming the LAST
2349          * block from the directory.
2350          */
2351         resblks = XFS_REMOVE_SPACE_RES(mp);
2352         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2353                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2354         if (error == ENOSPC) {
2355                 resblks = 0;
2356                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2357                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2358         }
2359         if (error) {
2360                 ASSERT(error != ENOSPC);
2361                 REMOVE_DEBUG_TRACE(__LINE__);
2362                 xfs_trans_cancel(tp, 0);
2363                 IRELE(ip);
2364                 return error;
2365         }
2366
2367         error = xfs_lock_dir_and_entry(dp, ip);
2368         if (error) {
2369                 REMOVE_DEBUG_TRACE(__LINE__);
2370                 xfs_trans_cancel(tp, cancel_flags);
2371                 IRELE(ip);
2372                 goto std_return;
2373         }
2374
2375         /*
2376          * At this point, we've gotten both the directory and the entry
2377          * inodes locked.
2378          */
2379         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2380         if (dp != ip) {
2381                 /*
2382                  * Increment vnode ref count only in this case since
2383                  * there's an extra vnode reference in the case where
2384                  * dp == ip.
2385                  */
2386                 IHOLD(dp);
2387                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2388         }
2389
2390         /*
2391          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2392          */
2393         XFS_BMAP_INIT(&free_list, &first_block);
2394         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2395                                         &first_block, &free_list, 0);
2396         if (error) {
2397                 ASSERT(error != ENOENT);
2398                 REMOVE_DEBUG_TRACE(__LINE__);
2399                 goto error1;
2400         }
2401         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2402
2403         dp->i_gen++;
2404         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2405
2406         error = xfs_droplink(tp, ip);
2407         if (error) {
2408                 REMOVE_DEBUG_TRACE(__LINE__);
2409                 goto error1;
2410         }
2411
2412         /* Determine if this is the last link while
2413          * we are in the transaction.
2414          */
2415         link_zero = (ip)->i_d.di_nlink==0;
2416
2417         /*
2418          * Take an extra ref on the inode so that it doesn't
2419          * go to xfs_inactive() from within the commit.
2420          */
2421         IHOLD(ip);
2422
2423         /*
2424          * If this is a synchronous mount, make sure that the
2425          * remove transaction goes to disk before returning to
2426          * the user.
2427          */
2428         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2429                 xfs_trans_set_sync(tp);
2430         }
2431
2432         error = xfs_bmap_finish(&tp, &free_list, &committed);
2433         if (error) {
2434                 REMOVE_DEBUG_TRACE(__LINE__);
2435                 goto error_rele;
2436         }
2437
2438         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2439         if (error) {
2440                 IRELE(ip);
2441                 goto std_return;
2442         }
2443
2444         /*
2445          * If we are using filestreams, kill the stream association.
2446          * If the file is still open it may get a new one but that
2447          * will get killed on last close in xfs_close() so we don't
2448          * have to worry about that.
2449          */
2450         if (link_zero && xfs_inode_is_filestream(ip))
2451                 xfs_filestream_deassociate(ip);
2452
2453         xfs_itrace_exit(ip);
2454         IRELE(ip);
2455
2456 /*      Fall through to std_return with error = 0 */
2457  std_return:
2458         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2459                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2460                                 dir_vp, DM_RIGHT_NULL,
2461                                 NULL, DM_RIGHT_NULL,
2462                                 name, NULL, dm_di_mode, error, 0);
2463         }
2464         return error;
2465
2466  error1:
2467         xfs_bmap_cancel(&free_list);
2468         cancel_flags |= XFS_TRANS_ABORT;
2469         xfs_trans_cancel(tp, cancel_flags);
2470         goto std_return;
2471
2472  error_rele:
2473         /*
2474          * In this case make sure to not release the inode until after
2475          * the current transaction is aborted.  Releasing it beforehand
2476          * can cause us to go to xfs_inactive and start a recursive
2477          * transaction which can easily deadlock with the current one.
2478          */
2479         xfs_bmap_cancel(&free_list);
2480         cancel_flags |= XFS_TRANS_ABORT;
2481         xfs_trans_cancel(tp, cancel_flags);
2482
2483         IRELE(ip);
2484
2485         goto std_return;
2486 }
2487
2488 int
2489 xfs_link(
2490         xfs_inode_t             *tdp,
2491         bhv_vnode_t             *src_vp,
2492         bhv_vname_t             *dentry)
2493 {
2494         bhv_vnode_t             *target_dir_vp = XFS_ITOV(tdp);
2495         xfs_mount_t             *mp = tdp->i_mount;
2496         xfs_inode_t             *sip = xfs_vtoi(src_vp);
2497         xfs_trans_t             *tp;
2498         xfs_inode_t             *ips[2];
2499         int                     error;
2500         xfs_bmap_free_t         free_list;
2501         xfs_fsblock_t           first_block;
2502         int                     cancel_flags;
2503         int                     committed;
2504         int                     resblks;
2505         char                    *target_name = VNAME(dentry);
2506         int                     target_namelen;
2507
2508         xfs_itrace_entry(tdp);
2509         xfs_itrace_entry(xfs_vtoi(src_vp));
2510
2511         target_namelen = VNAMELEN(dentry);
2512         ASSERT(!VN_ISDIR(src_vp));
2513
2514         if (XFS_FORCED_SHUTDOWN(mp))
2515                 return XFS_ERROR(EIO);
2516
2517         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2518                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2519                                         target_dir_vp, DM_RIGHT_NULL,
2520                                         src_vp, DM_RIGHT_NULL,
2521                                         target_name, NULL, 0, 0, 0);
2522                 if (error)
2523                         return error;
2524         }
2525
2526         /* Return through std_return after this point. */
2527
2528         error = XFS_QM_DQATTACH(mp, sip, 0);
2529         if (!error && sip != tdp)
2530                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2531         if (error)
2532                 goto std_return;
2533
2534         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2535         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2536         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2537         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2538                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2539         if (error == ENOSPC) {
2540                 resblks = 0;
2541                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2542                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2543         }
2544         if (error) {
2545                 cancel_flags = 0;
2546                 goto error_return;
2547         }
2548
2549         if (sip->i_ino < tdp->i_ino) {
2550                 ips[0] = sip;
2551                 ips[1] = tdp;
2552         } else {
2553                 ips[0] = tdp;
2554                 ips[1] = sip;
2555         }
2556
2557         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2558
2559         /*
2560          * Increment vnode ref counts since xfs_trans_commit &
2561          * xfs_trans_cancel will both unlock the inodes and
2562          * decrement the associated ref counts.
2563          */
2564         VN_HOLD(src_vp);
2565         VN_HOLD(target_dir_vp);
2566         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2567         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2568
2569         /*
2570          * If the source has too many links, we can't make any more to it.
2571          */
2572         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2573                 error = XFS_ERROR(EMLINK);
2574                 goto error_return;
2575         }
2576
2577         /*
2578          * If we are using project inheritance, we only allow hard link
2579          * creation in our tree when the project IDs are the same; else
2580          * the tree quota mechanism could be circumvented.
2581          */
2582         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2583                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2584                 error = XFS_ERROR(EXDEV);
2585                 goto error_return;
2586         }
2587
2588         if (resblks == 0 &&
2589             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2590                 goto error_return;
2591
2592         XFS_BMAP_INIT(&free_list, &first_block);
2593
2594         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2595                                    sip->i_ino, &first_block, &free_list,
2596                                    resblks);
2597         if (error)
2598                 goto abort_return;
2599         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2600         tdp->i_gen++;
2601         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2602
2603         error = xfs_bumplink(tp, sip);
2604         if (error)
2605                 goto abort_return;
2606
2607         /*
2608          * If this is a synchronous mount, make sure that the
2609          * link transaction goes to disk before returning to
2610          * the user.
2611          */
2612         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2613                 xfs_trans_set_sync(tp);
2614         }
2615
2616         error = xfs_bmap_finish (&tp, &free_list, &committed);
2617         if (error) {
2618                 xfs_bmap_cancel(&free_list);
2619                 goto abort_return;
2620         }
2621
2622         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2623         if (error)
2624                 goto std_return;
2625
2626         /* Fall through to std_return with error = 0. */
2627 std_return:
2628         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2629                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2630                                 target_dir_vp, DM_RIGHT_NULL,
2631                                 src_vp, DM_RIGHT_NULL,
2632                                 target_name, NULL, 0, error, 0);
2633         }
2634         return error;
2635
2636  abort_return:
2637         cancel_flags |= XFS_TRANS_ABORT;
2638         /* FALLTHROUGH */
2639
2640  error_return:
2641         xfs_trans_cancel(tp, cancel_flags);
2642         goto std_return;
2643 }
2644
2645
2646 int
2647 xfs_mkdir(
2648         xfs_inode_t             *dp,
2649         bhv_vname_t             *dentry,
2650         mode_t                  mode,
2651         bhv_vnode_t             **vpp,
2652         cred_t                  *credp)
2653 {
2654         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2655         char                    *dir_name = VNAME(dentry);
2656         int                     dir_namelen = VNAMELEN(dentry);
2657         xfs_mount_t             *mp = dp->i_mount;
2658         xfs_inode_t             *cdp;   /* inode of created dir */
2659         bhv_vnode_t             *cvp;   /* vnode of created dir */
2660         xfs_trans_t             *tp;
2661         int                     cancel_flags;
2662         int                     error;
2663         int                     committed;
2664         xfs_bmap_free_t         free_list;
2665         xfs_fsblock_t           first_block;
2666         boolean_t               unlock_dp_on_error = B_FALSE;
2667         boolean_t               created = B_FALSE;
2668         int                     dm_event_sent = 0;
2669         xfs_prid_t              prid;
2670         struct xfs_dquot        *udqp, *gdqp;
2671         uint                    resblks;
2672
2673         if (XFS_FORCED_SHUTDOWN(mp))
2674                 return XFS_ERROR(EIO);
2675
2676         tp = NULL;
2677
2678         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2679                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2680                                         dir_vp, DM_RIGHT_NULL, NULL,
2681                                         DM_RIGHT_NULL, dir_name, NULL,
2682                                         mode, 0, 0);
2683                 if (error)
2684                         return error;
2685                 dm_event_sent = 1;
2686         }
2687
2688         /* Return through std_return after this point. */
2689
2690         xfs_itrace_entry(dp);
2691
2692         mp = dp->i_mount;
2693         udqp = gdqp = NULL;
2694         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2695                 prid = dp->i_d.di_projid;
2696         else
2697                 prid = (xfs_prid_t)dfltprid;
2698
2699         /*
2700          * Make sure that we have allocated dquot(s) on disk.
2701          */
2702         error = XFS_QM_DQVOPALLOC(mp, dp,
2703                         current_fsuid(credp), current_fsgid(credp), prid,
2704                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2705         if (error)
2706                 goto std_return;
2707
2708         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2709         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2710         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2711         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2712                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2713         if (error == ENOSPC) {
2714                 resblks = 0;
2715                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2716                                           XFS_TRANS_PERM_LOG_RES,
2717                                           XFS_MKDIR_LOG_COUNT);
2718         }
2719         if (error) {
2720                 cancel_flags = 0;
2721                 goto error_return;
2722         }
2723
2724         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2725         unlock_dp_on_error = B_TRUE;
2726
2727         /*
2728          * Check for directory link count overflow.
2729          */
2730         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2731                 error = XFS_ERROR(EMLINK);
2732                 goto error_return;
2733         }
2734
2735         /*
2736          * Reserve disk quota and the inode.
2737          */
2738         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2739         if (error)
2740                 goto error_return;
2741
2742         if (resblks == 0 &&
2743             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2744                 goto error_return;
2745         /*
2746          * create the directory inode.
2747          */
2748         error = xfs_dir_ialloc(&tp, dp, mode, 2,
2749                         0, credp, prid, resblks > 0,
2750                 &cdp, NULL);
2751         if (error) {
2752                 if (error == ENOSPC)
2753                         goto error_return;
2754                 goto abort_return;
2755         }
2756         xfs_itrace_ref(cdp);
2757
2758         /*
2759          * Now we add the directory inode to the transaction.
2760          * We waited until now since xfs_dir_ialloc might start
2761          * a new transaction.  Had we joined the transaction
2762          * earlier, the locks might have gotten released. An error
2763          * from here on will result in the transaction cancel
2764          * unlocking dp so don't do it explicitly in the error path.
2765          */
2766         VN_HOLD(dir_vp);
2767         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2768         unlock_dp_on_error = B_FALSE;
2769
2770         XFS_BMAP_INIT(&free_list, &first_block);
2771
2772         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2773                                    &first_block, &free_list, resblks ?
2774                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2775         if (error) {
2776                 ASSERT(error != ENOSPC);
2777                 goto error1;
2778         }
2779         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2780
2781         /*
2782          * Bump the in memory version number of the parent directory
2783          * so that other processes accessing it will recognize that
2784          * the directory has changed.
2785          */
2786         dp->i_gen++;
2787
2788         error = xfs_dir_init(tp, cdp, dp);
2789         if (error)
2790                 goto error2;
2791
2792         cdp->i_gen = 1;
2793         error = xfs_bumplink(tp, dp);
2794         if (error)
2795                 goto error2;
2796
2797         cvp = XFS_ITOV(cdp);
2798
2799         created = B_TRUE;
2800
2801         *vpp = cvp;
2802         IHOLD(cdp);
2803
2804         /*
2805          * Attach the dquots to the new inode and modify the icount incore.
2806          */
2807         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2808
2809         /*
2810          * If this is a synchronous mount, make sure that the
2811          * mkdir transaction goes to disk before returning to
2812          * the user.
2813          */
2814         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2815                 xfs_trans_set_sync(tp);
2816         }
2817
2818         error = xfs_bmap_finish(&tp, &free_list, &committed);
2819         if (error) {
2820                 IRELE(cdp);
2821                 goto error2;
2822         }
2823
2824         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2825         XFS_QM_DQRELE(mp, udqp);
2826         XFS_QM_DQRELE(mp, gdqp);
2827         if (error) {
2828                 IRELE(cdp);
2829         }
2830
2831         /* Fall through to std_return with error = 0 or errno from
2832          * xfs_trans_commit. */
2833
2834 std_return:
2835         if ((created || (error != 0 && dm_event_sent != 0)) &&
2836             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2837                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2838                                         dir_vp, DM_RIGHT_NULL,
2839                                         created ? XFS_ITOV(cdp):NULL,
2840                                         DM_RIGHT_NULL,
2841                                         dir_name, NULL,
2842                                         mode, error, 0);
2843         }
2844         return error;
2845
2846  error2:
2847  error1:
2848         xfs_bmap_cancel(&free_list);
2849  abort_return:
2850         cancel_flags |= XFS_TRANS_ABORT;
2851  error_return:
2852         xfs_trans_cancel(tp, cancel_flags);
2853         XFS_QM_DQRELE(mp, udqp);
2854         XFS_QM_DQRELE(mp, gdqp);
2855
2856         if (unlock_dp_on_error)
2857                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2858
2859         goto std_return;
2860 }
2861
2862 int
2863 xfs_rmdir(
2864         xfs_inode_t             *dp,
2865         bhv_vname_t             *dentry)
2866 {
2867         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2868         char                    *name = VNAME(dentry);
2869         int                     namelen = VNAMELEN(dentry);
2870         xfs_mount_t             *mp = dp->i_mount;
2871         xfs_inode_t             *cdp;   /* child directory */
2872         xfs_trans_t             *tp;
2873         int                     error;
2874         xfs_bmap_free_t         free_list;
2875         xfs_fsblock_t           first_block;
2876         int                     cancel_flags;
2877         int                     committed;
2878         int                     dm_di_mode = S_IFDIR;
2879         int                     last_cdp_link;
2880         uint                    resblks;
2881
2882         xfs_itrace_entry(dp);
2883
2884         if (XFS_FORCED_SHUTDOWN(mp))
2885                 return XFS_ERROR(EIO);
2886
2887         if (!xfs_get_dir_entry(dentry, &cdp)) {
2888                 dm_di_mode = cdp->i_d.di_mode;
2889                 IRELE(cdp);
2890         }
2891
2892         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2893                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2894                                         dir_vp, DM_RIGHT_NULL,
2895                                         NULL, DM_RIGHT_NULL,
2896                                         name, NULL, dm_di_mode, 0, 0);
2897                 if (error)
2898                         return XFS_ERROR(error);
2899         }
2900
2901         /* Return through std_return after this point. */
2902
2903         cdp = NULL;
2904
2905         /*
2906          * We need to get a reference to cdp before we get our log
2907          * reservation.  The reason for this is that we cannot call
2908          * xfs_iget for an inode for which we do not have a reference
2909          * once we've acquired a log reservation.  This is because the
2910          * inode we are trying to get might be in xfs_inactive going
2911          * for a log reservation.  Since we'll have to wait for the
2912          * inactive code to complete before returning from xfs_iget,
2913          * we need to make sure that we don't have log space reserved
2914          * when we call xfs_iget.  Instead we get an unlocked reference
2915          * to the inode before getting our log reservation.
2916          */
2917         error = xfs_get_dir_entry(dentry, &cdp);
2918         if (error) {
2919                 REMOVE_DEBUG_TRACE(__LINE__);
2920                 goto std_return;
2921         }
2922         mp = dp->i_mount;
2923         dm_di_mode = cdp->i_d.di_mode;
2924
2925         /*
2926          * Get the dquots for the inodes.
2927          */
2928         error = XFS_QM_DQATTACH(mp, dp, 0);
2929         if (!error && dp != cdp)
2930                 error = XFS_QM_DQATTACH(mp, cdp, 0);
2931         if (error) {
2932                 IRELE(cdp);
2933                 REMOVE_DEBUG_TRACE(__LINE__);
2934                 goto std_return;
2935         }
2936
2937         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2938         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2939         /*
2940          * We try to get the real space reservation first,
2941          * allowing for directory btree deletion(s) implying
2942          * possible bmap insert(s).  If we can't get the space
2943          * reservation then we use 0 instead, and avoid the bmap
2944          * btree insert(s) in the directory code by, if the bmap
2945          * insert tries to happen, instead trimming the LAST
2946          * block from the directory.
2947          */
2948         resblks = XFS_REMOVE_SPACE_RES(mp);
2949         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2950                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2951         if (error == ENOSPC) {
2952                 resblks = 0;
2953                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2954                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2955         }
2956         if (error) {
2957                 ASSERT(error != ENOSPC);
2958                 cancel_flags = 0;
2959                 IRELE(cdp);
2960                 goto error_return;
2961         }
2962         XFS_BMAP_INIT(&free_list, &first_block);
2963
2964         /*
2965          * Now lock the child directory inode and the parent directory
2966          * inode in the proper order.  This will take care of validating
2967          * that the directory entry for the child directory inode has
2968          * not changed while we were obtaining a log reservation.
2969          */
2970         error = xfs_lock_dir_and_entry(dp, cdp);
2971         if (error) {
2972                 xfs_trans_cancel(tp, cancel_flags);
2973                 IRELE(cdp);
2974                 goto std_return;
2975         }
2976
2977         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2978         if (dp != cdp) {
2979                 /*
2980                  * Only increment the parent directory vnode count if
2981                  * we didn't bump it in looking up cdp.  The only time
2982                  * we don't bump it is when we're looking up ".".
2983                  */
2984                 VN_HOLD(dir_vp);
2985         }
2986
2987         xfs_itrace_ref(cdp);
2988         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
2989
2990         ASSERT(cdp->i_d.di_nlink >= 2);
2991         if (cdp->i_d.di_nlink != 2) {
2992                 error = XFS_ERROR(ENOTEMPTY);
2993                 goto error_return;
2994         }
2995         if (!xfs_dir_isempty(cdp)) {
2996                 error = XFS_ERROR(ENOTEMPTY);
2997                 goto error_return;
2998         }
2999
3000         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3001                                         &first_block, &free_list, resblks);
3002         if (error)
3003                 goto error1;
3004
3005         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3006
3007         /*
3008          * Bump the in memory generation count on the parent
3009          * directory so that other can know that it has changed.
3010          */
3011         dp->i_gen++;
3012
3013         /*
3014          * Drop the link from cdp's "..".
3015          */
3016         error = xfs_droplink(tp, dp);
3017         if (error) {
3018                 goto error1;
3019         }
3020
3021         /*
3022          * Drop the link from dp to cdp.
3023          */
3024         error = xfs_droplink(tp, cdp);
3025         if (error) {
3026                 goto error1;
3027         }
3028
3029         /*
3030          * Drop the "." link from cdp to self.
3031          */
3032         error = xfs_droplink(tp, cdp);
3033         if (error) {
3034                 goto error1;
3035         }
3036
3037         /* Determine these before committing transaction */
3038         last_cdp_link = (cdp)->i_d.di_nlink==0;
3039
3040         /*
3041          * Take an extra ref on the child vnode so that it
3042          * does not go to xfs_inactive() from within the commit.
3043          */
3044         IHOLD(cdp);
3045
3046         /*
3047          * If this is a synchronous mount, make sure that the
3048          * rmdir transaction goes to disk before returning to
3049          * the user.
3050          */
3051         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3052                 xfs_trans_set_sync(tp);
3053         }
3054
3055         error = xfs_bmap_finish (&tp, &free_list, &committed);
3056         if (error) {
3057                 xfs_bmap_cancel(&free_list);
3058                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3059                                  XFS_TRANS_ABORT));
3060                 IRELE(cdp);
3061                 goto std_return;
3062         }
3063
3064         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3065         if (error) {
3066                 IRELE(cdp);
3067                 goto std_return;
3068         }
3069
3070
3071         IRELE(cdp);
3072
3073         /* Fall through to std_return with error = 0 or the errno
3074          * from xfs_trans_commit. */
3075  std_return:
3076         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3077                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3078                                         dir_vp, DM_RIGHT_NULL,
3079                                         NULL, DM_RIGHT_NULL,
3080                                         name, NULL, dm_di_mode,
3081                                         error, 0);
3082         }
3083         return error;
3084
3085  error1:
3086         xfs_bmap_cancel(&free_list);
3087         cancel_flags |= XFS_TRANS_ABORT;
3088         /* FALLTHROUGH */
3089
3090  error_return:
3091         xfs_trans_cancel(tp, cancel_flags);
3092         goto std_return;
3093 }
3094
3095 int
3096 xfs_symlink(
3097         xfs_inode_t             *dp,
3098         bhv_vname_t             *dentry,
3099         char                    *target_path,
3100         mode_t                  mode,
3101         bhv_vnode_t             **vpp,
3102         cred_t                  *credp)
3103 {
3104         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
3105         xfs_mount_t             *mp = dp->i_mount;
3106         xfs_trans_t             *tp;
3107         xfs_inode_t             *ip;
3108         int                     error;
3109         int                     pathlen;
3110         xfs_bmap_free_t         free_list;
3111         xfs_fsblock_t           first_block;
3112         boolean_t               unlock_dp_on_error = B_FALSE;
3113         uint                    cancel_flags;
3114         int                     committed;
3115         xfs_fileoff_t           first_fsb;
3116         xfs_filblks_t           fs_blocks;
3117         int                     nmaps;
3118         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3119         xfs_daddr_t             d;
3120         char                    *cur_chunk;
3121         int                     byte_cnt;
3122         int                     n;
3123         xfs_buf_t               *bp;
3124         xfs_prid_t              prid;
3125         struct xfs_dquot        *udqp, *gdqp;
3126         uint                    resblks;
3127         char                    *link_name = VNAME(dentry);
3128         int                     link_namelen;
3129
3130         *vpp = NULL;
3131         error = 0;
3132         ip = NULL;
3133         tp = NULL;
3134
3135         xfs_itrace_entry(dp);
3136
3137         if (XFS_FORCED_SHUTDOWN(mp))
3138                 return XFS_ERROR(EIO);
3139
3140         link_namelen = VNAMELEN(dentry);
3141
3142         /*
3143          * Check component lengths of the target path name.
3144          */
3145         pathlen = strlen(target_path);
3146         if (pathlen >= MAXPATHLEN)      /* total string too long */
3147                 return XFS_ERROR(ENAMETOOLONG);
3148         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3149                 int len, total;
3150                 char *path;
3151
3152                 for (total = 0, path = target_path; total < pathlen;) {
3153                         /*
3154                          * Skip any slashes.
3155                          */
3156                         while(*path == '/') {
3157                                 total++;
3158                                 path++;
3159                         }
3160
3161                         /*
3162                          * Count up to the next slash or end of path.
3163                          * Error out if the component is bigger than MAXNAMELEN.
3164                          */
3165                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3166                                 if (++len >= MAXNAMELEN) {
3167                                         error = ENAMETOOLONG;
3168                                         return error;
3169                                 }
3170                         }
3171                 }
3172         }
3173
3174         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3175                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3176                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3177                                         link_name, target_path, 0, 0, 0);
3178                 if (error)
3179                         return error;
3180         }
3181
3182         /* Return through std_return after this point. */
3183
3184         udqp = gdqp = NULL;
3185         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3186                 prid = dp->i_d.di_projid;
3187         else
3188                 prid = (xfs_prid_t)dfltprid;
3189
3190         /*
3191          * Make sure that we have allocated dquot(s) on disk.
3192          */
3193         error = XFS_QM_DQVOPALLOC(mp, dp,
3194                         current_fsuid(credp), current_fsgid(credp), prid,
3195                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3196         if (error)
3197                 goto std_return;
3198
3199         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3200         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3201         /*
3202          * The symlink will fit into the inode data fork?
3203          * There can't be any attributes so we get the whole variable part.
3204          */
3205         if (pathlen <= XFS_LITINO(mp))
3206                 fs_blocks = 0;
3207         else
3208                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3209         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3210         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3211                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3212         if (error == ENOSPC && fs_blocks == 0) {
3213                 resblks = 0;
3214                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3215                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3216         }
3217         if (error) {
3218                 cancel_flags = 0;
3219                 goto error_return;
3220         }
3221
3222         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3223         unlock_dp_on_error = B_TRUE;
3224
3225         /*
3226          * Check whether the directory allows new symlinks or not.
3227          */
3228         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3229                 error = XFS_ERROR(EPERM);
3230                 goto error_return;
3231         }
3232
3233         /*
3234          * Reserve disk quota : blocks and inode.
3235          */
3236         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3237         if (error)
3238                 goto error_return;
3239
3240         /*
3241          * Check for ability to enter directory entry, if no space reserved.
3242          */
3243         if (resblks == 0 &&
3244             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3245                 goto error_return;
3246         /*
3247          * Initialize the bmap freelist prior to calling either
3248          * bmapi or the directory create code.
3249          */
3250         XFS_BMAP_INIT(&free_list, &first_block);
3251
3252         /*
3253          * Allocate an inode for the symlink.
3254          */
3255         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
3256                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3257         if (error) {
3258                 if (error == ENOSPC)
3259                         goto error_return;
3260                 goto error1;
3261         }
3262         xfs_itrace_ref(ip);
3263
3264         /*
3265          * An error after we've joined dp to the transaction will result in the
3266          * transaction cancel unlocking dp so don't do it explicitly in the
3267          * error path.
3268          */
3269         VN_HOLD(dir_vp);
3270         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3271         unlock_dp_on_error = B_FALSE;
3272
3273         /*
3274          * Also attach the dquot(s) to it, if applicable.
3275          */
3276         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3277
3278         if (resblks)
3279                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3280         /*
3281          * If the symlink will fit into the inode, write it inline.
3282          */
3283         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3284                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3285                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3286                 ip->i_d.di_size = pathlen;
3287
3288                 /*
3289                  * The inode was initially created in extent format.
3290                  */
3291                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3292                 ip->i_df.if_flags |= XFS_IFINLINE;
3293
3294                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3295                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3296
3297         } else {
3298                 first_fsb = 0;
3299                 nmaps = SYMLINK_MAPS;
3300
3301                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3302                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3303                                   &first_block, resblks, mval, &nmaps,
3304                                   &free_list, NULL);
3305                 if (error) {
3306                         goto error1;
3307                 }
3308
3309                 if (resblks)
3310                         resblks -= fs_blocks;
3311                 ip->i_d.di_size = pathlen;
3312                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3313
3314                 cur_chunk = target_path;
3315                 for (n = 0; n < nmaps; n++) {
3316                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3317                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3318                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3319                                                BTOBB(byte_cnt), 0);
3320                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3321                         if (pathlen < byte_cnt) {
3322                                 byte_cnt = pathlen;
3323                         }
3324                         pathlen -= byte_cnt;
3325
3326                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3327                         cur_chunk += byte_cnt;
3328
3329                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3330                 }
3331         }
3332
3333         /*
3334          * Create the directory entry for the symlink.
3335          */
3336         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3337                                    &first_block, &free_list, resblks);
3338         if (error)
3339                 goto error1;
3340         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3341         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3342
3343         /*
3344          * Bump the in memory version number of the parent directory
3345          * so that other processes accessing it will recognize that
3346          * the directory has changed.
3347          */
3348         dp->i_gen++;
3349
3350         /*
3351          * If this is a synchronous mount, make sure that the
3352          * symlink transaction goes to disk before returning to
3353          * the user.
3354          */
3355         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3356                 xfs_trans_set_sync(tp);
3357         }
3358
3359         /*
3360          * xfs_trans_commit normally decrements the vnode ref count
3361          * when it unlocks the inode. Since we want to return the
3362          * vnode to the caller, we bump the vnode ref count now.
3363          */
3364         IHOLD(ip);
3365
3366         error = xfs_bmap_finish(&tp, &free_list, &committed);
3367         if (error) {
3368                 goto error2;
3369         }
3370         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3371         XFS_QM_DQRELE(mp, udqp);
3372         XFS_QM_DQRELE(mp, gdqp);
3373
3374         /* Fall through to std_return with error = 0 or errno from
3375          * xfs_trans_commit     */
3376 std_return:
3377         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3378                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3379                                         dir_vp, DM_RIGHT_NULL,
3380                                         error ? NULL : XFS_ITOV(ip),
3381                                         DM_RIGHT_NULL, link_name, target_path,
3382                                         0, error, 0);
3383         }
3384
3385         if (!error) {
3386                 bhv_vnode_t *vp;
3387
3388                 ASSERT(ip);
3389                 vp = XFS_ITOV(ip);
3390                 *vpp = vp;
3391         }
3392         return error;
3393
3394  error2:
3395         IRELE(ip);
3396  error1:
3397         xfs_bmap_cancel(&free_list);
3398         cancel_flags |= XFS_TRANS_ABORT;
3399  error_return:
3400         xfs_trans_cancel(tp, cancel_flags);
3401         XFS_QM_DQRELE(mp, udqp);
3402         XFS_QM_DQRELE(mp, gdqp);
3403
3404         if (unlock_dp_on_error)
3405                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3406
3407         goto std_return;
3408 }
3409
3410 int
3411 xfs_rwlock(
3412         xfs_inode_t     *ip,
3413         bhv_vrwlock_t   locktype)
3414 {
3415         if (S_ISDIR(ip->i_d.di_mode))
3416                 return 1;
3417         if (locktype == VRWLOCK_WRITE) {
3418                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3419         } else if (locktype == VRWLOCK_TRY_READ) {
3420                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3421         } else if (locktype == VRWLOCK_TRY_WRITE) {
3422                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3423         } else {
3424                 ASSERT((locktype == VRWLOCK_READ) ||
3425                        (locktype == VRWLOCK_WRITE_DIRECT));
3426                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3427         }
3428
3429         return 1;
3430 }
3431
3432
3433 void
3434 xfs_rwunlock(
3435         xfs_inode_t     *ip,
3436         bhv_vrwlock_t   locktype)
3437 {
3438         if (S_ISDIR(ip->i_d.di_mode))
3439                 return;
3440         if (locktype == VRWLOCK_WRITE) {
3441                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3442         } else {
3443                 ASSERT((locktype == VRWLOCK_READ) ||
3444                        (locktype == VRWLOCK_WRITE_DIRECT));
3445                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3446         }
3447         return;
3448 }
3449
3450
3451 int
3452 xfs_inode_flush(
3453         xfs_inode_t     *ip,
3454         int             flags)
3455 {
3456         xfs_mount_t     *mp = ip->i_mount;
3457         int             error = 0;
3458
3459         if (XFS_FORCED_SHUTDOWN(mp))
3460                 return XFS_ERROR(EIO);
3461
3462         /*
3463          * Bypass inodes which have already been cleaned by
3464          * the inode flush clustering code inside xfs_iflush
3465          */
3466         if (xfs_inode_clean(ip))
3467                 return 0;
3468
3469         /*
3470          * We make this non-blocking if the inode is contended,
3471          * return EAGAIN to indicate to the caller that they
3472          * did not succeed. This prevents the flush path from
3473          * blocking on inodes inside another operation right
3474          * now, they get caught later by xfs_sync.
3475          */
3476         if (flags & FLUSH_SYNC) {
3477                 xfs_ilock(ip, XFS_ILOCK_SHARED);
3478                 xfs_iflock(ip);
3479         } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3480                 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3481                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3482                         return EAGAIN;
3483                 }
3484         } else {
3485                 return EAGAIN;
3486         }
3487
3488         error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
3489                                                     : XFS_IFLUSH_ASYNC_NOBLOCK);
3490         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3491
3492         return error;
3493 }
3494
3495
3496 int
3497 xfs_set_dmattrs(
3498         xfs_inode_t     *ip,
3499         u_int           evmask,
3500         u_int16_t       state)
3501 {
3502         xfs_mount_t     *mp = ip->i_mount;
3503         xfs_trans_t     *tp;
3504         int             error;
3505
3506         if (!capable(CAP_SYS_ADMIN))
3507                 return XFS_ERROR(EPERM);
3508
3509         if (XFS_FORCED_SHUTDOWN(mp))
3510                 return XFS_ERROR(EIO);
3511
3512         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3513         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3514         if (error) {
3515                 xfs_trans_cancel(tp, 0);
3516                 return error;
3517         }
3518         xfs_ilock(ip, XFS_ILOCK_EXCL);
3519         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3520
3521         ip->i_d.di_dmevmask = evmask;
3522         ip->i_d.di_dmstate  = state;
3523
3524         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3525         IHOLD(ip);
3526         error = xfs_trans_commit(tp, 0);
3527
3528         return error;
3529 }
3530
3531 int
3532 xfs_reclaim(
3533         xfs_inode_t     *ip)
3534 {
3535         bhv_vnode_t     *vp = XFS_ITOV(ip);
3536
3537         xfs_itrace_entry(ip);
3538
3539         ASSERT(!VN_MAPPED(vp));
3540
3541         /* bad inode, get out here ASAP */
3542         if (VN_BAD(vp)) {
3543                 xfs_ireclaim(ip);
3544                 return 0;
3545         }
3546
3547         vn_iowait(ip);
3548
3549         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3550
3551         /*
3552          * Make sure the atime in the XFS inode is correct before freeing the
3553          * Linux inode.
3554          */
3555         xfs_synchronize_atime(ip);
3556
3557         /*
3558          * If we have nothing to flush with this inode then complete the
3559          * teardown now, otherwise break the link between the xfs inode and the
3560          * linux inode and clean up the xfs inode later. This avoids flushing
3561          * the inode to disk during the delete operation itself.
3562          *
3563          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3564          * first to ensure that xfs_iunpin() will never see an xfs inode
3565          * that has a linux inode being reclaimed. Synchronisation is provided
3566          * by the i_flags_lock.
3567          */
3568         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3569                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3570                 xfs_iflock(ip);
3571                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3572         } else {
3573                 xfs_mount_t     *mp = ip->i_mount;
3574
3575                 /* Protect sync and unpin from us */
3576                 XFS_MOUNT_ILOCK(mp);
3577                 spin_lock(&ip->i_flags_lock);
3578                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3579                 vn_to_inode(vp)->i_private = NULL;
3580                 ip->i_vnode = NULL;
3581                 spin_unlock(&ip->i_flags_lock);
3582                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3583                 XFS_MOUNT_IUNLOCK(mp);
3584         }
3585         return 0;
3586 }
3587
3588 int
3589 xfs_finish_reclaim(
3590         xfs_inode_t     *ip,
3591         int             locked,
3592         int             sync_mode)
3593 {
3594         xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3595         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3596         int             error;
3597
3598         if (vp && VN_BAD(vp))
3599                 goto reclaim;
3600
3601         /* The hash lock here protects a thread in xfs_iget_core from
3602          * racing with us on linking the inode back with a vnode.
3603          * Once we have the XFS_IRECLAIM flag set it will not touch
3604          * us.
3605          */
3606         write_lock(&pag->pag_ici_lock);
3607         spin_lock(&ip->i_flags_lock);
3608         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3609             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3610                 spin_unlock(&ip->i_flags_lock);
3611                 write_unlock(&pag->pag_ici_lock);
3612                 if (locked) {
3613                         xfs_ifunlock(ip);
3614                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3615                 }
3616                 return 1;
3617         }
3618         __xfs_iflags_set(ip, XFS_IRECLAIM);
3619         spin_unlock(&ip->i_flags_lock);
3620         write_unlock(&pag->pag_ici_lock);
3621         xfs_put_perag(ip->i_mount, pag);
3622
3623         /*
3624          * If the inode is still dirty, then flush it out.  If the inode
3625          * is not in the AIL, then it will be OK to flush it delwri as
3626          * long as xfs_iflush() does not keep any references to the inode.
3627          * We leave that decision up to xfs_iflush() since it has the
3628          * knowledge of whether it's OK to simply do a delwri flush of
3629          * the inode or whether we need to wait until the inode is
3630          * pulled from the AIL.
3631          * We get the flush lock regardless, though, just to make sure
3632          * we don't free it while it is being flushed.
3633          */
3634         if (!locked) {
3635                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3636                 xfs_iflock(ip);
3637         }
3638
3639         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3640                 if (ip->i_update_core ||
3641                     ((ip->i_itemp != NULL) &&
3642                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3643                         error = xfs_iflush(ip, sync_mode);
3644                         /*
3645                          * If we hit an error, typically because of filesystem
3646                          * shutdown, we don't need to let vn_reclaim to know
3647                          * because we're gonna reclaim the inode anyway.
3648                          */
3649                         if (error) {
3650                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3651                                 goto reclaim;
3652                         }
3653                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3654                 }
3655
3656                 ASSERT(ip->i_update_core == 0);
3657                 ASSERT(ip->i_itemp == NULL ||
3658                        ip->i_itemp->ili_format.ilf_fields == 0);
3659         }
3660
3661         xfs_ifunlock(ip);
3662         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3663
3664  reclaim:
3665         xfs_ireclaim(ip);
3666         return 0;
3667 }
3668
3669 int
3670 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3671 {
3672         int             purged;
3673         xfs_inode_t     *ip, *n;
3674         int             done = 0;
3675
3676         while (!done) {
3677                 purged = 0;
3678                 XFS_MOUNT_ILOCK(mp);
3679                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3680                         if (noblock) {
3681                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3682                                         continue;
3683                                 if (xfs_ipincount(ip) ||
3684                                     !xfs_iflock_nowait(ip)) {
3685                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3686                                         continue;
3687                                 }
3688                         }
3689                         XFS_MOUNT_IUNLOCK(mp);
3690                         if (xfs_finish_reclaim(ip, noblock,
3691                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3692                                 delay(1);
3693                         purged = 1;
3694                         break;
3695                 }
3696
3697                 done = !purged;
3698         }
3699
3700         XFS_MOUNT_IUNLOCK(mp);
3701         return 0;
3702 }
3703
3704 /*
3705  * xfs_alloc_file_space()
3706  *      This routine allocates disk space for the given file.
3707  *
3708  *      If alloc_type == 0, this request is for an ALLOCSP type
3709  *      request which will change the file size.  In this case, no
3710  *      DMAPI event will be generated by the call.  A TRUNCATE event
3711  *      will be generated later by xfs_setattr.
3712  *
3713  *      If alloc_type != 0, this request is for a RESVSP type
3714  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3715  *      lower block boundary byte address is less than the file's
3716  *      length.
3717  *
3718  * RETURNS:
3719  *       0 on success
3720  *      errno on error
3721  *
3722  */
3723 STATIC int
3724 xfs_alloc_file_space(
3725         xfs_inode_t             *ip,
3726         xfs_off_t               offset,
3727         xfs_off_t               len,
3728         int                     alloc_type,
3729         int                     attr_flags)
3730 {
3731         xfs_mount_t             *mp = ip->i_mount;
3732         xfs_off_t               count;
3733         xfs_filblks_t           allocated_fsb;
3734         xfs_filblks_t           allocatesize_fsb;
3735         xfs_extlen_t            extsz, temp;
3736         xfs_fileoff_t           startoffset_fsb;
3737         xfs_fsblock_t           firstfsb;
3738         int                     nimaps;
3739         int                     bmapi_flag;
3740         int                     quota_flag;
3741         int                     rt;
3742         xfs_trans_t             *tp;
3743         xfs_bmbt_irec_t         imaps[1], *imapp;
3744         xfs_bmap_free_t         free_list;
3745         uint                    qblocks, resblks, resrtextents;
3746         int                     committed;
3747         int                     error;
3748
3749         xfs_itrace_entry(ip);
3750
3751         if (XFS_FORCED_SHUTDOWN(mp))
3752                 return XFS_ERROR(EIO);
3753
3754         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3755                 return error;
3756
3757         if (len <= 0)
3758                 return XFS_ERROR(EINVAL);
3759
3760         rt = XFS_IS_REALTIME_INODE(ip);
3761         extsz = xfs_get_extsz_hint(ip);
3762
3763         count = len;
3764         imapp = &imaps[0];
3765         nimaps = 1;
3766         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3767         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3768         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3769
3770         /*      Generate a DMAPI event if needed.       */
3771         if (alloc_type != 0 && offset < ip->i_size &&
3772                         (attr_flags&ATTR_DMI) == 0  &&
3773                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3774                 xfs_off_t           end_dmi_offset;
3775
3776                 end_dmi_offset = offset+len;
3777                 if (end_dmi_offset > ip->i_size)
3778                         end_dmi_offset = ip->i_size;
3779                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
3780                         offset, end_dmi_offset - offset,
3781                         0, NULL);
3782                 if (error)
3783                         return error;
3784         }
3785
3786         /*
3787          * Allocate file space until done or until there is an error
3788          */
3789 retry:
3790         while (allocatesize_fsb && !error) {
3791                 xfs_fileoff_t   s, e;
3792
3793                 /*
3794                  * Determine space reservations for data/realtime.
3795                  */
3796                 if (unlikely(extsz)) {
3797                         s = startoffset_fsb;
3798                         do_div(s, extsz);
3799                         s *= extsz;
3800                         e = startoffset_fsb + allocatesize_fsb;
3801                         if ((temp = do_mod(startoffset_fsb, extsz)))
3802                                 e += temp;
3803                         if ((temp = do_mod(e, extsz)))
3804                                 e += extsz - temp;
3805                 } else {
3806                         s = 0;
3807                         e = allocatesize_fsb;
3808                 }
3809
3810                 if (unlikely(rt)) {
3811                         resrtextents = qblocks = (uint)(e - s);
3812                         resrtextents /= mp->m_sb.sb_rextsize;
3813                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3814                         quota_flag = XFS_QMOPT_RES_RTBLKS;
3815                 } else {
3816                         resrtextents = 0;
3817                         resblks = qblocks = \
3818                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3819                         quota_flag = XFS_QMOPT_RES_REGBLKS;
3820                 }
3821
3822                 /*
3823                  * Allocate and setup the transaction.
3824                  */
3825                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3826                 error = xfs_trans_reserve(tp, resblks,
3827                                           XFS_WRITE_LOG_RES(mp), resrtextents,
3828                                           XFS_TRANS_PERM_LOG_RES,
3829                                           XFS_WRITE_LOG_COUNT);
3830                 /*
3831                  * Check for running out of space
3832                  */
3833                 if (error) {
3834                         /*
3835                          * Free the transaction structure.
3836                          */
3837                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3838                         xfs_trans_cancel(tp, 0);
3839                         break;
3840                 }
3841                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3842                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3843                                                       qblocks, 0, quota_flag);
3844                 if (error)
3845                         goto error1;
3846
3847                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3848                 xfs_trans_ihold(tp, ip);
3849
3850                 /*
3851                  * Issue the xfs_bmapi() call to allocate the blocks
3852                  */
3853                 XFS_BMAP_INIT(&free_list, &firstfsb);
3854                 error = xfs_bmapi(tp, ip, startoffset_fsb,
3855                                   allocatesize_fsb, bmapi_flag,
3856                                   &firstfsb, 0, imapp, &nimaps,
3857                                   &free_list, NULL);
3858                 if (error) {
3859                         goto error0;
3860                 }
3861
3862                 /*
3863                  * Complete the transaction
3864                  */
3865                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3866                 if (error) {
3867                         goto error0;
3868                 }
3869
3870                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3871                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3872                 if (error) {
3873                         break;
3874                 }
3875
3876                 allocated_fsb = imapp->br_blockcount;
3877
3878                 if (nimaps == 0) {
3879                         error = XFS_ERROR(ENOSPC);
3880                         break;
3881                 }
3882
3883                 startoffset_fsb += allocated_fsb;
3884                 allocatesize_fsb -= allocated_fsb;
3885         }
3886 dmapi_enospc_check:
3887         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
3888             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3889                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3890                                 XFS_ITOV(ip), DM_RIGHT_NULL,
3891                                 XFS_ITOV(ip), DM_RIGHT_NULL,
3892                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3893                 if (error == 0)
3894                         goto retry;     /* Maybe DMAPI app. has made space */
3895                 /* else fall through with error from XFS_SEND_DATA */
3896         }
3897
3898         return error;
3899
3900 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
3901         xfs_bmap_cancel(&free_list);
3902         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
3903
3904 error1: /* Just cancel transaction */
3905         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3906         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3907         goto dmapi_enospc_check;
3908 }
3909
3910 /*
3911  * Zero file bytes between startoff and endoff inclusive.
3912  * The iolock is held exclusive and no blocks are buffered.
3913  */
3914 STATIC int
3915 xfs_zero_remaining_bytes(
3916         xfs_inode_t             *ip,
3917         xfs_off_t               startoff,
3918         xfs_off_t               endoff)
3919 {
3920         xfs_bmbt_irec_t         imap;
3921         xfs_fileoff_t           offset_fsb;
3922         xfs_off_t               lastoffset;
3923         xfs_off_t               offset;
3924         xfs_buf_t               *bp;
3925         xfs_mount_t             *mp = ip->i_mount;
3926         int                     nimap;
3927         int                     error = 0;
3928
3929         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3930                                 XFS_IS_REALTIME_INODE(ip) ?
3931                                 mp->m_rtdev_targp : mp->m_ddev_targp);
3932
3933         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3934                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
3935                 nimap = 1;
3936                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
3937                         NULL, 0, &imap, &nimap, NULL, NULL);
3938                 if (error || nimap < 1)
3939                         break;
3940                 ASSERT(imap.br_blockcount >= 1);
3941                 ASSERT(imap.br_startoff == offset_fsb);
3942                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
3943                 if (lastoffset > endoff)
3944                         lastoffset = endoff;
3945                 if (imap.br_startblock == HOLESTARTBLOCK)
3946                         continue;
3947                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3948                 if (imap.br_state == XFS_EXT_UNWRITTEN)
3949                         continue;
3950                 XFS_BUF_UNDONE(bp);
3951                 XFS_BUF_UNWRITE(bp);
3952                 XFS_BUF_READ(bp);
3953                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
3954                 xfsbdstrat(mp, bp);
3955                 if ((error = xfs_iowait(bp))) {
3956                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
3957                                           mp, bp, XFS_BUF_ADDR(bp));
3958                         break;
3959                 }
3960                 memset(XFS_BUF_PTR(bp) +
3961                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
3962                       0, lastoffset - offset + 1);
3963                 XFS_BUF_UNDONE(bp);
3964                 XFS_BUF_UNREAD(bp);
3965                 XFS_BUF_WRITE(bp);
3966                 xfsbdstrat(mp, bp);
3967                 if ((error = xfs_iowait(bp))) {
3968                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
3969                                           mp, bp, XFS_BUF_ADDR(bp));
3970                         break;
3971                 }
3972         }
3973         xfs_buf_free(bp);
3974         return error;
3975 }
3976
3977 /*
3978  * xfs_free_file_space()
3979  *      This routine frees disk space for the given file.
3980  *
3981  *      This routine is only called by xfs_change_file_space
3982  *      for an UNRESVSP type call.
3983  *
3984  * RETURNS:
3985  *       0 on success
3986  *      errno on error
3987  *
3988  */
3989 STATIC int
3990 xfs_free_file_space(
3991         xfs_inode_t             *ip,
3992         xfs_off_t               offset,
3993         xfs_off_t               len,
3994         int                     attr_flags)
3995 {
3996         bhv_vnode_t             *vp;
3997         int                     committed;
3998         int                     done;
3999         xfs_off_t               end_dmi_offset;
4000         xfs_fileoff_t           endoffset_fsb;
4001         int                     error;
4002         xfs_fsblock_t           firstfsb;
4003         xfs_bmap_free_t         free_list;
4004         xfs_bmbt_irec_t         imap;
4005         xfs_off_t               ioffset;
4006         xfs_extlen_t            mod=0;
4007         xfs_mount_t             *mp;
4008         int                     nimap;
4009         uint                    resblks;
4010         uint                    rounding;
4011         int                     rt;
4012         xfs_fileoff_t           startoffset_fsb;
4013         xfs_trans_t             *tp;
4014         int                     need_iolock = 1;
4015
4016         vp = XFS_ITOV(ip);
4017         mp = ip->i_mount;
4018
4019         xfs_itrace_entry(ip);
4020
4021         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4022                 return error;
4023
4024         error = 0;
4025         if (len <= 0)   /* if nothing being freed */
4026                 return error;
4027         rt = XFS_IS_REALTIME_INODE(ip);
4028         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4029         end_dmi_offset = offset + len;
4030         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4031
4032         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
4033             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4034                 if (end_dmi_offset > ip->i_size)
4035                         end_dmi_offset = ip->i_size;
4036                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4037                                 offset, end_dmi_offset - offset,
4038                                 AT_DELAY_FLAG(attr_flags), NULL);
4039                 if (error)
4040                         return error;
4041         }
4042
4043         if (attr_flags & ATTR_NOLOCK)
4044                 need_iolock = 0;
4045         if (need_iolock) {
4046                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4047                 vn_iowait(ip);  /* wait for the completion of any pending DIOs */
4048         }
4049
4050         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
4051         ioffset = offset & ~(rounding - 1);
4052
4053         if (VN_CACHED(vp) != 0) {
4054                 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
4055                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
4056                 if (error)
4057                         goto out_unlock_iolock;
4058         }
4059
4060         /*
4061          * Need to zero the stuff we're not freeing, on disk.
4062          * If its a realtime file & can't use unwritten extents then we
4063          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4064          * will take care of it for us.
4065          */
4066         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
4067                 nimap = 1;
4068                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
4069                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4070                 if (error)
4071                         goto out_unlock_iolock;
4072                 ASSERT(nimap == 0 || nimap == 1);
4073                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4074                         xfs_daddr_t     block;
4075
4076                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4077                         block = imap.br_startblock;
4078                         mod = do_div(block, mp->m_sb.sb_rextsize);
4079                         if (mod)
4080                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4081                 }
4082                 nimap = 1;
4083                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
4084                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4085                 if (error)
4086                         goto out_unlock_iolock;
4087                 ASSERT(nimap == 0 || nimap == 1);
4088                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4089                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4090                         mod++;
4091                         if (mod && (mod != mp->m_sb.sb_rextsize))
4092                                 endoffset_fsb -= mod;
4093                 }
4094         }
4095         if ((done = (endoffset_fsb <= startoffset_fsb)))
4096                 /*
4097                  * One contiguous piece to clear
4098                  */
4099                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4100         else {
4101                 /*
4102                  * Some full blocks, possibly two pieces to clear
4103                  */
4104                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4105                         error = xfs_zero_remaining_bytes(ip, offset,
4106                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4107                 if (!error &&
4108                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4109                         error = xfs_zero_remaining_bytes(ip,
4110                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4111                                 offset + len - 1);
4112         }
4113
4114         /*
4115          * free file space until done or until there is an error
4116          */
4117         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4118         while (!error && !done) {
4119
4120                 /*
4121                  * allocate and setup the transaction. Allow this
4122                  * transaction to dip into the reserve blocks to ensure
4123                  * the freeing of the space succeeds at ENOSPC.
4124                  */
4125                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4126                 tp->t_flags |= XFS_TRANS_RESERVE;
4127                 error = xfs_trans_reserve(tp,
4128                                           resblks,
4129                                           XFS_WRITE_LOG_RES(mp),
4130                                           0,
4131                                           XFS_TRANS_PERM_LOG_RES,
4132                                           XFS_WRITE_LOG_COUNT);
4133
4134                 /*
4135                  * check for running out of space
4136                  */
4137                 if (error) {
4138                         /*
4139                          * Free the transaction structure.
4140                          */
4141                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4142                         xfs_trans_cancel(tp, 0);
4143                         break;
4144                 }
4145                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4146                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4147                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4148                                 XFS_QMOPT_RES_REGBLKS);
4149                 if (error)
4150                         goto error1;
4151
4152                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4153                 xfs_trans_ihold(tp, ip);
4154
4155                 /*
4156                  * issue the bunmapi() call to free the blocks
4157                  */
4158                 XFS_BMAP_INIT(&free_list, &firstfsb);
4159                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4160                                   endoffset_fsb - startoffset_fsb,
4161                                   0, 2, &firstfsb, &free_list, NULL, &done);
4162                 if (error) {
4163                         goto error0;
4164                 }
4165
4166                 /*
4167                  * complete the transaction
4168                  */
4169                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4170                 if (error) {
4171                         goto error0;
4172                 }
4173
4174                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4175                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4176         }
4177
4178  out_unlock_iolock:
4179         if (need_iolock)
4180                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4181         return error;
4182
4183  error0:
4184         xfs_bmap_cancel(&free_list);
4185  error1:
4186         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4187         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4188                     XFS_ILOCK_EXCL);
4189         return error;
4190 }
4191
4192 /*
4193  * xfs_change_file_space()
4194  *      This routine allocates or frees disk space for the given file.
4195  *      The user specified parameters are checked for alignment and size
4196  *      limitations.
4197  *
4198  * RETURNS:
4199  *       0 on success
4200  *      errno on error
4201  *
4202  */
4203 int
4204 xfs_change_file_space(
4205         xfs_inode_t     *ip,
4206         int             cmd,
4207         xfs_flock64_t   *bf,
4208         xfs_off_t       offset,
4209         cred_t          *credp,
4210         int             attr_flags)
4211 {
4212         xfs_mount_t     *mp = ip->i_mount;
4213         int             clrprealloc;
4214         int             error;
4215         xfs_fsize_t     fsize;
4216         int             setprealloc;
4217         xfs_off_t       startoffset;
4218         xfs_off_t       llen;
4219         xfs_trans_t     *tp;
4220         bhv_vattr_t     va;
4221
4222         xfs_itrace_entry(ip);
4223
4224         if (!S_ISREG(ip->i_d.di_mode))
4225                 return XFS_ERROR(EINVAL);
4226
4227         switch (bf->l_whence) {
4228         case 0: /*SEEK_SET*/
4229                 break;
4230         case 1: /*SEEK_CUR*/
4231                 bf->l_start += offset;
4232                 break;
4233         case 2: /*SEEK_END*/
4234                 bf->l_start += ip->i_size;
4235                 break;
4236         default:
4237                 return XFS_ERROR(EINVAL);
4238         }
4239
4240         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4241
4242         if (   (bf->l_start < 0)
4243             || (bf->l_start > XFS_MAXIOFFSET(mp))
4244             || (bf->l_start + llen < 0)
4245             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4246                 return XFS_ERROR(EINVAL);
4247
4248         bf->l_whence = 0;
4249
4250         startoffset = bf->l_start;
4251         fsize = ip->i_size;
4252
4253         /*
4254          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4255          * file space.
4256          * These calls do NOT zero the data space allocated to the file,
4257          * nor do they change the file size.
4258          *
4259          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4260          * space.
4261          * These calls cause the new file data to be zeroed and the file
4262          * size to be changed.
4263          */
4264         setprealloc = clrprealloc = 0;
4265
4266         switch (cmd) {
4267         case XFS_IOC_RESVSP:
4268         case XFS_IOC_RESVSP64:
4269                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4270                                                                 1, attr_flags);
4271                 if (error)
4272                         return error;
4273                 setprealloc = 1;
4274                 break;
4275
4276         case XFS_IOC_UNRESVSP:
4277         case XFS_IOC_UNRESVSP64:
4278                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4279                                                                 attr_flags)))
4280                         return error;
4281                 break;
4282
4283         case XFS_IOC_ALLOCSP:
4284         case XFS_IOC_ALLOCSP64:
4285         case XFS_IOC_FREESP:
4286         case XFS_IOC_FREESP64:
4287                 if (startoffset > fsize) {
4288                         error = xfs_alloc_file_space(ip, fsize,
4289                                         startoffset - fsize, 0, attr_flags);
4290                         if (error)
4291                                 break;
4292                 }
4293
4294                 va.va_mask = XFS_AT_SIZE;
4295                 va.va_size = startoffset;
4296
4297                 error = xfs_setattr(ip, &va, attr_flags, credp);
4298
4299                 if (error)
4300                         return error;
4301
4302                 clrprealloc = 1;
4303                 break;
4304
4305         default:
4306                 ASSERT(0);
4307                 return XFS_ERROR(EINVAL);
4308         }
4309
4310         /*
4311          * update the inode timestamp, mode, and prealloc flag bits
4312          */
4313         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4314
4315         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4316                                       0, 0, 0))) {
4317                 /* ASSERT(0); */
4318                 xfs_trans_cancel(tp, 0);
4319                 return error;
4320         }
4321
4322         xfs_ilock(ip, XFS_ILOCK_EXCL);
4323
4324         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4325         xfs_trans_ihold(tp, ip);
4326
4327         if ((attr_flags & ATTR_DMI) == 0) {
4328                 ip->i_d.di_mode &= ~S_ISUID;
4329
4330                 /*
4331                  * Note that we don't have to worry about mandatory
4332                  * file locking being disabled here because we only
4333                  * clear the S_ISGID bit if the Group execute bit is
4334                  * on, but if it was on then mandatory locking wouldn't
4335                  * have been enabled.
4336                  */
4337                 if (ip->i_d.di_mode & S_IXGRP)
4338                         ip->i_d.di_mode &= ~S_ISGID;
4339
4340                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4341         }
4342         if (setprealloc)
4343                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4344         else if (clrprealloc)
4345                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4346
4347         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4348         xfs_trans_set_sync(tp);
4349
4350         error = xfs_trans_commit(tp, 0);
4351
4352         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4353
4354         return error;
4355 }