xfs: Don't flush stale inodes

[safe/jmp/linux-2.6] / fs / xfs / xfs_inode.c
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 872191b..391d36b 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -47,11 +47,10 @@
  #include "xfs_rw.h"
  #include "xfs_error.h"
  #include "xfs_utils.h"
-#include "xfs_dir2_trace.h"
  #include "xfs_quota.h"
-#include "xfs_acl.h"
  #include "xfs_filestream.h"
  #include "xfs_vnodeops.h"
+#include "xfs_trace.h"
  
  kmem_zone_t *xfs_ifork_zone;
  kmem_zone_t *xfs_inode_zone;
@@ -344,6 +343,16 @@ xfs_iformat(
                 return XFS_ERROR(EFSCORRUPTED);
         }
  
+       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                    !ip->i_mount->m_rtdev_targp)) {
+               xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                       "corrupt dinode %Lu, has realtime flag set.",
+                       ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
         switch (ip->i_d.di_mode & S_IFMT) {
         case S_IFIFO:
         case S_IFCHR:
@@ -424,6 +433,19 @@ xfs_iformat(
         case XFS_DINODE_FMT_LOCAL:
                 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
                 size = be16_to_cpu(atp->hdr.totsize);
+
+               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+                       xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                               "corrupt inode %Lu "
+                               "(bad attr fork size %Ld).",
+                               (unsigned long long) ip->i_ino,
+                               (long long) size);
+                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+                                            XFS_ERRLEVEL_LOW,
+                                            ip->i_mount, dip);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
                 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
                 break;
         case XFS_DINODE_FMT_EXTENTS:
@@ -629,7 +651,7 @@ xfs_iformat_btree(
         return 0;
  }
  
-void
+STATIC void
  xfs_dinode_from_disk(
         xfs_icdinode_t          *to,
         xfs_dinode_t            *from)
@@ -870,7 +892,7 @@ xfs_iread(
          * around for a while.  This helps to keep recently accessed
          * meta-data in-core longer.
          */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+       XFS_BUF_SET_REF(bp, XFS_INO_REF);
  
         /*
          * Use xfs_trans_brelse() to release the buffer containing the
@@ -1225,7 +1247,7 @@ xfs_isize_check(
   * In that case the pages will still be in memory, but the inode size
   * will never have been updated.
   */
-xfs_fsize_t
+STATIC xfs_fsize_t
  xfs_file_last_byte(
         xfs_inode_t     *ip)
  {
@@ -1245,8 +1267,10 @@ xfs_file_last_byte(
          * necessary.
          */
         if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
                 error = xfs_bmap_last_offset(NULL, ip, &last_block,
                         XFS_DATA_FORK);
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
                 if (error) {
                         last_block = 0;
                 }
@@ -1267,42 +1291,6 @@ xfs_file_last_byte(
         return last_byte;
  }
  
-#if defined(XFS_RW_TRACE)
-STATIC void
-xfs_itrunc_trace(
-       int             tag,
-       xfs_inode_t     *ip,
-       int             flag,
-       xfs_fsize_t     new_size,
-       xfs_off_t       toss_start,
-       xfs_off_t       toss_finish)
-{
-       if (ip->i_rwtrace == NULL) {
-               return;
-       }
-
-       ktrace_enter(ip->i_rwtrace,
-                    (void*)((long)tag),
-                    (void*)ip,
-                    (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
-                    (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
-                    (void*)((long)flag),
-                    (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
-                    (void*)(unsigned long)(new_size & 0xffffffff),
-                    (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
-                    (void*)(unsigned long)(toss_start & 0xffffffff),
-                    (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
-                    (void*)(unsigned long)(toss_finish & 0xffffffff),
-                    (void*)(unsigned long)current_cpu(),
-                    (void*)(unsigned long)current_pid(),
-                    (void*)NULL,
-                    (void*)NULL,
-                    (void*)NULL);
-}
-#else
-#define        xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
-#endif
-
  /*
   * Start the truncation of the file to new_size.  The new size
   * must be smaller than the current size.  This routine will
@@ -1322,8 +1310,8 @@ xfs_itrunc_trace(
   * direct I/O with the truncate operation.  Also, because we hold
   * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
   * started until the truncate completes and drops the lock. Essentially,
- * the vn_iowait() call forms an I/O barrier that provides strict ordering
- * between direct I/Os and the truncate operation.
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
+ * ordering between direct I/Os and the truncate operation.
   *
   * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
   * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
@@ -1354,7 +1342,7 @@ xfs_itruncate_start(
  
         /* wait for the completion of any pending DIOs */
         if (new_size == 0 || new_size < ip->i_size)
-               vn_iowait(ip);
+               xfs_ioend_wait(ip);
  
         /*
          * Call toss_pages or flushinval_pages to get rid of pages
@@ -1385,8 +1373,7 @@ xfs_itruncate_start(
                 return 0;
         }
         last_byte = xfs_file_last_byte(ip);
-       xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
-                        last_byte);
+       trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
         if (last_byte > toss_start) {
                 if (flags & XFS_ITRUNC_DEFINITE) {
                         xfs_tosspages(ip, toss_start,
@@ -1490,7 +1477,8 @@ xfs_itruncate_finish(
                 new_size = 0LL;
         }
         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
-       xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
+       trace_xfs_itruncate_finish_start(ip, new_size);
+
         /*
          * The first thing we do is set the size to new_size permanently
          * on disk.  This way we don't have to worry about anyone ever
@@ -1601,10 +1589,10 @@ xfs_itruncate_finish(
                  * in this file with garbage in them once recovery
                  * runs.
                  */
-               XFS_BMAP_INIT(&free_list, &first_block);
+               xfs_bmap_init(&free_list, &first_block);
                 error = xfs_bunmapi(ntp, ip,
                                     first_unmap_block, unmap_len,
-                                   XFS_BMAPI_AFLAG(fork) |
+                                   xfs_bmapi_aflag(fork) |
                                       (sync ? 0 : XFS_BMAPI_ASYNC),
                                     XFS_ITRUNC_MAX_EXTENTS,
                                     &first_block, &free_list,
@@ -1707,7 +1695,7 @@ xfs_itruncate_finish(
         ASSERT((new_size != 0) ||
                (fork == XFS_ATTR_FORK) ||
                (ip->i_d.di_nextents == 0));
-       xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
+       trace_xfs_itruncate_finish_end(ip, new_size);
         return 0;
  }
  
@@ -2450,78 +2438,6 @@ xfs_idestroy_fork(
  }
  
  /*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- *
- * Note: because we don't initialise everything on reallocation out
- * of the zone, we must ensure we nullify everything correctly before
- * freeing the structure.
- */
-void
-xfs_idestroy(
-       xfs_inode_t     *ip)
-{
-       switch (ip->i_d.di_mode & S_IFMT) {
-       case S_IFREG:
-       case S_IFDIR:
-       case S_IFLNK:
-               xfs_idestroy_fork(ip, XFS_DATA_FORK);
-               break;
-       }
-       if (ip->i_afp)
-               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-#ifdef XFS_INODE_TRACE
-       ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-       ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-       ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-       ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-       ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-       ktrace_free(ip->i_dir_trace);
-#endif
-       if (ip->i_itemp) {
-               /*
-                * Only if we are shutting down the fs will we see an
-                * inode still in the AIL. If it is there, we should remove
-                * it to prevent a use-after-free from occurring.
-                */
-               xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-               struct xfs_ail  *ailp = lip->li_ailp;
-
-               ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                      XFS_FORCED_SHUTDOWN(ip->i_mount));
-               if (lip->li_flags & XFS_LI_IN_AIL) {
-                       spin_lock(&ailp->xa_lock);
-                       if (lip->li_flags & XFS_LI_IN_AIL)
-                               xfs_trans_ail_delete(ailp, lip);
-                       else
-                               spin_unlock(&ailp->xa_lock);
-               }
-               xfs_inode_item_destroy(ip);
-               ip->i_itemp = NULL;
-       }
-       /* asserts to verify all state is correct here */
-       ASSERT(atomic_read(&ip->i_iocount) == 0);
-       ASSERT(atomic_read(&ip->i_pincount) == 0);
-       ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(completion_done(&ip->i_flush));
-       kmem_zone_free(xfs_inode_zone, ip);
-}
-
-
-/*
   * Increment the pin count of the given buffer.
   * This value is protected by ipinlock spinlock in the mount structure.
   */
@@ -2629,7 +2545,7 @@ xfs_iextents_copy(
         for (i = 0; i < nrecs; i++) {
                 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
                 start_block = xfs_bmbt_get_startblock(ep);
-               if (ISNULLSTARTBLOCK(start_block)) {
+               if (isnullstartblock(start_block)) {
                         /*
                          * It's a delayed allocation extent, so skip it.
                          */
@@ -2876,7 +2792,6 @@ cluster_corrupt_out:
                         XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                         XFS_BUF_UNDONE(bp);
                         XFS_BUF_STALE(bp);
-                       XFS_BUF_SHUT(bp);
                         XFS_BUF_ERROR(bp,EIO);
                         xfs_biodone(bp);
                 } else {
@@ -2926,10 +2841,14 @@ xfs_iflush(
         mp = ip->i_mount;
  
         /*
-        * If the inode isn't dirty, then just release the inode
-        * flush lock and do nothing.
+        * If the inode isn't dirty, then just release the inode flush lock and
+        * do nothing. Treat stale inodes the same; we cannot rely on the
+        * backing buffer remaining stale in cache for the remaining life of
+        * the stale inode and so xfs_itobp() below may give us a buffer that
+        * no longer contains inodes below. Doing this stale check here also
+        * avoids forcing the log on pinned, stale inodes.
          */
-       if (xfs_inode_clean(ip)) {
+       if (xfs_inode_clean(ip) || xfs_iflags_test(ip, XFS_ISTALE)) {
                 xfs_ifunlock(ip);
                 return 0;
         }
@@ -3117,9 +3036,9 @@ xfs_iflush_int(
         SYNCHRONIZE();
  
         /*
-        * Make sure to get the latest atime from the Linux inode.
+        * Make sure to get the latest timestamps from the Linux inode.
          */
-       xfs_synchronize_atime(ip);
+       xfs_synchronize_times(ip);
  
         if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
@@ -3301,25 +3220,6 @@ corrupt_out:
         return XFS_ERROR(EFSCORRUPTED);
  }
  
-
-
-#ifdef XFS_ILOCK_TRACE
-ktrace_t       *xfs_ilock_trace_buf;
-
-void
-xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
-{
-       ktrace_enter(ip->i_lock_trace,
-                    (void *)ip,
-                    (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
-                    (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
-                    (void *)ra,                /* caller of ilock */
-                    (void *)(unsigned long)current_cpu(),
-                    (void *)(unsigned long)current_pid(),
-                    NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
-}
-#endif
-
  /*
   * Return a pointer to the extent record at file index idx.
   */
@@ -3351,13 +3251,17 @@ xfs_iext_get_ext(
   */
  void
  xfs_iext_insert(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_inode_t     *ip,            /* incore inode pointer */
         xfs_extnum_t    idx,            /* starting index of new items */
         xfs_extnum_t    count,          /* number of inserted items */
-       xfs_bmbt_irec_t *new)           /* items to insert */
+       xfs_bmbt_irec_t *new,           /* items to insert */
+       int             state)          /* type of extent conversion */
  {
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
         xfs_extnum_t    i;              /* extent record index */
  
+       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
         ASSERT(ifp->if_flags & XFS_IFEXTENTS);
         xfs_iext_add(ifp, idx, count);
         for (i = idx; i < idx + count; i++, new++)
@@ -3600,13 +3504,17 @@ xfs_iext_add_indirect_multi(
   */
  void
  xfs_iext_remove(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_inode_t     *ip,            /* incore inode pointer */
         xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
+       int             ext_diff,       /* number of extents to remove */
+       int             state)          /* type of extent conversion */
  {
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
         xfs_extnum_t    nextents;       /* number of extents in file */
         int             new_size;       /* size of extents after removal */
  
+       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
         ASSERT(ext_diff > 0);
         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
         new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
@@ -3888,7 +3796,7 @@ xfs_iext_inline_to_direct(
  /*
   * Resize an extent indirection array to new_size bytes.
   */
-void
+STATIC void
  xfs_iext_realloc_indirect(
         xfs_ifork_t     *ifp,           /* inode fork pointer */
         int             new_size)       /* new indirection array size */
@@ -3913,7 +3821,7 @@ xfs_iext_realloc_indirect(
  /*
   * Switch from indirection array to linear (direct) extent allocations.
   */
-void
+STATIC void
  xfs_iext_indirect_to_direct(
          xfs_ifork_t    *ifp)           /* inode fork pointer */
  {