xfs: fix race in inode cluster freeing failing to stale inodes
[safe/jmp/linux-2.6] / fs / xfs / xfs_inode.c
index 6ba7765..d53c39d 100644 (file)
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
        int                     blks_per_cluster;
        int                     nbufs;
        int                     ninodes;
-       int                     i, j, found, pre_flushed;
+       int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
-       xfs_inode_t             *ip, **ip_found;
+       xfs_inode_t             *ip;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
        struct xfs_perag        *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
        }
 
-       ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
-
        for (j = 0; j < nbufs; j++, inum += ninodes) {
+               int     found = 0;
+
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
 
+               /*
+                * We obtain and lock the backing buffer first in the process
+                * here, as we have to ensure that any dirty inode that we
+                * can't get the flush lock on is attached to the buffer.
+                * If we scan the in-memory inodes first, then buffer IO can
+                * complete before we get a lock on it, and hence we may fail
+                * to mark all the active inodes on the buffer stale.
+                */
+               bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+                                       mp->m_bsize * blks_per_cluster,
+                                       XBF_LOCK);
+
+               /*
+                * Walk the inodes already attached to the buffer and mark them
+                * stale. These will all have the flush locks held, so an
+                * in-memory inode walk can't lock them.
+                */
+               lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+               while (lip) {
+                       if (lip->li_type == XFS_LI_INODE) {
+                               iip = (xfs_inode_log_item_t *)lip;
+                               ASSERT(iip->ili_logged == 1);
+                               lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+                               xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                       &iip->ili_flush_lsn,
+                                                       &iip->ili_item.li_lsn);
+                               xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+                               found++;
+                       }
+                       lip = lip->li_bio_list;
+               }
 
                /*
-                * Look for each inode in memory and attempt to lock it,
-                * we can be racing with flush and tail pushing here.
-                * any inode we get the locks on, add to an array of
-                * inode items to process later.
+                * For each inode in memory attempt to add it to the inode
+                * buffer and set it up for being staled on buffer IO
+                * completion.  This is safe as we've locked out tail pushing
+                * and flushing by locking the buffer.
                 *
-                * The get the buffer lock, we could beat a flush
-                * or tail pushing thread to the lock here, in which
-                * case they will go looking for the inode buffer
-                * and fail, we need some other form of interlock
-                * here.
+                * We have already marked every inode that was part of a
+                * transaction stale above, which means there is no point in
+                * even trying to lock them.
                 */
-               found = 0;
                for (i = 0; i < ninodes; i++) {
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
 
-                       /* Inode not in memory or we found it already,
-                        * nothing to do
-                        */
+                       /* Inode not in memory or stale, nothing to do */
                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
 
-                       if (xfs_inode_clean(ip)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               continue;
-                       }
-
-                       /* If we can get the locks then add it to the
-                        * list, otherwise by the time we get the bp lock
-                        * below it will already be attached to the
-                        * inode buffer.
-                        */
-
-                       /* This inode will already be locked - by us, lets
-                        * keep it that way.
-                        */
-
-                       if (ip == free_ip) {
-                               if (xfs_iflock_nowait(ip)) {
-                                       xfs_iflags_set(ip, XFS_ISTALE);
-                                       if (xfs_inode_clean(ip)) {
-                                               xfs_ifunlock(ip);
-                                       } else {
-                                               ip_found[found++] = ip;
-                                       }
-                               }
+                       /* don't try to lock/unlock the current inode */
+                       if (ip != free_ip &&
+                           !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
+                       read_unlock(&pag->pag_ici_lock);
 
-                       if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                               if (xfs_iflock_nowait(ip)) {
-                                       xfs_iflags_set(ip, XFS_ISTALE);
-
-                                       if (xfs_inode_clean(ip)) {
-                                               xfs_ifunlock(ip);
-                                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                       } else {
-                                               ip_found[found++] = ip;
-                                       }
-                               } else {
+                       if (!xfs_iflock_nowait(ip)) {
+                               if (ip != free_ip)
                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                               }
+                               continue;
                        }
-                       read_unlock(&pag->pag_ici_lock);
-               }
 
-               bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
-                                       mp->m_bsize * blks_per_cluster,
-                                       XBF_LOCK);
-
-               pre_flushed = 0;
-               lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-               while (lip) {
-                       if (lip->li_type == XFS_LI_INODE) {
-                               iip = (xfs_inode_log_item_t *)lip;
-                               ASSERT(iip->ili_logged == 1);
-                               lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                               xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                       &iip->ili_flush_lsn,
-                                                       &iip->ili_item.li_lsn);
-                               xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                               pre_flushed++;
+                       xfs_iflags_set(ip, XFS_ISTALE);
+                       if (xfs_inode_clean(ip)) {
+                               ASSERT(ip != free_ip);
+                               xfs_ifunlock(ip);
+                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                               continue;
                        }
-                       lip = lip->li_bio_list;
-               }
 
-               for (i = 0; i < found; i++) {
-                       ip = ip_found[i];
                        iip = ip->i_itemp;
-
                        if (!iip) {
+                               /* inode with unlogged changes only */
+                               ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
+                       found++;
 
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
                                xfs_istale_done, (xfs_log_item_t *)iip);
-                       if (ip != free_ip) {
+
+                       if (ip != free_ip)
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       }
                }
 
-               if (found || pre_flushed)
+               if (found)
                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
 
-       kmem_free(ip_found);
        xfs_perag_put(pag);
 }