SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/xfs/linux-2.6/xfs_sync.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_types.h"
  21 #include "xfs_bit.h"
  22 #include "xfs_log.h"
  23 #include "xfs_inum.h"
  24 #include "xfs_trans.h"
  25 #include "xfs_sb.h"
  26 #include "xfs_ag.h"
  27 #include "xfs_dir2.h"
  28 #include "xfs_dmapi.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_bmap_btree.h"
  31 #include "xfs_alloc_btree.h"
  32 #include "xfs_ialloc_btree.h"
  33 #include "xfs_btree.h"
  34 #include "xfs_dir2_sf.h"
  35 #include "xfs_attr_sf.h"
  36 #include "xfs_inode.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_error.h"
  39 #include "xfs_mru_cache.h"
  40 #include "xfs_filestream.h"
  41 #include "xfs_vnodeops.h"
  42 #include "xfs_utils.h"
  43 #include "xfs_buf_item.h"
  44 #include "xfs_inode_item.h"
  45 #include "xfs_rw.h"
  46
  47 #include <linux/kthread.h>
  48 #include <linux/freezer.h>
  49
  50 /*
  51  * Sync all the inodes in the given AG according to the
  52  * direction given by the flags.
  53  */
  54 STATIC int
  55 xfs_sync_inodes_ag(
  56         xfs_mount_t     *mp,
  57         int             ag,
  58         int             flags)
  59 {
  60         xfs_perag_t     *pag = &mp->m_perag[ag];
  61         int             nr_found;
  62         int             first_index = 0;
  63         int             error = 0;
  64         int             last_error = 0;
  65         int             fflag = XFS_B_ASYNC;
  66         int             lock_flags = XFS_ILOCK_SHARED;
  67
  68         if (flags & SYNC_DELWRI)
  69                 fflag = XFS_B_DELWRI;
  70         if (flags & SYNC_WAIT)
  71                 fflag = 0;              /* synchronous overrides all */
  72
  73         if (flags & SYNC_DELWRI) {
  74                 /*
  75                  * We need the I/O lock if we're going to call any of
  76                  * the flush/inval routines.
  77                  */
  78                 lock_flags |= XFS_IOLOCK_SHARED;
  79         }
  80
  81         do {
  82                 struct inode    *inode;
  83                 boolean_t       inode_refed;
  84                 xfs_inode_t     *ip = NULL;
  85
  86                 /*
  87                  * use a gang lookup to find the next inode in the tree
  88                  * as the tree is sparse and a gang lookup walks to find
  89                  * the number of objects requested.
  90                  */
  91                 read_lock(&pag->pag_ici_lock);
  92                 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
  93                                 (void**)&ip, first_index, 1);
  94
  95                 if (!nr_found) {
  96                         read_unlock(&pag->pag_ici_lock);
  97                         break;
  98                 }
  99
 100                 /* update the index for the next lookup */
 101                 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 102
 103                 /*
 104                  * skip inodes in reclaim. Let xfs_syncsub do that for
 105                  * us so we don't need to worry.
 106                  */
 107                 if (xfs_iflags_test(ip, (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
 108                         read_unlock(&pag->pag_ici_lock);
 109                         continue;
 110                 }
 111
 112                 /* bad inodes are dealt with elsewhere */
 113                 inode = VFS_I(ip);
 114                 if (is_bad_inode(inode)) {
 115                         read_unlock(&pag->pag_ici_lock);
 116                         continue;
 117                 }
 118
 119                 /* nothing to sync during shutdown */
 120                 if (XFS_FORCED_SHUTDOWN(mp)) {
 121                         read_unlock(&pag->pag_ici_lock);
 122                         return 0;
 123                 }
 124
 125                 /*
 126                  * If we can't get a reference on the VFS_I, the inode must be
 127                  * in reclaim. If we can get the inode lock without blocking,
 128                  * it is safe to flush the inode because we hold the tree lock
 129                  * and xfs_iextract will block right now. Hence if we lock the
 130                  * inode while holding the tree lock, xfs_ireclaim() is
 131                  * guaranteed to block on the inode lock we now hold and hence
 132                  * it is safe to reference the inode until we drop the inode
 133                  * locks completely.
 134                  */
 135                 inode_refed = B_FALSE;
 136                 if (igrab(inode)) {
 137                         read_unlock(&pag->pag_ici_lock);
 138                         xfs_ilock(ip, lock_flags);
 139                         inode_refed = B_TRUE;
 140                 } else {
 141                         if (!xfs_ilock_nowait(ip, lock_flags)) {
 142                                 /* leave it to reclaim */
 143                                 read_unlock(&pag->pag_ici_lock);
 144                                 continue;
 145                         }
 146                         read_unlock(&pag->pag_ici_lock);
 147                 }
 148
 149                 /*
 150                  * If we have to flush data or wait for I/O completion
 151                  * we need to drop the ilock that we currently hold.
 152                  * If we need to drop the lock, insert a marker if we
 153                  * have not already done so.
 154                  */
 155                 if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
 156                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 157                         error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
 158                         if (flags & SYNC_IOWAIT)
 159                                 vn_iowait(ip);
 160                         xfs_ilock(ip, XFS_ILOCK_SHARED);
 161                 }
 162
 163                 if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
 164                         if (flags & SYNC_WAIT) {
 165                                 xfs_iflock(ip);
 166                                 if (!xfs_inode_clean(ip))
 167                                         error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
 168                                 else
 169                                         xfs_ifunlock(ip);
 170                         } else if (xfs_iflock_nowait(ip)) {
 171                                 if (!xfs_inode_clean(ip))
 172                                         error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
 173                                 else
 174                                         xfs_ifunlock(ip);
 175                         }
 176                 }
 177
 178                 if (lock_flags)
 179                         xfs_iunlock(ip, lock_flags);
 180
 181                 if (inode_refed) {
 182                         IRELE(ip);
 183                 }
 184
 185                 if (error)
 186                         last_error = error;
 187                 /*
 188                  * bail out if the filesystem is corrupted.
 189                  */
 190                 if (error == EFSCORRUPTED)
 191                         return XFS_ERROR(error);
 192
 193         } while (nr_found);
 194
 195         return last_error;
 196 }
 197
 198 int
 199 xfs_sync_inodes(
 200         xfs_mount_t     *mp,
 201         int             flags)
 202 {
 203         int             error;
 204         int             last_error;
 205         int             i;
 206         int             lflags = XFS_LOG_FORCE;
 207
 208         if (mp->m_flags & XFS_MOUNT_RDONLY)
 209                 return 0;
 210         error = 0;
 211         last_error = 0;
 212
 213         if (flags & SYNC_WAIT)
 214                 lflags |= XFS_LOG_SYNC;
 215
 216         for (i = 0; i < mp->m_sb.sb_agcount; i++) {
 217                 if (!mp->m_perag[i].pag_ici_init)
 218                         continue;
 219                 error = xfs_sync_inodes_ag(mp, i, flags);
 220                 if (error)
 221                         last_error = error;
 222                 if (error == EFSCORRUPTED)
 223                         break;
 224         }
 225         if (flags & SYNC_DELWRI)
 226                 xfs_log_force(mp, 0, lflags);
 227
 228         return XFS_ERROR(last_error);
 229 }
 230
 231 STATIC int
 232 xfs_commit_dummy_trans(
 233         struct xfs_mount        *mp,
 234         uint                    log_flags)
 235 {
 236         struct xfs_inode        *ip = mp->m_rootip;
 237         struct xfs_trans        *tp;
 238         int                     error;
 239
 240         /*
 241          * Put a dummy transaction in the log to tell recovery
 242          * that all others are OK.
 243          */
 244         tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
 245         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
 246         if (error) {
 247                 xfs_trans_cancel(tp, 0);
 248                 return error;
 249         }
 250
 251         xfs_ilock(ip, XFS_ILOCK_EXCL);
 252
 253         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 254         xfs_trans_ihold(tp, ip);
 255         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 256         /* XXX(hch): ignoring the error here.. */
 257         error = xfs_trans_commit(tp, 0);
 258
 259         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 260
 261         xfs_log_force(mp, 0, log_flags);
 262         return 0;
 263 }
 264
 265 int
 266 xfs_sync_fsdata(
 267         struct xfs_mount        *mp,
 268         int                     flags)
 269 {
 270         struct xfs_buf          *bp;
 271         struct xfs_buf_log_item *bip;
 272         int                     error = 0;
 273
 274         /*
 275          * If this is xfssyncd() then only sync the superblock if we can
 276          * lock it without sleeping and it is not pinned.
 277          */
 278         if (flags & SYNC_BDFLUSH) {
 279                 ASSERT(!(flags & SYNC_WAIT));
 280
 281                 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
 282                 if (!bp)
 283                         goto out;
 284
 285                 bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
 286                 if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
 287                         goto out_brelse;
 288         } else {
 289                 bp = xfs_getsb(mp, 0);
 290
 291                 /*
 292                  * If the buffer is pinned then push on the log so we won't
 293                  * get stuck waiting in the write for someone, maybe
 294                  * ourselves, to flush the log.
 295                  *
 296                  * Even though we just pushed the log above, we did not have
 297                  * the superblock buffer locked at that point so it can
 298                  * become pinned in between there and here.
 299                  */
 300                 if (XFS_BUF_ISPINNED(bp))
 301                         xfs_log_force(mp, 0, XFS_LOG_FORCE);
 302         }
 303
 304
 305         if (flags & SYNC_WAIT)
 306                 XFS_BUF_UNASYNC(bp);
 307         else
 308                 XFS_BUF_ASYNC(bp);
 309
 310         return xfs_bwrite(mp, bp);
 311
 312  out_brelse:
 313         xfs_buf_relse(bp);
 314  out:
 315         return error;
 316 }
 317
 318 /*
 319  * First stage of freeze - no more writers will make progress now we are here,
 320  * so we flush delwri and delalloc buffers here, then wait for all I/O to
 321  * complete.  Data is frozen at that point. Metadata is not frozen,
 322  * transactions can still occur here so don't bother flushing the buftarg (i.e
 323  * SYNC_QUIESCE) because it'll just get dirty again.
 324  */
 325 int
 326 xfs_quiesce_data(
 327         struct xfs_mount        *mp)
 328 {
 329         int error;
 330
 331         /* push non-blocking */
 332         xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
 333         XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
 334         xfs_filestream_flush(mp);
 335
 336         /* push and block */
 337         xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
 338         XFS_QM_DQSYNC(mp, SYNC_WAIT);
 339
 340         /* write superblock and hoover shutdown errors */
 341         error = xfs_sync_fsdata(mp, 0);
 342
 343         /* flush devices */
 344         XFS_bflush(mp->m_ddev_targp);
 345         if (mp->m_rtdev_targp)
 346                 XFS_bflush(mp->m_rtdev_targp);
 347
 348         return error;
 349 }
 350
 351 /*
 352  * xfs_sync flushes any pending I/O to file system vfsp.
 353  *
 354  * This routine is called by vfs_sync() to make sure that things make it
 355  * out to disk eventually, on sync() system calls to flush out everything,
 356  * and when the file system is unmounted.  For the vfs_sync() case, all
 357  * we really need to do is sync out the log to make all of our meta-data
 358  * updates permanent (except for timestamps).  For calls from pflushd(),
 359  * dirty pages are kept moving by calling pdflush() on the inodes
 360  * containing them.  We also flush the inodes that we can lock without
 361  * sleeping and the superblock if we can lock it without sleeping from
 362  * vfs_sync() so that items at the tail of the log are always moving out.
 363  *
 364  * Flags:
 365  *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
 366  *                     to sleep if we can help it.  All we really need
 367  *                     to do is ensure that the log is synced at least
 368  *                     periodically.  We also push the inodes and
 369  *                     superblock if we can lock them without sleeping
 370  *                      and they are not pinned.
 371  *      SYNC_ATTR    - We need to flush the inodes. Now handled by direct calls
 372  *                     to xfs_sync_inodes().
 373  *      SYNC_WAIT    - All the flushes that take place in this call should
 374  *                     be synchronous.
 375  *      SYNC_DELWRI  - This tells us to push dirty pages associated with
 376  *                     inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
 377  *                     determine if they should be flushed sync, async, or
 378  *                     delwri.
 379  *      SYNC_FSDATA  - This indicates that the caller would like to make
 380  *                     sure the superblock is safe on disk.  We can ensure
 381  *                     this by simply making sure the log gets flushed
 382  *                     if SYNC_BDFLUSH is set, and by actually writing it
 383  *                     out otherwise.
 384  *      SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
 385  *                     before we return (including direct I/O). Forms the drain
 386  *                     side of the write barrier needed to safely quiesce the
 387  *                     filesystem.
 388  *
 389  */
 390 int
 391 xfs_sync(
 392         xfs_mount_t     *mp,
 393         int             flags)
 394 {
 395         int             error;
 396         int             last_error = 0;
 397         uint            log_flags = XFS_LOG_FORCE;
 398
 399         ASSERT(!(flags & SYNC_ATTR));
 400
 401         /*
 402          * Get the Quota Manager to flush the dquots.
 403          *
 404          * If XFS quota support is not enabled or this filesystem
 405          * instance does not use quotas XFS_QM_DQSYNC will always
 406          * return zero.
 407          */
 408         error = XFS_QM_DQSYNC(mp, flags);
 409         if (error) {
 410                 /*
 411                  * If we got an IO error, we will be shutting down.
 412                  * So, there's nothing more for us to do here.
 413                  */
 414                 ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
 415                 if (XFS_FORCED_SHUTDOWN(mp))
 416                         return XFS_ERROR(error);
 417         }
 418
 419         if (flags & SYNC_IOWAIT)
 420                 xfs_filestream_flush(mp);
 421
 422         /*
 423          * Sync out the log.  This ensures that the log is periodically
 424          * flushed even if there is not enough activity to fill it up.
 425          */
 426         if (flags & SYNC_WAIT)
 427                 log_flags |= XFS_LOG_SYNC;
 428
 429         xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
 430
 431         if (flags & SYNC_DELWRI) {
 432                 if (flags & SYNC_BDFLUSH)
 433                         xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 434                 else
 435                         error = xfs_sync_inodes(mp, flags);
 436                 /*
 437                  * Flushing out dirty data above probably generated more
 438                  * log activity, so if this isn't vfs_sync() then flush
 439                  * the log again.
 440                  */
 441                 xfs_log_force(mp, 0, log_flags);
 442         }
 443
 444         if (flags & SYNC_FSDATA) {
 445                 error = xfs_sync_fsdata(mp, flags);
 446                 if (error)
 447                         last_error = error;
 448         }
 449
 450         /*
 451          * Now check to see if the log needs a "dummy" transaction.
 452          */
 453         if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
 454                 error = xfs_commit_dummy_trans(mp, log_flags);
 455                 if (error)
 456                         return error;
 457         }
 458
 459         return XFS_ERROR(last_error);
 460 }
 461
 462 /*
 463  * Enqueue a work item to be picked up by the vfs xfssyncd thread.
 464  * Doing this has two advantages:
 465  * - It saves on stack space, which is tight in certain situations
 466  * - It can be used (with care) as a mechanism to avoid deadlocks.
 467  * Flushing while allocating in a full filesystem requires both.
 468  */
 469 STATIC void
 470 xfs_syncd_queue_work(
 471         struct xfs_mount *mp,
 472         void            *data,
 473         void            (*syncer)(struct xfs_mount *, void *))
 474 {
 475         struct bhv_vfs_sync_work *work;
 476
 477         work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
 478         INIT_LIST_HEAD(&work->w_list);
 479         work->w_syncer = syncer;
 480         work->w_data = data;
 481         work->w_mount = mp;
 482         spin_lock(&mp->m_sync_lock);
 483         list_add_tail(&work->w_list, &mp->m_sync_list);
 484         spin_unlock(&mp->m_sync_lock);
 485         wake_up_process(mp->m_sync_task);
 486 }
 487
 488 /*
 489  * Flush delayed allocate data, attempting to free up reserved space
 490  * from existing allocations.  At this point a new allocation attempt
 491  * has failed with ENOSPC and we are in the process of scratching our
 492  * heads, looking about for more room...
 493  */
 494 STATIC void
 495 xfs_flush_inode_work(
 496         struct xfs_mount *mp,
 497         void            *arg)
 498 {
 499         struct inode    *inode = arg;
 500         filemap_flush(inode->i_mapping);
 501         iput(inode);
 502 }
 503
 504 void
 505 xfs_flush_inode(
 506         xfs_inode_t     *ip)
 507 {
 508         struct inode    *inode = VFS_I(ip);
 509
 510         igrab(inode);
 511         xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
 512         delay(msecs_to_jiffies(500));
 513 }
 514
 515 /*
 516  * This is the "bigger hammer" version of xfs_flush_inode_work...
 517  * (IOW, "If at first you don't succeed, use a Bigger Hammer").
 518  */
 519 STATIC void
 520 xfs_flush_device_work(
 521         struct xfs_mount *mp,
 522         void            *arg)
 523 {
 524         struct inode    *inode = arg;
 525         sync_blockdev(mp->m_super->s_bdev);
 526         iput(inode);
 527 }
 528
 529 void
 530 xfs_flush_device(
 531         xfs_inode_t     *ip)
 532 {
 533         struct inode    *inode = VFS_I(ip);
 534
 535         igrab(inode);
 536         xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
 537         delay(msecs_to_jiffies(500));
 538         xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 539 }
 540
 541 /*
 542  * Every sync period we need to unpin all items, reclaim inodes, sync
 543  * quota and write out the superblock. We might need to cover the log
 544  * to indicate it is idle.
 545  */
 546 STATIC void
 547 xfs_sync_worker(
 548         struct xfs_mount *mp,
 549         void            *unused)
 550 {
 551         int             error;
 552
 553         if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
 554                 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
 555                 xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 556                 /* dgc: errors ignored here */
 557                 error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
 558                 error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
 559                 if (xfs_log_need_covered(mp))
 560                         error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
 561         }
 562         mp->m_sync_seq++;
 563         wake_up(&mp->m_wait_single_sync_task);
 564 }
 565
 566 STATIC int
 567 xfssyncd(
 568         void                    *arg)
 569 {
 570         struct xfs_mount        *mp = arg;
 571         long                    timeleft;
 572         bhv_vfs_sync_work_t     *work, *n;
 573         LIST_HEAD               (tmp);
 574
 575         set_freezable();
 576         timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
 577         for (;;) {
 578                 timeleft = schedule_timeout_interruptible(timeleft);
 579                 /* swsusp */
 580                 try_to_freeze();
 581                 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
 582                         break;
 583
 584                 spin_lock(&mp->m_sync_lock);
 585                 /*
 586                  * We can get woken by laptop mode, to do a sync -
 587                  * that's the (only!) case where the list would be
 588                  * empty with time remaining.
 589                  */
 590                 if (!timeleft || list_empty(&mp->m_sync_list)) {
 591                         if (!timeleft)
 592                                 timeleft = xfs_syncd_centisecs *
 593                                                         msecs_to_jiffies(10);
 594                         INIT_LIST_HEAD(&mp->m_sync_work.w_list);
 595                         list_add_tail(&mp->m_sync_work.w_list,
 596                                         &mp->m_sync_list);
 597                 }
 598                 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
 599                         list_move(&work->w_list, &tmp);
 600                 spin_unlock(&mp->m_sync_lock);
 601
 602                 list_for_each_entry_safe(work, n, &tmp, w_list) {
 603                         (*work->w_syncer)(mp, work->w_data);
 604                         list_del(&work->w_list);
 605                         if (work == &mp->m_sync_work)
 606                                 continue;
 607                         kmem_free(work);
 608                 }
 609         }
 610
 611         return 0;
 612 }
 613
 614 int
 615 xfs_syncd_init(
 616         struct xfs_mount        *mp)
 617 {
 618         mp->m_sync_work.w_syncer = xfs_sync_worker;
 619         mp->m_sync_work.w_mount = mp;
 620         mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
 621         if (IS_ERR(mp->m_sync_task))
 622                 return -PTR_ERR(mp->m_sync_task);
 623         return 0;
 624 }
 625
 626 void
 627 xfs_syncd_stop(
 628         struct xfs_mount        *mp)
 629 {
 630         kthread_stop(mp->m_sync_task);
 631 }
 632