X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Ffs-writeback.c;h=1d1088f48bc2dfe713d53a9af0fa4d51e1161428;hb=e1f38e2cea199ef2b8e117506fef8abbecbaae5e;hp=34757758511a7f073a019b67a61c7884ecf3fed0;hpb=49db041430e8a856dbc3af15430bf068f1c74655;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3475775..1d1088f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -41,8 +42,9 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate; - int range_cyclic; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; }; /* @@ -97,6 +99,11 @@ static void bdi_work_clear(struct bdi_work *work) { clear_bit(WS_USED_B, &work->state); smp_mb__after_clear_bit(); + /* + * work can have disappeared at this point. bit waitq functions + * should be able to tolerate this, provided bdi_sched_wait does + * not dereference it's pointer argument. + */ wake_up_bit(&work->state, WS_USED_B); } @@ -113,6 +120,7 @@ static void bdi_work_free(struct rcu_head *head) static void wb_work_complete(struct bdi_work *work) { const enum writeback_sync_modes sync_mode = work->args.sync_mode; + int onstack = bdi_work_on_stack(work); /* * For allocated work, we can clear the done/seen bit right here. @@ -120,9 +128,9 @@ static void wb_work_complete(struct bdi_work *work) * to after the RCU grace period, since the stack could be invalidated * as soon as bdi_work_clear() has done the wakeup. */ - if (!bdi_work_on_stack(work)) + if (!onstack) bdi_work_clear(work); - if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work)) + if (sync_mode == WB_SYNC_NONE || onstack) call_rcu(&work->rcu_head, bdi_work_free); } @@ -151,10 +159,10 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) BUG_ON(!bdi->wb_cnt); /* - * Make sure stores are seen before it appears on the list + * list_add_tail_rcu() contains the necessary barriers to + * make sure the above stores are seen before the item is + * noticed on the list */ - smp_mb(); - spin_lock(&bdi->wb_lock); list_add_tail_rcu(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); @@ -168,13 +176,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) else { struct bdi_writeback *wb = &bdi->wb; - /* - * End work now if this wb has no dirty IO pending. Otherwise - * wakeup the handling thread - */ - if (!wb_has_dirty_io(wb)) - wb_clear_pending(wb, work); - else if (wb->task) + if (wb->task) wake_up_process(wb->task); } } @@ -241,6 +243,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, /** * bdi_start_writeback - start writeback * @bdi: the backing device to write from + * @sb: write inodes from this super_block * @nr_pages: the number of pages to write * * Description: @@ -249,14 +252,25 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, * completion. Caller need not hold sb s_umount semaphore. * */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, + long nr_pages) { struct wb_writeback_args args = { + .sb = sb, .sync_mode = WB_SYNC_NONE, .nr_pages = nr_pages, .range_cyclic = 1, }; + /* + * We treat @nr_pages=0 as the special case to do background writeback, + * ie. to sync pages until the background dirty threshold is reached. + */ + if (!nr_pages) { + args.nr_pages = LONG_MAX; + args.for_background = 1; + } + bdi_alloc_queue_work(bdi, &args); } @@ -310,7 +324,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times - * from permanently stopping the whole pdflush writeback. + * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif @@ -324,13 +338,38 @@ static void move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, unsigned long *older_than_this) { + LIST_HEAD(tmp); + struct list_head *pos, *node; + struct super_block *sb = NULL; + struct inode *inode; + int do_sb_sort = 0; + while (!list_empty(delaying_queue)) { - struct inode *inode = list_entry(delaying_queue->prev, - struct inode, i_list); + inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_list, dispatch_queue); + if (sb && sb != inode->i_sb) + do_sb_sort = 1; + sb = inode->i_sb; + list_move(&inode->i_list, &tmp); + } + + /* just one sb in list, splice to dispatch_queue and we're done */ + if (!do_sb_sort) { + list_splice(&tmp, dispatch_queue); + return; + } + + /* Move inodes from one superblock together */ + while (!list_empty(&tmp)) { + inode = list_entry(tmp.prev, struct inode, i_list); + sb = inode->i_sb; + list_for_each_prev_safe(pos, node, &tmp) { + inode = list_entry(pos, struct inode, i_list); + if (inode->i_sb == sb) + list_move(&inode->i_list, dispatch_queue); + } } } @@ -343,10 +382,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -static int write_inode(struct inode *inode, int sync) +static int write_inode(struct inode *inode, struct writeback_control *wbc) { if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); + return inode->i_sb->s_op->write_inode(inode, wbc); return 0; } @@ -359,11 +398,11 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - do { + while (inode->i_state & I_SYNC) { spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); - } while (inode->i_state & I_SYNC); + } } /* @@ -383,7 +422,6 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; - int wait = wbc->sync_mode == WB_SYNC_ALL; unsigned dirty; int ret; @@ -401,7 +439,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * We'll have another go at writing back this inode when we * completed a full scan of b_io. */ - if (!wait) { + if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode); return 0; } @@ -414,24 +452,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) BUG_ON(inode->i_state & I_SYNC); - /* Set I_SYNC, reset I_DIRTY */ - dirty = inode->i_state & I_DIRTY; + /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY; - + inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); - /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - int err = write_inode(inode, wait); + /* + * Make sure to wait on the data before writing out the metadata. + * This is important for filesystems that modify metadata on data + * I/O completion. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } - if (wait) { - int err = filemap_fdatawait(mapping); + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode_lock); + dirty = inode->i_state & I_DIRTY; + inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode_lock); + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + int err = write_inode(inode, wbc); if (ret == 0) ret = err; } @@ -439,8 +489,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - if (!(inode->i_state & I_DIRTY) && - mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { + /* + * More pages get dirtied by a fast dirtier. + */ + goto select_queue; + } else if (inode->i_state & I_DIRTY) { + /* + * At least XFS will redirty the inode during the + * writeback (delalloc) and on io completion (isize). + */ + redirty_tail(inode); + } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty @@ -462,6 +522,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; +select_queue: if (wbc->nr_to_write <= 0) { /* * slice used up: queue for next turn @@ -484,12 +545,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } - } else if (inode->i_state & I_DIRTY) { - /* - * Someone redirtied the inode while were writing back - * the pages. - */ - redirty_tail(inode); } else if (atomic_read(&inode->i_count)) { /* * The inode is clean, inuse @@ -506,129 +561,90 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) return ret; } +static void unpin_sb_for_writeback(struct super_block *sb) +{ + up_read(&sb->s_umount); + put_super(sb); +} + +enum sb_pin_state { + SB_PINNED, + SB_NOT_PINNED, + SB_PIN_FAILED +}; + /* * For WB_SYNC_NONE writeback, the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't * go away while we are writing inodes from it. - * - * Returns 0 if the super was successfully pinned (or pinning wasn't needed), - * 1 if we failed. */ -static int pin_sb_for_writeback(struct writeback_control *wbc, - struct inode *inode) +static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, + struct super_block *sb) { - struct super_block *sb = inode->i_sb; - /* * Caller must already hold the ref for this */ if (wbc->sync_mode == WB_SYNC_ALL) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - return 0; + return SB_NOT_PINNED; } - spin_lock(&sb_lock); sb->s_count++; if (down_read_trylock(&sb->s_umount)) { if (sb->s_root) { spin_unlock(&sb_lock); - return 0; + return SB_PINNED; } /* * umounted, drop rwsem again and fall through to failure */ up_read(&sb->s_umount); } - sb->s_count--; spin_unlock(&sb_lock); - return 1; -} - -static void unpin_sb_for_writeback(struct writeback_control *wbc, - struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - if (wbc->sync_mode == WB_SYNC_ALL) - return; - - up_read(&sb->s_umount); - put_super(sb); + return SB_PIN_FAILED; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +/* + * Write a portion of b_io inodes which belong to @sb. + * If @wbc->sb != NULL, then find and write all such + * inodes. Otherwise write only ones which go sequentially + * in reverse order. + * Return 1, if the caller writeback routine should be + * interrupted. Otherwise return 0. + */ +static int writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct writeback_control *wbc) { - struct super_block *sb = wbc->sb; - const int is_blkdev_sb = sb_is_blkdev_sb(sb); - const unsigned long start = jiffies; /* livelock avoidance */ - - spin_lock(&inode_lock); - - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - while (!list_empty(&wb->b_io)) { - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); long pages_skipped; - - /* - * super block given and doesn't match, skip this inode - */ - if (sb && sb != inode->i_sb) { + struct inode *inode = list_entry(wb->b_io.prev, + struct inode, i_list); + if (wbc->sb && sb != inode->i_sb) { + /* super block given and doesn't + match, skip this inode */ redirty_tail(inode); continue; } - - if (!bdi_cap_writeback_dirty(wb->bdi)) { - redirty_tail(inode); - if (is_blkdev_sb) { - /* - * Dirty memory-backed blockdev: the ramdisk - * driver does this. Skip just this inode - */ - continue; - } - /* - * Dirty memory-backed inode against a filesystem other - * than the kernel-internal bdev filesystem. Skip the - * entire superblock. - */ - break; - } - + if (sb != inode->i_sb) + /* finish with this superblock */ + return 0; if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; } - - if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { - wbc->encountered_congestion = 1; - if (!is_blkdev_sb) - break; /* Skip a congested fs */ - requeue_io(inode); - continue; /* Skip a congested blockdev */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, start)) - break; - - if (pin_sb_for_writeback(wbc, inode)) { - requeue_io(inode); - continue; - } + if (inode_dirtied_after(inode, wbc->wb_start)) + return 1; BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - unpin_sb_for_writeback(wbc, inode); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -642,12 +658,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, spin_lock(&inode_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; - break; + return 1; } if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } + /* b_io is empty */ + return 1; +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; + + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, + struct inode, i_list); + struct super_block *sb = inode->i_sb; + enum sb_pin_state state; + + if (wbc->sb && sb != wbc->sb) { + /* super block given and doesn't + match, skip this inode */ + redirty_tail(inode); + continue; + } + state = pin_sb_for_writeback(wbc, sb); + + if (state == SB_PIN_FAILED) { + requeue_io(inode); + continue; + } + ret = writeback_sb_inodes(sb, wb, wbc); + + if (state == SB_PINNED) + unpin_sb_for_writeback(sb); + if (ret) + break; + } spin_unlock(&inode_lock); /* Leave any unwritten inodes on b_io */ } @@ -702,10 +756,12 @@ static long wb_writeback(struct bdi_writeback *wb, .sync_mode = args->sync_mode, .older_than_this = NULL, .for_kupdate = args->for_kupdate, + .for_background = args->for_background, .range_cyclic = args->range_cyclic, }; unsigned long oldest_jif; long wrote = 0; + struct inode *inode; if (wbc.for_kupdate) { wbc.older_than_this = &oldest_jif; @@ -719,24 +775,19 @@ static long wb_writeback(struct bdi_writeback *wb, for (;;) { /* - * Don't flush anything for non-integrity writeback where - * no nr_pages was given + * Stop writeback when nr_pages has been consumed */ - if (!args->for_kupdate && args->nr_pages <= 0 && - args->sync_mode == WB_SYNC_NONE) + if (args->nr_pages <= 0) break; /* - * If no specific pages were given and this is just a - * periodic background writeout and we are below the - * background dirty threshold, don't do anything + * For background writeout, stop when we are below the + * background dirty threshold */ - if (args->for_kupdate && args->nr_pages <= 0 && - !over_bground_thresh()) + if (args->for_background && !over_bground_thresh()) break; wbc.more_io = 0; - wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; writeback_inodes_wb(wb, &wbc); @@ -744,13 +795,32 @@ static long wb_writeback(struct bdi_writeback *wb, wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* - * If we ran out of stuff to write, bail unless more_io got set + * If we consumed everything, see if we have more + */ + if (wbc.nr_to_write <= 0) + continue; + /* + * Didn't write everything and we don't have more IO, bail */ - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - if (wbc.more_io && !wbc.for_kupdate) - continue; + if (!wbc.more_io) break; + /* + * Did we write something? Try for more + */ + if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) + continue; + /* + * Nothing written. Wait for some inode to + * become available for writeback. Otherwise + * we'll just busyloop. + */ + spin_lock(&inode_lock); + if (!list_empty(&wb->b_more_io)) { + inode = list_entry(wb->b_more_io.prev, + struct inode, i_list); + inode_wait_for_writeback(inode); } + spin_unlock(&inode_lock); } return wrote; @@ -772,8 +842,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, rcu_read_lock(); list_for_each_entry_rcu(work, &bdi->work_list, list) { - if (!test_and_clear_bit(wb->nr, &work->seen)) + if (!test_bit(wb->nr, &work->seen)) continue; + clear_bit(wb->nr, &work->seen); ret = work; break; @@ -788,6 +859,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) unsigned long expired; long nr_pages; + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) @@ -883,8 +960,17 @@ int bdi_writeback_task(struct bdi_writeback *wb) break; } - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty_careful(&wb->bdi->work_list) && + !kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + try_to_freeze(); } @@ -1059,9 +1145,6 @@ EXPORT_SYMBOL(__mark_inode_dirty); * If older_than_this is non-NULL, then only write out inodes which * had their first dirtying at a time earlier than *older_than_this. * - * If we're a pdlfush thread, then implement pdflush collision avoidance - * against the entire list. - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -1140,11 +1223,28 @@ void writeback_inodes_sb(struct super_block *sb) nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - bdi_writeback_all(sb, nr_to_write); + bdi_start_writeback(sb->s_bdi, sb, nr_to_write); } EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_if_idle(struct super_block *sb) +{ + if (!writeback_in_progress(sb->s_bdi)) { + writeback_inodes_sb(sb); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_if_idle); + +/** * sync_inodes_sb - sync sb inode pages * @sb: the superblock *