X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage-writeback.c;h=c0d4ce144dec41130e552723c5b3ecd4dca1876a;hb=99219a3fbc2dcf2eaa954f7b2ac27299fd7894cd;hp=0166ea15c9ee8d61f08ddc6e100de841e59205c7;hpb=8d06afab73a75f40ae2864e6c296356bab1ab473;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0166ea1..c0d4ce1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -23,12 +23,15 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -45,8 +48,7 @@ */ static long ratelimit_pages = 32; -static long total_pages; /* The total number of pages in the machine. */ -static int dirty_exceeded; /* Dirty mem may be over limit */ +static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ /* * When balance_dirty_pages decides that the caller needs to perform some @@ -72,15 +74,14 @@ int dirty_background_ratio = 10; int vm_dirty_ratio = 40; /* - * The interval between `kupdate'-style writebacks, in centiseconds - * (hundredths of a second) + * The interval between `kupdate'-style writebacks, in jiffies */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_interval = 5 * HZ; /* - * The longest number of centiseconds for which data is allowed to remain dirty + * The longest number of jiffies for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_interval = 30 * HZ; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -88,7 +89,8 @@ int dirty_expire_centisecs = 30 * 100; int block_dump; /* - * Flag that puts the machine in "laptop mode". + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; @@ -99,22 +101,6 @@ EXPORT_SYMBOL(laptop_mode); static void background_writeout(unsigned long _min_pages); -struct writeback_state -{ - unsigned long nr_dirty; - unsigned long nr_unstable; - unsigned long nr_mapped; - unsigned long nr_writeback; -}; - -static void get_writeback_state(struct writeback_state *wbs) -{ - wbs->nr_dirty = read_page_state(nr_dirty); - wbs->nr_unstable = read_page_state(nr_unstable); - wbs->nr_mapped = read_page_state(nr_mapped); - wbs->nr_writeback = read_page_state(nr_writeback); -} - /* * Work out the current dirty-memory clamping and background writeout * thresholds. @@ -133,19 +119,17 @@ static void get_writeback_state(struct writeback_state *wbs) * clamping level. */ static void -get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(long *pbackground, long *pdirty, + struct address_space *mapping) { int background_ratio; /* Percentages */ int dirty_ratio; int unmapped_ratio; long background; long dirty; - unsigned long available_memory = total_pages; + unsigned long available_memory = vm_total_pages; struct task_struct *tsk; - get_writeback_state(wbs); - #ifdef CONFIG_HIGHMEM /* * If this mapping can only allocate from low memory, @@ -156,7 +140,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, #endif - unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; + unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES)) * 100) / + vm_total_pages; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -189,7 +175,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, */ static void balance_dirty_pages(struct address_space *mapping) { - struct writeback_state wbs; long nr_reclaimable; long background_thresh; long dirty_thresh; @@ -204,15 +189,18 @@ static void balance_dirty_pages(struct address_space *mapping) .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, + .range_cyclic = 1, }; - get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - break; + get_dirty_limits(&background_thresh, &dirty_thresh, mapping); + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= + dirty_thresh) + break; - dirty_exceeded = 1; + if (!dirty_exceeded) + dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -222,11 +210,14 @@ static void balance_dirty_pages(struct address_space *mapping) */ if (nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - break; + get_dirty_limits(&background_thresh, + &dirty_thresh, mapping); + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + if (nr_reclaimable + + global_page_state(NR_WRITEBACK) + <= dirty_thresh) + break; pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) break; /* We've done our duty */ @@ -234,8 +225,9 @@ static void balance_dirty_pages(struct address_space *mapping) blk_congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) - dirty_exceeded = 0; + if (nr_reclaimable + global_page_state(NR_WRITEBACK) + <= dirty_thresh && dirty_exceeded) + dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* pdflush is already working this queue */ @@ -253,9 +245,20 @@ static void balance_dirty_pages(struct address_space *mapping) pdflush_operation(background_writeout, 0); } +void set_page_dirty_balance(struct page *page) +{ + if (set_page_dirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping) + balance_dirty_pages_ratelimited(mapping); + } +} + /** - * balance_dirty_pages_ratelimited - balance dirty memory state + * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied + * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -266,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(int, ratelimits) = 0; - long ratelimit; + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -279,24 +284,26 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - if (get_cpu_var(ratelimits)++ >= ratelimit) { - __get_cpu_var(ratelimits) = 0; - put_cpu_var(ratelimits); + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); balance_dirty_pages(mapping); return; } - put_cpu_var(ratelimits); + preempt_enable(); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { - struct writeback_state wbs; long background_thresh; long dirty_thresh; for ( ; ; ) { - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL); /* * Boost the allowable dirty threshold a bit for page @@ -304,8 +311,9 @@ void throttle_vm_writeout(void) */ dirty_thresh += dirty_thresh / 10; /* wheeee... */ - if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) - break; + if (global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK) <= dirty_thresh) + break; blk_congestion_wait(WRITE, HZ/10); } } @@ -324,15 +332,16 @@ static void background_writeout(unsigned long _min_pages) .older_than_this = NULL, .nr_to_write = 0, .nonblocking = 1, + .range_cyclic = 1, }; for ( ; ; ) { - struct writeback_state wbs; long background_thresh; long dirty_thresh; - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); - if (wbs.nr_dirty + wbs.nr_unstable < background_thresh + get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; wbc.encountered_congestion = 0; @@ -356,12 +365,9 @@ static void background_writeout(unsigned long _min_pages) */ int wakeup_pdflush(long nr_pages) { - if (nr_pages == 0) { - struct writeback_state wbs; - - get_writeback_state(&wbs); - nr_pages = wbs.nr_dirty + wbs.nr_unstable; - } + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); return pdflush_operation(background_writeout, nr_pages); } @@ -379,8 +385,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per dirty_writeback_centisecs. But if a writeback event - * takes longer than a dirty_writeback_centisecs interval, then leave a + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back @@ -392,7 +398,6 @@ static void wb_kupdate(unsigned long arg) unsigned long start_jif; unsigned long next_jif; long nr_to_write; - struct writeback_state wbs; struct writeback_control wbc = { .bdi = NULL, .sync_mode = WB_SYNC_NONE, @@ -400,15 +405,16 @@ static void wb_kupdate(unsigned long arg) .nr_to_write = 0, .nonblocking = 1, .for_kupdate = 1, + .range_cyclic = 1, }; sync_supers(); - get_writeback_state(&wbs); - oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; - next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; - nr_to_write = wbs.nr_dirty + wbs.nr_unstable + + next_jif = start_jif + dirty_writeback_interval; + nr_to_write = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { wbc.encountered_congestion = 0; @@ -424,7 +430,7 @@ static void wb_kupdate(unsigned long arg) } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; - if (dirty_writeback_centisecs) + if (dirty_writeback_interval) mod_timer(&wb_timer, next_jif); } @@ -434,11 +440,11 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_centisecs) { + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) { mod_timer(&wb_timer, - jiffies + (dirty_writeback_centisecs * HZ) / 100); - } else { + jiffies + dirty_writeback_interval); + } else { del_timer(&wb_timer); } return 0; @@ -467,7 +473,7 @@ static void laptop_timer_fn(unsigned long unused) */ void laptop_io_completion(void) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -497,23 +503,23 @@ void laptop_sync_completion(void) * will write six megabyte chunks, max. */ -static void set_ratelimit(void) +void writeback_set_ratelimit(void) { - ratelimit_pages = total_pages / (num_online_cpus() * 32); + ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; } -static int +static int __cpuinit ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) { - set_ratelimit(); + writeback_set_ratelimit(); return 0; } -static struct notifier_block ratelimit_nb = { +static struct notifier_block __cpuinitdata ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; @@ -528,9 +534,7 @@ void __init page_writeback_init(void) long buffer_pages = nr_free_buffer_pages(); long correction; - total_pages = nr_free_pagecache_pages(); - - correction = (100 * 4 * buffer_pages) / total_pages; + correction = (100 * 4 * buffer_pages) / vm_total_pages; if (correction < 100) { dirty_background_ratio *= correction; @@ -543,18 +547,157 @@ void __init page_writeback_init(void) if (vm_dirty_ratio <= 0) vm_dirty_ratio = 1; } - mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); - set_ratelimit(); + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } +/** + * generic_writepages - walk the list of dirty pages of the given + * address space and writepage() all of them. + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * If a page is already under I/O, generic_writepages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + * + * Derived from mpage_writepages() - if you fix this you should check that + * also! + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + int (*writepage)(struct page *page, struct writeback_control *wbc); + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + writepage = mapping->a_ops->writepage; + + /* deal with chardevs and other special file */ + if (!writepage) + return 0; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc); + if (ret) { + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); + } + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) + unlock_page(page); + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} + +EXPORT_SYMBOL(generic_writepages); + int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { + int ret; + if (wbc->nr_to_write <= 0) return 0; + wbc->for_writepages = 1; if (mapping->a_ops->writepages) - return mapping->a_ops->writepages(mapping, wbc); - return generic_writepages(mapping, wbc); + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + wbc->for_writepages = 0; + return ret; } /** @@ -614,8 +757,6 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { - int ret = 0; - if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -626,7 +767,8 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); if (mapping_cap_account_dirty(mapping)) - inc_page_state(nr_dirty); + __inc_zone_page_state(page, + NR_FILE_DIRTY); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -637,8 +779,9 @@ int __set_page_dirty_nobuffers(struct page *page) I_DIRTY_PAGES); } } + return 1; } - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -664,12 +807,16 @@ int fastcall set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - return __set_page_dirty_buffers(page); +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); + } + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; } - if (!PageDirty(page)) - SetPageDirty(page); return 0; } EXPORT_SYMBOL(set_page_dirty); @@ -688,7 +835,7 @@ int set_page_dirty_lock(struct page *page) { int ret; - lock_page(page); + lock_page_nosync(page); ret = set_page_dirty(page); unlock_page(page); return ret; @@ -711,8 +858,14 @@ int test_clear_page_dirty(struct page *page) page_index(page), PAGECACHE_TAG_DIRTY); write_unlock_irqrestore(&mapping->tree_lock, flags); - if (mapping_cap_account_dirty(mapping)) - dec_page_state(nr_dirty); + /* + * We can continue to use `mapping' here because the + * page is locked, which pins the address_space + */ + if (mapping_cap_account_dirty(mapping)) { + page_mkclean(page); + dec_zone_page_state(page, NR_FILE_DIRTY); + } return 1; } write_unlock_irqrestore(&mapping->tree_lock, flags); @@ -742,8 +895,10 @@ int clear_page_dirty_for_io(struct page *page) if (mapping) { if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) - dec_page_state(nr_dirty); + if (mapping_cap_account_dirty(mapping)) { + page_mkclean(page); + dec_zone_page_state(page, NR_FILE_DIRTY); + } return 1; } return 0; @@ -801,6 +956,15 @@ int test_set_page_writeback(struct page *page) EXPORT_SYMBOL(test_set_page_writeback); /* + * Wakes up tasks that are being throttled due to writeback congestion + */ +void writeback_congestion_end(void) +{ + blk_congestion_end(WRITE); +} +EXPORT_SYMBOL(writeback_congestion_end); + +/* * Return true if any of the pages in the mapping are marged with the * passed tag. */