X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage-writeback.c;h=30351f0063acb125f4581f0a1c6197662254d8ab;hb=58984ce21d315b70df1a43644df7416ea7c9bfd8;hp=ded57d52806052deb17a5fdadbda6dac3dca9799;hpb=06d6cf6959d22037fcec598f4f954db5db3d7356;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ded57d5..30351f0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -7,7 +7,7 @@ * Contains functions related to writing back dirty pages at the * address_space level. * - * 10Apr2002 akpm@zip.com.au + * 10Apr2002 Andrew Morton * Initial version */ @@ -66,7 +66,13 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 5; +int dirty_background_ratio = 10; + +/* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory @@ -77,17 +83,23 @@ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 10; +int vm_dirty_ratio = 20; + +/* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; /* - * The interval between `kupdate'-style writebacks, in jiffies + * The interval between `kupdate'-style writebacks */ -int dirty_writeback_interval = 5 * HZ; +unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ /* - * The longest number of jiffies for which data is allowed to remain dirty + * The longest time for which data is allowed to remain dirty */ -int dirty_expire_interval = 30 * HZ; +unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -126,8 +138,6 @@ static void background_writeout(unsigned long _min_pages); static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -137,23 +147,75 @@ static int calc_period_shift(void) { unsigned long dirty_total; - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; return 2 + ilog2(dirty_total - 1); } /* - * update the period when the dirty ratio changes. + * update the period when the dirty threshold changes. */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned long old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; } return ret; } @@ -178,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -static inline void task_dirty_inc(struct task_struct *tsk) +void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); } @@ -331,9 +393,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) - + zone_page_state(z, NR_INACTIVE) - + zone_page_state(z, NR_ACTIVE); + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -347,13 +407,17 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) - + global_page_state(NR_INACTIVE) - + global_page_state(NR_ACTIVE); + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -362,26 +426,32 @@ static unsigned long determine_dirtyable_memory(void) } void -get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi) +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { - int background_ratio; /* Percentages */ - int dirty_ratio; - long background; - long dirty; + unsigned long background; + unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; + + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } - background_ratio = dirty_background_ratio; - if (background_ratio >= dirty_ratio) - background_ratio = dirty_ratio / 2; + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; - background = (background_ratio * available_memory) / 100; - dirty = (dirty_ratio * available_memory) / 100; + if (background >= dirty) + background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; @@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(gfp_t gfp_mask) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); @@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + @@ -700,9 +770,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - dirty_expire_interval; + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); start_jif = jiffies; - next_jif = start_jif + dirty_writeback_interval; + next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); @@ -731,9 +801,10 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, jiffies + + msecs_to_jiffies(dirty_writeback_interval * 10)); else del_timer(&wb_timer); return 0; @@ -835,7 +906,8 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, + jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); @@ -868,10 +940,13 @@ int write_cache_pages(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; + long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -880,85 +955,149 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + lock_page(page); + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - ret = (*writepage)(page, wbc, data); + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; + break; } } pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled && !done) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (!wbc->no_nrwrite_index_update) { + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) + mapping->writeback_index = done_index; + wbc->nr_to_write = nr_to_write; + } - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); @@ -1061,6 +1200,20 @@ int __set_page_dirty_no_writeback(struct page *page) } /* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -1084,21 +1237,16 @@ int __set_page_dirty_nobuffers(struct page *page) if (!mapping) return 1; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1125,7 +1273,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -static int __set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -1143,14 +1291,6 @@ static int __set_page_dirty(struct page *page) } return 0; } - -int set_page_dirty(struct page *page) -{ - int ret = __set_page_dirty(page); - if (ret) - task_dirty_inc(current); - return ret; -} EXPORT_SYMBOL(set_page_dirty); /* @@ -1254,7 +1394,7 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) { radix_tree_tag_clear(&mapping->page_tree, @@ -1265,7 +1405,7 @@ int test_clear_page_writeback(struct page *page) __bdi_writeout_inc(bdi); } } - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); } @@ -1283,7 +1423,7 @@ int test_set_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) { radix_tree_tag_set(&mapping->page_tree, @@ -1296,7 +1436,7 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); }