X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage-writeback.c;h=30351f0063acb125f4581f0a1c6197662254d8ab;hb=0f181328287db30671e9997329cff71395d4af8b;hp=01b9cb8ccf68fc0b552198330899b1cc9d6d5be2;hpb=bd19e012f6fd3b7309689165ea865cbb7bb88c1e;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 01b9cb8..30351f0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -66,7 +66,13 @@ static inline long sync_writeback_pages(void) /* * Start background writeback (via pdflush) at this percentage */ -int dirty_background_ratio = 5; +int dirty_background_ratio = 10; + +/* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory @@ -77,17 +83,23 @@ int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 10; +int vm_dirty_ratio = 20; + +/* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; /* - * The interval between `kupdate'-style writebacks, in jiffies + * The interval between `kupdate'-style writebacks */ -int dirty_writeback_interval = 5 * HZ; +unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ /* - * The longest number of jiffies for which data is allowed to remain dirty + * The longest time for which data is allowed to remain dirty */ -int dirty_expire_interval = 30 * HZ; +unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -135,23 +147,75 @@ static int calc_period_shift(void) { unsigned long dirty_total; - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; return 2 + ilog2(dirty_total - 1); } /* - * update the period when the dirty ratio changes. + * update the period when the dirty threshold changes. */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned long old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; } return ret; } @@ -176,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -static inline void task_dirty_inc(struct task_struct *tsk) +void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); } @@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) } void -get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi) +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { - int background_ratio; /* Percentages */ - int dirty_ratio; - long background; - long dirty; + unsigned long background; + unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; - background_ratio = dirty_background_ratio; - if (background_ratio >= dirty_ratio) - background_ratio = dirty_ratio / 2; + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } - background = (background_ratio * available_memory) / 100; - dirty = (dirty_ratio * available_memory) / 100; + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; @@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(gfp_t gfp_mask) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); @@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + @@ -700,9 +770,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - dirty_expire_interval; + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); start_jif = jiffies; - next_jif = start_jif + dirty_writeback_interval; + next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); @@ -731,9 +801,10 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, file, buffer, length, ppos); if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, jiffies + + msecs_to_jiffies(dirty_writeback_interval * 10)); else del_timer(&wb_timer); return 0; @@ -835,7 +906,8 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + mod_timer(&wb_timer, + jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); @@ -899,67 +971,117 @@ int write_cache_pages(struct address_space *mapping, } retry: done_index = index; - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - done_index = page->index + 1; - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. */ - lock_page(page); - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } - if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; - unlock_page(page); - continue; + break; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + done_index = page->index + 1; + + lock_page(page); - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - ret = (*writepage)(page, wbc, data); + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - if (ret || (--nr_to_write <= 0)) - done = 1; + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; + break; } } pagevec_release(&pvec); cond_resched(); } - if (!cycled) { + if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap @@ -1078,6 +1200,20 @@ int __set_page_dirty_no_writeback(struct page *page) } /* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); + task_io_account_write(PAGE_CACHE_SIZE); + } +} + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -1106,12 +1242,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -1142,7 +1273,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -static int __set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -1160,14 +1291,6 @@ static int __set_page_dirty(struct page *page) } return 0; } - -int set_page_dirty(struct page *page) -{ - int ret = __set_page_dirty(page); - if (ret) - task_dirty_inc(current); - return ret; -} EXPORT_SYMBOL(set_page_dirty); /*