Fix congestion_wait() sync/async vs read/write confusion
[safe/jmp/linux-2.6] / mm / page-writeback.c
index 2e8c2b0..81627eb 100644 (file)
@@ -66,7 +66,13 @@ static inline long sync_writeback_pages(void)
 /*
  * Start background writeback (via pdflush) at this percentage
  */
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 10;
+
+/*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
 
 /*
  * free highmem will not be subtracted from the total free memory
@@ -77,17 +83,23 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 20;
+
+/*
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
+ */
+unsigned long vm_dirty_bytes;
 
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
  */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
 
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
  */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
 {
        unsigned long dirty_total;
 
-       dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+       if (vm_dirty_bytes)
+               dirty_total = vm_dirty_bytes / PAGE_SIZE;
+       else
+               dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+                               100;
        return 2 + ilog2(dirty_total - 1);
 }
 
 /*
- * update the period when the dirty ratio changes.
+ * update the period when the dirty threshold changes.
  */
+static void update_completion_period(void)
+{
+       int shift = calc_period_shift();
+       prop_change_shift(&vm_completions, shift);
+       prop_change_shift(&vm_dirties, shift);
+}
+
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+               struct file *filp, void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       if (ret == 0 && write)
+               dirty_background_bytes = 0;
+       return ret;
+}
+
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+               struct file *filp, void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       if (ret == 0 && write)
+               dirty_background_ratio = 0;
+       return ret;
+}
+
 int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int old_ratio = vm_dirty_ratio;
-       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-               int shift = calc_period_shift();
-               prop_change_shift(&vm_completions, shift);
-               prop_change_shift(&vm_dirties, shift);
+               update_completion_period();
+               vm_dirty_bytes = 0;
+       }
+       return ret;
+}
+
+
+int dirty_bytes_handler(struct ctl_table *table, int write,
+               struct file *filp, void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       unsigned long old_bytes = vm_dirty_bytes;
+       int ret;
+
+       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+               update_completion_period();
+               vm_dirty_ratio = 0;
        }
        return ret;
 }
@@ -176,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
-static inline void task_dirty_inc(struct task_struct *tsk)
+void task_dirty_inc(struct task_struct *tsk)
 {
        prop_inc_single(&vm_dirties, &tsk->dirties);
 }
@@ -201,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
  * This avoids exceeding the total dirty_limit when the floating averages
  * fluctuate too quickly.
  */
-static void
-clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
+               unsigned long dirty, unsigned long *pbdi_dirty)
 {
-       long avail_dirty;
+       unsigned long avail_dirty;
 
-       avail_dirty = dirty -
-               (global_page_state(NR_FILE_DIRTY) +
+       avail_dirty = global_page_state(NR_FILE_DIRTY) +
                 global_page_state(NR_WRITEBACK) +
                 global_page_state(NR_UNSTABLE_NFS) +
-                global_page_state(NR_WRITEBACK_TEMP));
+                global_page_state(NR_WRITEBACK_TEMP);
 
-       if (avail_dirty < 0)
+       if (avail_dirty < dirty)
+               avail_dirty = dirty - avail_dirty;
+       else
                avail_dirty = 0;
 
        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -235,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  *
  *   dirty -= (dirty/8) * p_{t}
  */
-static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 {
        long numerator, denominator;
-       long dirty = *pdirty;
+       unsigned long dirty = *pdirty;
        u64 inv = dirty >> 3;
 
        task_dirties_fraction(tsk, &numerator, &denominator);
@@ -362,26 +427,32 @@ unsigned long determine_dirtyable_memory(void)
 }
 
 void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                struct backing_dev_info *bdi)
+get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
+                unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
 {
-       int background_ratio;           /* Percentages */
-       int dirty_ratio;
-       long background;
-       long dirty;
+       unsigned long background;
+       unsigned long dirty;
        unsigned long available_memory = determine_dirtyable_memory();
        struct task_struct *tsk;
 
-       dirty_ratio = vm_dirty_ratio;
-       if (dirty_ratio < 5)
-               dirty_ratio = 5;
+       if (vm_dirty_bytes)
+               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+       else {
+               int dirty_ratio;
 
-       background_ratio = dirty_background_ratio;
-       if (background_ratio >= dirty_ratio)
-               background_ratio = dirty_ratio / 2;
+               dirty_ratio = vm_dirty_ratio;
+               if (dirty_ratio < 5)
+                       dirty_ratio = 5;
+               dirty = (dirty_ratio * available_memory) / 100;
+       }
 
-       background = (background_ratio * available_memory) / 100;
-       dirty = (dirty_ratio * available_memory) / 100;
+       if (dirty_background_bytes)
+               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+       else
+               background = (dirty_background_ratio * available_memory) / 100;
+
+       if (background >= dirty)
+               background = dirty / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                background += background / 4;
@@ -423,9 +494,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 {
        long nr_reclaimable, bdi_nr_reclaimable;
        long nr_writeback, bdi_nr_writeback;
-       long background_thresh;
-       long dirty_thresh;
-       long bdi_thresh;
+       unsigned long background_thresh;
+       unsigned long dirty_thresh;
+       unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
 
@@ -470,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * filesystems (i.e. NFS) in which data may have been
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
+                * Only move pages to writeback if this bdi is over its
+                * threshold otherwise wait until the disk writes catch
+                * up.
                 */
-               if (bdi_nr_reclaimable) {
+               if (bdi_nr_reclaimable > bdi_thresh) {
                        writeback_inodes(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
                        get_dirty_limits(&background_thresh, &dirty_thresh,
@@ -501,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
 
-               congestion_wait(WRITE, HZ/10);
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
 
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -580,8 +654,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
-       long background_thresh;
-       long dirty_thresh;
+       unsigned long background_thresh;
+       unsigned long dirty_thresh;
 
         for ( ; ; ) {
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -595,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                 if (global_page_state(NR_UNSTABLE_NFS) +
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
                /*
                 * The caller might hold locks which can prevent IO completion
@@ -624,8 +698,8 @@ static void background_writeout(unsigned long _min_pages)
        };
 
        for ( ; ; ) {
-               long background_thresh;
-               long dirty_thresh;
+               unsigned long background_thresh;
+               unsigned long dirty_thresh;
 
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
@@ -641,7 +715,7 @@ static void background_writeout(unsigned long _min_pages)
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
                        if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;
                }
@@ -700,9 +774,9 @@ static void wb_kupdate(unsigned long arg)
 
        sync_supers();
 
-       oldest_jif = jiffies - dirty_expire_interval;
+       oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
        start_jif = jiffies;
-       next_jif = start_jif + dirty_writeback_interval;
+       next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
        nr_to_write = global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -713,7 +787,7 @@ static void wb_kupdate(unsigned long arg)
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
                        if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;  /* All the old data is written */
                }
@@ -731,9 +805,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, file, buffer, length, ppos);
        if (dirty_writeback_interval)
-               mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+               mod_timer(&wb_timer, jiffies +
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
        else
                del_timer(&wb_timer);
        return 0;
@@ -835,7 +910,8 @@ void __init page_writeback_init(void)
 {
        int shift;
 
-       mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+       mod_timer(&wb_timer,
+                 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 
@@ -911,15 +987,24 @@ retry:
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
 
-                       done_index = page->index + 1;
-
                        /*
-                        * At this point we hold neither mapping->tree_lock nor
-                        * lock on the page itself: the page may be truncated or
-                        * invalidated (changing page->mapping to NULL), or even
-                        * swizzled back from swapper_space to tmpfs file
-                        * mapping
+                        * At this point, the page may be truncated or
+                        * invalidated (changing page->mapping to NULL), or
+                        * even swizzled back from swapper_space to tmpfs file
+                        * mapping. However, page->index will not change
+                        * because we have a reference on the page.
                         */
+                       if (page->index > end) {
+                               /*
+                                * can't be range_cyclic (1st pass) because
+                                * end == -1 in that case.
+                                */
+                               done = 1;
+                               break;
+                       }
+
+                       done_index = page->index + 1;
+
                        lock_page(page);
 
                        /*
@@ -936,15 +1021,6 @@ continue_unlock:
                                continue;
                        }
 
-                       if (page->index > end) {
-                               /*
-                                * can't be range_cyclic (1st pass) because
-                                * end == -1 in that case.
-                                */
-                               done = 1;
-                               goto continue_unlock;
-                       }
-
                        if (!PageDirty(page)) {
                                /* someone wrote it for us */
                                goto continue_unlock;
@@ -981,20 +1057,35 @@ continue_unlock:
                                }
                        }
 
-                       if (wbc->sync_mode == WB_SYNC_NONE) {
-                               wbc->nr_to_write--;
-                               if (wbc->nr_to_write <= 0)
+                       if (nr_to_write > 0) {
+                               nr_to_write--;
+                               if (nr_to_write == 0 &&
+                                   wbc->sync_mode == WB_SYNC_NONE) {
+                                       /*
+                                        * We stop writing back only if we are
+                                        * not doing integrity sync. In case of
+                                        * integrity sync we have to keep going
+                                        * because someone may be concurrently
+                                        * dirtying pages, and we might have
+                                        * synced a lot of newly appeared dirty
+                                        * pages, but have not synced all of the
+                                        * old dirty pages.
+                                        */
                                        done = 1;
+                                       break;
+                               }
                        }
+
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                                wbc->encountered_congestion = 1;
                                done = 1;
+                               break;
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
-       if (!cycled) {
+       if (!cycled && !done) {
                /*
                 * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
@@ -1113,6 +1204,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 }
 
 /*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+       if (mapping_cap_account_dirty(mapping)) {
+               __inc_zone_page_state(page, NR_FILE_DIRTY);
+               __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+               task_dirty_inc(current);
+               task_io_account_write(PAGE_CACHE_SIZE);
+       }
+}
+
+/*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
  *
@@ -1141,12 +1246,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                       if (mapping_cap_account_dirty(mapping)) {
-                               __inc_zone_page_state(page, NR_FILE_DIRTY);
-                               __inc_bdi_stat(mapping->backing_dev_info,
-                                               BDI_RECLAIMABLE);
-                               task_io_account_write(PAGE_CACHE_SIZE);
-                       }
+                       account_page_dirtied(page, mapping);
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
@@ -1177,7 +1277,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
 
@@ -1195,14 +1295,6 @@ static int __set_page_dirty(struct page *page)
        }
        return 0;
 }
-
-int set_page_dirty(struct page *page)
-{
-       int ret = __set_page_dirty(page);
-       if (ret)
-               task_dirty_inc(current);
-       return ret;
-}
 EXPORT_SYMBOL(set_page_dirty);
 
 /*