Fix congestion_wait() sync/async vs read/write confusion
[safe/jmp/linux-2.6] / mm / page-writeback.c
index c17005e..81627eb 100644 (file)
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
 /*
  * Start background writeback (via pdflush) at this percentage
  */
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 10;
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 20;
 
 /*
  * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 10;
 unsigned long vm_dirty_bytes;
 
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
  */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
 
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
  */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
-static inline void task_dirty_inc(struct task_struct *tsk)
+void task_dirty_inc(struct task_struct *tsk)
 {
        prop_inc_single(&vm_dirties, &tsk->dirties);
 }
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
  * This avoids exceeding the total dirty_limit when the floating averages
  * fluctuate too quickly.
  */
-static void
-clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
+               unsigned long dirty, unsigned long *pbdi_dirty)
 {
-       long avail_dirty;
+       unsigned long avail_dirty;
 
-       avail_dirty = dirty -
-               (global_page_state(NR_FILE_DIRTY) +
+       avail_dirty = global_page_state(NR_FILE_DIRTY) +
                 global_page_state(NR_WRITEBACK) +
                 global_page_state(NR_UNSTABLE_NFS) +
-                global_page_state(NR_WRITEBACK_TEMP));
+                global_page_state(NR_WRITEBACK_TEMP);
 
-       if (avail_dirty < 0)
+       if (avail_dirty < dirty)
+               avail_dirty = dirty - avail_dirty;
+       else
                avail_dirty = 0;
 
        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  *
  *   dirty -= (dirty/8) * p_{t}
  */
-static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 {
        long numerator, denominator;
-       long dirty = *pdirty;
+       unsigned long dirty = *pdirty;
        u64 inv = dirty >> 3;
 
        task_dirties_fraction(tsk, &numerator, &denominator);
@@ -540,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * filesystems (i.e. NFS) in which data may have been
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
+                * Only move pages to writeback if this bdi is over its
+                * threshold otherwise wait until the disk writes catch
+                * up.
                 */
-               if (bdi_nr_reclaimable) {
+               if (bdi_nr_reclaimable > bdi_thresh) {
                        writeback_inodes(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
                        get_dirty_limits(&background_thresh, &dirty_thresh,
@@ -571,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
 
-               congestion_wait(WRITE, HZ/10);
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
 
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -665,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                 if (global_page_state(NR_UNSTABLE_NFS) +
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
                /*
                 * The caller might hold locks which can prevent IO completion
@@ -711,7 +715,7 @@ static void background_writeout(unsigned long _min_pages)
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
                        if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;
                }
@@ -770,9 +774,9 @@ static void wb_kupdate(unsigned long arg)
 
        sync_supers();
 
-       oldest_jif = jiffies - dirty_expire_interval;
+       oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
        start_jif = jiffies;
-       next_jif = start_jif + dirty_writeback_interval;
+       next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
        nr_to_write = global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -783,7 +787,7 @@ static void wb_kupdate(unsigned long arg)
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
                        if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;  /* All the old data is written */
                }
@@ -801,9 +805,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, file, buffer, length, ppos);
        if (dirty_writeback_interval)
-               mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+               mod_timer(&wb_timer, jiffies +
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
        else
                del_timer(&wb_timer);
        return 0;
@@ -905,7 +910,8 @@ void __init page_writeback_init(void)
 {
        int shift;
 
-       mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+       mod_timer(&wb_timer,
+                 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 
@@ -1051,20 +1057,23 @@ continue_unlock:
                                }
                        }
 
-                       if (nr_to_write > 0)
+                       if (nr_to_write > 0) {
                                nr_to_write--;
-                       else if (wbc->sync_mode == WB_SYNC_NONE) {
-                               /*
-                                * We stop writing back only if we are not
-                                * doing integrity sync. In case of integrity
-                                * sync we have to keep going because someone
-                                * may be concurrently dirtying pages, and we
-                                * might have synced a lot of newly appeared
-                                * dirty pages, but have not synced all of the
-                                * old dirty pages.
-                                */
-                               done = 1;
-                               break;
+                               if (nr_to_write == 0 &&
+                                   wbc->sync_mode == WB_SYNC_NONE) {
+                                       /*
+                                        * We stop writing back only if we are
+                                        * not doing integrity sync. In case of
+                                        * integrity sync we have to keep going
+                                        * because someone may be concurrently
+                                        * dirtying pages, and we might have
+                                        * synced a lot of newly appeared dirty
+                                        * pages, but have not synced all of the
+                                        * old dirty pages.
+                                        */
+                                       done = 1;
+                                       break;
+                               }
                        }
 
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -1076,7 +1085,7 @@ continue_unlock:
                pagevec_release(&pvec);
                cond_resched();
        }
-       if (!cycled) {
+       if (!cycled && !done) {
                /*
                 * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
@@ -1195,6 +1204,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 }
 
 /*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+       if (mapping_cap_account_dirty(mapping)) {
+               __inc_zone_page_state(page, NR_FILE_DIRTY);
+               __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+               task_dirty_inc(current);
+               task_io_account_write(PAGE_CACHE_SIZE);
+       }
+}
+
+/*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
  *
@@ -1223,12 +1246,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                       if (mapping_cap_account_dirty(mapping)) {
-                               __inc_zone_page_state(page, NR_FILE_DIRTY);
-                               __inc_bdi_stat(mapping->backing_dev_info,
-                                               BDI_RECLAIMABLE);
-                               task_io_account_write(PAGE_CACHE_SIZE);
-                       }
+                       account_page_dirtied(page, mapping);
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
@@ -1259,7 +1277,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
 
@@ -1277,14 +1295,6 @@ static int __set_page_dirty(struct page *page)
        }
        return 0;
 }
-
-int set_page_dirty(struct page *page)
-{
-       int ret = __set_page_dirty(page);
-       if (ret)
-               task_dirty_inc(current);
-       return ret;
-}
 EXPORT_SYMBOL(set_page_dirty);
 
 /*