Fix congestion_wait() sync/async vs read/write confusion

[safe/jmp/linux-2.6] / mm / page-writeback.c
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index c17005e..81627eb 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
  /*
   * Start background writeback (via pdflush) at this percentage
   */
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 10;
  
  /*
   * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable;
  /*
   * The generator of dirty data starts writeback at this percentage
   */
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 20;
  
  /*
   * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 10;
  unsigned long vm_dirty_bytes;
  
  /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
   */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
  
  /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
   */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
  
  /*
   * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
  }
  EXPORT_SYMBOL_GPL(bdi_writeout_inc);
  
-static inline void task_dirty_inc(struct task_struct *tsk)
+void task_dirty_inc(struct task_struct *tsk)
  {
         prop_inc_single(&vm_dirties, &tsk->dirties);
  }
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
   * This avoids exceeding the total dirty_limit when the floating averages
   * fluctuate too quickly.
   */
-static void
-clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
+               unsigned long dirty, unsigned long *pbdi_dirty)
  {
-       long avail_dirty;
+       unsigned long avail_dirty;
  
-       avail_dirty = dirty -
-               (global_page_state(NR_FILE_DIRTY) +
+       avail_dirty = global_page_state(NR_FILE_DIRTY) +
                  global_page_state(NR_WRITEBACK) +
                  global_page_state(NR_UNSTABLE_NFS) +
-                global_page_state(NR_WRITEBACK_TEMP));
+                global_page_state(NR_WRITEBACK_TEMP);
  
-       if (avail_dirty < 0)
+       if (avail_dirty < dirty)
+               avail_dirty = dirty - avail_dirty;
+       else
                 avail_dirty = 0;
  
         avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
   *
   *   dirty -= (dirty/8) * p_{t}
   */
-static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
  {
         long numerator, denominator;
-       long dirty = *pdirty;
+       unsigned long dirty = *pdirty;
         u64 inv = dirty >> 3;
  
         task_dirties_fraction(tsk, &numerator, &denominator);
@@ -540,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping)
                  * filesystems (i.e. NFS) in which data may have been
                  * written to the server's write cache, but has not yet
                  * been flushed to permanent storage.
+                * Only move pages to writeback if this bdi is over its
+                * threshold otherwise wait until the disk writes catch
+                * up.
                  */
-               if (bdi_nr_reclaimable) {
+               if (bdi_nr_reclaimable > bdi_thresh) {
                         writeback_inodes(&wbc);
                         pages_written += write_chunk - wbc.nr_to_write;
                         get_dirty_limits(&background_thresh, &dirty_thresh,
@@ -571,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                 if (pages_written >= write_chunk)
                         break;          /* We've done our duty */
  
-               congestion_wait(WRITE, HZ/10);
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
         }
  
         if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -665,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                  if (global_page_state(NR_UNSTABLE_NFS) +
                         global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                 break;
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
  
                 /*
                  * The caller might hold locks which can prevent IO completion
@@ -711,7 +715,7 @@ static void background_writeout(unsigned long _min_pages)
                 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                         /* Wrote less than expected */
                         if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                         else
                                 break;
                 }
@@ -770,9 +774,9 @@ static void wb_kupdate(unsigned long arg)
  
         sync_supers();
  
-       oldest_jif = jiffies - dirty_expire_interval;
+       oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
         start_jif = jiffies;
-       next_jif = start_jif + dirty_writeback_interval;
+       next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
         nr_to_write = global_page_state(NR_FILE_DIRTY) +
                         global_page_state(NR_UNSTABLE_NFS) +
                         (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -783,7 +787,7 @@ static void wb_kupdate(unsigned long arg)
                 writeback_inodes(&wbc);
                 if (wbc.nr_to_write > 0) {
                         if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
                         else
                                 break;  /* All the old data is written */
                 }
@@ -801,9 +805,10 @@ static void wb_kupdate(unsigned long arg)
  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
-       proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+       proc_dointvec(table, write, file, buffer, length, ppos);
         if (dirty_writeback_interval)
-               mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+               mod_timer(&wb_timer, jiffies +
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
         else
                 del_timer(&wb_timer);
         return 0;
@@ -905,7 +910,8 @@ void __init page_writeback_init(void)
  {
         int shift;
  
-       mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+       mod_timer(&wb_timer,
+                 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
         writeback_set_ratelimit();
         register_cpu_notifier(&ratelimit_nb);
  
@@ -1051,20 +1057,23 @@ continue_unlock:
                                 }
                         }
  
-                       if (nr_to_write > 0)
+                       if (nr_to_write > 0) {
                                 nr_to_write--;
-                       else if (wbc->sync_mode == WB_SYNC_NONE) {
-                               /*
-                                * We stop writing back only if we are not
-                                * doing integrity sync. In case of integrity
-                                * sync we have to keep going because someone
-                                * may be concurrently dirtying pages, and we
-                                * might have synced a lot of newly appeared
-                                * dirty pages, but have not synced all of the
-                                * old dirty pages.
-                                */
-                               done = 1;
-                               break;
+                               if (nr_to_write == 0 &&
+                                   wbc->sync_mode == WB_SYNC_NONE) {
+                                       /*
+                                        * We stop writing back only if we are
+                                        * not doing integrity sync. In case of
+                                        * integrity sync we have to keep going
+                                        * because someone may be concurrently
+                                        * dirtying pages, and we might have
+                                        * synced a lot of newly appeared dirty
+                                        * pages, but have not synced all of the
+                                        * old dirty pages.
+                                        */
+                                       done = 1;
+                                       break;
+                               }
                         }
  
                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -1076,7 +1085,7 @@ continue_unlock:
                 pagevec_release(&pvec);
                 cond_resched();
         }
-       if (!cycled) {
+       if (!cycled && !done) {
                 /*
                  * range_cyclic:
                  * We hit the last page and there is more work to be done: wrap
@@ -1195,6 +1204,20 @@ int __set_page_dirty_no_writeback(struct page *page)
  }
  
  /*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+       if (mapping_cap_account_dirty(mapping)) {
+               __inc_zone_page_state(page, NR_FILE_DIRTY);
+               __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+               task_dirty_inc(current);
+               task_io_account_write(PAGE_CACHE_SIZE);
+       }
+}
+
+/*
   * For address_spaces which do not use buffers.  Just tag the page as dirty in
   * its radix tree.
   *
@@ -1223,12 +1246,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                 if (mapping2) { /* Race with truncate? */
                         BUG_ON(mapping2 != mapping);
                         WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                       if (mapping_cap_account_dirty(mapping)) {
-                               __inc_zone_page_state(page, NR_FILE_DIRTY);
-                               __inc_bdi_stat(mapping->backing_dev_info,
-                                               BDI_RECLAIMABLE);
-                               task_io_account_write(PAGE_CACHE_SIZE);
-                       }
+                       account_page_dirtied(page, mapping);
                         radix_tree_tag_set(&mapping->page_tree,
                                 page_index(page), PAGECACHE_TAG_DIRTY);
                 }
@@ -1259,7 +1277,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
   * If the mapping doesn't provide a set_page_dirty a_op, then
   * just fall through and assume that it wants buffer_heads.
   */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
  {
         struct address_space *mapping = page_mapping(page);
  
@@ -1277,14 +1295,6 @@ static int __set_page_dirty(struct page *page)
         }
         return 0;
  }
-
-int set_page_dirty(struct page *page)
-{
-       int ret = __set_page_dirty(page);
-       if (ret)
-               task_dirty_inc(current);
-       return ret;
-}
  EXPORT_SYMBOL(set_page_dirty);
  
  /*