move page writeback acounting out of macros
[safe/jmp/linux-2.6] / mm / page-writeback.c
index 237107c..63512a9 100644 (file)
@@ -67,12 +67,12 @@ static inline long sync_writeback_pages(void)
 /*
  * Start background writeback (via pdflush) at this percentage
  */
-int dirty_background_ratio = 10;
+int dirty_background_ratio = 5;
 
 /*
  * The generator of dirty data starts writeback at this percentage
  */
-int vm_dirty_ratio = 40;
+int vm_dirty_ratio = 10;
 
 /*
  * The interval between `kupdate'-style writebacks, in jiffies
@@ -119,6 +119,44 @@ static void background_writeout(unsigned long _min_pages);
  * We make sure that the background writeout level is below the adjusted
  * clamping level.
  */
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+       int node;
+       unsigned long x = 0;
+
+       for_each_online_node(node) {
+               struct zone *z =
+                       &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+               x += zone_page_state(z, NR_FREE_PAGES)
+                       + zone_page_state(z, NR_INACTIVE)
+                       + zone_page_state(z, NR_ACTIVE);
+       }
+       /*
+        * Make sure that the number of highmem pages is never larger
+        * than the number of the total dirtyable memory. This can only
+        * occur in very strange VM situations but we want to make sure
+        * that this does not occur.
+        */
+       return min(x, total);
+#else
+       return 0;
+#endif
+}
+
+static unsigned long determine_dirtyable_memory(void)
+{
+       unsigned long x;
+
+       x = global_page_state(NR_FREE_PAGES)
+               + global_page_state(NR_INACTIVE)
+               + global_page_state(NR_ACTIVE);
+       x -= highmem_dirtyable_memory(x);
+       return x + 1;   /* Ensure that we never return 0 */
+}
+
 static void
 get_dirty_limits(long *pbackground, long *pdirty,
                                        struct address_space *mapping)
@@ -128,22 +166,12 @@ get_dirty_limits(long *pbackground, long *pdirty,
        int unmapped_ratio;
        long background;
        long dirty;
-       unsigned long available_memory = vm_total_pages;
+       unsigned long available_memory = determine_dirtyable_memory();
        struct task_struct *tsk;
 
-#ifdef CONFIG_HIGHMEM
-       /*
-        * If this mapping can only allocate from low memory,
-        * we exclude high memory from our count.
-        */
-       if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
-               available_memory -= totalhigh_pages;
-#endif
-
-
        unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
                                global_page_state(NR_ANON_PAGES)) * 100) /
-                                       vm_total_pages;
+                                       available_memory;
 
        dirty_ratio = vm_dirty_ratio;
        if (dirty_ratio > unmapped_ratio / 2)
@@ -298,11 +326,21 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
-void throttle_vm_writeout(void)
+void throttle_vm_writeout(gfp_t gfp_mask)
 {
        long background_thresh;
        long dirty_thresh;
 
+       if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
+               /*
+                * The caller might hold locks which can prevent IO completion
+                * or progress in the filesystem.  So we cannot just sit here
+                * waiting for IO to complete.
+                */
+               congestion_wait(WRITE, HZ/10);
+               return;
+       }
+
         for ( ; ; ) {
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
 
@@ -319,7 +357,6 @@ void throttle_vm_writeout(void)
         }
 }
 
-
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
@@ -439,15 +476,13 @@ static void wb_kupdate(unsigned long arg)
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-               struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
-       if (dirty_writeback_interval) {
-               mod_timer(&wb_timer,
-                       jiffies + dirty_writeback_interval);
-               } else {
+       if (dirty_writeback_interval)
+               mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+       else
                del_timer(&wb_timer);
-       }
        return 0;
 }
 
@@ -517,7 +552,7 @@ static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
        writeback_set_ratelimit();
-       return 0;
+       return NOTIFY_DONE;
 }
 
 static struct notifier_block __cpuinitdata ratelimit_nb = {
@@ -526,61 +561,52 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 };
 
 /*
- * If the machine has a large highmem:lowmem ratio then scale back the default
- * dirty memory thresholds: allowing too much dirty highmem pins an excessive
- * number of buffer_heads.
+ * Called early on to tune the page writeback dirty limits.
+ *
+ * We used to scale dirty pages according to how total memory
+ * related to pages that could be allocated for buffers (by
+ * comparing nr_free_buffer_pages() to vm_total_pages.
+ *
+ * However, that was when we used "dirty_ratio" to scale with
+ * all memory, and we don't do that any more. "dirty_ratio"
+ * is now applied to total non-HIGHPAGE memory (by subtracting
+ * totalhigh_pages from vm_total_pages), and as such we can't
+ * get into the old insane situation any more where we had
+ * large amounts of dirty pages compared to a small amount of
+ * non-HIGHMEM memory.
+ *
+ * But we might still want to scale the dirty_ratio by how
+ * much memory the box has..
  */
 void __init page_writeback_init(void)
 {
-       long buffer_pages = nr_free_buffer_pages();
-       long correction;
-
-       correction = (100 * 4 * buffer_pages) / vm_total_pages;
-
-       if (correction < 100) {
-               dirty_background_ratio *= correction;
-               dirty_background_ratio /= 100;
-               vm_dirty_ratio *= correction;
-               vm_dirty_ratio /= 100;
-
-               if (dirty_background_ratio <= 0)
-                       dirty_background_ratio = 1;
-               if (vm_dirty_ratio <= 0)
-                       vm_dirty_ratio = 1;
-       }
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 }
 
 /**
- * generic_writepages - walk the list of dirty pages of the given
- *                      address space and writepage() all of them.
- *
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
  *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- *
- * If a page is already under I/O, generic_writepages() skips it, even
+ * If a page is already under I/O, write_cache_pages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
- *
- * Derived from mpage_writepages() - if you fix this you should check that
- * also!
  */
-int generic_writepages(struct address_space *mapping,
-                      struct writeback_control *wbc)
+int write_cache_pages(struct address_space *mapping,
+                     struct writeback_control *wbc, writepage_t writepage,
+                     void *data)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
-       int (*writepage)(struct page *page, struct writeback_control *wbc);
        struct pagevec pvec;
        int nr_pages;
        pgoff_t index;
@@ -593,12 +619,6 @@ int generic_writepages(struct address_space *mapping,
                return 0;
        }
 
-       writepage = mapping->a_ops->writepage;
-
-       /* deal with chardevs and other special file */
-       if (!writepage)
-               return 0;
-
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
@@ -650,13 +670,7 @@ retry:
                                continue;
                        }
 
-                       ret = (*writepage)(page, wbc);
-                       if (ret) {
-                               if (ret == -ENOSPC)
-                                       set_bit(AS_ENOSPC, &mapping->flags);
-                               else
-                                       set_bit(AS_EIO, &mapping->flags);
-                       }
+                       ret = (*writepage)(page, wbc, data);
 
                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
                                unlock_page(page);
@@ -683,6 +697,38 @@ retry:
                mapping->writeback_index = index;
        return ret;
 }
+EXPORT_SYMBOL(write_cache_pages);
+
+/*
+ * Function used by generic_writepages to call the real writepage
+ * function and set the mapping flags on error
+ */
+static int __writepage(struct page *page, struct writeback_control *wbc,
+                      void *data)
+{
+       struct address_space *mapping = data;
+       int ret = mapping->a_ops->writepage(page, wbc);
+       mapping_set_error(mapping, ret);
+       return ret;
+}
+
+/**
+ * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ */
+int generic_writepages(struct address_space *mapping,
+                      struct writeback_control *wbc)
+{
+       /* deal with chardevs and other special file */
+       if (!mapping->a_ops->writepage)
+               return 0;
+
+       return write_cache_pages(mapping, wbc, __writepage, mapping);
+}
 
 EXPORT_SYMBOL(generic_writepages);
 
@@ -703,7 +749,6 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
 /**
  * write_one_page - write out a single page and optionally wait on I/O
- *
  * @page: the page to write
  * @wait: if true, wait on writeout
  *
@@ -742,6 +787,16 @@ int write_one_page(struct page *page, int wait)
 EXPORT_SYMBOL(write_one_page);
 
 /*
+ * For address_spaces which do not use buffers nor write back.
+ */
+int __set_page_dirty_no_writeback(struct page *page)
+{
+       if (!PageDirty(page))
+               SetPageDirty(page);
+       return 0;
+}
+
+/*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
  * its radix tree.
  *
@@ -769,6 +824,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
+                       WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
                        if (mapping_cap_account_dirty(mapping)) {
                                __inc_zone_page_state(page, NR_FILE_DIRTY);
                                task_io_account_write(PAGE_CACHE_SIZE);
@@ -845,38 +901,6 @@ int set_page_dirty_lock(struct page *page)
 EXPORT_SYMBOL(set_page_dirty_lock);
 
 /*
- * Clear a page's dirty flag, while caring for dirty memory accounting. 
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
-       struct address_space *mapping = page_mapping(page);
-       unsigned long flags;
-
-       if (!mapping)
-               return TestClearPageDirty(page);
-
-       write_lock_irqsave(&mapping->tree_lock, flags);
-       if (TestClearPageDirty(page)) {
-               radix_tree_tag_clear(&mapping->page_tree,
-                               page_index(page), PAGECACHE_TAG_DIRTY);
-               write_unlock_irqrestore(&mapping->tree_lock, flags);
-               /*
-                * We can continue to use `mapping' here because the
-                * page is locked, which pins the address_space
-                */
-               if (mapping_cap_account_dirty(mapping)) {
-                       page_mkclean(page);
-                       dec_zone_page_state(page, NR_FILE_DIRTY);
-               }
-               return 1;
-       }
-       write_unlock_irqrestore(&mapping->tree_lock, flags);
-       return 0;
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-
-/*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
  * Returns true if the page was previously dirty.
  *
@@ -894,17 +918,54 @@ int clear_page_dirty_for_io(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
 
-       if (!mapping)
-               return TestClearPageDirty(page);
+       BUG_ON(!PageLocked(page));
 
-       if (TestClearPageDirty(page)) {
-               if (mapping_cap_account_dirty(mapping)) {
-                       page_mkclean(page);
+       ClearPageReclaim(page);
+       if (mapping && mapping_cap_account_dirty(mapping)) {
+               /*
+                * Yes, Virginia, this is indeed insane.
+                *
+                * We use this sequence to make sure that
+                *  (a) we account for dirty stats properly
+                *  (b) we tell the low-level filesystem to
+                *      mark the whole page dirty if it was
+                *      dirty in a pagetable. Only to then
+                *  (c) clean the page again and return 1 to
+                *      cause the writeback.
+                *
+                * This way we avoid all nasty races with the
+                * dirty bit in multiple places and clearing
+                * them concurrently from different threads.
+                *
+                * Note! Normally the "set_page_dirty(page)"
+                * has no effect on the actual dirty bit - since
+                * that will already usually be set. But we
+                * need the side effects, and it can help us
+                * avoid races.
+                *
+                * We basically use the page "master dirty bit"
+                * as a serialization point for all the different
+                * threads doing their things.
+                */
+               if (page_mkclean(page))
+                       set_page_dirty(page);
+               /*
+                * We carefully synchronise fault handlers against
+                * installing a dirty pte and marking the page dirty
+                * at this point. We do this by having them hold the
+                * page lock at some point after installing their
+                * pte, but before marking the page dirty.
+                * Pages are always locked coming in here, so we get
+                * the desired exclusion. See mm/memory.c:do_wp_page()
+                * for more comments.
+                */
+               if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                       return 1;
                }
-               return 1;
+               return 0;
        }
-       return 0;
+       return TestClearPageDirty(page);
 }
 EXPORT_SYMBOL(clear_page_dirty_for_io);
 
@@ -926,6 +987,8 @@ int test_clear_page_writeback(struct page *page)
        } else {
                ret = TestClearPageWriteback(page);
        }
+       if (ret)
+               dec_zone_page_state(page, NR_WRITEBACK);
        return ret;
 }
 
@@ -951,6 +1014,8 @@ int test_set_page_writeback(struct page *page)
        } else {
                ret = TestSetPageWriteback(page);
        }
+       if (!ret)
+               inc_zone_page_state(page, NR_WRITEBACK);
        return ret;
 
 }