nfsd: nfsd should drop CAP_MKNOD for non-root
[safe/jmp/linux-2.6] / mm / page-writeback.c
index bbcb916..74dc57c 100644 (file)
@@ -7,7 +7,7 @@
  * Contains functions related to writing back dirty pages at the
  * address_space level.
  *
- * 10Apr2002   akpm@zip.com.au
+ * 10Apr2002   Andrew Morton
  *             Initial version
  */
 
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
 int dirty_background_ratio = 5;
 
 /*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
+
+/*
  * free highmem will not be subtracted from the total free memory
  * for calculating free ratios if vm_highmem_is_dirtyable is true
  */
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
 int vm_dirty_ratio = 10;
 
 /*
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
+ */
+unsigned long vm_dirty_bytes;
+
+/*
  * The interval between `kupdate'-style writebacks, in jiffies
  */
 int dirty_writeback_interval = 5 * HZ;
@@ -126,8 +138,6 @@ static void background_writeout(unsigned long _min_pages);
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 
-static unsigned long determine_dirtyable_memory(void);
-
 /*
  * couple the period to the dirty_ratio:
  *
@@ -137,23 +147,75 @@ static int calc_period_shift(void)
 {
        unsigned long dirty_total;
 
-       dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+       if (vm_dirty_bytes)
+               dirty_total = vm_dirty_bytes / PAGE_SIZE;
+       else
+               dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+                               100;
        return 2 + ilog2(dirty_total - 1);
 }
 
 /*
- * update the period when the dirty ratio changes.
+ * update the period when the dirty threshold changes.
  */
+static void update_completion_period(void)
+{
+       int shift = calc_period_shift();
+       prop_change_shift(&vm_completions, shift);
+       prop_change_shift(&vm_dirties, shift);
+}
+
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+               struct file *filp, void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       if (ret == 0 && write)
+               dirty_background_bytes = 0;
+       return ret;
+}
+
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+               struct file *filp, void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       if (ret == 0 && write)
+               dirty_background_ratio = 0;
+       return ret;
+}
+
 int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int old_ratio = vm_dirty_ratio;
-       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-               int shift = calc_period_shift();
-               prop_change_shift(&vm_completions, shift);
-               prop_change_shift(&vm_dirties, shift);
+               update_completion_period();
+               vm_dirty_bytes = 0;
+       }
+       return ret;
+}
+
+
+int dirty_bytes_handler(struct ctl_table *table, int write,
+               struct file *filp, void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       unsigned long old_bytes = vm_dirty_bytes;
+       int ret;
+
+       ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+       if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+               update_completion_period();
+               vm_dirty_ratio = 0;
        }
        return ret;
 }
@@ -168,7 +230,17 @@ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
                              bdi->max_prop_frac);
 }
 
-static inline void task_dirty_inc(struct task_struct *tsk)
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __bdi_writeout_inc(bdi);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(bdi_writeout_inc);
+
+void task_dirty_inc(struct task_struct *tsk)
 {
        prop_inc_single(&vm_dirties, &tsk->dirties);
 }
@@ -201,7 +273,8 @@ clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
        avail_dirty = dirty -
                (global_page_state(NR_FILE_DIRTY) +
                 global_page_state(NR_WRITEBACK) +
-                global_page_state(NR_UNSTABLE_NFS));
+                global_page_state(NR_UNSTABLE_NFS) +
+                global_page_state(NR_WRITEBACK_TEMP));
 
        if (avail_dirty < 0)
                avail_dirty = 0;
@@ -320,9 +393,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
                struct zone *z =
                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 
-               x += zone_page_state(z, NR_FREE_PAGES)
-                       + zone_page_state(z, NR_INACTIVE)
-                       + zone_page_state(z, NR_ACTIVE);
+               x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
        }
        /*
         * Make sure that the number of highmem pages is never larger
@@ -336,13 +407,17 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 #endif
 }
 
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
 {
        unsigned long x;
 
-       x = global_page_state(NR_FREE_PAGES)
-               + global_page_state(NR_INACTIVE)
-               + global_page_state(NR_ACTIVE);
+       x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
 
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@ -351,26 +426,32 @@ static unsigned long determine_dirtyable_memory(void)
 }
 
 void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                struct backing_dev_info *bdi)
+get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
+                unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
 {
-       int background_ratio;           /* Percentages */
-       int dirty_ratio;
-       long background;
-       long dirty;
+       unsigned long background;
+       unsigned long dirty;
        unsigned long available_memory = determine_dirtyable_memory();
        struct task_struct *tsk;
 
-       dirty_ratio = vm_dirty_ratio;
-       if (dirty_ratio < 5)
-               dirty_ratio = 5;
+       if (vm_dirty_bytes)
+               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+       else {
+               int dirty_ratio;
 
-       background_ratio = dirty_background_ratio;
-       if (background_ratio >= dirty_ratio)
-               background_ratio = dirty_ratio / 2;
+               dirty_ratio = vm_dirty_ratio;
+               if (dirty_ratio < 5)
+                       dirty_ratio = 5;
+               dirty = (dirty_ratio * available_memory) / 100;
+       }
+
+       if (dirty_background_bytes)
+               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+       else
+               background = (dirty_background_ratio * available_memory) / 100;
 
-       background = (background_ratio * available_memory) / 100;
-       dirty = (dirty_ratio * available_memory) / 100;
+       if (background >= dirty)
+               background = dirty / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                background += background / 4;
@@ -412,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 {
        long nr_reclaimable, bdi_nr_reclaimable;
        long nr_writeback, bdi_nr_writeback;
-       long background_thresh;
-       long dirty_thresh;
-       long bdi_thresh;
+       unsigned long background_thresh;
+       unsigned long dirty_thresh;
+       unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
 
@@ -569,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
-       long background_thresh;
-       long dirty_thresh;
+       unsigned long background_thresh;
+       unsigned long dirty_thresh;
 
         for ( ; ; ) {
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -613,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
        };
 
        for ( ; ; ) {
-               long background_thresh;
-               long dirty_thresh;
+               unsigned long background_thresh;
+               unsigned long dirty_thresh;
 
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
@@ -857,10 +938,13 @@ int write_cache_pages(struct address_space *mapping,
        int done = 0;
        struct pagevec pvec;
        int nr_pages;
+       pgoff_t uninitialized_var(writeback_index);
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
-       int scanned = 0;
+       pgoff_t done_index;
+       int cycled;
        int range_whole = 0;
+       long nr_to_write = wbc->nr_to_write;
 
        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                wbc->encountered_congestion = 1;
@@ -869,82 +953,149 @@ int write_cache_pages(struct address_space *mapping,
 
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
-               index = mapping->writeback_index; /* Start from prev offset */
+               writeback_index = mapping->writeback_index; /* prev offset */
+               index = writeback_index;
+               if (index == 0)
+                       cycled = 1;
+               else
+                       cycled = 0;
                end = -1;
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
-               scanned = 1;
+               cycled = 1; /* ignore range_cyclic tests */
        }
 retry:
-       while (!done && (index <= end) &&
-              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                             PAGECACHE_TAG_DIRTY,
-                                             min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-               unsigned i;
+       done_index = index;
+       while (!done && (index <= end)) {
+               int i;
+
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                             PAGECACHE_TAG_DIRTY,
+                             min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+               if (nr_pages == 0)
+                       break;
 
-               scanned = 1;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
 
                        /*
-                        * At this point we hold neither mapping->tree_lock nor
-                        * lock on the page itself: the page may be truncated or
-                        * invalidated (changing page->mapping to NULL), or even
-                        * swizzled back from swapper_space to tmpfs file
-                        * mapping
+                        * At this point, the page may be truncated or
+                        * invalidated (changing page->mapping to NULL), or
+                        * even swizzled back from swapper_space to tmpfs file
+                        * mapping. However, page->index will not change
+                        * because we have a reference on the page.
                         */
+                       if (page->index > end) {
+                               /*
+                                * can't be range_cyclic (1st pass) because
+                                * end == -1 in that case.
+                                */
+                               done = 1;
+                               break;
+                       }
+
+                       done_index = page->index + 1;
+
                        lock_page(page);
 
+                       /*
+                        * Page truncated or invalidated. We can freely skip it
+                        * then, even for data integrity operations: the page
+                        * has disappeared concurrently, so there could be no
+                        * real expectation of this data interity operation
+                        * even if there is now a new, dirty page at the same
+                        * pagecache address.
+                        */
                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
                                unlock_page(page);
                                continue;
                        }
 
-                       if (!wbc->range_cyclic && page->index > end) {
-                               done = 1;
-                               unlock_page(page);
-                               continue;
+                       if (!PageDirty(page)) {
+                               /* someone wrote it for us */
+                               goto continue_unlock;
                        }
 
-                       if (wbc->sync_mode != WB_SYNC_NONE)
-                               wait_on_page_writeback(page);
-
-                       if (PageWriteback(page) ||
-                           !clear_page_dirty_for_io(page)) {
-                               unlock_page(page);
-                               continue;
+                       if (PageWriteback(page)) {
+                               if (wbc->sync_mode != WB_SYNC_NONE)
+                                       wait_on_page_writeback(page);
+                               else
+                                       goto continue_unlock;
                        }
 
-                       ret = (*writepage)(page, wbc, data);
+                       BUG_ON(PageWriteback(page));
+                       if (!clear_page_dirty_for_io(page))
+                               goto continue_unlock;
 
-                       if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
-                               unlock_page(page);
-                               ret = 0;
+                       ret = (*writepage)(page, wbc, data);
+                       if (unlikely(ret)) {
+                               if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                       unlock_page(page);
+                                       ret = 0;
+                               } else {
+                                       /*
+                                        * done_index is set past this page,
+                                        * so media errors will not choke
+                                        * background writeout for the entire
+                                        * file. This has consequences for
+                                        * range_cyclic semantics (ie. it may
+                                        * not be suitable for data integrity
+                                        * writeout).
+                                        */
+                                       done = 1;
+                                       break;
+                               }
+                       }
+
+                       if (nr_to_write > 0) {
+                               nr_to_write--;
+                               if (nr_to_write == 0 &&
+                                   wbc->sync_mode == WB_SYNC_NONE) {
+                                       /*
+                                        * We stop writing back only if we are
+                                        * not doing integrity sync. In case of
+                                        * integrity sync we have to keep going
+                                        * because someone may be concurrently
+                                        * dirtying pages, and we might have
+                                        * synced a lot of newly appeared dirty
+                                        * pages, but have not synced all of the
+                                        * old dirty pages.
+                                        */
+                                       done = 1;
+                                       break;
+                               }
                        }
-                       if (ret || (--(wbc->nr_to_write) <= 0))
-                               done = 1;
+
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                                wbc->encountered_congestion = 1;
                                done = 1;
+                               break;
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
-       if (!scanned && !done) {
+       if (!cycled && !done) {
                /*
+                * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
-               scanned = 1;
+               cycled = 1;
                index = 0;
+               end = writeback_index - 1;
                goto retry;
        }
-       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-               mapping->writeback_index = index;
+       if (!wbc->no_nrwrite_index_update) {
+               if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+                       mapping->writeback_index = done_index;
+               wbc->nr_to_write = nr_to_write;
+       }
+
        return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
@@ -1070,7 +1221,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (!mapping)
                        return 1;
 
-               write_lock_irq(&mapping->tree_lock);
+               spin_lock_irq(&mapping->tree_lock);
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
@@ -1079,12 +1230,13 @@ int __set_page_dirty_nobuffers(struct page *page)
                                __inc_zone_page_state(page, NR_FILE_DIRTY);
                                __inc_bdi_stat(mapping->backing_dev_info,
                                                BDI_RECLAIMABLE);
+                               task_dirty_inc(current);
                                task_io_account_write(PAGE_CACHE_SIZE);
                        }
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
-               write_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irq(&mapping->tree_lock);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1111,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
 
@@ -1129,14 +1281,6 @@ static int __set_page_dirty(struct page *page)
        }
        return 0;
 }
-
-int set_page_dirty(struct page *page)
-{
-       int ret = __set_page_dirty(page);
-       if (ret)
-               task_dirty_inc(current);
-       return ret;
-}
 EXPORT_SYMBOL(set_page_dirty);
 
 /*
@@ -1240,7 +1384,7 @@ int test_clear_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
 
-               write_lock_irqsave(&mapping->tree_lock, flags);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
@@ -1251,7 +1395,7 @@ int test_clear_page_writeback(struct page *page)
                                __bdi_writeout_inc(bdi);
                        }
                }
-               write_unlock_irqrestore(&mapping->tree_lock, flags);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
        }
@@ -1269,7 +1413,7 @@ int test_set_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
 
-               write_lock_irqsave(&mapping->tree_lock, flags);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
@@ -1282,7 +1426,7 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
-               write_unlock_irqrestore(&mapping->tree_lock, flags);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
        }