#include <linux/pagevec.h>
/*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation. We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES 1024
-
-/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * It should be somewhat larger than dirtied pages to ensure that reasonably
* large amounts of I/O are submitted.
*/
-static inline long sync_writeback_pages(void)
+static inline long sync_writeback_pages(unsigned long dirtied)
{
- return ratelimit_pages + ratelimit_pages / 2;
+ if (dirtied < ratelimit_pages)
+ dirtied = ratelimit_pages;
+
+ return dirtied + dirtied / 2;
}
/* The following parameters are exported via /proc/sys/vm */
/*
- * Start background writeback (via pdflush) at this percentage
+ * Start background writeback (via writeback threads) at this percentage
*/
-int dirty_background_ratio = 5;
+int dirty_background_ratio = 10;
/*
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
/*
* The generator of dirty data starts writeback at this percentage
*/
-int vm_dirty_ratio = 10;
+int vm_dirty_ratio = 20;
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
unsigned long vm_dirty_bytes;
/*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
*/
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
/*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
*/
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
/* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
-
/*
* Scale the writeback cache size proportional to the relative writeout speeds.
*
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
- ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_bytes = 0;
return ret;
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
- ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_ratio = 0;
return ret;
}
int dirty_ratio_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
- ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
update_completion_period();
vm_dirty_bytes = 0;
int dirty_bytes_handler(struct ctl_table *table, int write,
- struct file *filp, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
- ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
update_completion_period();
vm_dirty_ratio = 0;
* This avoids exceeding the total dirty_limit when the floating averages
* fluctuate too quickly.
*/
-static void
-clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
+ unsigned long dirty, unsigned long *pbdi_dirty)
{
- long avail_dirty;
+ unsigned long avail_dirty;
- avail_dirty = dirty -
- (global_page_state(NR_FILE_DIRTY) +
+ avail_dirty = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_WRITEBACK) +
global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK_TEMP));
+ global_page_state(NR_WRITEBACK_TEMP);
- if (avail_dirty < 0)
+ if (avail_dirty < dirty)
+ avail_dirty = dirty - avail_dirty;
+ else
avail_dirty = 0;
avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
*
* dirty -= (dirty/8) * p_{t}
*/
-static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
{
long numerator, denominator;
- long dirty = *pdirty;
+ unsigned long dirty = *pdirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
/*
*
*/
-static DEFINE_SPINLOCK(bdi_lock);
static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
- unsigned long flags;
- spin_lock_irqsave(&bdi_lock, flags);
+ spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
ret = -EINVAL;
}
}
- spin_unlock_irqrestore(&bdi_lock, flags);
+ spin_unlock_bh(&bdi_lock);
return ret;
}
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
- unsigned long flags;
int ret = 0;
if (max_ratio > 100)
return -EINVAL;
- spin_lock_irqsave(&bdi_lock, flags);
+ spin_lock_bh(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
}
- spin_unlock_irqrestore(&bdi_lock, flags);
+ spin_unlock_bh(&bdi_lock);
return ret;
}
struct zone *z =
&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
+ x += zone_page_state(z, NR_FREE_PAGES) +
+ zone_reclaimable_pages(z);
}
/*
* Make sure that the number of highmem pages is never larger
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
* the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
*/
-static void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping,
+ unsigned long write_chunk)
{
long nr_reclaimable, bdi_nr_reclaimable;
long nr_writeback, bdi_nr_writeback;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long pages_written = 0;
- unsigned long write_chunk = sync_writeback_pages();
+ unsigned long pause = 1;
struct backing_dev_info *bdi = mapping->backing_dev_info;
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
+ * Only move pages to writeback if this bdi is over its
+ * threshold otherwise wait until the disk writes catch
+ * up.
*/
- if (bdi_nr_reclaimable) {
- writeback_inodes(&wbc);
+ if (bdi_nr_reclaimable > bdi_thresh) {
+ writeback_inodes_wbc(&wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
if (pages_written >= write_chunk)
break; /* We've done our duty */
- congestion_wait(WRITE, HZ/10);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule_timeout(pause);
+
+ /*
+ * Increase the delay for each loop, up to our previous
+ * default of taking a 100ms nap.
+ */
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
}
if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
- return; /* pdflush is already working this queue */
+ return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS)
+ (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+ + global_page_state(NR_UNSTABLE_NFS))
> background_thresh)))
- pdflush_operation(background_writeout, 0);
+ bdi_start_writeback(bdi, NULL, 0);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
+static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+
/**
* balance_dirty_pages_ratelimited_nr - balance dirty memory state
* @mapping: address_space which was dirtied
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
* tasks in balance_dirty_pages(). Period.
*/
preempt_disable();
- p = &__get_cpu_var(ratelimits);
+ p = &__get_cpu_var(bdp_ratelimits);
*p += nr_pages_dirtied;
if (unlikely(*p >= ratelimit)) {
+ ratelimit = sync_writeback_pages(*p);
*p = 0;
preempt_enable();
- balance_dirty_pages(mapping);
+ balance_dirty_pages(mapping, ratelimit);
return;
}
preempt_enable();
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
- congestion_wait(WRITE, HZ/10);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
/*
* The caller might hold locks which can prevent IO completion
}
}
-/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
- */
-static void background_writeout(unsigned long _min_pages)
-{
- long min_pages = _min_pages;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = 0,
- .nonblocking = 1,
- .range_cyclic = 1,
- };
-
- for ( ; ; ) {
- unsigned long background_thresh;
- unsigned long dirty_thresh;
-
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
- && min_pages <= 0)
- break;
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- wbc.pages_skipped = 0;
- writeback_inodes(&wbc);
- min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- /* Wrote less than expected */
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(WRITE, HZ/10);
- else
- break;
- }
- }
-}
-
-/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
- if (nr_pages == 0)
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
-}
-
-static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space. So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval. But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write. So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
- unsigned long oldest_jif;
- unsigned long start_jif;
- unsigned long next_jif;
- long nr_to_write;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = &oldest_jif,
- .nr_to_write = 0,
- .nonblocking = 1,
- .for_kupdate = 1,
- .range_cyclic = 1,
- };
-
- sync_supers();
-
- oldest_jif = jiffies - dirty_expire_interval;
- start_jif = jiffies;
- next_jif = start_jif + dirty_writeback_interval;
- nr_to_write = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- while (nr_to_write > 0) {
- wbc.more_io = 0;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_inodes(&wbc);
- if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion || wbc.more_io)
- congestion_wait(WRITE, HZ/10);
- else
- break; /* All the old data is written */
- }
- nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- }
- if (time_before(next_jif, jiffies + HZ))
- next_jif = jiffies + HZ;
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, next_jif);
-}
-
-/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
- else
- del_timer(&wb_timer);
+ proc_dointvec(table, write, buffer, length, ppos);
return 0;
}
-static void wb_timer_fn(unsigned long unused)
-{
- if (pdflush_operation(wb_kupdate, 0) < 0)
- mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
-}
-
-static void laptop_flush(unsigned long unused)
+static void do_laptop_sync(struct work_struct *work)
{
- sys_sync();
+ wakeup_flusher_threads(0);
+ kfree(work);
}
static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ struct work_struct *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(work, do_laptop_sync);
+ schedule_work(work);
+ }
}
/*
{
int shift;
- mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
if (wbc->nr_to_write <= 0)
return 0;
- wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
- wbc->for_writepages = 0;
return ret;
}
}
/*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ task_dirty_inc(current);
+ task_io_account_write(PAGE_CACHE_SIZE);
+ }
+}
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- if (mapping_cap_account_dirty(mapping)) {
- __inc_zone_page_state(page, NR_FILE_DIRTY);
- __inc_bdi_stat(mapping->backing_dev_info,
- BDI_RECLAIMABLE);
- task_dirty_inc(current);
- task_io_account_write(PAGE_CACHE_SIZE);
- }
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
EXPORT_SYMBOL(redirty_page_for_writepage);
/*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/