2ee7ec5e003f78bba8a2de98c450d7dde807879e
[safe/jmp/linux-2.6] / mm / vmstat.c
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *              Christoph Lameter <christoph@lameter.com>
10  */
11
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/cpu.h>
15
16 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
17                         unsigned long *free, struct pglist_data *pgdat)
18 {
19         *active = node_page_state(pgdat->node_id, NR_ACTIVE);
20         *inactive = node_page_state(pgdat->node_id, NR_INACTIVE);
21         *free = node_page_state(pgdat->node_id, NR_FREE_PAGES);
22 }
23
24 void get_zone_counts(unsigned long *active,
25                 unsigned long *inactive, unsigned long *free)
26 {
27         *active = global_page_state(NR_ACTIVE);
28         *inactive = global_page_state(NR_INACTIVE);
29         *free = global_page_state(NR_FREE_PAGES);
30 }
31
32 #ifdef CONFIG_VM_EVENT_COUNTERS
33 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
34 EXPORT_PER_CPU_SYMBOL(vm_event_states);
35
36 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
37 {
38         int cpu = 0;
39         int i;
40
41         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
42
43         cpu = first_cpu(*cpumask);
44         while (cpu < NR_CPUS) {
45                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
46
47                 cpu = next_cpu(cpu, *cpumask);
48
49                 if (cpu < NR_CPUS)
50                         prefetch(&per_cpu(vm_event_states, cpu));
51
52
53                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
54                         ret[i] += this->event[i];
55         }
56 }
57
58 /*
59  * Accumulate the vm event counters across all CPUs.
60  * The result is unavoidably approximate - it can change
61  * during and after execution of this function.
62 */
63 void all_vm_events(unsigned long *ret)
64 {
65         sum_vm_events(ret, &cpu_online_map);
66 }
67 EXPORT_SYMBOL_GPL(all_vm_events);
68
69 #ifdef CONFIG_HOTPLUG
70 /*
71  * Fold the foreign cpu events into our own.
72  *
73  * This is adding to the events on one processor
74  * but keeps the global counts constant.
75  */
76 void vm_events_fold_cpu(int cpu)
77 {
78         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
79         int i;
80
81         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
82                 count_vm_events(i, fold_state->event[i]);
83                 fold_state->event[i] = 0;
84         }
85 }
86 #endif /* CONFIG_HOTPLUG */
87
88 #endif /* CONFIG_VM_EVENT_COUNTERS */
89
90 /*
91  * Manage combined zone based / global counters
92  *
93  * vm_stat contains the global counters
94  */
95 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
96 EXPORT_SYMBOL(vm_stat);
97
98 #ifdef CONFIG_SMP
99
100 static int calculate_threshold(struct zone *zone)
101 {
102         int threshold;
103         int mem;        /* memory in 128 MB units */
104
105         /*
106          * The threshold scales with the number of processors and the amount
107          * of memory per zone. More memory means that we can defer updates for
108          * longer, more processors could lead to more contention.
109          * fls() is used to have a cheap way of logarithmic scaling.
110          *
111          * Some sample thresholds:
112          *
113          * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
114          * ------------------------------------------------------------------
115          * 8            1               1       0.9-1 GB        4
116          * 16           2               2       0.9-1 GB        4
117          * 20           2               2       1-2 GB          5
118          * 24           2               2       2-4 GB          6
119          * 28           2               2       4-8 GB          7
120          * 32           2               2       8-16 GB         8
121          * 4            2               2       <128M           1
122          * 30           4               3       2-4 GB          5
123          * 48           4               3       8-16 GB         8
124          * 32           8               4       1-2 GB          4
125          * 32           8               4       0.9-1GB         4
126          * 10           16              5       <128M           1
127          * 40           16              5       900M            4
128          * 70           64              7       2-4 GB          5
129          * 84           64              7       4-8 GB          6
130          * 108          512             9       4-8 GB          6
131          * 125          1024            10      8-16 GB         8
132          * 125          1024            10      16-32 GB        9
133          */
134
135         mem = zone->present_pages >> (27 - PAGE_SHIFT);
136
137         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
138
139         /*
140          * Maximum threshold is 125
141          */
142         threshold = min(125, threshold);
143
144         return threshold;
145 }
146
147 /*
148  * Refresh the thresholds for each zone.
149  */
150 static void refresh_zone_stat_thresholds(void)
151 {
152         struct zone *zone;
153         int cpu;
154         int threshold;
155
156         for_each_zone(zone) {
157
158                 if (!zone->present_pages)
159                         continue;
160
161                 threshold = calculate_threshold(zone);
162
163                 for_each_online_cpu(cpu)
164                         zone_pcp(zone, cpu)->stat_threshold = threshold;
165         }
166 }
167
168 /*
169  * For use when we know that interrupts are disabled.
170  */
171 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
172                                 int delta)
173 {
174         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
175         s8 *p = pcp->vm_stat_diff + item;
176         long x;
177
178         x = delta + *p;
179
180         if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
181                 zone_page_state_add(x, zone, item);
182                 x = 0;
183         }
184         *p = x;
185 }
186 EXPORT_SYMBOL(__mod_zone_page_state);
187
188 /*
189  * For an unknown interrupt state
190  */
191 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
192                                         int delta)
193 {
194         unsigned long flags;
195
196         local_irq_save(flags);
197         __mod_zone_page_state(zone, item, delta);
198         local_irq_restore(flags);
199 }
200 EXPORT_SYMBOL(mod_zone_page_state);
201
202 /*
203  * Optimized increment and decrement functions.
204  *
205  * These are only for a single page and therefore can take a struct page *
206  * argument instead of struct zone *. This allows the inclusion of the code
207  * generated for page_zone(page) into the optimized functions.
208  *
209  * No overflow check is necessary and therefore the differential can be
210  * incremented or decremented in place which may allow the compilers to
211  * generate better code.
212  * The increment or decrement is known and therefore one boundary check can
213  * be omitted.
214  *
215  * NOTE: These functions are very performance sensitive. Change only
216  * with care.
217  *
218  * Some processors have inc/dec instructions that are atomic vs an interrupt.
219  * However, the code must first determine the differential location in a zone
220  * based on the processor number and then inc/dec the counter. There is no
221  * guarantee without disabling preemption that the processor will not change
222  * in between and therefore the atomicity vs. interrupt cannot be exploited
223  * in a useful way here.
224  */
225 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
226 {
227         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
228         s8 *p = pcp->vm_stat_diff + item;
229
230         (*p)++;
231
232         if (unlikely(*p > pcp->stat_threshold)) {
233                 int overstep = pcp->stat_threshold / 2;
234
235                 zone_page_state_add(*p + overstep, zone, item);
236                 *p = -overstep;
237         }
238 }
239
240 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
241 {
242         __inc_zone_state(page_zone(page), item);
243 }
244 EXPORT_SYMBOL(__inc_zone_page_state);
245
246 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
247 {
248         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
249         s8 *p = pcp->vm_stat_diff + item;
250
251         (*p)--;
252
253         if (unlikely(*p < - pcp->stat_threshold)) {
254                 int overstep = pcp->stat_threshold / 2;
255
256                 zone_page_state_add(*p - overstep, zone, item);
257                 *p = overstep;
258         }
259 }
260
261 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
262 {
263         __dec_zone_state(page_zone(page), item);
264 }
265 EXPORT_SYMBOL(__dec_zone_page_state);
266
267 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
268 {
269         unsigned long flags;
270
271         local_irq_save(flags);
272         __inc_zone_state(zone, item);
273         local_irq_restore(flags);
274 }
275
276 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
277 {
278         unsigned long flags;
279         struct zone *zone;
280
281         zone = page_zone(page);
282         local_irq_save(flags);
283         __inc_zone_state(zone, item);
284         local_irq_restore(flags);
285 }
286 EXPORT_SYMBOL(inc_zone_page_state);
287
288 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
289 {
290         unsigned long flags;
291
292         local_irq_save(flags);
293         __dec_zone_page_state(page, item);
294         local_irq_restore(flags);
295 }
296 EXPORT_SYMBOL(dec_zone_page_state);
297
298 /*
299  * Update the zone counters for one cpu.
300  */
301 void refresh_cpu_vm_stats(int cpu)
302 {
303         struct zone *zone;
304         int i;
305         unsigned long flags;
306
307         for_each_zone(zone) {
308                 struct per_cpu_pageset *pcp;
309
310                 if (!populated_zone(zone))
311                         continue;
312
313                 pcp = zone_pcp(zone, cpu);
314
315                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
316                         if (pcp->vm_stat_diff[i]) {
317                                 local_irq_save(flags);
318                                 zone_page_state_add(pcp->vm_stat_diff[i],
319                                         zone, i);
320                                 pcp->vm_stat_diff[i] = 0;
321                                 local_irq_restore(flags);
322                         }
323         }
324 }
325
326 static void __refresh_cpu_vm_stats(void *dummy)
327 {
328         refresh_cpu_vm_stats(smp_processor_id());
329 }
330
331 /*
332  * Consolidate all counters.
333  *
334  * Note that the result is less inaccurate but still inaccurate
335  * if concurrent processes are allowed to run.
336  */
337 void refresh_vm_stats(void)
338 {
339         on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
340 }
341 EXPORT_SYMBOL(refresh_vm_stats);
342
343 #endif
344
345 #ifdef CONFIG_NUMA
346 /*
347  * zonelist = the list of zones passed to the allocator
348  * z        = the zone from which the allocation occurred.
349  *
350  * Must be called with interrupts disabled.
351  */
352 void zone_statistics(struct zonelist *zonelist, struct zone *z)
353 {
354         if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
355                 __inc_zone_state(z, NUMA_HIT);
356         } else {
357                 __inc_zone_state(z, NUMA_MISS);
358                 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
359         }
360         if (z->node == numa_node_id())
361                 __inc_zone_state(z, NUMA_LOCAL);
362         else
363                 __inc_zone_state(z, NUMA_OTHER);
364 }
365 #endif
366
367 #ifdef CONFIG_PROC_FS
368
369 #include <linux/seq_file.h>
370
371 static void *frag_start(struct seq_file *m, loff_t *pos)
372 {
373         pg_data_t *pgdat;
374         loff_t node = *pos;
375         for (pgdat = first_online_pgdat();
376              pgdat && node;
377              pgdat = next_online_pgdat(pgdat))
378                 --node;
379
380         return pgdat;
381 }
382
383 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
384 {
385         pg_data_t *pgdat = (pg_data_t *)arg;
386
387         (*pos)++;
388         return next_online_pgdat(pgdat);
389 }
390
391 static void frag_stop(struct seq_file *m, void *arg)
392 {
393 }
394
395 /*
396  * This walks the free areas for each zone.
397  */
398 static int frag_show(struct seq_file *m, void *arg)
399 {
400         pg_data_t *pgdat = (pg_data_t *)arg;
401         struct zone *zone;
402         struct zone *node_zones = pgdat->node_zones;
403         unsigned long flags;
404         int order;
405
406         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
407                 if (!populated_zone(zone))
408                         continue;
409
410                 spin_lock_irqsave(&zone->lock, flags);
411                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
412                 for (order = 0; order < MAX_ORDER; ++order)
413                         seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
414                 spin_unlock_irqrestore(&zone->lock, flags);
415                 seq_putc(m, '\n');
416         }
417         return 0;
418 }
419
420 const struct seq_operations fragmentation_op = {
421         .start  = frag_start,
422         .next   = frag_next,
423         .stop   = frag_stop,
424         .show   = frag_show,
425 };
426
427 #ifdef CONFIG_ZONE_DMA32
428 #define TEXT_FOR_DMA32(xx) xx "_dma32",
429 #else
430 #define TEXT_FOR_DMA32(xx)
431 #endif
432
433 #ifdef CONFIG_HIGHMEM
434 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
435 #else
436 #define TEXT_FOR_HIGHMEM(xx)
437 #endif
438
439 #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
440                                         TEXT_FOR_HIGHMEM(xx)
441
442 static const char * const vmstat_text[] = {
443         /* Zoned VM counters */
444         "nr_free_pages",
445         "nr_active",
446         "nr_inactive",
447         "nr_anon_pages",
448         "nr_mapped",
449         "nr_file_pages",
450         "nr_dirty",
451         "nr_writeback",
452         "nr_slab_reclaimable",
453         "nr_slab_unreclaimable",
454         "nr_page_table_pages",
455         "nr_unstable",
456         "nr_bounce",
457         "nr_vmscan_write",
458
459 #ifdef CONFIG_NUMA
460         "numa_hit",
461         "numa_miss",
462         "numa_foreign",
463         "numa_interleave",
464         "numa_local",
465         "numa_other",
466 #endif
467
468 #ifdef CONFIG_VM_EVENT_COUNTERS
469         "pgpgin",
470         "pgpgout",
471         "pswpin",
472         "pswpout",
473
474         TEXTS_FOR_ZONES("pgalloc")
475
476         "pgfree",
477         "pgactivate",
478         "pgdeactivate",
479
480         "pgfault",
481         "pgmajfault",
482
483         TEXTS_FOR_ZONES("pgrefill")
484         TEXTS_FOR_ZONES("pgsteal")
485         TEXTS_FOR_ZONES("pgscan_kswapd")
486         TEXTS_FOR_ZONES("pgscan_direct")
487
488         "pginodesteal",
489         "slabs_scanned",
490         "kswapd_steal",
491         "kswapd_inodesteal",
492         "pageoutrun",
493         "allocstall",
494
495         "pgrotated",
496 #endif
497 };
498
499 /*
500  * Output information about zones in @pgdat.
501  */
502 static int zoneinfo_show(struct seq_file *m, void *arg)
503 {
504         pg_data_t *pgdat = arg;
505         struct zone *zone;
506         struct zone *node_zones = pgdat->node_zones;
507         unsigned long flags;
508
509         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
510                 int i;
511
512                 if (!populated_zone(zone))
513                         continue;
514
515                 spin_lock_irqsave(&zone->lock, flags);
516                 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
517                 seq_printf(m,
518                            "\n  pages free     %lu"
519                            "\n        min      %lu"
520                            "\n        low      %lu"
521                            "\n        high     %lu"
522                            "\n        scanned  %lu (a: %lu i: %lu)"
523                            "\n        spanned  %lu"
524                            "\n        present  %lu",
525                            zone_page_state(zone, NR_FREE_PAGES),
526                            zone->pages_min,
527                            zone->pages_low,
528                            zone->pages_high,
529                            zone->pages_scanned,
530                            zone->nr_scan_active, zone->nr_scan_inactive,
531                            zone->spanned_pages,
532                            zone->present_pages);
533
534                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
535                         seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
536                                         zone_page_state(zone, i));
537
538                 seq_printf(m,
539                            "\n        protection: (%lu",
540                            zone->lowmem_reserve[0]);
541                 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
542                         seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
543                 seq_printf(m,
544                            ")"
545                            "\n  pagesets");
546                 for_each_online_cpu(i) {
547                         struct per_cpu_pageset *pageset;
548                         int j;
549
550                         pageset = zone_pcp(zone, i);
551                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
552                                 seq_printf(m,
553                                            "\n    cpu: %i pcp: %i"
554                                            "\n              count: %i"
555                                            "\n              high:  %i"
556                                            "\n              batch: %i",
557                                            i, j,
558                                            pageset->pcp[j].count,
559                                            pageset->pcp[j].high,
560                                            pageset->pcp[j].batch);
561                         }
562 #ifdef CONFIG_SMP
563                         seq_printf(m, "\n  vm stats threshold: %d",
564                                         pageset->stat_threshold);
565 #endif
566                 }
567                 seq_printf(m,
568                            "\n  all_unreclaimable: %u"
569                            "\n  prev_priority:     %i"
570                            "\n  start_pfn:         %lu",
571                            zone->all_unreclaimable,
572                            zone->prev_priority,
573                            zone->zone_start_pfn);
574                 spin_unlock_irqrestore(&zone->lock, flags);
575                 seq_putc(m, '\n');
576         }
577         return 0;
578 }
579
580 const struct seq_operations zoneinfo_op = {
581         .start  = frag_start, /* iterate over all zones. The same as in
582                                * fragmentation. */
583         .next   = frag_next,
584         .stop   = frag_stop,
585         .show   = zoneinfo_show,
586 };
587
588 static void *vmstat_start(struct seq_file *m, loff_t *pos)
589 {
590         unsigned long *v;
591 #ifdef CONFIG_VM_EVENT_COUNTERS
592         unsigned long *e;
593 #endif
594         int i;
595
596         if (*pos >= ARRAY_SIZE(vmstat_text))
597                 return NULL;
598
599 #ifdef CONFIG_VM_EVENT_COUNTERS
600         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
601                         + sizeof(struct vm_event_state), GFP_KERNEL);
602 #else
603         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
604                         GFP_KERNEL);
605 #endif
606         m->private = v;
607         if (!v)
608                 return ERR_PTR(-ENOMEM);
609         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
610                 v[i] = global_page_state(i);
611 #ifdef CONFIG_VM_EVENT_COUNTERS
612         e = v + NR_VM_ZONE_STAT_ITEMS;
613         all_vm_events(e);
614         e[PGPGIN] /= 2;         /* sectors -> kbytes */
615         e[PGPGOUT] /= 2;
616 #endif
617         return v + *pos;
618 }
619
620 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
621 {
622         (*pos)++;
623         if (*pos >= ARRAY_SIZE(vmstat_text))
624                 return NULL;
625         return (unsigned long *)m->private + *pos;
626 }
627
628 static int vmstat_show(struct seq_file *m, void *arg)
629 {
630         unsigned long *l = arg;
631         unsigned long off = l - (unsigned long *)m->private;
632
633         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
634         return 0;
635 }
636
637 static void vmstat_stop(struct seq_file *m, void *arg)
638 {
639         kfree(m->private);
640         m->private = NULL;
641 }
642
643 const struct seq_operations vmstat_op = {
644         .start  = vmstat_start,
645         .next   = vmstat_next,
646         .stop   = vmstat_stop,
647         .show   = vmstat_show,
648 };
649
650 #endif /* CONFIG_PROC_FS */
651
652 #ifdef CONFIG_SMP
653 /*
654  * Use the cpu notifier to insure that the thresholds are recalculated
655  * when necessary.
656  */
657 static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
658                 unsigned long action,
659                 void *hcpu)
660 {
661         switch (action) {
662         case CPU_UP_PREPARE:
663         case CPU_UP_CANCELED:
664         case CPU_DEAD:
665                 refresh_zone_stat_thresholds();
666                 break;
667         default:
668                 break;
669         }
670         return NOTIFY_OK;
671 }
672
673 static struct notifier_block __cpuinitdata vmstat_notifier =
674         { &vmstat_cpuup_callback, NULL, 0 };
675
676 int __init setup_vmstat(void)
677 {
678         refresh_zone_stat_thresholds();
679         register_cpu_notifier(&vmstat_notifier);
680         return 0;
681 }
682 module_init(setup_vmstat)
683 #endif