[PATCH] Drop __get_zone_counts()
[safe/jmp/linux-2.6] / mm / vmstat.c
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *              Christoph Lameter <christoph@lameter.com>
10  */
11
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/cpu.h>
15
16 void get_zone_counts(unsigned long *active,
17                 unsigned long *inactive, unsigned long *free)
18 {
19         *active = global_page_state(NR_ACTIVE);
20         *inactive = global_page_state(NR_INACTIVE);
21         *free = global_page_state(NR_FREE_PAGES);
22 }
23
24 #ifdef CONFIG_VM_EVENT_COUNTERS
25 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
26 EXPORT_PER_CPU_SYMBOL(vm_event_states);
27
28 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
29 {
30         int cpu = 0;
31         int i;
32
33         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
34
35         cpu = first_cpu(*cpumask);
36         while (cpu < NR_CPUS) {
37                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
38
39                 cpu = next_cpu(cpu, *cpumask);
40
41                 if (cpu < NR_CPUS)
42                         prefetch(&per_cpu(vm_event_states, cpu));
43
44
45                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
46                         ret[i] += this->event[i];
47         }
48 }
49
50 /*
51  * Accumulate the vm event counters across all CPUs.
52  * The result is unavoidably approximate - it can change
53  * during and after execution of this function.
54 */
55 void all_vm_events(unsigned long *ret)
56 {
57         sum_vm_events(ret, &cpu_online_map);
58 }
59 EXPORT_SYMBOL_GPL(all_vm_events);
60
61 #ifdef CONFIG_HOTPLUG
62 /*
63  * Fold the foreign cpu events into our own.
64  *
65  * This is adding to the events on one processor
66  * but keeps the global counts constant.
67  */
68 void vm_events_fold_cpu(int cpu)
69 {
70         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
71         int i;
72
73         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
74                 count_vm_events(i, fold_state->event[i]);
75                 fold_state->event[i] = 0;
76         }
77 }
78 #endif /* CONFIG_HOTPLUG */
79
80 #endif /* CONFIG_VM_EVENT_COUNTERS */
81
82 /*
83  * Manage combined zone based / global counters
84  *
85  * vm_stat contains the global counters
86  */
87 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
88 EXPORT_SYMBOL(vm_stat);
89
90 #ifdef CONFIG_SMP
91
92 static int calculate_threshold(struct zone *zone)
93 {
94         int threshold;
95         int mem;        /* memory in 128 MB units */
96
97         /*
98          * The threshold scales with the number of processors and the amount
99          * of memory per zone. More memory means that we can defer updates for
100          * longer, more processors could lead to more contention.
101          * fls() is used to have a cheap way of logarithmic scaling.
102          *
103          * Some sample thresholds:
104          *
105          * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
106          * ------------------------------------------------------------------
107          * 8            1               1       0.9-1 GB        4
108          * 16           2               2       0.9-1 GB        4
109          * 20           2               2       1-2 GB          5
110          * 24           2               2       2-4 GB          6
111          * 28           2               2       4-8 GB          7
112          * 32           2               2       8-16 GB         8
113          * 4            2               2       <128M           1
114          * 30           4               3       2-4 GB          5
115          * 48           4               3       8-16 GB         8
116          * 32           8               4       1-2 GB          4
117          * 32           8               4       0.9-1GB         4
118          * 10           16              5       <128M           1
119          * 40           16              5       900M            4
120          * 70           64              7       2-4 GB          5
121          * 84           64              7       4-8 GB          6
122          * 108          512             9       4-8 GB          6
123          * 125          1024            10      8-16 GB         8
124          * 125          1024            10      16-32 GB        9
125          */
126
127         mem = zone->present_pages >> (27 - PAGE_SHIFT);
128
129         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
130
131         /*
132          * Maximum threshold is 125
133          */
134         threshold = min(125, threshold);
135
136         return threshold;
137 }
138
139 /*
140  * Refresh the thresholds for each zone.
141  */
142 static void refresh_zone_stat_thresholds(void)
143 {
144         struct zone *zone;
145         int cpu;
146         int threshold;
147
148         for_each_zone(zone) {
149
150                 if (!zone->present_pages)
151                         continue;
152
153                 threshold = calculate_threshold(zone);
154
155                 for_each_online_cpu(cpu)
156                         zone_pcp(zone, cpu)->stat_threshold = threshold;
157         }
158 }
159
160 /*
161  * For use when we know that interrupts are disabled.
162  */
163 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
164                                 int delta)
165 {
166         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
167         s8 *p = pcp->vm_stat_diff + item;
168         long x;
169
170         x = delta + *p;
171
172         if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
173                 zone_page_state_add(x, zone, item);
174                 x = 0;
175         }
176         *p = x;
177 }
178 EXPORT_SYMBOL(__mod_zone_page_state);
179
180 /*
181  * For an unknown interrupt state
182  */
183 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
184                                         int delta)
185 {
186         unsigned long flags;
187
188         local_irq_save(flags);
189         __mod_zone_page_state(zone, item, delta);
190         local_irq_restore(flags);
191 }
192 EXPORT_SYMBOL(mod_zone_page_state);
193
194 /*
195  * Optimized increment and decrement functions.
196  *
197  * These are only for a single page and therefore can take a struct page *
198  * argument instead of struct zone *. This allows the inclusion of the code
199  * generated for page_zone(page) into the optimized functions.
200  *
201  * No overflow check is necessary and therefore the differential can be
202  * incremented or decremented in place which may allow the compilers to
203  * generate better code.
204  * The increment or decrement is known and therefore one boundary check can
205  * be omitted.
206  *
207  * NOTE: These functions are very performance sensitive. Change only
208  * with care.
209  *
210  * Some processors have inc/dec instructions that are atomic vs an interrupt.
211  * However, the code must first determine the differential location in a zone
212  * based on the processor number and then inc/dec the counter. There is no
213  * guarantee without disabling preemption that the processor will not change
214  * in between and therefore the atomicity vs. interrupt cannot be exploited
215  * in a useful way here.
216  */
217 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
218 {
219         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
220         s8 *p = pcp->vm_stat_diff + item;
221
222         (*p)++;
223
224         if (unlikely(*p > pcp->stat_threshold)) {
225                 int overstep = pcp->stat_threshold / 2;
226
227                 zone_page_state_add(*p + overstep, zone, item);
228                 *p = -overstep;
229         }
230 }
231
232 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
233 {
234         __inc_zone_state(page_zone(page), item);
235 }
236 EXPORT_SYMBOL(__inc_zone_page_state);
237
238 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
239 {
240         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
241         s8 *p = pcp->vm_stat_diff + item;
242
243         (*p)--;
244
245         if (unlikely(*p < - pcp->stat_threshold)) {
246                 int overstep = pcp->stat_threshold / 2;
247
248                 zone_page_state_add(*p - overstep, zone, item);
249                 *p = overstep;
250         }
251 }
252
253 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
254 {
255         __dec_zone_state(page_zone(page), item);
256 }
257 EXPORT_SYMBOL(__dec_zone_page_state);
258
259 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
260 {
261         unsigned long flags;
262
263         local_irq_save(flags);
264         __inc_zone_state(zone, item);
265         local_irq_restore(flags);
266 }
267
268 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
269 {
270         unsigned long flags;
271         struct zone *zone;
272
273         zone = page_zone(page);
274         local_irq_save(flags);
275         __inc_zone_state(zone, item);
276         local_irq_restore(flags);
277 }
278 EXPORT_SYMBOL(inc_zone_page_state);
279
280 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
281 {
282         unsigned long flags;
283
284         local_irq_save(flags);
285         __dec_zone_page_state(page, item);
286         local_irq_restore(flags);
287 }
288 EXPORT_SYMBOL(dec_zone_page_state);
289
290 /*
291  * Update the zone counters for one cpu.
292  */
293 void refresh_cpu_vm_stats(int cpu)
294 {
295         struct zone *zone;
296         int i;
297         unsigned long flags;
298
299         for_each_zone(zone) {
300                 struct per_cpu_pageset *pcp;
301
302                 if (!populated_zone(zone))
303                         continue;
304
305                 pcp = zone_pcp(zone, cpu);
306
307                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
308                         if (pcp->vm_stat_diff[i]) {
309                                 local_irq_save(flags);
310                                 zone_page_state_add(pcp->vm_stat_diff[i],
311                                         zone, i);
312                                 pcp->vm_stat_diff[i] = 0;
313                                 local_irq_restore(flags);
314                         }
315         }
316 }
317
318 static void __refresh_cpu_vm_stats(void *dummy)
319 {
320         refresh_cpu_vm_stats(smp_processor_id());
321 }
322
323 /*
324  * Consolidate all counters.
325  *
326  * Note that the result is less inaccurate but still inaccurate
327  * if concurrent processes are allowed to run.
328  */
329 void refresh_vm_stats(void)
330 {
331         on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
332 }
333 EXPORT_SYMBOL(refresh_vm_stats);
334
335 #endif
336
337 #ifdef CONFIG_NUMA
338 /*
339  * zonelist = the list of zones passed to the allocator
340  * z        = the zone from which the allocation occurred.
341  *
342  * Must be called with interrupts disabled.
343  */
344 void zone_statistics(struct zonelist *zonelist, struct zone *z)
345 {
346         if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
347                 __inc_zone_state(z, NUMA_HIT);
348         } else {
349                 __inc_zone_state(z, NUMA_MISS);
350                 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
351         }
352         if (z->node == numa_node_id())
353                 __inc_zone_state(z, NUMA_LOCAL);
354         else
355                 __inc_zone_state(z, NUMA_OTHER);
356 }
357 #endif
358
359 #ifdef CONFIG_PROC_FS
360
361 #include <linux/seq_file.h>
362
363 static void *frag_start(struct seq_file *m, loff_t *pos)
364 {
365         pg_data_t *pgdat;
366         loff_t node = *pos;
367         for (pgdat = first_online_pgdat();
368              pgdat && node;
369              pgdat = next_online_pgdat(pgdat))
370                 --node;
371
372         return pgdat;
373 }
374
375 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
376 {
377         pg_data_t *pgdat = (pg_data_t *)arg;
378
379         (*pos)++;
380         return next_online_pgdat(pgdat);
381 }
382
383 static void frag_stop(struct seq_file *m, void *arg)
384 {
385 }
386
387 /*
388  * This walks the free areas for each zone.
389  */
390 static int frag_show(struct seq_file *m, void *arg)
391 {
392         pg_data_t *pgdat = (pg_data_t *)arg;
393         struct zone *zone;
394         struct zone *node_zones = pgdat->node_zones;
395         unsigned long flags;
396         int order;
397
398         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
399                 if (!populated_zone(zone))
400                         continue;
401
402                 spin_lock_irqsave(&zone->lock, flags);
403                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
404                 for (order = 0; order < MAX_ORDER; ++order)
405                         seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
406                 spin_unlock_irqrestore(&zone->lock, flags);
407                 seq_putc(m, '\n');
408         }
409         return 0;
410 }
411
412 const struct seq_operations fragmentation_op = {
413         .start  = frag_start,
414         .next   = frag_next,
415         .stop   = frag_stop,
416         .show   = frag_show,
417 };
418
419 #ifdef CONFIG_ZONE_DMA32
420 #define TEXT_FOR_DMA32(xx) xx "_dma32",
421 #else
422 #define TEXT_FOR_DMA32(xx)
423 #endif
424
425 #ifdef CONFIG_HIGHMEM
426 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
427 #else
428 #define TEXT_FOR_HIGHMEM(xx)
429 #endif
430
431 #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
432                                         TEXT_FOR_HIGHMEM(xx)
433
434 static const char * const vmstat_text[] = {
435         /* Zoned VM counters */
436         "nr_free_pages",
437         "nr_active",
438         "nr_inactive",
439         "nr_anon_pages",
440         "nr_mapped",
441         "nr_file_pages",
442         "nr_dirty",
443         "nr_writeback",
444         "nr_slab_reclaimable",
445         "nr_slab_unreclaimable",
446         "nr_page_table_pages",
447         "nr_unstable",
448         "nr_bounce",
449         "nr_vmscan_write",
450
451 #ifdef CONFIG_NUMA
452         "numa_hit",
453         "numa_miss",
454         "numa_foreign",
455         "numa_interleave",
456         "numa_local",
457         "numa_other",
458 #endif
459
460 #ifdef CONFIG_VM_EVENT_COUNTERS
461         "pgpgin",
462         "pgpgout",
463         "pswpin",
464         "pswpout",
465
466         TEXTS_FOR_ZONES("pgalloc")
467
468         "pgfree",
469         "pgactivate",
470         "pgdeactivate",
471
472         "pgfault",
473         "pgmajfault",
474
475         TEXTS_FOR_ZONES("pgrefill")
476         TEXTS_FOR_ZONES("pgsteal")
477         TEXTS_FOR_ZONES("pgscan_kswapd")
478         TEXTS_FOR_ZONES("pgscan_direct")
479
480         "pginodesteal",
481         "slabs_scanned",
482         "kswapd_steal",
483         "kswapd_inodesteal",
484         "pageoutrun",
485         "allocstall",
486
487         "pgrotated",
488 #endif
489 };
490
491 /*
492  * Output information about zones in @pgdat.
493  */
494 static int zoneinfo_show(struct seq_file *m, void *arg)
495 {
496         pg_data_t *pgdat = arg;
497         struct zone *zone;
498         struct zone *node_zones = pgdat->node_zones;
499         unsigned long flags;
500
501         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
502                 int i;
503
504                 if (!populated_zone(zone))
505                         continue;
506
507                 spin_lock_irqsave(&zone->lock, flags);
508                 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
509                 seq_printf(m,
510                            "\n  pages free     %lu"
511                            "\n        min      %lu"
512                            "\n        low      %lu"
513                            "\n        high     %lu"
514                            "\n        scanned  %lu (a: %lu i: %lu)"
515                            "\n        spanned  %lu"
516                            "\n        present  %lu",
517                            zone_page_state(zone, NR_FREE_PAGES),
518                            zone->pages_min,
519                            zone->pages_low,
520                            zone->pages_high,
521                            zone->pages_scanned,
522                            zone->nr_scan_active, zone->nr_scan_inactive,
523                            zone->spanned_pages,
524                            zone->present_pages);
525
526                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
527                         seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
528                                         zone_page_state(zone, i));
529
530                 seq_printf(m,
531                            "\n        protection: (%lu",
532                            zone->lowmem_reserve[0]);
533                 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
534                         seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
535                 seq_printf(m,
536                            ")"
537                            "\n  pagesets");
538                 for_each_online_cpu(i) {
539                         struct per_cpu_pageset *pageset;
540                         int j;
541
542                         pageset = zone_pcp(zone, i);
543                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
544                                 seq_printf(m,
545                                            "\n    cpu: %i pcp: %i"
546                                            "\n              count: %i"
547                                            "\n              high:  %i"
548                                            "\n              batch: %i",
549                                            i, j,
550                                            pageset->pcp[j].count,
551                                            pageset->pcp[j].high,
552                                            pageset->pcp[j].batch);
553                         }
554 #ifdef CONFIG_SMP
555                         seq_printf(m, "\n  vm stats threshold: %d",
556                                         pageset->stat_threshold);
557 #endif
558                 }
559                 seq_printf(m,
560                            "\n  all_unreclaimable: %u"
561                            "\n  prev_priority:     %i"
562                            "\n  start_pfn:         %lu",
563                            zone->all_unreclaimable,
564                            zone->prev_priority,
565                            zone->zone_start_pfn);
566                 spin_unlock_irqrestore(&zone->lock, flags);
567                 seq_putc(m, '\n');
568         }
569         return 0;
570 }
571
572 const struct seq_operations zoneinfo_op = {
573         .start  = frag_start, /* iterate over all zones. The same as in
574                                * fragmentation. */
575         .next   = frag_next,
576         .stop   = frag_stop,
577         .show   = zoneinfo_show,
578 };
579
580 static void *vmstat_start(struct seq_file *m, loff_t *pos)
581 {
582         unsigned long *v;
583 #ifdef CONFIG_VM_EVENT_COUNTERS
584         unsigned long *e;
585 #endif
586         int i;
587
588         if (*pos >= ARRAY_SIZE(vmstat_text))
589                 return NULL;
590
591 #ifdef CONFIG_VM_EVENT_COUNTERS
592         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
593                         + sizeof(struct vm_event_state), GFP_KERNEL);
594 #else
595         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
596                         GFP_KERNEL);
597 #endif
598         m->private = v;
599         if (!v)
600                 return ERR_PTR(-ENOMEM);
601         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
602                 v[i] = global_page_state(i);
603 #ifdef CONFIG_VM_EVENT_COUNTERS
604         e = v + NR_VM_ZONE_STAT_ITEMS;
605         all_vm_events(e);
606         e[PGPGIN] /= 2;         /* sectors -> kbytes */
607         e[PGPGOUT] /= 2;
608 #endif
609         return v + *pos;
610 }
611
612 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
613 {
614         (*pos)++;
615         if (*pos >= ARRAY_SIZE(vmstat_text))
616                 return NULL;
617         return (unsigned long *)m->private + *pos;
618 }
619
620 static int vmstat_show(struct seq_file *m, void *arg)
621 {
622         unsigned long *l = arg;
623         unsigned long off = l - (unsigned long *)m->private;
624
625         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
626         return 0;
627 }
628
629 static void vmstat_stop(struct seq_file *m, void *arg)
630 {
631         kfree(m->private);
632         m->private = NULL;
633 }
634
635 const struct seq_operations vmstat_op = {
636         .start  = vmstat_start,
637         .next   = vmstat_next,
638         .stop   = vmstat_stop,
639         .show   = vmstat_show,
640 };
641
642 #endif /* CONFIG_PROC_FS */
643
644 #ifdef CONFIG_SMP
645 /*
646  * Use the cpu notifier to insure that the thresholds are recalculated
647  * when necessary.
648  */
649 static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
650                 unsigned long action,
651                 void *hcpu)
652 {
653         switch (action) {
654         case CPU_UP_PREPARE:
655         case CPU_UP_CANCELED:
656         case CPU_DEAD:
657                 refresh_zone_stat_thresholds();
658                 break;
659         default:
660                 break;
661         }
662         return NOTIFY_OK;
663 }
664
665 static struct notifier_block __cpuinitdata vmstat_notifier =
666         { &vmstat_cpuup_callback, NULL, 0 };
667
668 int __init setup_vmstat(void)
669 {
670         refresh_zone_stat_thresholds();
671         register_cpu_notifier(&vmstat_notifier);
672         return 0;
673 }
674 module_init(setup_vmstat)
675 #endif