SAFE public projects git trees. - safe/jmp/linux-2.6/blob - mm/vmstat.c

   1 /*
   2  *  linux/mm/vmstat.c
   3  *
   4  *  Manages VM statistics
   5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6  *
   7  *  zoned VM statistics
   8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
   9  *              Christoph Lameter <christoph@lameter.com>
  10  */
  11
  12 #include <linux/mm.h>
  13 #include <linux/module.h>
  14 #include <linux/cpu.h>
  15
  16 void get_zone_counts(unsigned long *active,
  17                 unsigned long *inactive, unsigned long *free)
  18 {
  19         *active = global_page_state(NR_ACTIVE);
  20         *inactive = global_page_state(NR_INACTIVE);
  21         *free = global_page_state(NR_FREE_PAGES);
  22 }
  23
  24 #ifdef CONFIG_VM_EVENT_COUNTERS
  25 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
  26 EXPORT_PER_CPU_SYMBOL(vm_event_states);
  27
  28 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
  29 {
  30         int cpu = 0;
  31         int i;
  32
  33         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
  34
  35         cpu = first_cpu(*cpumask);
  36         while (cpu < NR_CPUS) {
  37                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
  38
  39                 cpu = next_cpu(cpu, *cpumask);
  40
  41                 if (cpu < NR_CPUS)
  42                         prefetch(&per_cpu(vm_event_states, cpu));
  43
  44
  45                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
  46                         ret[i] += this->event[i];
  47         }
  48 }
  49
  50 /*
  51  * Accumulate the vm event counters across all CPUs.
  52  * The result is unavoidably approximate - it can change
  53  * during and after execution of this function.
  54 */
  55 void all_vm_events(unsigned long *ret)
  56 {
  57         sum_vm_events(ret, &cpu_online_map);
  58 }
  59 EXPORT_SYMBOL_GPL(all_vm_events);
  60
  61 #ifdef CONFIG_HOTPLUG
  62 /*
  63  * Fold the foreign cpu events into our own.
  64  *
  65  * This is adding to the events on one processor
  66  * but keeps the global counts constant.
  67  */
  68 void vm_events_fold_cpu(int cpu)
  69 {
  70         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
  71         int i;
  72
  73         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
  74                 count_vm_events(i, fold_state->event[i]);
  75                 fold_state->event[i] = 0;
  76         }
  77 }
  78 #endif /* CONFIG_HOTPLUG */
  79
  80 #endif /* CONFIG_VM_EVENT_COUNTERS */
  81
  82 /*
  83  * Manage combined zone based / global counters
  84  *
  85  * vm_stat contains the global counters
  86  */
  87 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
  88 EXPORT_SYMBOL(vm_stat);
  89
  90 #ifdef CONFIG_SMP
  91
  92 static int calculate_threshold(struct zone *zone)
  93 {
  94         int threshold;
  95         int mem;        /* memory in 128 MB units */
  96
  97         /*
  98          * The threshold scales with the number of processors and the amount
  99          * of memory per zone. More memory means that we can defer updates for
 100          * longer, more processors could lead to more contention.
 101          * fls() is used to have a cheap way of logarithmic scaling.
 102          *
 103          * Some sample thresholds:
 104          *
 105          * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
 106          * ------------------------------------------------------------------
 107          * 8            1               1       0.9-1 GB        4
 108          * 16           2               2       0.9-1 GB        4
 109          * 20           2               2       1-2 GB          5
 110          * 24           2               2       2-4 GB          6
 111          * 28           2               2       4-8 GB          7
 112          * 32           2               2       8-16 GB         8
 113          * 4            2               2       <128M           1
 114          * 30           4               3       2-4 GB          5
 115          * 48           4               3       8-16 GB         8
 116          * 32           8               4       1-2 GB          4
 117          * 32           8               4       0.9-1GB         4
 118          * 10           16              5       <128M           1
 119          * 40           16              5       900M            4
 120          * 70           64              7       2-4 GB          5
 121          * 84           64              7       4-8 GB          6
 122          * 108          512             9       4-8 GB          6
 123          * 125          1024            10      8-16 GB         8
 124          * 125          1024            10      16-32 GB        9
 125          */
 126
 127         mem = zone->present_pages >> (27 - PAGE_SHIFT);
 128
 129         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
 130
 131         /*
 132          * Maximum threshold is 125
 133          */
 134         threshold = min(125, threshold);
 135
 136         return threshold;
 137 }
 138
 139 /*
 140  * Refresh the thresholds for each zone.
 141  */
 142 static void refresh_zone_stat_thresholds(void)
 143 {
 144         struct zone *zone;
 145         int cpu;
 146         int threshold;
 147
 148         for_each_zone(zone) {
 149
 150                 if (!zone->present_pages)
 151                         continue;
 152
 153                 threshold = calculate_threshold(zone);
 154
 155                 for_each_online_cpu(cpu)
 156                         zone_pcp(zone, cpu)->stat_threshold = threshold;
 157         }
 158 }
 159
 160 /*
 161  * For use when we know that interrupts are disabled.
 162  */
 163 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 164                                 int delta)
 165 {
 166         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
 167         s8 *p = pcp->vm_stat_diff + item;
 168         long x;
 169
 170         x = delta + *p;
 171
 172         if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
 173                 zone_page_state_add(x, zone, item);
 174                 x = 0;
 175         }
 176         *p = x;
 177 }
 178 EXPORT_SYMBOL(__mod_zone_page_state);
 179
 180 /*
 181  * For an unknown interrupt state
 182  */
 183 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 184                                         int delta)
 185 {
 186         unsigned long flags;
 187
 188         local_irq_save(flags);
 189         __mod_zone_page_state(zone, item, delta);
 190         local_irq_restore(flags);
 191 }
 192 EXPORT_SYMBOL(mod_zone_page_state);
 193
 194 /*
 195  * Optimized increment and decrement functions.
 196  *
 197  * These are only for a single page and therefore can take a struct page *
 198  * argument instead of struct zone *. This allows the inclusion of the code
 199  * generated for page_zone(page) into the optimized functions.
 200  *
 201  * No overflow check is necessary and therefore the differential can be
 202  * incremented or decremented in place which may allow the compilers to
 203  * generate better code.
 204  * The increment or decrement is known and therefore one boundary check can
 205  * be omitted.
 206  *
 207  * NOTE: These functions are very performance sensitive. Change only
 208  * with care.
 209  *
 210  * Some processors have inc/dec instructions that are atomic vs an interrupt.
 211  * However, the code must first determine the differential location in a zone
 212  * based on the processor number and then inc/dec the counter. There is no
 213  * guarantee without disabling preemption that the processor will not change
 214  * in between and therefore the atomicity vs. interrupt cannot be exploited
 215  * in a useful way here.
 216  */
 217 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 218 {
 219         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
 220         s8 *p = pcp->vm_stat_diff + item;
 221
 222         (*p)++;
 223
 224         if (unlikely(*p > pcp->stat_threshold)) {
 225                 int overstep = pcp->stat_threshold / 2;
 226
 227                 zone_page_state_add(*p + overstep, zone, item);
 228                 *p = -overstep;
 229         }
 230 }
 231
 232 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 233 {
 234         __inc_zone_state(page_zone(page), item);
 235 }
 236 EXPORT_SYMBOL(__inc_zone_page_state);
 237
 238 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 239 {
 240         struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
 241         s8 *p = pcp->vm_stat_diff + item;
 242
 243         (*p)--;
 244
 245         if (unlikely(*p < - pcp->stat_threshold)) {
 246                 int overstep = pcp->stat_threshold / 2;
 247
 248                 zone_page_state_add(*p - overstep, zone, item);
 249                 *p = overstep;
 250         }
 251 }
 252
 253 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 254 {
 255         __dec_zone_state(page_zone(page), item);
 256 }
 257 EXPORT_SYMBOL(__dec_zone_page_state);
 258
 259 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 260 {
 261         unsigned long flags;
 262
 263         local_irq_save(flags);
 264         __inc_zone_state(zone, item);
 265         local_irq_restore(flags);
 266 }
 267
 268 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 269 {
 270         unsigned long flags;
 271         struct zone *zone;
 272
 273         zone = page_zone(page);
 274         local_irq_save(flags);
 275         __inc_zone_state(zone, item);
 276         local_irq_restore(flags);
 277 }
 278 EXPORT_SYMBOL(inc_zone_page_state);
 279
 280 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 281 {
 282         unsigned long flags;
 283
 284         local_irq_save(flags);
 285         __dec_zone_page_state(page, item);
 286         local_irq_restore(flags);
 287 }
 288 EXPORT_SYMBOL(dec_zone_page_state);
 289
 290 /*
 291  * Update the zone counters for one cpu.
 292  */
 293 void refresh_cpu_vm_stats(int cpu)
 294 {
 295         struct zone *zone;
 296         int i;
 297         unsigned long flags;
 298
 299         for_each_zone(zone) {
 300                 struct per_cpu_pageset *pcp;
 301
 302                 if (!populated_zone(zone))
 303                         continue;
 304
 305                 pcp = zone_pcp(zone, cpu);
 306
 307                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 308                         if (pcp->vm_stat_diff[i]) {
 309                                 local_irq_save(flags);
 310                                 zone_page_state_add(pcp->vm_stat_diff[i],
 311                                         zone, i);
 312                                 pcp->vm_stat_diff[i] = 0;
 313                                 local_irq_restore(flags);
 314                         }
 315         }
 316 }
 317
 318 static void __refresh_cpu_vm_stats(void *dummy)
 319 {
 320         refresh_cpu_vm_stats(smp_processor_id());
 321 }
 322
 323 /*
 324  * Consolidate all counters.
 325  *
 326  * Note that the result is less inaccurate but still inaccurate
 327  * if concurrent processes are allowed to run.
 328  */
 329 void refresh_vm_stats(void)
 330 {
 331         on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
 332 }
 333 EXPORT_SYMBOL(refresh_vm_stats);
 334
 335 #endif
 336
 337 #ifdef CONFIG_NUMA
 338 /*
 339  * zonelist = the list of zones passed to the allocator
 340  * z        = the zone from which the allocation occurred.
 341  *
 342  * Must be called with interrupts disabled.
 343  */
 344 void zone_statistics(struct zonelist *zonelist, struct zone *z)
 345 {
 346         if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
 347                 __inc_zone_state(z, NUMA_HIT);
 348         } else {
 349                 __inc_zone_state(z, NUMA_MISS);
 350                 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
 351         }
 352         if (z->node == numa_node_id())
 353                 __inc_zone_state(z, NUMA_LOCAL);
 354         else
 355                 __inc_zone_state(z, NUMA_OTHER);
 356 }
 357 #endif
 358
 359 #ifdef CONFIG_PROC_FS
 360
 361 #include <linux/seq_file.h>
 362
 363 static void *frag_start(struct seq_file *m, loff_t *pos)
 364 {
 365         pg_data_t *pgdat;
 366         loff_t node = *pos;
 367         for (pgdat = first_online_pgdat();
 368              pgdat && node;
 369              pgdat = next_online_pgdat(pgdat))
 370                 --node;
 371
 372         return pgdat;
 373 }
 374
 375 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 376 {
 377         pg_data_t *pgdat = (pg_data_t *)arg;
 378
 379         (*pos)++;
 380         return next_online_pgdat(pgdat);
 381 }
 382
 383 static void frag_stop(struct seq_file *m, void *arg)
 384 {
 385 }
 386
 387 /*
 388  * This walks the free areas for each zone.
 389  */
 390 static int frag_show(struct seq_file *m, void *arg)
 391 {
 392         pg_data_t *pgdat = (pg_data_t *)arg;
 393         struct zone *zone;
 394         struct zone *node_zones = pgdat->node_zones;
 395         unsigned long flags;
 396         int order;
 397
 398         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
 399                 if (!populated_zone(zone))
 400                         continue;
 401
 402                 spin_lock_irqsave(&zone->lock, flags);
 403                 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 404                 for (order = 0; order < MAX_ORDER; ++order)
 405                         seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
 406                 spin_unlock_irqrestore(&zone->lock, flags);
 407                 seq_putc(m, '\n');
 408         }
 409         return 0;
 410 }
 411
 412 const struct seq_operations fragmentation_op = {
 413         .start  = frag_start,
 414         .next   = frag_next,
 415         .stop   = frag_stop,
 416         .show   = frag_show,
 417 };
 418
 419 #ifdef CONFIG_ZONE_DMA32
 420 #define TEXT_FOR_DMA32(xx) xx "_dma32",
 421 #else
 422 #define TEXT_FOR_DMA32(xx)
 423 #endif
 424
 425 #ifdef CONFIG_HIGHMEM
 426 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
 427 #else
 428 #define TEXT_FOR_HIGHMEM(xx)
 429 #endif
 430
 431 #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
 432                                         TEXT_FOR_HIGHMEM(xx)
 433
 434 static const char * const vmstat_text[] = {
 435         /* Zoned VM counters */
 436         "nr_free_pages",
 437         "nr_active",
 438         "nr_inactive",
 439         "nr_anon_pages",
 440         "nr_mapped",
 441         "nr_file_pages",
 442         "nr_dirty",
 443         "nr_writeback",
 444         "nr_slab_reclaimable",
 445         "nr_slab_unreclaimable",
 446         "nr_page_table_pages",
 447         "nr_unstable",
 448         "nr_bounce",
 449         "nr_vmscan_write",
 450
 451 #ifdef CONFIG_NUMA
 452         "numa_hit",
 453         "numa_miss",
 454         "numa_foreign",
 455         "numa_interleave",
 456         "numa_local",
 457         "numa_other",
 458 #endif
 459
 460 #ifdef CONFIG_VM_EVENT_COUNTERS
 461         "pgpgin",
 462         "pgpgout",
 463         "pswpin",
 464         "pswpout",
 465
 466         TEXTS_FOR_ZONES("pgalloc")
 467
 468         "pgfree",
 469         "pgactivate",
 470         "pgdeactivate",
 471
 472         "pgfault",
 473         "pgmajfault",
 474
 475         TEXTS_FOR_ZONES("pgrefill")
 476         TEXTS_FOR_ZONES("pgsteal")
 477         TEXTS_FOR_ZONES("pgscan_kswapd")
 478         TEXTS_FOR_ZONES("pgscan_direct")
 479
 480         "pginodesteal",
 481         "slabs_scanned",
 482         "kswapd_steal",
 483         "kswapd_inodesteal",
 484         "pageoutrun",
 485         "allocstall",
 486
 487         "pgrotated",
 488 #endif
 489 };
 490
 491 /*
 492  * Output information about zones in @pgdat.
 493  */
 494 static int zoneinfo_show(struct seq_file *m, void *arg)
 495 {
 496         pg_data_t *pgdat = arg;
 497         struct zone *zone;
 498         struct zone *node_zones = pgdat->node_zones;
 499         unsigned long flags;
 500
 501         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
 502                 int i;
 503
 504                 if (!populated_zone(zone))
 505                         continue;
 506
 507                 spin_lock_irqsave(&zone->lock, flags);
 508                 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
 509                 seq_printf(m,
 510                            "\n  pages free     %lu"
 511                            "\n        min      %lu"
 512                            "\n        low      %lu"
 513                            "\n        high     %lu"
 514                            "\n        scanned  %lu (a: %lu i: %lu)"
 515                            "\n        spanned  %lu"
 516                            "\n        present  %lu",
 517                            zone_page_state(zone, NR_FREE_PAGES),
 518                            zone->pages_min,
 519                            zone->pages_low,
 520                            zone->pages_high,
 521                            zone->pages_scanned,
 522                            zone->nr_scan_active, zone->nr_scan_inactive,
 523                            zone->spanned_pages,
 524                            zone->present_pages);
 525
 526                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 527                         seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
 528                                         zone_page_state(zone, i));
 529
 530                 seq_printf(m,
 531                            "\n        protection: (%lu",
 532                            zone->lowmem_reserve[0]);
 533                 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
 534                         seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
 535                 seq_printf(m,
 536                            ")"
 537                            "\n  pagesets");
 538                 for_each_online_cpu(i) {
 539                         struct per_cpu_pageset *pageset;
 540                         int j;
 541
 542                         pageset = zone_pcp(zone, i);
 543                         for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
 544                                 seq_printf(m,
 545                                            "\n    cpu: %i pcp: %i"
 546                                            "\n              count: %i"
 547                                            "\n              high:  %i"
 548                                            "\n              batch: %i",
 549                                            i, j,
 550                                            pageset->pcp[j].count,
 551                                            pageset->pcp[j].high,
 552                                            pageset->pcp[j].batch);
 553                         }
 554 #ifdef CONFIG_SMP
 555                         seq_printf(m, "\n  vm stats threshold: %d",
 556                                         pageset->stat_threshold);
 557 #endif
 558                 }
 559                 seq_printf(m,
 560                            "\n  all_unreclaimable: %u"
 561                            "\n  prev_priority:     %i"
 562                            "\n  start_pfn:         %lu",
 563                            zone->all_unreclaimable,
 564                            zone->prev_priority,
 565                            zone->zone_start_pfn);
 566                 spin_unlock_irqrestore(&zone->lock, flags);
 567                 seq_putc(m, '\n');
 568         }
 569         return 0;
 570 }
 571
 572 const struct seq_operations zoneinfo_op = {
 573         .start  = frag_start, /* iterate over all zones. The same as in
 574                                * fragmentation. */
 575         .next   = frag_next,
 576         .stop   = frag_stop,
 577         .show   = zoneinfo_show,
 578 };
 579
 580 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 581 {
 582         unsigned long *v;
 583 #ifdef CONFIG_VM_EVENT_COUNTERS
 584         unsigned long *e;
 585 #endif
 586         int i;
 587
 588         if (*pos >= ARRAY_SIZE(vmstat_text))
 589                 return NULL;
 590
 591 #ifdef CONFIG_VM_EVENT_COUNTERS
 592         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
 593                         + sizeof(struct vm_event_state), GFP_KERNEL);
 594 #else
 595         v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
 596                         GFP_KERNEL);
 597 #endif
 598         m->private = v;
 599         if (!v)
 600                 return ERR_PTR(-ENOMEM);
 601         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 602                 v[i] = global_page_state(i);
 603 #ifdef CONFIG_VM_EVENT_COUNTERS
 604         e = v + NR_VM_ZONE_STAT_ITEMS;
 605         all_vm_events(e);
 606         e[PGPGIN] /= 2;         /* sectors -> kbytes */
 607         e[PGPGOUT] /= 2;
 608 #endif
 609         return v + *pos;
 610 }
 611
 612 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
 613 {
 614         (*pos)++;
 615         if (*pos >= ARRAY_SIZE(vmstat_text))
 616                 return NULL;
 617         return (unsigned long *)m->private + *pos;
 618 }
 619
 620 static int vmstat_show(struct seq_file *m, void *arg)
 621 {
 622         unsigned long *l = arg;
 623         unsigned long off = l - (unsigned long *)m->private;
 624
 625         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
 626         return 0;
 627 }
 628
 629 static void vmstat_stop(struct seq_file *m, void *arg)
 630 {
 631         kfree(m->private);
 632         m->private = NULL;
 633 }
 634
 635 const struct seq_operations vmstat_op = {
 636         .start  = vmstat_start,
 637         .next   = vmstat_next,
 638         .stop   = vmstat_stop,
 639         .show   = vmstat_show,
 640 };
 641
 642 #endif /* CONFIG_PROC_FS */
 643
 644 #ifdef CONFIG_SMP
 645 /*
 646  * Use the cpu notifier to insure that the thresholds are recalculated
 647  * when necessary.
 648  */
 649 static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 650                 unsigned long action,
 651                 void *hcpu)
 652 {
 653         switch (action) {
 654         case CPU_UP_PREPARE:
 655         case CPU_UP_CANCELED:
 656         case CPU_DEAD:
 657                 refresh_zone_stat_thresholds();
 658                 break;
 659         default:
 660                 break;
 661         }
 662         return NOTIFY_OK;
 663 }
 664
 665 static struct notifier_block __cpuinitdata vmstat_notifier =
 666         { &vmstat_cpuup_callback, NULL, 0 };
 667
 668 int __init setup_vmstat(void)
 669 {
 670         refresh_zone_stat_thresholds();
 671         register_cpu_notifier(&vmstat_notifier);
 672         return 0;
 673 }
 674 module_init(setup_vmstat)
 675 #endif