mm: fix section mismatch warnings

[safe/jmp/linux-2.6] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 40db96a..8b000d6 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,8 @@
  #include <linux/stop_machine.h>
  #include <linux/sort.h>
  #include <linux/pfn.h>
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -71,7 +73,9 @@ static void __free_pages_ok(struct page *page, unsigned int order);
   * don't need any ZONE_NORMAL reservation
   */
  int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+#ifdef CONFIG_ZONE_DMA
          256,
+#endif
  #ifdef CONFIG_ZONE_DMA32
          256,
  #endif
@@ -82,15 +86,10 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
  
  EXPORT_SYMBOL(totalram_pages);
  
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
+#ifdef CONFIG_ZONE_DMA
          "DMA",
+#endif
  #ifdef CONFIG_ZONE_DMA32
          "DMA32",
  #endif
@@ -104,7 +103,7 @@ int min_free_kbytes = 1024;
  
  unsigned long __meminitdata nr_kernel_pages;
  unsigned long __meminitdata nr_all_pages;
-static unsigned long __initdata dma_reserve;
+static unsigned long __meminitdata dma_reserve;
  
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
    /*
@@ -127,10 +126,10 @@ static unsigned long __initdata dma_reserve;
      #endif
    #endif
  
-  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
-  int __initdata nr_nodemap_entries;
-  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+  struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
+  int __meminitdata nr_nodemap_entries;
+  unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
  #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
    unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
    unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
@@ -157,10 +156,8 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
  
  static int page_is_consistent(struct zone *zone, struct page *page)
  {
-#ifdef CONFIG_HOLES_IN_ZONE
-       if (!pfn_valid(page_to_pfn(page)))
+       if (!pfn_valid_within(page_to_pfn(page)))
                 return 0;
-#endif
         if (zone != page_zone(page))
                 return 0;
  
@@ -228,7 +225,7 @@ static void bad_page(struct page *page)
  
  static void free_compound_page(struct page *page)
  {
-       __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+       __free_pages_ok(page, compound_order(page));
  }
  
  static void prep_compound_page(struct page *page, unsigned long order)
@@ -236,13 +233,14 @@ static void prep_compound_page(struct page *page, unsigned long order)
         int i;
         int nr_pages = 1 << order;
  
-       page[1].lru.next = (void *)free_compound_page;  /* set dtor */
-       page[1].lru.prev = (void *)order;
-       for (i = 0; i < nr_pages; i++) {
+       set_compound_page_dtor(page, free_compound_page);
+       set_compound_order(page, order);
+       __SetPageHead(page);
+       for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
  
-               __SetPageCompound(p);
-               set_page_private(p, (unsigned long)page);
+               __SetPageTail(p);
+               p->first_page = page;
         }
  }
  
@@ -251,16 +249,19 @@ static void destroy_compound_page(struct page *page, unsigned long order)
         int i;
         int nr_pages = 1 << order;
  
-       if (unlikely((unsigned long)page[1].lru.prev != order))
+       if (unlikely(compound_order(page) != order))
                 bad_page(page);
  
-       for (i = 0; i < nr_pages; i++) {
+       if (unlikely(!PageHead(page)))
+                       bad_page(page);
+       __ClearPageHead(page);
+       for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
  
-               if (unlikely(!PageCompound(p) |
-                               (page_private(p) != (unsigned long)page)))
+               if (unlikely(!PageTail(p) |
+                               (p->first_page != page)))
                         bad_page(page);
-               __ClearPageCompound(p);
+               __ClearPageTail(p);
         }
  }
  
@@ -347,10 +348,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  static inline int page_is_buddy(struct page *page, struct page *buddy,
                                                                 int order)
  {
-#ifdef CONFIG_HOLES_IN_ZONE
-       if (!pfn_valid(page_to_pfn(buddy)))
+       if (!pfn_valid_within(page_to_pfn(buddy)))
                 return 0;
-#endif
  
         if (page_zone_id(page) != page_zone_id(buddy))
                 return 0;
@@ -400,7 +399,7 @@ static inline void __free_one_page(struct page *page,
         VM_BUG_ON(page_idx & (order_size - 1));
         VM_BUG_ON(bad_range(zone, page));
  
-       zone->free_pages += order_size;
+       __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
         while (order < MAX_ORDER-1) {
                 unsigned long combined_idx;
                 struct free_area *area;
@@ -434,13 +433,18 @@ static inline int free_pages_check(struct page *page)
                         1 << PG_private |
                         1 << PG_locked  |
                         1 << PG_active  |
-                       1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback |
                         1 << PG_reserved |
                         1 << PG_buddy ))))
                 bad_page(page);
+       /*
+        * PageReclaim == PageTail. It is only an error
+        * for PageReclaim to be set if PageCompound is clear.
+        */
+       if (unlikely(!PageCompound(page) && PageReclaim(page)))
+               bad_page(page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
         /*
@@ -485,7 +489,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
         spin_lock(&zone->lock);
         zone->all_unreclaimable = 0;
         zone->pages_scanned = 0;
-       __free_one_page(page, zone ,order);
+       __free_one_page(page, zone, order);
         spin_unlock(&zone->lock);
  }
  
@@ -601,9 +605,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                         1 << PG_referenced | 1 << PG_arch_1 |
-                       1 << PG_checked | 1 << PG_mappedtodisk);
+                       1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
         set_page_private(page, 0);
         set_page_refcounted(page);
+
+       arch_alloc_page(page, order);
         kernel_map_pages(page, 1 << order, 1);
  
         if (gfp_flags & __GFP_ZERO)
@@ -634,7 +640,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
                 list_del(&page->lru);
                 rmv_page_order(page);
                 area->nr_free--;
-               zone->free_pages -= 1UL << order;
+               __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
                 expand(zone, page, order, current_order, area);
                 return page;
         }
@@ -663,43 +669,51 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
         return i;
  }
  
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly = MAX_NUMNODES;
+EXPORT_SYMBOL(nr_node_ids);
+
+/*
+ * Figure out the number of possible node ids.
+ */
+static void __init setup_nr_node_ids(void)
+{
+       unsigned int node;
+       unsigned int highest = 0;
+
+       for_each_node_mask(node, node_possible_map)
+               highest = node;
+       nr_node_ids = highest + 1;
+}
+#else
+static void __init setup_nr_node_ids(void) {}
+#endif
+
  #ifdef CONFIG_NUMA
  /*
- * Called from the slab reaper to drain pagesets on a particular node that
- * belongs to the currently executing processor.
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
   * Note that this function must be called with the thread pinned to
   * a single processor.
   */
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  {
-       int i;
-       enum zone_type z;
         unsigned long flags;
+       int to_drain;
  
-       for (z = 0; z < MAX_NR_ZONES; z++) {
-               struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
-               struct per_cpu_pageset *pset;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               pset = zone_pcp(zone, smp_processor_id());
-               for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-                       struct per_cpu_pages *pcp;
-
-                       pcp = &pset->pcp[i];
-                       if (pcp->count) {
-                               local_irq_save(flags);
-                               free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                               pcp->count = 0;
-                               local_irq_restore(flags);
-                       }
-               }
-       }
+       local_irq_save(flags);
+       if (pcp->count >= pcp->batch)
+               to_drain = pcp->batch;
+       else
+               to_drain = pcp->count;
+       free_pages_bulk(zone, to_drain, &pcp->list, 0);
+       pcp->count -= to_drain;
+       local_irq_restore(flags);
  }
  #endif
  
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
  static void __drain_pages(unsigned int cpu)
  {
         unsigned long flags;
@@ -709,6 +723,9 @@ static void __drain_pages(unsigned int cpu)
         for_each_zone(zone) {
                 struct per_cpu_pageset *pset;
  
+               if (!populated_zone(zone))
+                       continue;
+
                 pset = zone_pcp(zone, cpu);
                 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                         struct per_cpu_pages *pcp;
@@ -721,7 +738,6 @@ static void __drain_pages(unsigned int cpu)
                 }
         }
  }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
  
  #ifdef CONFIG_PM
  
@@ -742,8 +758,8 @@ void mark_free_pages(struct zone *zone)
                 if (pfn_valid(pfn)) {
                         struct page *page = pfn_to_page(pfn);
  
-                       if (!PageNosave(page))
-                               ClearPageNosaveFree(page);
+                       if (!swsusp_page_is_forbidden(page))
+                               swsusp_unset_page_free(page);
                 }
  
         for (order = MAX_ORDER - 1; order >= 0; --order)
@@ -752,7 +768,7 @@ void mark_free_pages(struct zone *zone)
  
                         pfn = page_to_pfn(list_entry(curr, struct page, lru));
                         for (i = 0; i < (1UL << order); i++)
-                               SetPageNosaveFree(pfn_to_page(pfn + i));
+                               swsusp_set_page_free(pfn_to_page(pfn + i));
                 }
  
         spin_unlock_irqrestore(&zone->lock, flags);
@@ -852,7 +868,7 @@ again:
                 pcp = &zone_pcp(zone, cpu)->pcp[cold];
                 local_irq_save(flags);
                 if (!pcp->count) {
-                       pcp->count += rmqueue_bulk(zone, 0,
+                       pcp->count = rmqueue_bulk(zone, 0,
                                                 pcp->batch, &pcp->list);
                         if (unlikely(!pcp->count))
                                 goto failed;
@@ -892,6 +908,91 @@ failed:
  #define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
  #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
  
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+       struct fault_attr attr;
+
+       u32 ignore_gfp_highmem;
+       u32 ignore_gfp_wait;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+       struct dentry *ignore_gfp_highmem_file;
+       struct dentry *ignore_gfp_wait_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+       .attr = FAULT_ATTR_INITIALIZER,
+       .ignore_gfp_wait = 1,
+       .ignore_gfp_highmem = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+       return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+       if (gfp_mask & __GFP_NOFAIL)
+               return 0;
+       if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+               return 0;
+       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+               return 0;
+
+       return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       struct dentry *dir;
+       int err;
+
+       err = init_fault_attr_dentries(&fail_page_alloc.attr,
+                                      "fail_page_alloc");
+       if (err)
+               return err;
+       dir = fail_page_alloc.attr.dentries.dir;
+
+       fail_page_alloc.ignore_gfp_wait_file =
+               debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                     &fail_page_alloc.ignore_gfp_wait);
+
+       fail_page_alloc.ignore_gfp_highmem_file =
+               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                                     &fail_page_alloc.ignore_gfp_highmem);
+
+       if (!fail_page_alloc.ignore_gfp_wait_file ||
+                       !fail_page_alloc.ignore_gfp_highmem_file) {
+               err = -ENOMEM;
+               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+       }
+
+       return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+       return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
  /*
   * Return 1 if free pages are above 'mark'. This takes into account the order
   * of the allocation.
@@ -900,8 +1001,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                       int classzone_idx, int alloc_flags)
  {
         /* free_pages my go negative - that's OK */
-       unsigned long min = mark;
-       long free_pages = z->free_pages - (1 << order) + 1;
+       long min = mark;
+       long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
         int o;
  
         if (alloc_flags & ALLOC_HIGH)
@@ -924,31 +1025,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
         return 1;
  }
  
+#ifdef CONFIG_NUMA
  /*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full.  See further
+ * comments in mmzone.h.  Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       nodemask_t *allowednodes;       /* zonelist_cache approximation */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return NULL;
+
+       if (jiffies - zlc->last_full_zap > 1 * HZ) {
+               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+               zlc->last_full_zap = jiffies;
+       }
+
+       allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+                                       &cpuset_current_mems_allowed :
+                                       &node_online_map;
+       return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ *  1) Check that the zone isn't thought to be full (doesn't have its
+ *     bit set in the zonelist_cache fullzones BITMAP).
+ *  2) Check that the zones node (obtained from the zonelist_cache
+ *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                                               nodemask_t *allowednodes)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       int i;                          /* index of *z in zonelist zones */
+       int n;                          /* node that zone *z is on */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return 1;
+
+       i = z - zonelist->zones;
+       n = zlc->z_to_n[i];
+
+       /* This zone is worth trying if it is allowed but not full */
+       return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       int i;                          /* index of *z in zonelist zones */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+
+       i = z - zonelist->zones;
+
+       set_bit(i, zlc->fullzones);
+}
+
+#else  /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+       return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                               nodemask_t *allowednodes)
+{
+       return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif /* CONFIG_NUMA */
+
+/*
+ * get_page_from_freelist goes through the zonelist trying to allocate
   * a page.
   */
  static struct page *
  get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                 struct zonelist *zonelist, int alloc_flags)
  {
-       struct zone **z = zonelist->zones;
+       struct zone **z;
         struct page *page = NULL;
-       int classzone_idx = zone_idx(*z);
+       int classzone_idx = zone_idx(zonelist->zones[0]);
         struct zone *zone;
+       nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+       int zlc_active = 0;             /* set if using zonelist_cache */
+       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
  
+zonelist_scan:
         /*
-        * Go through the zonelist once, looking for a zone with enough free.
+        * Scan zonelist, looking for a zone with enough free.
          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
          */
+       z = zonelist->zones;
+
         do {
+               if (NUMA_BUILD && zlc_active &&
+                       !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
                 zone = *z;
                 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
                         zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                 break;
                 if ((alloc_flags & ALLOC_CPUSET) &&
-                               !cpuset_zone_allowed(zone, gfp_mask))
-                       continue;
+                       !cpuset_zone_allowed_softwall(zone, gfp_mask))
+                               goto try_next_zone;
  
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
                         unsigned long mark;
@@ -958,18 +1188,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                 mark = zone->pages_low;
                         else
                                 mark = zone->pages_high;
-                       if (!zone_watermark_ok(zone , order, mark,
-                                   classzone_idx, alloc_flags))
+                       if (!zone_watermark_ok(zone, order, mark,
+                                   classzone_idx, alloc_flags)) {
                                 if (!zone_reclaim_mode ||
                                     !zone_reclaim(zone, gfp_mask, order))
-                                       continue;
+                                       goto this_zone_full;
+                       }
                 }
  
                 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
-               if (page) {
+               if (page)
                         break;
+this_zone_full:
+               if (NUMA_BUILD)
+                       zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+               if (NUMA_BUILD && !did_zlc_setup) {
+                       /* we do zlc_setup after the first zone is tried */
+                       allowednodes = zlc_setup(zonelist, alloc_flags);
+                       zlc_active = 1;
+                       did_zlc_setup = 1;
                 }
         } while (*(++z) != NULL);
+
+       if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+               /* Disable zlc cache for second zonelist scan */
+               zlc_active = 0;
+               goto zonelist_scan;
+       }
         return page;
  }
  
@@ -991,6 +1237,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
  
         might_sleep_if(wait);
  
+       if (should_fail_alloc_page(gfp_mask, order))
+               return NULL;
+
  restart:
         z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
  
@@ -1004,9 +1253,19 @@ restart:
         if (page)
                 goto got_pg;
  
-       do {
+       /*
+        * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+        * __GFP_NOWARN set) should not cause reclaim since the subsystem
+        * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+        * using a larger set of nodes after it has established that the
+        * allowed per node queues are empty and that nodes are
+        * over allocated.
+        */
+       if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+               goto nopage;
+
+       for (z = zonelist->zones; *z; z++)
                 wakeup_kswapd(*z, order);
-       } while (*(++z));
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -1040,6 +1299,7 @@ restart:
  
         /* This allocation should allow future memory freeing. */
  
+rebalance:
         if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                         && !in_interrupt()) {
                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1050,7 +1310,7 @@ nofail_alloc:
                         if (page)
                                 goto got_pg;
                         if (gfp_mask & __GFP_NOFAIL) {
-                               blk_congestion_wait(WRITE, HZ/50);
+                               congestion_wait(WRITE, HZ/50);
                                 goto nofail_alloc;
                         }
                 }
@@ -1061,7 +1321,6 @@ nofail_alloc:
         if (!wait)
                 goto nopage;
  
-rebalance:
         cond_resched();
  
         /* We now go into synchronous reclaim */
@@ -1113,7 +1372,7 @@ rebalance:
                         do_retry = 1;
         }
         if (do_retry) {
-               blk_congestion_wait(WRITE, HZ/50);
+               congestion_wait(WRITE, HZ/50);
                 goto rebalance;
         }
  
@@ -1193,35 +1452,6 @@ fastcall void free_pages(unsigned long addr, unsigned int order)
  
  EXPORT_SYMBOL(free_pages);
  
-/*
- * Total amount of free (allocatable) RAM:
- */
-unsigned int nr_free_pages(void)
-{
-       unsigned int sum = 0;
-       struct zone *zone;
-
-       for_each_zone(zone)
-               sum += zone->free_pages;
-
-       return sum;
-}
-
-EXPORT_SYMBOL(nr_free_pages);
-
-#ifdef CONFIG_NUMA
-unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
-{
-       unsigned int sum = 0;
-       enum zone_type i;
-
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               sum += pgdat->node_zones[i].free_pages;
-
-       return sum;
-}
-#endif
-
  static unsigned int nr_free_zone_pages(int offset)
  {
         /* Just pick one node, since fallback list is circular */
@@ -1261,14 +1491,14 @@ unsigned int nr_free_pagecache_pages(void)
  static inline void show_node(struct zone *zone)
  {
         if (NUMA_BUILD)
-               printk("Node %ld ", zone_to_nid(zone));
+               printk("Node %d ", zone_to_nid(zone));
  }
  
  void si_meminfo(struct sysinfo *val)
  {
         val->totalram = totalram_pages;
         val->sharedram = 0;
-       val->freeram = nr_free_pages();
+       val->freeram = global_page_state(NR_FREE_PAGES);
         val->bufferram = nr_blockdev_pages();
         val->totalhigh = totalhigh_pages;
         val->freehigh = nr_free_highpages();
@@ -1283,10 +1513,11 @@ void si_meminfo_node(struct sysinfo *val, int nid)
         pg_data_t *pgdat = NODE_DATA(nid);
  
         val->totalram = pgdat->node_present_pages;
-       val->freeram = nr_free_pages_pgdat(pgdat);
+       val->freeram = node_page_state(nid, NR_FREE_PAGES);
  #ifdef CONFIG_HIGHMEM
         val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
-       val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+       val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+                       NR_FREE_PAGES);
  #else
         val->totalhigh = 0;
         val->freehigh = 0;
@@ -1305,9 +1536,6 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  void show_free_areas(void)
  {
         int cpu;
-       unsigned long active;
-       unsigned long inactive;
-       unsigned long free;
         struct zone *zone;
  
         for_each_zone(zone) {
@@ -1331,20 +1559,19 @@ void show_free_areas(void)
                 }
         }
  
-       get_zone_counts(&active, &inactive, &free);
-
-       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
-               "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
-               active,
-               inactive,
+       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+               " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
+               global_page_state(NR_ACTIVE),
+               global_page_state(NR_INACTIVE),
                 global_page_state(NR_FILE_DIRTY),
                 global_page_state(NR_WRITEBACK),
                 global_page_state(NR_UNSTABLE_NFS),
-               nr_free_pages(),
+               global_page_state(NR_FREE_PAGES),
                 global_page_state(NR_SLAB_RECLAIMABLE) +
                         global_page_state(NR_SLAB_UNRECLAIMABLE),
                 global_page_state(NR_FILE_MAPPED),
-               global_page_state(NR_PAGETABLE));
+               global_page_state(NR_PAGETABLE),
+               global_page_state(NR_BOUNCE));
  
         for_each_zone(zone) {
                 int i;
@@ -1365,12 +1592,12 @@ void show_free_areas(void)
                         " all_unreclaimable? %s"
                         "\n",
                         zone->name,
-                       K(zone->free_pages),
+                       K(zone_page_state(zone, NR_FREE_PAGES)),
                         K(zone->pages_min),
                         K(zone->pages_low),
                         K(zone->pages_high),
-                       K(zone->nr_active),
-                       K(zone->nr_inactive),
+                       K(zone_page_state(zone, NR_ACTIVE)),
+                       K(zone_page_state(zone, NR_INACTIVE)),
                         K(zone->present_pages),
                         zone->pages_scanned,
                         (zone->all_unreclaimable ? "yes" : "no")
@@ -1541,6 +1768,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
         }
  }
  
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+       int i;
+
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zonelist *zonelist;
+               struct zonelist_cache *zlc;
+               struct zone **z;
+
+               zonelist = pgdat->node_zonelists + i;
+               zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+               for (z = zonelist->zones; *z; z++)
+                       zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+       }
+}
+
  #else  /* CONFIG_NUMA */
  
  static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1578,14 +1823,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
         }
  }
  
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+       int i;
+
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
+
  #endif /* CONFIG_NUMA */
  
  /* return values int ....just for stop_machine_run() */
  static int __meminit __build_all_zonelists(void *dummy)
  {
         int nid;
-       for_each_online_node(nid)
+
+       for_each_online_node(nid) {
                 build_zonelists(NODE_DATA(nid));
+               build_zonelist_cache(NODE_DATA(nid));
+       }
         return 0;
  }
  
@@ -1679,15 +1936,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
   * done. Non-atomic initialization, single-pass.
   */
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-               unsigned long start_pfn)
+               unsigned long start_pfn, enum memmap_context context)
  {
         struct page *page;
         unsigned long end_pfn = start_pfn + size;
         unsigned long pfn;
  
         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!early_pfn_valid(pfn))
-                       continue;
+               /*
+                * There can be holes in boot-time mem_map[]s
+                * handed to this function.  They do not
+                * exist on hotplugged memory.
+                */
+               if (context == MEMMAP_EARLY) {
+                       if (!early_pfn_valid(pfn))
+                               continue;
+                       if (!early_pfn_in_nid(pfn, nid))
+                               continue;
+               }
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
                 init_page_count(page);
@@ -1712,23 +1978,9 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
         }
  }
  
-#define ZONETABLE_INDEX(x, zone_nr)    ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
-               unsigned long pfn, unsigned long size)
-{
-       unsigned long snum = pfn_to_section_nr(pfn);
-       unsigned long end = pfn_to_section_nr(pfn + size);
-
-       if (FLAGS_HAS_NODE)
-               zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-       else
-               for (; snum <= end; snum++)
-                       zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
  #ifndef __HAVE_ARCH_MEMMAP_INIT
  #define memmap_init(size, nid, zone, start_pfn) \
-       memmap_init_zone((size), (nid), (zone), (start_pfn))
+       memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
  #endif
  
  static int __cpuinit zone_batchsize(struct zone *zone)
@@ -1878,16 +2130,19 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
         int ret = NOTIFY_OK;
  
         switch (action) {
-               case CPU_UP_PREPARE:
-                       if (process_zones(cpu))
-                               ret = NOTIFY_BAD;
-                       break;
-               case CPU_UP_CANCELED:
-               case CPU_DEAD:
-                       free_zone_pagesets(cpu);
-                       break;
-               default:
-                       break;
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               if (process_zones(cpu))
+                       ret = NOTIFY_BAD;
+               break;
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               free_zone_pagesets(cpu);
+               break;
+       default:
+               break;
         }
         return ret;
  }
@@ -1910,7 +2165,7 @@ void __init setup_per_cpu_pageset(void)
  
  #endif
  
-static __meminit
+static noinline __init_refok
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  {
         int i;
@@ -1974,7 +2229,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
  
  __meminit int init_currently_empty_zone(struct zone *zone,
                                         unsigned long zone_start_pfn,
-                                       unsigned long size)
+                                       unsigned long size,
+                                       enum memmap_context context)
  {
         struct pglist_data *pgdat = zone->zone_pgdat;
         int ret;
@@ -1997,7 +2253,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
   * Basic iterator support. Return the first range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns first region regardless of node
   */
-static int __init first_active_region_index_in_nid(int nid)
+static int __meminit first_active_region_index_in_nid(int nid)
  {
         int i;
  
@@ -2012,7 +2268,7 @@ static int __init first_active_region_index_in_nid(int nid)
   * Basic iterator support. Return the next active range of PFNs for a node
   * Note: nid == MAX_NUMNODES returns next region regardles of node
   */
-static int __init next_active_region_index_in_nid(int index, int nid)
+static int __meminit next_active_region_index_in_nid(int index, int nid)
  {
         for (index = index + 1; index < nr_nodemap_entries; index++)
                 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
@@ -2028,7 +2284,7 @@ static int __init next_active_region_index_in_nid(int index, int nid)
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
-int __init early_pfn_to_nid(unsigned long pfn)
+int __meminit early_pfn_to_nid(unsigned long pfn)
  {
         int i;
  
@@ -2165,7 +2421,7 @@ static void __init account_node_boundary(unsigned int nid,
   * with no available memory, a warning is printed and the start and end
   * PFNs will be 0.
   */
-void __init get_pfn_range_for_nid(unsigned int nid,
+void __meminit get_pfn_range_for_nid(unsigned int nid,
                         unsigned long *start_pfn, unsigned long *end_pfn)
  {
         int i;
@@ -2190,7 +2446,7 @@ void __init get_pfn_range_for_nid(unsigned int nid,
   * Return the number of pages a zone spans in a node, including holes
   * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
   */
-unsigned long __init zone_spanned_pages_in_node(int nid,
+unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                         unsigned long zone_type,
                                         unsigned long *ignored)
  {
@@ -2218,7 +2474,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid,
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
   * then all holes in the requested range will be accounted for.
   */
-unsigned long __init __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
                                 unsigned long range_start_pfn,
                                 unsigned long range_end_pfn)
  {
@@ -2258,7 +2514,7 @@ unsigned long __init __absent_pages_in_range(int nid,
  
         /* Account for ranges past physical memory on this node */
         if (range_end_pfn > prev_end_pfn)
-               hole_pages = range_end_pfn -
+               hole_pages += range_end_pfn -
                                 max(range_start_pfn, prev_end_pfn);
  
         return hole_pages;
@@ -2278,7 +2534,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
  }
  
  /* Return the number of page frames in holes in a zone on a node */
-unsigned long __init zone_absent_pages_in_node(int nid,
+unsigned long __meminit zone_absent_pages_in_node(int nid,
                                         unsigned long zone_type,
                                         unsigned long *ignored)
  {
@@ -2314,7 +2570,7 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
  
  #endif
  
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                 unsigned long *zones_size, unsigned long *zholes_size)
  {
         unsigned long realtotalpages, totalpages = 0;
@@ -2378,11 +2634,11 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                                 "  %s zone: %lu pages exceeds realsize %lu\n",
                                 zone_names[j], memmap_pages, realsize);
  
-               /* Account for reserved DMA pages */
-               if (j == ZONE_DMA && realsize > dma_reserve) {
+               /* Account for reserved pages */
+               if (j == 0 && realsize > dma_reserve) {
                         realsize -= dma_reserve;
-                       printk(KERN_DEBUG "  DMA zone: %lu pages reserved\n",
-                                                               dma_reserve);
+                       printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
+                                       zone_names[0], dma_reserve);
                 }
  
                 if (!is_highmem_idx(j))
@@ -2402,30 +2658,27 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                 spin_lock_init(&zone->lru_lock);
                 zone_seqlock_init(zone);
                 zone->zone_pgdat = pgdat;
-               zone->free_pages = 0;
  
-               zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+               zone->prev_priority = DEF_PRIORITY;
  
                 zone_pcp_init(zone);
                 INIT_LIST_HEAD(&zone->active_list);
                 INIT_LIST_HEAD(&zone->inactive_list);
                 zone->nr_scan_active = 0;
                 zone->nr_scan_inactive = 0;
-               zone->nr_active = 0;
-               zone->nr_inactive = 0;
                 zap_zone_vm_stats(zone);
                 atomic_set(&zone->reclaim_in_progress, 0);
                 if (!size)
                         continue;
  
-               zonetable_add(zone, nid, j, zone_start_pfn, size);
-               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+               ret = init_currently_empty_zone(zone, zone_start_pfn,
+                                               size, MEMMAP_EARLY);
                 BUG_ON(ret);
                 zone_start_pfn += size;
         }
  }
  
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
  {
         /* Skip empty nodes */
         if (!pgdat->node_spanned_pages)
@@ -2604,17 +2857,23 @@ static void __init sort_node_map(void)
                         cmp_node_active_region, NULL);
  }
  
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
+/* Find the lowest pfn for a node */
  unsigned long __init find_min_pfn_for_node(unsigned long nid)
  {
         int i;
+       unsigned long min_pfn = ULONG_MAX;
  
         /* Assuming a sorted map, the first range found has the starting pfn */
         for_each_active_range_index_in_nid(i, nid)
-               return early_node_map[i].start_pfn;
+               min_pfn = min(min_pfn, early_node_map[i].start_pfn);
  
-       printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
-       return 0;
+       if (min_pfn == ULONG_MAX) {
+               printk(KERN_WARNING
+                       "Could not find start_pfn for node %lu\n", nid);
+               return 0;
+       }
+
+       return min_pfn;
  }
  
  /**
@@ -2663,6 +2922,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
         unsigned long nid;
         enum zone_type i;
  
+       /* Sort early_node_map as initialisation assumes it is sorted */
+       sort_node_map();
+
         /* Record where the zone boundaries are */
         memset(arch_zone_lowest_possible_pfn, 0,
                                 sizeof(arch_zone_lowest_possible_pfn));
@@ -2677,9 +2939,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                         max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
         }
  
-       /* Regions in the early_node_map can be in any order */
-       sort_node_map();
-
         /* Print out the zone ranges */
         printk("Zone PFN ranges:\n");
         for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2696,6 +2955,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                                 early_node_map[i].end_pfn);
  
         /* Initialise every node */
+       setup_nr_node_ids();
         for_each_online_node(nid) {
                 pg_data_t *pgdat = NODE_DATA(nid);
                 free_area_init_node(nid, pgdat, NULL,
@@ -2733,13 +2993,12 @@ void __init free_area_init(unsigned long *zones_size)
                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
  }
  
-#ifdef CONFIG_HOTPLUG_CPU
  static int page_alloc_cpu_notify(struct notifier_block *self,
                                  unsigned long action, void *hcpu)
  {
         int cpu = (unsigned long)hcpu;
  
-       if (action == CPU_DEAD) {
+       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
                 local_irq_disable();
                 __drain_pages(cpu);
                 vm_events_fold_cpu(cpu);
@@ -2748,7 +3007,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
         }
         return NOTIFY_OK;
  }
-#endif /* CONFIG_HOTPLUG_CPU */
  
  void __init page_alloc_init(void)
  {
@@ -2936,7 +3194,8 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
  {
         proc_dointvec(table, write, file, buffer, length, ppos);
-       setup_per_zone_pages_min();
+       if (write)
+               setup_per_zone_pages_min();
         return 0;
  }
  
@@ -3052,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         /* allow the kernel cmdline to have a say */
         if (!numentries) {
                 /* round applicable memory size up to nearest megabyte */
-               numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+               numentries = nr_kernel_pages;
                 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
                 numentries >>= 20 - PAGE_SHIFT;
                 numentries <<= 20 - PAGE_SHIFT;
@@ -3062,6 +3321,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                         numentries >>= (scale - PAGE_SHIFT);
                 else
                         numentries <<= (PAGE_SHIFT - scale);
+
+               /* Make sure we've got at least a 0-order allocation.. */
+               if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+                       numentries = PAGE_SIZE / bucketsize;
         }
         numentries = roundup_pow_of_two(numentries);
  
@@ -3074,7 +3337,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (numentries > max)
                 numentries = max;
  
-       log2qty = long_log2(numentries);
+       log2qty = ilog2(numentries);
  
         do {
                 size = bucketsize << log2qty;
@@ -3096,7 +3359,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
                tablename,
                (1U << log2qty),
-              long_log2(size) - PAGE_SHIFT,
+              ilog2(size) - PAGE_SHIFT,
                size);
  
         if (_hash_shift)
@@ -3119,3 +3382,5 @@ unsigned long page_to_pfn(struct page *page)
  EXPORT_SYMBOL(pfn_to_page);
  EXPORT_SYMBOL(page_to_pfn);
  #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+