cpuset,mm: update tasks' mems_allowed in time

[safe/jmp/linux-2.6] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 22b15a4..7cc3179 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,6 +46,7 @@
  #include <linux/page-isolation.h>
  #include <linux/page_cgroup.h>
  #include <linux/debugobjects.h>
+#include <linux/kmemleak.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -149,10 +150,6 @@ static unsigned long __meminitdata dma_reserve;
    static int __meminitdata nr_nodemap_entries;
    static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
    static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
    static unsigned long __initdata required_kernelcore;
    static unsigned long __initdata required_movablecore;
    static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -331,7 +328,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
  
-               if (unlikely(!PageTail(p) | (p->first_page != page))) {
+               if (unlikely(!PageTail(p) || (p->first_page != page))) {
                         bad_page(page);
                         bad++;
                 }
@@ -922,13 +919,10 @@ static void drain_pages(unsigned int cpu)
         unsigned long flags;
         struct zone *zone;
  
-       for_each_zone(zone) {
+       for_each_populated_zone(zone) {
                 struct per_cpu_pageset *pset;
                 struct per_cpu_pages *pcp;
  
-               if (!populated_zone(zone))
-                       continue;
-
                 pset = zone_pcp(zone, cpu);
  
                 pcp = &pset->pcp;
@@ -1575,17 +1569,15 @@ nofail_alloc:
  
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
-       /*
-        * The task's cpuset might have expanded its set of allowable nodes
-        */
-       cpuset_update_task_memory_state();
+
         p->flags |= PF_MEMALLOC;
  
         lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+       did_some_progress = try_to_free_pages(zonelist, order,
+                                               gfp_mask, nodemask);
  
         p->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
@@ -1879,10 +1871,7 @@ void show_free_areas(void)
         int cpu;
         struct zone *zone;
  
-       for_each_zone(zone) {
-               if (!populated_zone(zone))
-                       continue;
-
+       for_each_populated_zone(zone) {
                 show_node(zone);
                 printk("%s per-cpu:\n", zone->name);
  
@@ -1922,12 +1911,9 @@ void show_free_areas(void)
                 global_page_state(NR_PAGETABLE),
                 global_page_state(NR_BOUNCE));
  
-       for_each_zone(zone) {
+       for_each_populated_zone(zone) {
                 int i;
  
-               if (!populated_zone(zone))
-                       continue;
-
                 show_node(zone);
                 printk("%s"
                         " free:%lukB"
@@ -1967,12 +1953,9 @@ void show_free_areas(void)
                 printk("\n");
         }
  
-       for_each_zone(zone) {
+       for_each_populated_zone(zone) {
                 unsigned long nr[MAX_ORDER], flags, order, total = 0;
  
-               if (!populated_zone(zone))
-                       continue;
-
                 show_node(zone);
                 printk("%s: ", zone->name);
  
@@ -2139,7 +2122,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
         int n, val;
         int min_val = INT_MAX;
         int best_node = -1;
-       node_to_cpumask_ptr(tmp, 0);
+       const struct cpumask *tmp = cpumask_of_node(0);
  
         /* Use the local node if we haven't already */
         if (!node_isset(node, *used_node_mask)) {
@@ -2160,8 +2143,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
                 val += (n < node);
  
                 /* Give preference to headless and unused nodes */
-               node_to_cpumask_ptr_next(tmp, n);
-               if (!cpus_empty(*tmp))
+               tmp = cpumask_of_node(n);
+               if (!cpumask_empty(tmp))
                         val += PENALTY_FOR_NODE_WITH_CPUS;
  
                 /* Slight preference for less loaded node */
@@ -2692,6 +2675,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
  
  static int zone_batchsize(struct zone *zone)
  {
+#ifdef CONFIG_MMU
         int batch;
  
         /*
@@ -2717,9 +2701,26 @@ static int zone_batchsize(struct zone *zone)
          * of pages of one half of the possible page colors
          * and the other with pages of the other colors.
          */
-       batch = (1 << (fls(batch + batch/2)-1)) - 1;
+       batch = rounddown_pow_of_two(batch + batch/2) - 1;
  
         return batch;
+
+#else
+       /* The deferral and batching of frees should be suppressed under NOMMU
+        * conditions.
+        *
+        * The problem is that NOMMU needs to be able to allocate large chunks
+        * of contiguous memory as there's no hardware page translation to
+        * assemble apparent contiguous memory from discontiguous pages.
+        *
+        * Queueing large contiguous runs of pages for batching, however,
+        * causes the pages to actually be freed in smaller chunks.  As there
+        * can be a significant delay between the individual batches being
+        * recycled, this leads to the once large chunks of space being
+        * fragmented and becoming unavailable for high-order allocations.
+        */
+       return 0;
+#endif
  }
  
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
@@ -2784,11 +2785,7 @@ static int __cpuinit process_zones(int cpu)
  
         node_set_state(node, N_CPU);    /* this node has a cpu */
  
-       for_each_zone(zone) {
-
-               if (!populated_zone(zone))
-                       continue;
-
+       for_each_populated_zone(zone) {
                 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                          GFP_KERNEL, node);
                 if (!zone_pcp(zone, cpu))
@@ -2994,7 +2991,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
-int __meminit early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn)
  {
         int i;
  
@@ -3005,10 +3002,33 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
                 if (start_pfn <= pfn && pfn < end_pfn)
                         return early_node_map[i].nid;
         }
+       /* This is a memory hole */
+       return -1;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
  
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+       int nid;
+
+       nid = __early_pfn_to_nid(pfn);
+       if (nid >= 0)
+               return nid;
+       /* just returns 0 */
         return 0;
  }
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+       int nid;
+
+       nid = __early_pfn_to_nid(pfn);
+       if (nid >= 0 && nid != node)
+               return false;
+       return true;
+}
+#endif
  
  /* Basic iterator support to walk early_node_map[] */
  #define for_each_active_range_index_in_nid(i, nid) \
@@ -3077,64 +3097,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
  }
  
  /**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
-               unsigned long start_pfn, unsigned long end_pfn)
-{
-       mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-                       "Entering push_node_boundaries(%u, %lu, %lu)\n",
-                       nid, start_pfn, end_pfn);
-
-       /* Initialise the boundary for this node if necessary */
-       if (node_boundary_end_pfn[nid] == 0)
-               node_boundary_start_pfn[nid] = -1UL;
-
-       /* Update the boundaries */
-       if (node_boundary_start_pfn[nid] > start_pfn)
-               node_boundary_start_pfn[nid] = start_pfn;
-       if (node_boundary_end_pfn[nid] < end_pfn)
-               node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
-               unsigned long *start_pfn, unsigned long *end_pfn)
-{
-       mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-                       "Entering account_node_boundary(%u, %lu, %lu)\n",
-                       nid, *start_pfn, *end_pfn);
-
-       /* Return if boundary information has not been provided */
-       if (node_boundary_end_pfn[nid] == 0)
-               return;
-
-       /* Check the boundaries and update if necessary */
-       if (node_boundary_start_pfn[nid] < *start_pfn)
-               *start_pfn = node_boundary_start_pfn[nid];
-       if (node_boundary_end_pfn[nid] > *end_pfn)
-               *end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
-               unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
-               unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
-/**
   * get_pfn_range_for_nid - Return the start and end page frames for a node
   * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
   * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3159,9 +3121,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
  
         if (*start_pfn == -1UL)
                 *start_pfn = 0;
-
-       /* Push the node boundaries out if requested */
-       account_node_boundary(nid, start_pfn, end_pfn);
  }
  
  /*
@@ -3767,10 +3726,6 @@ void __init remove_all_active_ranges(void)
  {
         memset(early_node_map, 0, sizeof(early_node_map));
         nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-       memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
-       memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
  }
  
  /* Compare two active node_active_regions */
@@ -4589,6 +4544,16 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (_hash_mask)
                 *_hash_mask = (1 << log2qty) - 1;
  
+       /*
+        * If hashdist is set, the table allocation is done with __vmalloc()
+        * which invokes the kmemleak_alloc() callback. This function may also
+        * be called before the slab and kmemleak are initialised when
+        * kmemleak simply buffers the request to be executed later
+        * (GFP_ATOMIC flag ignored in this case).
+        */
+       if (!hashdist)
+               kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+
         return table;
  }