nommu: clamp zone_batchsize() to 0 under NOMMU conditions

[safe/jmp/linux-2.6] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index d0a240f..fe753ec 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
  
  unsigned long totalram_pages __read_mostly;
  unsigned long totalreserve_pages __read_mostly;
-long nr_swap_pages;
+unsigned long highest_memmap_pfn __read_mostly;
  int percpu_pagelist_fraction;
  
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
  
  static void bad_page(struct page *page)
  {
-       printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
-               "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-               current->comm, page, (int)(2*sizeof(unsigned long)),
-               (unsigned long)page->flags, page->mapping,
-               page_mapcount(page), page_count(page));
+       static unsigned long resume;
+       static unsigned long nr_shown;
+       static unsigned long nr_unshown;
+
+       /*
+        * Allow a burst of 60 reports, then keep quiet for that minute;
+        * or allow a steady drip of one report per second.
+        */
+       if (nr_shown == 60) {
+               if (time_before(jiffies, resume)) {
+                       nr_unshown++;
+                       goto out;
+               }
+               if (nr_unshown) {
+                       printk(KERN_ALERT
+                             "BUG: Bad page state: %lu messages suppressed\n",
+                               nr_unshown);
+                       nr_unshown = 0;
+               }
+               nr_shown = 0;
+       }
+       if (nr_shown++ == 0)
+               resume = jiffies + 60 * HZ;
+
+       printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
+               current->comm, page_to_pfn(page));
+       printk(KERN_ALERT
+               "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+               page, (void *)page->flags, page_count(page),
+               page_mapcount(page), page->mapping, page->index);
  
-       printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
-               KERN_EMERG "Backtrace:\n");
         dump_stack();
-       page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
-       set_page_count(page, 0);
-       reset_page_mapcount(page);
-       page->mapping = NULL;
+out:
+       /* Leave bad fields for debug, except PageBuddy could make trouble */
+       __ClearPageBuddy(page);
         add_taint(TAINT_BAD_PAGE);
  }
  
@@ -263,40 +285,60 @@ void prep_compound_page(struct page *page, unsigned long order)
  {
         int i;
         int nr_pages = 1 << order;
-       struct page *p = page + 1;
  
         set_compound_page_dtor(page, free_compound_page);
         set_compound_order(page, order);
         __SetPageHead(page);
-       for (i = 1; i < nr_pages; i++, p++) {
-               if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
-                       p = pfn_to_page(page_to_pfn(page) + i);
+       for (i = 1; i < nr_pages; i++) {
+               struct page *p = page + i;
+
                 __SetPageTail(p);
                 p->first_page = page;
         }
  }
  
-static void destroy_compound_page(struct page *page, unsigned long order)
+#ifdef CONFIG_HUGETLBFS
+void prep_compound_gigantic_page(struct page *page, unsigned long order)
  {
         int i;
         int nr_pages = 1 << order;
         struct page *p = page + 1;
  
-       if (unlikely(compound_order(page) != order))
+       set_compound_page_dtor(page, free_compound_page);
+       set_compound_order(page, order);
+       __SetPageHead(page);
+       for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+               __SetPageTail(p);
+               p->first_page = page;
+       }
+}
+#endif
+
+static int destroy_compound_page(struct page *page, unsigned long order)
+{
+       int i;
+       int nr_pages = 1 << order;
+       int bad = 0;
+
+       if (unlikely(compound_order(page) != order) ||
+           unlikely(!PageHead(page))) {
                 bad_page(page);
+               bad++;
+       }
  
-       if (unlikely(!PageHead(page)))
-                       bad_page(page);
         __ClearPageHead(page);
-       for (i = 1; i < nr_pages; i++, p++) {
-               if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
-                       p = pfn_to_page(page_to_pfn(page) + i);
  
-               if (unlikely(!PageTail(p) |
-                               (p->first_page != page)))
+       for (i = 1; i < nr_pages; i++) {
+               struct page *p = page + i;
+
+               if (unlikely(!PageTail(p) || (p->first_page != page))) {
                         bad_page(page);
+                       bad++;
+               }
                 __ClearPageTail(p);
         }
+
+       return bad;
  }
  
  static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -416,7 +458,8 @@ static inline void __free_one_page(struct page *page,
         int migratetype = get_pageblock_migratetype(page);
  
         if (unlikely(PageCompound(page)))
-               destroy_compound_page(page, order);
+               if (unlikely(destroy_compound_page(page, order)))
+                       return;
  
         page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
  
@@ -453,18 +496,13 @@ static inline int free_pages_check(struct page *page)
         if (unlikely(page_mapcount(page) |
                 (page->mapping != NULL)  |
                 (page_count(page) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
+               (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
                 bad_page(page);
-       if (PageDirty(page))
-               __ClearPageDirty(page);
-       if (PageSwapBacked(page))
-               __ClearPageSwapBacked(page);
-       /*
-        * For now, we report if PG_reserved was found set, but do not
-        * clear it, and do not free the page.  But we shall soon need
-        * to do more, for when the ZERO_PAGE count wraps negative.
-        */
-       return PageReserved(page);
+               return 1;
+       }
+       if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
+               page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       return 0;
  }
  
  /*
@@ -509,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
  {
         unsigned long flags;
         int i;
-       int reserved = 0;
+       int bad = 0;
  
         for (i = 0 ; i < (1 << order) ; ++i)
-               reserved += free_pages_check(page + i);
-       if (reserved)
+               bad += free_pages_check(page + i);
+       if (bad)
                 return;
  
         if (!PageHighMem(page)) {
@@ -598,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
         if (unlikely(page_mapcount(page) |
                 (page->mapping != NULL)  |
                 (page_count(page) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
+               (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
                 bad_page(page);
-
-       /*
-        * For now, we report if PG_reserved was found set, but do not
-        * clear it, and do not allocate the page: as a safety net.
-        */
-       if (PageReserved(page))
                 return 1;
+       }
  
-       page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
-                       1 << PG_referenced | 1 << PG_arch_1 |
-                       1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
-#ifdef CONFIG_UNEVICTABLE_LRU
-                       | 1 << PG_mlocked
-#endif
-                       );
         set_page_private(page, 0);
         set_page_refcounted(page);
  
@@ -896,13 +922,10 @@ static void drain_pages(unsigned int cpu)
         unsigned long flags;
         struct zone *zone;
  
-       for_each_zone(zone) {
+       for_each_populated_zone(zone) {
                 struct per_cpu_pageset *pset;
                 struct per_cpu_pages *pcp;
  
-               if (!populated_zone(zone))
-                       continue;
-
                 pset = zone_pcp(zone, cpu);
  
                 pcp = &pset->pcp;
@@ -1453,6 +1476,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
         unsigned long did_some_progress;
         unsigned long pages_reclaimed = 0;
  
+       lockdep_trace_alloc(gfp_mask);
+
         might_sleep_if(wait);
  
         if (should_fail_alloc_page(gfp_mask, order))
@@ -1547,13 +1572,21 @@ nofail_alloc:
  
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
+       /*
+        * The task's cpuset might have expanded its set of allowable nodes
+        */
+       cpuset_update_task_memory_state();
         p->flags |= PF_MEMALLOC;
+
+       lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+       did_some_progress = try_to_free_pages(zonelist, order,
+                                               gfp_mask, nodemask);
  
         p->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
         p->flags &= ~PF_MEMALLOC;
  
         cond_resched();
@@ -1844,10 +1877,7 @@ void show_free_areas(void)
         int cpu;
         struct zone *zone;
  
-       for_each_zone(zone) {
-               if (!populated_zone(zone))
-                       continue;
-
+       for_each_populated_zone(zone) {
                 show_node(zone);
                 printk("%s per-cpu:\n", zone->name);
  
@@ -1887,12 +1917,9 @@ void show_free_areas(void)
                 global_page_state(NR_PAGETABLE),
                 global_page_state(NR_BOUNCE));
  
-       for_each_zone(zone) {
+       for_each_populated_zone(zone) {
                 int i;
  
-               if (!populated_zone(zone))
-                       continue;
-
                 show_node(zone);
                 printk("%s"
                         " free:%lukB"
@@ -1932,12 +1959,9 @@ void show_free_areas(void)
                 printk("\n");
         }
  
-       for_each_zone(zone) {
+       for_each_populated_zone(zone) {
                 unsigned long nr[MAX_ORDER], flags, order, total = 0;
  
-               if (!populated_zone(zone))
-                       continue;
-
                 show_node(zone);
                 printk("%s: ", zone->name);
  
@@ -2104,7 +2128,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
         int n, val;
         int min_val = INT_MAX;
         int best_node = -1;
-       node_to_cpumask_ptr(tmp, 0);
+       const struct cpumask *tmp = cpumask_of_node(0);
  
         /* Use the local node if we haven't already */
         if (!node_isset(node, *used_node_mask)) {
@@ -2125,8 +2149,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
                 val += (n < node);
  
                 /* Give preference to headless and unused nodes */
-               node_to_cpumask_ptr_next(tmp, n);
-               if (!cpus_empty(*tmp))
+               tmp = cpumask_of_node(n);
+               if (!cpumask_empty(tmp))
                         val += PENALTY_FOR_NODE_WITH_CPUS;
  
                 /* Slight preference for less loaded node */
@@ -2591,6 +2615,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
         unsigned long pfn;
         struct zone *z;
  
+       if (highest_memmap_pfn < end_pfn - 1)
+               highest_memmap_pfn = end_pfn - 1;
+
         z = &NODE_DATA(nid)->node_zones[zone];
         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                 /*
@@ -2654,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
  
  static int zone_batchsize(struct zone *zone)
  {
+#ifdef CONFIG_MMU
         int batch;
  
         /*
@@ -2679,9 +2707,26 @@ static int zone_batchsize(struct zone *zone)
          * of pages of one half of the possible page colors
          * and the other with pages of the other colors.
          */
-       batch = (1 << (fls(batch + batch/2)-1)) - 1;
+       batch = rounddown_pow_of_two(batch + batch/2) - 1;
  
         return batch;
+
+#else
+       /* The deferral and batching of frees should be suppressed under NOMMU
+        * conditions.
+        *
+        * The problem is that NOMMU needs to be able to allocate large chunks
+        * of contiguous memory as there's no hardware page translation to
+        * assemble apparent contiguous memory from discontiguous pages.
+        *
+        * Queueing large contiguous runs of pages for batching, however,
+        * causes the pages to actually be freed in smaller chunks.  As there
+        * can be a significant delay between the individual batches being
+        * recycled, this leads to the once large chunks of space being
+        * fragmented and becoming unavailable for high-order allocations.
+        */
+       return 0;
+#endif
  }
  
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
@@ -2746,11 +2791,7 @@ static int __cpuinit process_zones(int cpu)
  
         node_set_state(node, N_CPU);    /* this node has a cpu */
  
-       for_each_zone(zone) {
-
-               if (!populated_zone(zone))
-                       continue;
-
+       for_each_populated_zone(zone) {
                 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                          GFP_KERNEL, node);
                 if (!zone_pcp(zone, cpu))
@@ -2956,7 +2997,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
-int __meminit early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn)
  {
         int i;
  
@@ -2967,10 +3008,33 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
                 if (start_pfn <= pfn && pfn < end_pfn)
                         return early_node_map[i].nid;
         }
+       /* This is a memory hole */
+       return -1;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
  
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+       int nid;
+
+       nid = __early_pfn_to_nid(pfn);
+       if (nid >= 0)
+               return nid;
+       /* just returns 0 */
         return 0;
  }
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+       int nid;
+
+       nid = __early_pfn_to_nid(pfn);
+       if (nid >= 0 && nid != node)
+               return false;
+       return true;
+}
+#endif
  
  /* Basic iterator support to walk early_node_map[] */
  #define for_each_active_range_index_in_nid(i, nid) \
@@ -3363,10 +3427,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
  {
         unsigned long usemapsize = usemap_size(zonesize);
         zone->pageblock_flags = NULL;
-       if (usemapsize) {
+       if (usemapsize)
                 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
-               memset(zone->pageblock_flags, 0, usemapsize);
-       }
  }
  #else
  static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3451,9 +3513,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                         PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                 if (realsize >= memmap_pages) {
                         realsize -= memmap_pages;
-                       printk(KERN_DEBUG
-                               "  %s zone: %lu pages used for memmap\n",
-                               zone_names[j], memmap_pages);
+                       if (memmap_pages)
+                               printk(KERN_DEBUG
+                                      "  %s zone: %lu pages used for memmap\n",
+                                      zone_names[j], memmap_pages);
                 } else
                         printk(KERN_WARNING
                                 "  %s zone: %lu pages exceeds realsize %lu\n",
@@ -3491,10 +3554,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                         INIT_LIST_HEAD(&zone->lru[l].list);
                         zone->lru[l].nr_scan = 0;
                 }
-               zone->recent_rotated[0] = 0;
-               zone->recent_rotated[1] = 0;
-               zone->recent_scanned[0] = 0;
-               zone->recent_scanned[1] = 0;
+               zone->reclaim_stat.recent_rotated[0] = 0;
+               zone->reclaim_stat.recent_rotated[1] = 0;
+               zone->reclaim_stat.recent_scanned[0] = 0;
+               zone->reclaim_stat.recent_scanned[1] = 0;
                 zap_zone_vm_stats(zone);
                 zone->flags = 0;
                 if (!size)
@@ -4298,7 +4361,7 @@ void setup_per_zone_pages_min(void)
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
-void setup_per_zone_inactive_ratio(void)
+static void setup_per_zone_inactive_ratio(void)
  {
         struct zone *zone;
  
@@ -4555,19 +4618,6 @@ void *__init alloc_large_system_hash(const char *tablename,
         return table;
  }
  
-#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-struct page *pfn_to_page(unsigned long pfn)
-{
-       return __pfn_to_page(pfn);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-       return __page_to_pfn(page);
-}
-EXPORT_SYMBOL(pfn_to_page);
-EXPORT_SYMBOL(page_to_pfn);
-#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-
  /* Return a pointer to the bitmap storing bits affecting a block of pages */
  static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
                                                         unsigned long pfn)