[PATCH] slab debug and ARCH_SLAB_MINALIGN don't get along

[safe/jmp/linux-2.6] / mm / mempolicy.c
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 67af4ce..fb90723 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,9 @@
  #include <linux/swap.h>
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
+#include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
  
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
@@ -95,17 +98,14 @@
  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)         /* Invert check for nodemask */
  #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)          /* Gather statistics */
  
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
  
  #define PDprintk(fmt...)
  
  /* Highest zone. An specific allocation for a zone below that is not
     policied. */
-int policy_zone = ZONE_DMA;
+enum zone_type policy_zone = ZONE_DMA;
  
  struct mempolicy default_policy = {
         .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -137,23 +137,30 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
  static struct zonelist *bind_zonelist(nodemask_t *nodes)
  {
         struct zonelist *zl;
-       int num, max, nd, k;
+       int num, max, nd;
+       enum zone_type k;
  
         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+       max++;                  /* space for zlcache_ptr (see mmzone.h) */
         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
         if (!zl)
                 return NULL;
+       zl->zlcache_ptr = NULL;
         num = 0;
         /* First put in the highest zones from all nodes, then all the next 
            lower zones etc. Avoid empty zones because the memory allocator
            doesn't like them. If you implement node hot removal you
            have to fix that. */
-       for (k = policy_zone; k >= 0; k--) { 
+       k = policy_zone;
+       while (1) {
                 for_each_node_mask(nd, *nodes) { 
                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
                         if (z->present_pages > 0) 
                                 zl->zones[num++] = z;
                 }
+               if (k == 0)
+                       break;
+               k--;
         }
         zl->zones[num] = NULL;
         return zl;
@@ -197,7 +204,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
         return policy;
  }
  
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                 unsigned long flags);
  
@@ -239,7 +246,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                         continue;
  
                 if (flags & MPOL_MF_STATS)
-                       gather_stats(page, private);
+                       gather_stats(page, private, pte_dirty(*pte));
                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                         migrate_page_add(page, private, flags);
                 else
@@ -330,9 +337,12 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         int err;
         struct vm_area_struct *first, *vma, *prev;
  
-       /* Clear the LRU lists so pages can be isolated */
-       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-               lru_add_drain_all();
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+               err = migrate_prep();
+               if (err)
+                       return ERR_PTR(err);
+       }
  
         first = find_vma(mm, start);
         if (!first)
@@ -421,6 +431,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
         return mpol_check_policy(mode, nodes);
  }
  
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+       if (p->mempolicy)
+               p->flags |= PF_MEMPOLICY;
+       else
+               p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+       mpol_fix_fork_child_flag(current);
+}
+
  /* Set the process memory policy */
  long do_set_mempolicy(int mode, nodemask_t *nodes)
  {
@@ -433,6 +474,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
                 return PTR_ERR(new);
         mpol_free(current->mempolicy);
         current->mempolicy = new;
+       mpol_set_task_struct_flag();
         if (new && new->policy == MPOL_INTERLEAVE)
                 current->il_next = first_node(new->v.nodes);
         return 0;
@@ -447,7 +489,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
         switch (p->policy) {
         case MPOL_BIND:
                 for (i = 0; p->v.zonelist->zones[i]; i++)
-                       node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+                       node_set(zone_to_nid(p->v.zonelist->zones[i]),
                                 *nodes);
                 break;
         case MPOL_DEFAULT:
@@ -540,92 +582,23 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
         return err;
  }
  
+#ifdef CONFIG_MIGRATION
  /*
   * page migration
   */
-
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                 unsigned long flags)
  {
         /*
          * Avoid migrating a page that is shared with others.
          */
-       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-               if (isolate_lru_page(page))
-                       list_add_tail(&page->lru, pagelist);
-       }
+       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+               isolate_lru_page(page, pagelist);
  }
  
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-                       struct vm_area_struct *vma, int dest)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
  {
-       LIST_HEAD(newlist);
-       LIST_HEAD(moved);
-       LIST_HEAD(failed);
-       int err = 0;
-       unsigned long offset = 0;
-       int nr_pages;
-       struct page *page;
-       struct list_head *p;
-
-redo:
-       nr_pages = 0;
-       list_for_each(p, pagelist) {
-               if (vma) {
-                       /*
-                        * The address passed to alloc_page_vma is used to
-                        * generate the proper interleave behavior. We fake
-                        * the address here by an increasing offset in order
-                        * to get the proper distribution of pages.
-                        *
-                        * No decision has been made as to which page
-                        * a certain old page is moved to so we cannot
-                        * specify the correct address.
-                        */
-                       page = alloc_page_vma(GFP_HIGHUSER, vma,
-                                       offset + vma->vm_start);
-                       offset += PAGE_SIZE;
-               }
-               else
-                       page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-               if (!page) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               list_add_tail(&page->lru, &newlist);
-               nr_pages++;
-               if (nr_pages > MIGRATE_CHUNK_SIZE)
-                       break;
-       }
-       err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
-       putback_lru_pages(&moved);      /* Call release pages instead ?? */
-
-       if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-               goto redo;
-out:
-       /* Return leftover allocated pages */
-       while (!list_empty(&newlist)) {
-               page = list_entry(newlist.next, struct page, lru);
-               list_del(&page->lru);
-               __free_page(page);
-       }
-       list_splice(&failed, pagelist);
-       if (err < 0)
-               return err;
-
-       /* Calculate number of leftover pages */
-       nr_pages = 0;
-       list_for_each(p, pagelist)
-               nr_pages++;
-       return nr_pages;
+       return alloc_pages_node(node, GFP_HIGHUSER, 0);
  }
  
  /*
@@ -644,11 +617,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
  
-       if (!list_empty(&pagelist)) {
-               err = migrate_pages_to(&pagelist, NULL, dest);
-               if (!list_empty(&pagelist))
-                       putback_lru_pages(&pagelist);
-       }
+       if (!list_empty(&pagelist))
+               err = migrate_pages(&pagelist, new_node_page, dest);
+
         return err;
  }
  
@@ -668,6 +639,10 @@ int do_migrate_pages(struct mm_struct *mm,
  
         down_read(&mm->mmap_sem);
  
+       err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+       if (err)
+               goto out;
+
  /*
   * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
   * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -727,12 +702,38 @@ int do_migrate_pages(struct mm_struct *mm,
                 if (err < 0)
                         break;
         }
-
+out:
         up_read(&mm->mmap_sem);
         if (err < 0)
                 return err;
         return busy;
+
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+       struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+       return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
  }
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+       const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+       return -ENOSYS;
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+       return NULL;
+}
+#endif
  
  long do_mbind(unsigned long start, unsigned long len,
                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
@@ -748,7 +749,7 @@ long do_mbind(unsigned long start, unsigned long len,
                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
             || mode > MPOL_MAX)
                 return -EINVAL;
-       if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+       if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                 return -EPERM;
  
         if (start & ~PAGE_MASK)
@@ -793,13 +794,12 @@ long do_mbind(unsigned long start, unsigned long len,
                 err = mbind_range(vma, start, end, new);
  
                 if (!list_empty(&pagelist))
-                       nr_failed = migrate_pages_to(&pagelist, vma, -1);
+                       nr_failed = migrate_pages(&pagelist, new_vma_page,
+                                               (unsigned long)vma);
  
                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                         err = -EIO;
         }
-       if (!list_empty(&pagelist))
-               putback_lru_pages(&pagelist);
  
         up_write(&mm->mmap_sem);
         mpol_free(new);
@@ -937,24 +937,29 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
         /*
          * Check if this process has the right to modify the specified
          * process. The right exists if the process has administrative
-        * capabilities, superuser priviledges or the same
+        * capabilities, superuser privileges or the same
          * userid as the target process.
          */
         if ((current->euid != task->suid) && (current->euid != task->uid) &&
             (current->uid != task->suid) && (current->uid != task->uid) &&
-           !capable(CAP_SYS_ADMIN)) {
+           !capable(CAP_SYS_NICE)) {
                 err = -EPERM;
                 goto out;
         }
  
         task_nodes = cpuset_mems_allowed(task);
         /* Is the user allowed to access the target nodes? */
-       if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+       if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
                 err = -EPERM;
                 goto out;
         }
  
-       err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+       err = security_task_movememory(task);
+       if (err)
+               goto out;
+
+       err = do_migrate_pages(mm, &old, &new,
+               capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
  out:
         mmput(mm);
         return err;
@@ -1133,7 +1138,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
   */
  unsigned slab_node(struct mempolicy *policy)
  {
-       switch (policy->policy) {
+       int pol = policy ? policy->policy : MPOL_DEFAULT;
+
+       switch (pol) {
         case MPOL_INTERLEAVE:
                 return interleave_nodes(policy);
  
@@ -1142,7 +1149,7 @@ unsigned slab_node(struct mempolicy *policy)
                  * Follow bind policy behavior and start allocation at the
                  * first node.
                  */
-               return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+               return zone_to_nid(policy->v.zonelist->zones[0]);
  
         case MPOL_PREFERRED:
                 if (policy->v.preferred_node >= 0)
@@ -1178,7 +1185,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
         if (vma) {
                 unsigned long off;
  
-               off = vma->vm_pgoff;
+               /*
+                * for small pages, there is no difference between
+                * shift and PAGE_SHIFT, so the bit-shift is safe.
+                * for huge pages, since vm_pgoff is in units of small
+                * pages, we need to shift off the always 0 bits to get
+                * a useful offset.
+                */
+               BUG_ON(shift < PAGE_SHIFT);
+               off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
                 off += (addr - vma->vm_start) >> shift;
                 return offset_il_node(pol, vma, off);
         } else
@@ -1211,10 +1226,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  
         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
         page = __alloc_pages(gfp, order, zl);
-       if (page && page_zone(page) == zl->zones[0]) {
-               zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
-               put_cpu();
-       }
+       if (page && page_zone(page) == zl->zones[0])
+               inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
         return page;
  }
  
@@ -1281,7 +1294,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  
         if ((gfp & __GFP_WAIT) && !in_interrupt())
                 cpuset_update_task_memory_state();
-       if (!pol || in_interrupt())
+       if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                 pol = &default_policy;
         if (pol->policy == MPOL_INTERLEAVE)
                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
@@ -1313,12 +1326,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
         atomic_set(&new->refcnt, 1);
         if (new->policy == MPOL_BIND) {
                 int sz = ksize(old->v.zonelist);
-               new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+               new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
                 if (!new->v.zonelist) {
                         kmem_cache_free(policy_cache, new);
                         return ERR_PTR(-ENOMEM);
                 }
-               memcpy(new->v.zonelist, old->v.zonelist, sz);
         }
         return new;
  }
@@ -1640,7 +1652,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
  
                 nodes_clear(nodes);
                 for (z = pol->v.zonelist->zones; *z; z++)
-                       node_set((*z)->zone_pgdat->node_id, nodes);
+                       node_set(zone_to_nid(*z), nodes);
                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
                 nodes = tmp;
  
@@ -1752,70 +1764,148 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
  struct numa_maps {
         unsigned long pages;
         unsigned long anon;
-       unsigned long mapped;
+       unsigned long active;
+       unsigned long writeback;
         unsigned long mapcount_max;
+       unsigned long dirty;
+       unsigned long swapcache;
         unsigned long node[MAX_NUMNODES];
  };
  
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
  {
         struct numa_maps *md = private;
         int count = page_mapcount(page);
  
-       if (count)
-               md->mapped++;
+       md->pages++;
+       if (pte_dirty || PageDirty(page))
+               md->dirty++;
  
-       if (count > md->mapcount_max)
-               md->mapcount_max = count;
+       if (PageSwapCache(page))
+               md->swapcache++;
  
-       md->pages++;
+       if (PageActive(page))
+               md->active++;
+
+       if (PageWriteback(page))
+               md->writeback++;
  
         if (PageAnon(page))
                 md->anon++;
  
+       if (count > md->mapcount_max)
+               md->mapcount_max = count;
+
         md->node[page_to_nid(page)]++;
-       cond_resched();
  }
  
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end,
+               struct numa_maps *md)
+{
+       unsigned long addr;
+       struct page *page;
+
+       for (addr = start; addr < end; addr += HPAGE_SIZE) {
+               pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+               pte_t pte;
+
+               if (!ptep)
+                       continue;
+
+               pte = *ptep;
+               if (pte_none(pte))
+                       continue;
+
+               page = pte_page(pte);
+               if (!page)
+                       continue;
+
+               gather_stats(page, md, pte_dirty(*ptep));
+       }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end,
+               struct numa_maps *md)
+{
+}
+#endif
+
  int show_numa_map(struct seq_file *m, void *v)
  {
-       struct task_struct *task = m->private;
+       struct proc_maps_private *priv = m->private;
         struct vm_area_struct *vma = v;
         struct numa_maps *md;
+       struct file *file = vma->vm_file;
+       struct mm_struct *mm = vma->vm_mm;
         int n;
         char buffer[50];
  
-       if (!vma->vm_mm)
+       if (!mm)
                 return 0;
  
         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
         if (!md)
                 return 0;
  
-       check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                   &node_online_map, MPOL_MF_STATS, md);
+       mpol_to_str(buffer, sizeof(buffer),
+                           get_vma_policy(priv->task, vma, vma->vm_start));
  
-       if (md->pages) {
-               mpol_to_str(buffer, sizeof(buffer),
-                           get_vma_policy(task, vma, vma->vm_start));
+       seq_printf(m, "%08lx %s", vma->vm_start, buffer);
  
-               seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
-                          vma->vm_start, buffer, md->pages,
-                          md->mapped, md->mapcount_max);
+       if (file) {
+               seq_printf(m, " file=");
+               seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+       } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+               seq_printf(m, " heap");
+       } else if (vma->vm_start <= mm->start_stack &&
+                       vma->vm_end >= mm->start_stack) {
+               seq_printf(m, " stack");
+       }
  
-               if (md->anon)
-                       seq_printf(m," anon=%lu",md->anon);
+       if (is_vm_hugetlb_page(vma)) {
+               check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+               seq_printf(m, " huge");
+       } else {
+               check_pgd_range(vma, vma->vm_start, vma->vm_end,
+                               &node_online_map, MPOL_MF_STATS, md);
+       }
  
-               for_each_online_node(n)
-                       if (md->node[n])
-                               seq_printf(m, " N%d=%lu", n, md->node[n]);
+       if (!md->pages)
+               goto out;
  
-               seq_putc(m, '\n');
-       }
+       if (md->anon)
+               seq_printf(m," anon=%lu",md->anon);
+
+       if (md->dirty)
+               seq_printf(m," dirty=%lu",md->dirty);
+
+       if (md->pages != md->anon && md->pages != md->dirty)
+               seq_printf(m, " mapped=%lu", md->pages);
+
+       if (md->mapcount_max > 1)
+               seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+       if (md->swapcache)
+               seq_printf(m," swapcache=%lu", md->swapcache);
+
+       if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+               seq_printf(m," active=%lu", md->active);
+
+       if (md->writeback)
+               seq_printf(m," writeback=%lu", md->writeback);
+
+       for_each_online_node(n)
+               if (md->node[n])
+                       seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+       seq_putc(m, '\n');
         kfree(md);
  
         if (m->count < m->size)
-               m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+               m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
         return 0;
  }