[ARM] 3454/1: ARM: OMAP: 6/8 Update framebuffer low-level init code, take 2

[safe/jmp/linux-2.6] / mm / mempolicy.c
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 44b9d69..dec8249 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
  #include <linux/swap.h>
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
+#include <linux/migrate.h>
  
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
@@ -95,8 +96,8 @@
  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)         /* Invert check for nodemask */
  #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)          /* Gather statistics */
  
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
  
  #define PDprintk(fmt...)
  
@@ -129,19 +130,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
         }
         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
  }
+
  /* Generate a custom zonelist for the BIND policy. */
  static struct zonelist *bind_zonelist(nodemask_t *nodes)
  {
         struct zonelist *zl;
-       int num, max, nd;
+       int num, max, nd, k;
  
         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-       zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+       zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
         if (!zl)
                 return NULL;
         num = 0;
-       for_each_node_mask(nd, *nodes)
-               zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+       /* First put in the highest zones from all nodes, then all the next 
+          lower zones etc. Avoid empty zones because the memory allocator
+          doesn't like them. If you implement node hot removal you
+          have to fix that. */
+       for (k = policy_zone; k >= 0; k--) { 
+               for_each_node_mask(nd, *nodes) { 
+                       struct zone *z = &NODE_DATA(nd)->node_zones[k];
+                       if (z->present_pages > 0) 
+                               zl->zones[num++] = z;
+               }
+       }
         zl->zones[num] = NULL;
         return zl;
  }
@@ -180,58 +191,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
                 break;
         }
         policy->policy = mode;
+       policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
         return policy;
  }
  
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
-                       struct address_space *mapping)
-{
-       struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
-       int rc = 1;
-
-       spin_lock(&mapping->i_mmap_lock);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-               if (mm != vma->vm_mm) {
-                       rc = 0;
-                       goto out;
-               }
-       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-               if (mm != vma->vm_mm) {
-                       rc = 0;
-                       goto out;
-               }
-out:
-       spin_unlock(&mapping->i_mmap_lock);
-       return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
-       struct page *page, struct list_head *pagelist, unsigned long flags)
-{
-       /*
-        * Avoid migrating a page that is shared by others and not writable.
-        */
-       if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-           mapping_writably_mapped(page->mapping) ||
-           single_mm_mapping(vma->vm_mm, page->mapping)) {
-               int rc = isolate_lru_page(page);
-
-               if (rc == 1)
-                       list_add(&page->lru, pagelist);
-               /*
-                * If the isolate attempt was not successful then we just
-                * encountered an unswappable page. Something must be wrong.
-                */
-               WARN_ON(rc == 0);
-       }
-}
-
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags);
  
  /* Scan through pages checking if pages follow certain conditions. */
  static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -253,14 +219,27 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 page = vm_normal_page(vma, addr, *pte);
                 if (!page)
                         continue;
+               /*
+                * The check for PageReserved here is important to avoid
+                * handling zero pages and other pages that may have been
+                * marked special by the system.
+                *
+                * If the PageReserved would not be checked here then f.e.
+                * the location of the zero page could have an influence
+                * on MPOL_MF_STRICT, zero pages would be counted for
+                * the per node stats, and there would be useless attempts
+                * to put zero pages on the migration list.
+                */
+               if (PageReserved(page))
+                       continue;
                 nid = page_to_nid(page);
                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
                         continue;
  
                 if (flags & MPOL_MF_STATS)
-                       gather_stats(page, private);
+                       gather_stats(page, private, pte_dirty(*pte));
                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                       migrate_page_add(vma, page, private, flags);
+                       migrate_page_add(page, private, flags);
                 else
                         break;
         } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -332,7 +311,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
  static inline int vma_migratable(struct vm_area_struct *vma)
  {
         if (vma->vm_flags & (
-               VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+               VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
                 return 0;
         return 1;
  }
@@ -349,6 +328,13 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         int err;
         struct vm_area_struct *first, *vma, *prev;
  
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+               err = migrate_prep();
+               if (err)
+                       return ERR_PTR(err);
+       }
+
         first = find_vma(mm, start);
         if (!first)
                 return ERR_PTR(-EFAULT);
@@ -430,95 +416,41 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
         if (!nodes)
                 return 0;
  
-       /* Update current mems_allowed */
-       cpuset_update_current_mems_allowed();
-       /* Ignore nodes not set in current->mems_allowed */
-       cpuset_restrict_to_mems_allowed(nodes->bits);
+       cpuset_update_task_memory_state();
+       if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+               return -EINVAL;
         return mpol_check_policy(mode, nodes);
  }
  
-static int swap_pages(struct list_head *pagelist)
-{
-       LIST_HEAD(moved);
-       LIST_HEAD(failed);
-       int n;
  
-       n = migrate_pages(pagelist, NULL, &moved, &failed);
-       putback_lru_pages(&failed);
-       putback_lru_pages(&moved);
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
  
-       return n;
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+       if (p->mempolicy)
+               p->flags |= PF_MEMPOLICY;
+       else
+               p->flags &= ~PF_MEMPOLICY;
  }
  
-long do_mbind(unsigned long start, unsigned long len,
-               unsigned long mode, nodemask_t *nmask, unsigned long flags)
+static void mpol_set_task_struct_flag(void)
  {
-       struct vm_area_struct *vma;
-       struct mm_struct *mm = current->mm;
-       struct mempolicy *new;
-       unsigned long end;
-       int err;
-       LIST_HEAD(pagelist);
-
-       if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
-                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-           || mode > MPOL_MAX)
-               return -EINVAL;
-       if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
-               return -EPERM;
-
-       if (start & ~PAGE_MASK)
-               return -EINVAL;
-
-       if (mode == MPOL_DEFAULT)
-               flags &= ~MPOL_MF_STRICT;
-
-       len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-       end = start + len;
-
-       if (end < start)
-               return -EINVAL;
-       if (end == start)
-               return 0;
-
-       if (mpol_check_policy(mode, nmask))
-               return -EINVAL;
-
-       new = mpol_new(mode, nmask);
-       if (IS_ERR(new))
-               return PTR_ERR(new);
-
-       /*
-        * If we are using the default policy then operation
-        * on discontinuous address spaces is okay after all
-        */
-       if (!new)
-               flags |= MPOL_MF_DISCONTIG_OK;
-
-       PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                       mode,nodes_addr(nodes)[0]);
-
-       down_write(&mm->mmap_sem);
-       vma = check_range(mm, start, end, nmask,
-                         flags | MPOL_MF_INVERT, &pagelist);
-
-       err = PTR_ERR(vma);
-       if (!IS_ERR(vma)) {
-               int nr_failed = 0;
-
-               err = mbind_range(vma, start, end, new);
-               if (!list_empty(&pagelist))
-                       nr_failed = swap_pages(&pagelist);
-
-               if (!err && nr_failed && (flags & MPOL_MF_STRICT))
-                       err = -EIO;
-       }
-       if (!list_empty(&pagelist))
-               putback_lru_pages(&pagelist);
-
-       up_write(&mm->mmap_sem);
-       mpol_free(new);
-       return err;
+       mpol_fix_fork_child_flag(current);
  }
  
  /* Set the process memory policy */
@@ -533,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
                 return PTR_ERR(new);
         mpol_free(current->mempolicy);
         current->mempolicy = new;
+       mpol_set_task_struct_flag();
         if (new && new->policy == MPOL_INTERLEAVE)
                 current->il_next = first_node(new->v.nodes);
         return 0;
@@ -589,7 +522,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
         struct vm_area_struct *vma = NULL;
         struct mempolicy *pol = current->mempolicy;
  
-       cpuset_update_current_mems_allowed();
+       cpuset_update_task_memory_state();
         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
                 return -EINVAL;
         if (flags & MPOL_F_ADDR) {
@@ -640,11 +573,47 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
         return err;
  }
  
+#ifdef CONFIG_MIGRATION
+/*
+ * page migration
+ */
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags)
+{
+       /*
+        * Avoid migrating a page that is shared with others.
+        */
+       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+               isolate_lru_page(page, pagelist);
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+{
+       nodemask_t nmask;
+       LIST_HEAD(pagelist);
+       int err = 0;
+
+       nodes_clear(nmask);
+       node_set(source, nmask);
+
+       check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+                       flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+       if (!list_empty(&pagelist)) {
+               err = migrate_pages_to(&pagelist, NULL, dest);
+               if (!list_empty(&pagelist))
+                       putback_lru_pages(&pagelist);
+       }
+       return err;
+}
+
  /*
- * For now migrate_pages simply swaps out the pages from nodes that are in
- * the source set but not in the target set. In the future, we would
- * want a function that moves pages between the two nodesets in such
- * a way as to preserve the physical layout as much as possible.
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
   *
   * Returns the number of page that could not be moved.
   */
@@ -652,22 +621,164 @@ int do_migrate_pages(struct mm_struct *mm,
         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
  {
         LIST_HEAD(pagelist);
-       int count = 0;
-       nodemask_t nodes;
+       int busy = 0;
+       int err = 0;
+       nodemask_t tmp;
  
-       nodes_andnot(nodes, *from_nodes, *to_nodes);
+       down_read(&mm->mmap_sem);
  
-       down_read(&mm->mmap_sem);
-       check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
-                       flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same.  If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient.  As we go, we remember the
+ * most recent <s, d> pair that moved (s != d).  If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved.  If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
  
-       if (!list_empty(&pagelist)) {
-               count = swap_pages(&pagelist);
-               putback_lru_pages(&pagelist);
+       tmp = *from_nodes;
+       while (!nodes_empty(tmp)) {
+               int s,d;
+               int source = -1;
+               int dest = 0;
+
+               for_each_node_mask(s, tmp) {
+                       d = node_remap(s, *from_nodes, *to_nodes);
+                       if (s == d)
+                               continue;
+
+                       source = s;     /* Node moved. Memorize */
+                       dest = d;
+
+                       /* dest not in remaining from nodes? */
+                       if (!node_isset(dest, tmp))
+                               break;
+               }
+               if (source == -1)
+                       break;
+
+               node_clear(source, tmp);
+               err = migrate_to_node(mm, source, dest, flags);
+               if (err > 0)
+                       busy += err;
+               if (err < 0)
+                       break;
         }
  
         up_read(&mm->mmap_sem);
-       return count;
+       if (err < 0)
+               return err;
+       return busy;
+
+}
+
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+       const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+       return -ENOSYS;
+}
+#endif
+
+long do_mbind(unsigned long start, unsigned long len,
+               unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+       struct vm_area_struct *vma;
+       struct mm_struct *mm = current->mm;
+       struct mempolicy *new;
+       unsigned long end;
+       int err;
+       LIST_HEAD(pagelist);
+
+       if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+           || mode > MPOL_MAX)
+               return -EINVAL;
+       if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+               return -EPERM;
+
+       if (start & ~PAGE_MASK)
+               return -EINVAL;
+
+       if (mode == MPOL_DEFAULT)
+               flags &= ~MPOL_MF_STRICT;
+
+       len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+       end = start + len;
+
+       if (end < start)
+               return -EINVAL;
+       if (end == start)
+               return 0;
+
+       if (mpol_check_policy(mode, nmask))
+               return -EINVAL;
+
+       new = mpol_new(mode, nmask);
+       if (IS_ERR(new))
+               return PTR_ERR(new);
+
+       /*
+        * If we are using the default policy then operation
+        * on discontinuous address spaces is okay after all
+        */
+       if (!new)
+               flags |= MPOL_MF_DISCONTIG_OK;
+
+       PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+                       mode,nodes_addr(nodes)[0]);
+
+       down_write(&mm->mmap_sem);
+       vma = check_range(mm, start, end, nmask,
+                         flags | MPOL_MF_INVERT, &pagelist);
+
+       err = PTR_ERR(vma);
+       if (!IS_ERR(vma)) {
+               int nr_failed = 0;
+
+               err = mbind_range(vma, start, end, new);
+
+               if (!list_empty(&pagelist))
+                       nr_failed = migrate_pages_to(&pagelist, vma, -1);
+
+               if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                       err = -EIO;
+       }
+
+       if (!list_empty(&pagelist))
+               putback_lru_pages(&pagelist);
+
+       up_write(&mm->mmap_sem);
+       mpol_free(new);
+       return err;
  }
  
  /*
@@ -686,6 +797,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
         nodes_clear(*nodes);
         if (maxnode == 0 || !nmask)
                 return 0;
+       if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+               return -EINVAL;
  
         nlongs = BITS_TO_LONGS(maxnode);
         if ((maxnode % BITS_PER_LONG) == 0)
@@ -764,9 +877,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
         return do_set_mempolicy(mode, &nodes);
  }
  
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
  asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                 const unsigned long __user *old_nodes,
                 const unsigned long __user *new_nodes)
@@ -802,24 +912,25 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
         /*
          * Check if this process has the right to modify the specified
          * process. The right exists if the process has administrative
-        * capabilities, superuser priviledges or the same
+        * capabilities, superuser privileges or the same
          * userid as the target process.
          */
         if ((current->euid != task->suid) && (current->euid != task->uid) &&
             (current->uid != task->suid) && (current->uid != task->uid) &&
-           !capable(CAP_SYS_ADMIN)) {
+           !capable(CAP_SYS_NICE)) {
                 err = -EPERM;
                 goto out;
         }
  
         task_nodes = cpuset_mems_allowed(task);
         /* Is the user allowed to access the target nodes? */
-       if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+       if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
                 err = -EPERM;
                 goto out;
         }
  
-       err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+       err = do_migrate_pages(mm, &old, &new,
+               capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
  out:
         mmput(mm);
         return err;
@@ -932,8 +1043,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  #endif
  
  /* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+               struct vm_area_struct *vma, unsigned long addr)
  {
         struct mempolicy *pol = task->mempolicy;
  
@@ -992,6 +1103,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
         return nid;
  }
  
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+       switch (policy->policy) {
+       case MPOL_INTERLEAVE:
+               return interleave_nodes(policy);
+
+       case MPOL_BIND:
+               /*
+                * Follow bind policy behavior and start allocation at the
+                * first node.
+                */
+               return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+       case MPOL_PREFERRED:
+               if (policy->v.preferred_node >= 0)
+                       return policy->v.preferred_node;
+               /* Fall through */
+
+       default:
+               return numa_node_id();
+       }
+}
+
  /* Do static interleaving for a VMA with known offset. */
  static unsigned offset_il_node(struct mempolicy *pol,
                 struct vm_area_struct *vma, unsigned long off)
@@ -1023,6 +1161,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
                 return interleave_nodes(pol);
  }
  
+#ifdef CONFIG_HUGETLBFS
  /* Return a zonelist suitable for a huge page allocation. */
  struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
  {
@@ -1036,6 +1175,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
         }
         return zonelist_policy(GFP_HIGHUSER, pol);
  }
+#endif
  
  /* Allocate a page in interleaved policy.
     Own path because it needs to do special accounting. */
@@ -1081,7 +1221,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  {
         struct mempolicy *pol = get_vma_policy(current, vma, addr);
  
-       cpuset_update_current_mems_allowed();
+       cpuset_update_task_memory_state();
  
         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                 unsigned nid;
@@ -1107,7 +1247,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
   *     interrupt context and apply the current process NUMA policy.
   *     Returns NULL when no page can be allocated.
   *
- *     Don't call cpuset_update_current_mems_allowed() unless
+ *     Don't call cpuset_update_task_memory_state() unless
   *     1) it's ok to take cpuset_sem (can WAIT), and
   *     2) allocating for current task (not interrupt).
   */
@@ -1116,7 +1256,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
         struct mempolicy *pol = current->mempolicy;
  
         if ((gfp & __GFP_WAIT) && !in_interrupt())
-               cpuset_update_current_mems_allowed();
+               cpuset_update_task_memory_state();
         if (!pol || in_interrupt())
                 pol = &default_policy;
         if (pol->policy == MPOL_INTERLEAVE)
@@ -1125,6 +1265,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  }
  EXPORT_SYMBOL(alloc_pages_current);
  
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
  /* Slow path of a mempolicy copy */
  struct mempolicy *__mpol_copy(struct mempolicy *old)
  {
@@ -1132,6 +1281,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
  
         if (!new)
                 return ERR_PTR(-ENOMEM);
+       if (current_cpuset_is_being_rebound()) {
+               nodemask_t mems = cpuset_mems_allowed(current);
+               mpol_rebind_policy(old, &mems);
+       }
         *new = *old;
         atomic_set(&new->refcnt, 1);
         if (new->policy == MPOL_BIND) {
@@ -1338,6 +1491,30 @@ restart:
         return 0;
  }
  
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+                               nodemask_t *policy_nodes)
+{
+       info->root = RB_ROOT;
+       spin_lock_init(&info->lock);
+
+       if (policy != MPOL_DEFAULT) {
+               struct mempolicy *newpol;
+
+               /* Falls back to MPOL_DEFAULT on any error */
+               newpol = mpol_new(policy, policy_nodes);
+               if (!IS_ERR(newpol)) {
+                       /* Create pseudo-vma that contains just the policy */
+                       struct vm_area_struct pvma;
+
+                       memset(&pvma, 0, sizeof(struct vm_area_struct));
+                       /* Policy covers entire file */
+                       pvma.vm_end = TASK_SIZE;
+                       mpol_set_shared_policy(info, &pvma, newpol);
+                       mpol_free(newpol);
+               }
+       }
+}
+
  int mpol_set_shared_policy(struct shared_policy *info,
                         struct vm_area_struct *vma, struct mempolicy *npol)
  {
@@ -1406,25 +1583,31 @@ void numa_default_policy(void)
  }
  
  /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-                                                       const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
  {
+       nodemask_t *mpolmask;
         nodemask_t tmp;
  
         if (!pol)
                 return;
+       mpolmask = &pol->cpuset_mems_allowed;
+       if (nodes_equal(*mpolmask, *newmask))
+               return;
  
         switch (pol->policy) {
         case MPOL_DEFAULT:
                 break;
         case MPOL_INTERLEAVE:
-               nodes_remap(tmp, pol->v.nodes, *old, *new);
+               nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
                 pol->v.nodes = tmp;
-               current->il_next = node_remap(current->il_next, *old, *new);
+               *mpolmask = *newmask;
+               current->il_next = node_remap(current->il_next,
+                                               *mpolmask, *newmask);
                 break;
         case MPOL_PREFERRED:
                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                                               *old, *new);
+                                               *mpolmask, *newmask);
+               *mpolmask = *newmask;
                 break;
         case MPOL_BIND: {
                 nodemask_t nodes;
@@ -1434,7 +1617,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                 nodes_clear(nodes);
                 for (z = pol->v.zonelist->zones; *z; z++)
                         node_set((*z)->zone_pgdat->node_id, nodes);
-               nodes_remap(tmp, nodes, *old, *new);
+               nodes_remap(tmp, nodes, *mpolmask, *newmask);
                 nodes = tmp;
  
                 zonelist = bind_zonelist(&nodes);
@@ -1449,6 +1632,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                         kfree(pol->v.zonelist);
                         pol->v.zonelist = zonelist;
                 }
+               *mpolmask = *newmask;
                 break;
         }
         default:
@@ -1458,14 +1642,29 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
  }
  
  /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+       mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
   *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
   */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  {
-       rebind_policy(current->mempolicy, old, new);
+       struct vm_area_struct *vma;
+
+       down_write(&mm->mmap_sem);
+       for (vma = mm->mmap; vma; vma = vma->vm_next)
+               mpol_rebind_policy(vma->vm_policy, new);
+       up_write(&mm->mmap_sem);
  }
  
  /*
@@ -1529,66 +1728,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
  struct numa_maps {
         unsigned long pages;
         unsigned long anon;
-       unsigned long mapped;
+       unsigned long active;
+       unsigned long writeback;
         unsigned long mapcount_max;
+       unsigned long dirty;
+       unsigned long swapcache;
         unsigned long node[MAX_NUMNODES];
  };
  
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
  {
         struct numa_maps *md = private;
         int count = page_mapcount(page);
  
-       if (count)
-               md->mapped++;
+       md->pages++;
+       if (pte_dirty || PageDirty(page))
+               md->dirty++;
  
-       if (count > md->mapcount_max)
-               md->mapcount_max = count;
+       if (PageSwapCache(page))
+               md->swapcache++;
  
-       md->pages++;
+       if (PageActive(page))
+               md->active++;
+
+       if (PageWriteback(page))
+               md->writeback++;
  
         if (PageAnon(page))
                 md->anon++;
  
+       if (count > md->mapcount_max)
+               md->mapcount_max = count;
+
         md->node[page_to_nid(page)]++;
         cond_resched();
  }
  
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end,
+               struct numa_maps *md)
+{
+       unsigned long addr;
+       struct page *page;
+
+       for (addr = start; addr < end; addr += HPAGE_SIZE) {
+               pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+               pte_t pte;
+
+               if (!ptep)
+                       continue;
+
+               pte = *ptep;
+               if (pte_none(pte))
+                       continue;
+
+               page = pte_page(pte);
+               if (!page)
+                       continue;
+
+               gather_stats(page, md, pte_dirty(*ptep));
+       }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end,
+               struct numa_maps *md)
+{
+}
+#endif
+
  int show_numa_map(struct seq_file *m, void *v)
  {
         struct task_struct *task = m->private;
         struct vm_area_struct *vma = v;
         struct numa_maps *md;
+       struct file *file = vma->vm_file;
+       struct mm_struct *mm = vma->vm_mm;
         int n;
         char buffer[50];
  
-       if (!vma->vm_mm)
+       if (!mm)
                 return 0;
  
         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
         if (!md)
                 return 0;
  
-       check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                   &node_online_map, MPOL_MF_STATS, md);
+       mpol_to_str(buffer, sizeof(buffer),
+                       get_vma_policy(task, vma, vma->vm_start));
  
-       if (md->pages) {
-               mpol_to_str(buffer, sizeof(buffer),
-                           get_vma_policy(task, vma, vma->vm_start));
+       seq_printf(m, "%08lx %s", vma->vm_start, buffer);
  
-               seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
-                          vma->vm_start, buffer, md->pages,
-                          md->mapped, md->mapcount_max);
+       if (file) {
+               seq_printf(m, " file=");
+               seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+       } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+               seq_printf(m, " heap");
+       } else if (vma->vm_start <= mm->start_stack &&
+                       vma->vm_end >= mm->start_stack) {
+               seq_printf(m, " stack");
+       }
  
-               if (md->anon)
-                       seq_printf(m," anon=%lu",md->anon);
+       if (is_vm_hugetlb_page(vma)) {
+               check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+               seq_printf(m, " huge");
+       } else {
+               check_pgd_range(vma, vma->vm_start, vma->vm_end,
+                               &node_online_map, MPOL_MF_STATS, md);
+       }
  
-               for_each_online_node(n)
-                       if (md->node[n])
-                               seq_printf(m, " N%d=%lu", n, md->node[n]);
+       if (!md->pages)
+               goto out;
  
-               seq_putc(m, '\n');
-       }
+       if (md->anon)
+               seq_printf(m," anon=%lu",md->anon);
+
+       if (md->dirty)
+               seq_printf(m," dirty=%lu",md->dirty);
+
+       if (md->pages != md->anon && md->pages != md->dirty)
+               seq_printf(m, " mapped=%lu", md->pages);
+
+       if (md->mapcount_max > 1)
+               seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+       if (md->swapcache)
+               seq_printf(m," swapcache=%lu", md->swapcache);
+
+       if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+               seq_printf(m," active=%lu", md->active);
+
+       if (md->writeback)
+               seq_printf(m," writeback=%lu", md->writeback);
+
+       for_each_online_node(n)
+               if (md->node[n])
+                       seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+       seq_putc(m, '\n');
         kfree(md);
  
         if (m->count < m->size)