V4L/DVB (4859): Fix initializations on some video_ioctl2 handlers

[safe/jmp/linux-2.6] / kernel / cpuset.c
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 44d13c2..2c3b443 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,22 +4,20 @@
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
- *  Portions Copyright (c) 2004 Silicon Graphics, Inc.
   *
- *  2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ *  2003-10-10 Written by Simon Derr.
   *  2003-10-22 Updates by Stephen Hemminger.
- *  2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *  2004 May-July Rework by Paul Jackson.
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
  
-#include <linux/config.h>
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
@@ -42,6 +40,7 @@
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
  #include <linux/seq_file.h>
+#include <linux/security.h>
  #include <linux/slab.h>
  #include <linux/smp_lock.h>
  #include <linux/spinlock.h>
@@ -108,7 +107,9 @@ typedef enum {
         CS_MEM_EXCLUSIVE,
         CS_MEMORY_MIGRATE,
         CS_REMOVED,
-       CS_NOTIFY_ON_RELEASE
+       CS_NOTIFY_ON_RELEASE,
+       CS_SPREAD_PAGE,
+       CS_SPREAD_SLAB,
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
@@ -137,8 +138,18 @@ static inline int is_memory_migrate(const struct cpuset *cs)
         return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
  }
  
+static inline int is_spread_page(const struct cpuset *cs)
+{
+       return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+       return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
  /*
- * Increment this atomic integer everytime any cpuset changes its
+ * Increment this integer everytime any cpuset changes its
   * mems_allowed value.  Users of cpusets can track this generation
   * number, and avoid having to lock and reload mems_allowed unless
   * the cpuset they're using changes generation.
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs)
   * on every visit to __alloc_pages(), to efficiently check whether
   * its current->cpuset->mems_allowed has changed, requiring an update
   * of its current->mems_allowed.
+ *
+ * Since cpuset_mems_generation is guarded by manage_mutex,
+ * there is no need to mark it atomic.
   */
-static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
+static int cpuset_mems_generation;
  
  static struct cpuset top_cpuset = {
         .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -226,7 +240,7 @@ static struct super_block *cpuset_sb;
   * A cpuset can only be deleted if both its 'count' of using tasks
   * is zero, and its list of 'children' cpusets is empty.  Since all
   * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
   * always has either children cpusets and/or using tasks.  So we don't
   * need a special hack to ensure that top_cpuset cannot be deleted.
   *
@@ -275,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
                 inode->i_mode = mode;
                 inode->i_uid = current->fsuid;
                 inode->i_gid = current->fsgid;
-               inode->i_blksize = PAGE_CACHE_SIZE;
                 inode->i_blocks = 0;
                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -364,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
                 inode->i_op = &simple_dir_inode_operations;
                 inode->i_fop = &simple_dir_operations;
                 /* directories start off with i_nlink == 2 (for "." entry) */
-               inode->i_nlink++;
+               inc_nlink(inode);
         } else {
                 return -ENOMEM;
         }
@@ -378,11 +391,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
         return 0;
  }
  
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
-                                       int flags, const char *unused_dev_name,
-                                       void *data)
+static int cpuset_get_sb(struct file_system_type *fs_type,
+                        int flags, const char *unused_dev_name,
+                        void *data, struct vfsmount *mnt)
  {
-       return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+       return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
  }
  
  static struct file_system_type cpuset_fs_type = {
@@ -400,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
   *
   *
   * When reading/writing to a file:
- *     - the cpuset to use in file->f_dentry->d_parent->d_fsdata
- *     - the 'cftype' of the file is file->f_dentry->d_fsdata
+ *     - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
+ *     - the 'cftype' of the file is file->f_path.dentry->d_fsdata
   */
  
  struct cftype {
@@ -602,12 +615,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
   * current->cpuset if a task has its memory placement changed.
   * Do not call this routine if in_interrupt().
   *
- * Call without callback_mutex or task_lock() held.  May be called
- * with or without manage_mutex held.  Doesn't need task_lock to guard
- * against another task changing a non-NULL cpuset pointer to NULL,
- * as that is only done by a task on itself, and if the current task
- * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_mutex and
+ * Call without callback_mutex or task_lock() held.  May be
+ * called with or without manage_mutex held.  Thanks in part to
+ * 'the_top_cpuset_hack', the tasks cpuset pointer will never
+ * be NULL.  This routine also might acquire callback_mutex and
   * current->mm->mmap_sem during call.
   *
   * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -657,6 +668,14 @@ void cpuset_update_task_memory_state(void)
                 cs = tsk->cpuset;       /* Maybe changed when task not locked */
                 guarantee_online_mems(cs, &tsk->mems_allowed);
                 tsk->cpuset_mems_generation = cs->mems_generation;
+               if (is_spread_page(cs))
+                       tsk->flags |= PF_SPREAD_PAGE;
+               else
+                       tsk->flags &= ~PF_SPREAD_PAGE;
+               if (is_spread_slab(cs))
+                       tsk->flags |= PF_SPREAD_SLAB;
+               else
+                       tsk->flags &= ~PF_SPREAD_SLAB;
                 task_unlock(tsk);
                 mutex_unlock(&callback_mutex);
                 mpol_rebind_task(tsk, &tsk->mems_allowed);
@@ -710,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
         }
  
         /* Remaining checks don't apply to root cpuset */
-       if ((par = cur->parent) == NULL)
+       if (cur == &top_cpuset)
                 return 0;
  
+       par = cur->parent;
+
         /* We must be a subset of our parent cpuset */
         if (!is_cpuset_subset(trial, par))
                 return -EACCES;
@@ -742,6 +763,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
   *
   * Call with manage_mutex held.  May nest a call to the
   * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
   */
  
  static void update_cpu_domains(struct cpuset *cur)
@@ -761,7 +784,7 @@ static void update_cpu_domains(struct cpuset *cur)
                 if (is_cpu_exclusive(c))
                         cpus_andnot(pspan, pspan, c->cpus_allowed);
         }
-       if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+       if (!is_cpu_exclusive(cur)) {
                 cpus_or(pspan, pspan, cur->cpus_allowed);
                 if (cpus_equal(pspan, cur->cpus_allowed))
                         return;
@@ -794,6 +817,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         struct cpuset trialcs;
         int retval, cpus_unchanged;
  
+       /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+       if (cs == &top_cpuset)
+               return -EACCES;
+
         trialcs = *cs;
         retval = cpulist_parse(buf, trialcs.cpus_allowed);
         if (retval < 0)
@@ -814,6 +841,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  }
  
  /*
+ * cpuset_migrate_mm
+ *
+ *    Migrate memory region from one set of nodes to another.
+ *
+ *    Temporarilly set tasks mems_allowed to target nodes of migration,
+ *    so that the migration code can allocate pages on these nodes.
+ *
+ *    Call holding manage_mutex, so our current->cpuset won't change
+ *    during this call, as manage_mutex holds off any attach_task()
+ *    calls.  Therefore we don't need to take task_lock around the
+ *    call to guarantee_online_mems(), as we know no one is changing
+ *    our tasks cpuset.
+ *
+ *    Hold callback_mutex around the two modifications of our tasks
+ *    mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ *    While the mm_struct we are migrating is typically from some
+ *    other task, the task_struct mems_allowed that we are hacking
+ *    is for our current task, which must allocate new pages for that
+ *    migrating memory region.
+ *
+ *    We call cpuset_update_task_memory_state() before hacking
+ *    our tasks mems_allowed, so that we are assured of being in
+ *    sync with our tasks cpuset, and in particular, callbacks to
+ *    cpuset_update_task_memory_state() from nested page allocations
+ *    won't see any mismatch of our cpuset and task mems_generation
+ *    values, so won't overwrite our hacked tasks mems_allowed
+ *    nodemask.
+ */
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+                                                       const nodemask_t *to)
+{
+       struct task_struct *tsk = current;
+
+       cpuset_update_task_memory_state();
+
+       mutex_lock(&callback_mutex);
+       tsk->mems_allowed = *to;
+       mutex_unlock(&callback_mutex);
+
+       do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+
+       mutex_lock(&callback_mutex);
+       guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+       mutex_unlock(&callback_mutex);
+}
+
+/*
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
   * cpusets mems_allowed and mems_generation, and for each
@@ -838,6 +914,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
         int fudge;
         int retval;
  
+       /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+       if (cs == &top_cpuset)
+               return -EACCES;
+
         trialcs = *cs;
         retval = nodelist_parse(buf, trialcs.mems_allowed);
         if (retval < 0)
@@ -858,7 +938,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
  
         mutex_lock(&callback_mutex);
         cs->mems_allowed = trialcs.mems_allowed;
-       cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+       cs->mems_generation = cpuset_mems_generation++;
         mutex_unlock(&callback_mutex);
  
         set_cpuset_being_rebound(cs);           /* causes mpol_copy() rebind */
@@ -925,10 +1005,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                 struct mm_struct *mm = mmarray[i];
  
                 mpol_rebind_mm(mm, &cs->mems_allowed);
-               if (migrate) {
-                       do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
-                                                       MPOL_MF_MOVE_ALL);
-               }
+               if (migrate)
+                       cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
                 mmput(mm);
         }
  
@@ -956,7 +1034,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
  /*
   * update_flag - read a 0 or a 1 in a file and update associated flag
   * bit:        the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *                             CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
+ *                             CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ *                             CS_SPREAD_PAGE, CS_SPREAD_SLAB)
   * cs: the cpuset to update
   * buf:        the buffer where we read the 0 or 1
   *
@@ -983,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
         cpu_exclusive_changed =
                 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
         mutex_lock(&callback_mutex);
-       if (turning_on)
-               set_bit(bit, &cs->flags);
-       else
-               clear_bit(bit, &cs->flags);
+       cs->flags = trialcs.flags;
         mutex_unlock(&callback_mutex);
  
         if (cpu_exclusive_changed)
@@ -995,7 +1071,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
  }
  
  /*
- * Frequency meter - How fast is some event occuring?
+ * Frequency meter - How fast is some event occurring?
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
@@ -1109,6 +1185,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
         cpumask_t cpus;
         nodemask_t from, to;
         struct mm_struct *mm;
+       int retval;
  
         if (sscanf(pidbuf, "%d", &pid) != 1)
                 return -EIO;
@@ -1137,11 +1214,22 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                 get_task_struct(tsk);
         }
  
+       retval = security_task_setscheduler(tsk, 0, NULL);
+       if (retval) {
+               put_task_struct(tsk);
+               return retval;
+       }
+
         mutex_lock(&callback_mutex);
  
         task_lock(tsk);
         oldcs = tsk->cpuset;
-       if (!oldcs) {
+       /*
+        * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+        * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+        * then fail this attach_task(), to avoid breaking top_cpuset.count.
+        */
+       if (tsk->flags & PF_EXITING) {
                 task_unlock(tsk);
                 mutex_unlock(&callback_mutex);
                 put_task_struct(tsk);
@@ -1162,11 +1250,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
         mm = get_task_mm(tsk);
         if (mm) {
                 mpol_rebind_mm(mm, &to);
+               if (is_memory_migrate(cs))
+                       cpuset_migrate_mm(mm, &from, &to);
                 mmput(mm);
         }
  
-       if (is_memory_migrate(cs))
-               do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
         put_task_struct(tsk);
         synchronize_rcu();
         if (atomic_dec_and_test(&oldcs->count))
@@ -1187,21 +1275,24 @@ typedef enum {
         FILE_NOTIFY_ON_RELEASE,
         FILE_MEMORY_PRESSURE_ENABLED,
         FILE_MEMORY_PRESSURE,
+       FILE_SPREAD_PAGE,
+       FILE_SPREAD_SLAB,
         FILE_TASKLIST,
  } cpuset_filetype_t;
  
-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct file *file,
+                                       const char __user *userbuf,
                                         size_t nbytes, loff_t *unused_ppos)
  {
-       struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
-       struct cftype *cft = __d_cft(file->f_dentry);
+       struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
+       struct cftype *cft = __d_cft(file->f_path.dentry);
         cpuset_filetype_t type = cft->private;
         char *buffer;
         char *pathbuf = NULL;
         int retval = 0;
  
         /* Crude upper limit on largest legitimate cpulist user might write. */
-       if (nbytes > 100 + 6 * NR_CPUS)
+       if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
                 return -E2BIG;
  
         /* +1 for nul-terminator */
@@ -1246,6 +1337,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
         case FILE_MEMORY_PRESSURE:
                 retval = -EACCES;
                 break;
+       case FILE_SPREAD_PAGE:
+               retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+               cs->mems_generation = cpuset_mems_generation++;
+               break;
+       case FILE_SPREAD_SLAB:
+               retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+               cs->mems_generation = cpuset_mems_generation++;
+               break;
         case FILE_TASKLIST:
                 retval = attach_task(cs, buffer, &pathbuf);
                 break;
@@ -1268,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
                                                 size_t nbytes, loff_t *ppos)
  {
         ssize_t retval = 0;
-       struct cftype *cft = __d_cft(file->f_dentry);
+       struct cftype *cft = __d_cft(file->f_path.dentry);
         if (!cft)
                 return -ENODEV;
  
@@ -1318,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
  static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
                                 size_t nbytes, loff_t *ppos)
  {
-       struct cftype *cft = __d_cft(file->f_dentry);
-       struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+       struct cftype *cft = __d_cft(file->f_path.dentry);
+       struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
         cpuset_filetype_t type = cft->private;
         char *page;
         ssize_t retval = 0;
@@ -1355,6 +1454,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
         case FILE_MEMORY_PRESSURE:
                 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
                 break;
+       case FILE_SPREAD_PAGE:
+               *s++ = is_spread_page(cs) ? '1' : '0';
+               break;
+       case FILE_SPREAD_SLAB:
+               *s++ = is_spread_slab(cs) ? '1' : '0';
+               break;
         default:
                 retval = -EINVAL;
                 goto out;
@@ -1371,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
                                                                 loff_t *ppos)
  {
         ssize_t retval = 0;
-       struct cftype *cft = __d_cft(file->f_dentry);
+       struct cftype *cft = __d_cft(file->f_path.dentry);
         if (!cft)
                 return -ENODEV;
  
@@ -1393,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
         if (err)
                 return err;
  
-       cft = __d_cft(file->f_dentry);
+       cft = __d_cft(file->f_path.dentry);
         if (!cft)
                 return -ENODEV;
         if (cft->open)
@@ -1406,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
  
  static int cpuset_file_release(struct inode *inode, struct file *file)
  {
-       struct cftype *cft = __d_cft(file->f_dentry);
+       struct cftype *cft = __d_cft(file->f_path.dentry);
         if (cft->release)
                 return cft->release(inode, file);
         return 0;
@@ -1427,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
         return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
  }
  
-static struct file_operations cpuset_file_operations = {
+static const struct file_operations cpuset_file_operations = {
         .read = cpuset_file_read,
         .write = cpuset_file_write,
         .llseek = generic_file_llseek,
@@ -1460,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
                 inode->i_fop = &simple_dir_operations;
  
                 /* start off with i_nlink == 2 (for "." entry) */
-               inode->i_nlink++;
+               inc_nlink(inode);
         } else if (S_ISREG(mode)) {
                 inode->i_size = 0;
                 inode->i_fop = &cpuset_file_operations;
@@ -1493,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
         error = cpuset_create_file(dentry, S_IFDIR | mode);
         if (!error) {
                 dentry->d_fsdata = cs;
-               parent->d_inode->i_nlink++;
+               inc_nlink(parent->d_inode);
                 cs->dentry = dentry;
         }
         dput(dentry);
@@ -1595,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
   */
  static int cpuset_tasks_open(struct inode *unused, struct file *file)
  {
-       struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+       struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
         struct ctr_struct *ctr;
         pid_t *pidarray;
         int npids;
@@ -1718,6 +1823,16 @@ static struct cftype cft_memory_pressure = {
         .private = FILE_MEMORY_PRESSURE,
  };
  
+static struct cftype cft_spread_page = {
+       .name = "memory_spread_page",
+       .private = FILE_SPREAD_PAGE,
+};
+
+static struct cftype cft_spread_slab = {
+       .name = "memory_spread_slab",
+       .private = FILE_SPREAD_SLAB,
+};
+
  static int cpuset_populate_dir(struct dentry *cs_dentry)
  {
         int err;
@@ -1736,6 +1851,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
                 return err;
         if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
                 return err;
+       if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
+               return err;
+       if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
+               return err;
         if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
                 return err;
         return 0;
@@ -1764,12 +1883,16 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
         cs->flags = 0;
         if (notify_on_release(parent))
                 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+       if (is_spread_page(parent))
+               set_bit(CS_SPREAD_PAGE, &cs->flags);
+       if (is_spread_slab(parent))
+               set_bit(CS_SPREAD_SLAB, &cs->flags);
         cs->cpus_allowed = CPU_MASK_NONE;
         cs->mems_allowed = NODE_MASK_NONE;
         atomic_set(&cs->count, 0);
         INIT_LIST_HEAD(&cs->sibling);
         INIT_LIST_HEAD(&cs->children);
-       cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+       cs->mems_generation = cpuset_mems_generation++;
         fmeter_init(&cs->fmeter);
  
         cs->parent = parent;
@@ -1808,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
  }
  
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex.  Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
  static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
         struct cpuset *cs = dentry->d_fsdata;
@@ -1827,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
                 mutex_unlock(&manage_mutex);
                 return -EBUSY;
         }
+       if (is_cpu_exclusive(cs)) {
+               int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+               if (retval < 0) {
+                       mutex_unlock(&manage_mutex);
+                       return retval;
+               }
+       }
         parent = cs->parent;
         mutex_lock(&callback_mutex);
         set_bit(CS_REMOVED, &cs->flags);
-       if (is_cpu_exclusive(cs))
-               update_cpu_domains(cs);
         list_del(&cs->sibling); /* delete my sibling from parent->children */
         spin_lock(&cs->dentry->d_lock);
         d = dget(cs->dentry);
@@ -1859,7 +1998,7 @@ int __init cpuset_init_early(void)
         struct task_struct *tsk = current;
  
         tsk->cpuset = &top_cpuset;
-       tsk->cpuset->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+       tsk->cpuset->mems_generation = cpuset_mems_generation++;
         return 0;
  }
  
@@ -1878,7 +2017,7 @@ int __init cpuset_init(void)
         top_cpuset.mems_allowed = NODE_MASK_ALL;
  
         fmeter_init(&top_cpuset.fmeter);
-       top_cpuset.mems_generation = atomic_inc_return(&cpuset_mems_generation);
+       top_cpuset.mems_generation = cpuset_mems_generation++;
  
         init_task.cpuset = &top_cpuset;
  
@@ -1894,7 +2033,7 @@ int __init cpuset_init(void)
         }
         root = cpuset_mount->mnt_sb->s_root;
         root->d_fsdata = &top_cpuset;
-       root->d_inode->i_nlink++;
+       inc_nlink(root->d_inode);
         top_cpuset.dentry = root;
         root->d_inode->i_op = &cpuset_dir_inode_operations;
         number_of_cpusets = 1;
@@ -1906,6 +2045,100 @@ out:
         return err;
  }
  
+/*
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets.  If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes.  Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes.  It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+       struct cpuset *c;
+
+       /* Each of our child cpusets mems must be online */
+       list_for_each_entry(c, &cur->children, sibling) {
+               guarantee_online_cpus_mems_in_subtree(c);
+               if (!cpus_empty(c->cpus_allowed))
+                       guarantee_online_cpus(c, &c->cpus_allowed);
+               if (!nodes_empty(c->mems_allowed))
+                       guarantee_online_mems(c, &c->mems_allowed);
+       }
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map.  Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here.  We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
+{
+       mutex_lock(&manage_mutex);
+       mutex_lock(&callback_mutex);
+
+       guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+       top_cpuset.cpus_allowed = cpu_online_map;
+       top_cpuset.mems_allowed = node_online_map;
+
+       mutex_unlock(&callback_mutex);
+       mutex_unlock(&manage_mutex);
+}
+
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period.  This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ */
+
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+                               unsigned long phase, void *cpu)
+{
+       common_cpu_mem_hotplug_unplug();
+       return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+void cpuset_track_online_nodes(void)
+{
+       common_cpu_mem_hotplug_unplug();
+}
+#endif
+
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
@@ -1916,6 +2149,8 @@ void __init cpuset_init_smp(void)
  {
         top_cpuset.cpus_allowed = cpu_online_map;
         top_cpuset.mems_allowed = node_online_map;
+
+       hotcpu_notifier(cpuset_handle_cpuhp, 0);
  }
  
  /**
@@ -1969,7 +2204,7 @@ void cpuset_fork(struct task_struct *child)
   * because tsk is already marked PF_EXITING, so attach_task() won't
   * mess with it, or task is a failed fork, never visible to attach_task.
   *
- * Hack:
+ * the_top_cpuset_hack:
   *
   *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
   *
@@ -2008,7 +2243,7 @@ void cpuset_exit(struct task_struct *tsk)
         struct cpuset *cs;
  
         cs = tsk->cpuset;
-       tsk->cpuset = &top_cpuset;      /* Hack - see comment above */
+       tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above */
  
         if (notify_on_release(cs)) {
                 char *pathbuf = NULL;
@@ -2085,7 +2320,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
         int i;
  
         for (i = 0; zl->zones[i]; i++) {
-               int nid = zl->zones[i]->zone_pgdat->node_id;
+               int nid = zone_to_nid(zl->zones[i]);
  
                 if (node_isset(nid, current->mems_allowed))
                         return 1;
@@ -2129,30 +2364,37 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
   * So only GFP_KERNEL allocations, if all nodes in the cpuset are
   * short of memory, might require taking the callback_mutex mutex.
   *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
- * hardwall cpusets - no allocation on a node outside the cpuset is
- * allowed (unless in interrupt, of course).
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
+ * no allocation on a node outside the cpuset is allowed (unless in
+ * interrupt, of course).
   *
- * The second loop doesn't even call here for GFP_ATOMIC requests
- * (if the __alloc_pages() local variable 'wait' is set).  That check
- * and the checks below have the combined affect in the second loop of
- * the __alloc_pages() routine that:
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
   *     in_interrupt - any node ok (current task context irrelevant)
   *     GFP_ATOMIC   - any node ok
   *     GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
   *     GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
   **/
  
  int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
  {
         int node;                       /* node that zone z is on */
         const struct cpuset *cs;        /* current cpuset ancestors */
-       int allowed = 1;                /* is allocation in zone z allowed? */
+       int allowed;                    /* is allocation in zone z allowed? */
  
-       if (in_interrupt())
+       if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
                 return 1;
-       node = z->zone_pgdat->node_id;
+       node = zone_to_nid(z);
+       might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
         if (node_isset(node, current->mems_allowed))
                 return 1;
         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
@@ -2201,6 +2443,44 @@ void cpuset_unlock(void)
  }
  
  /**
+ * cpuset_mem_spread_node() - On which node to begin search for a page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online.  So it
+ * should not be possible for the following code to return an
+ * offline node.  But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start.  The zonelist passed to
+ * __alloc_pages() will include all nodes.  If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+int cpuset_mem_spread_node(void)
+{
+       int node;
+
+       node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+       if (node == MAX_NUMNODES)
+               node = first_node(current->mems_allowed);
+       current->cpuset_mem_spread_rotor = node;
+       return node;
+}
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
+/**
   * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
   * @p: pointer to task_struct of some other task.
   *
@@ -2215,7 +2495,7 @@ void cpuset_unlock(void)
  int cpuset_excl_nodes_overlap(const struct task_struct *p)
  {
         const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
-       int overlap = 0;                /* do cpusets overlap? */
+       int overlap = 1;                /* do cpusets overlap? */
  
         task_lock(current);
         if (current->flags & PF_EXITING) {
@@ -2281,46 +2561,52 @@ void __cpuset_memory_pressure_bump(void)
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
   *    and we take manage_mutex, keeping attach_task() from changing it
- *    anyway.
+ *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
+ *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
+ *    cpuset to top_cpuset.
   */
-
  static int proc_cpuset_show(struct seq_file *m, void *v)
  {
-       struct cpuset *cs;
+       struct pid *pid;
         struct task_struct *tsk;
         char *buf;
-       int retval = 0;
+       int retval;
  
+       retval = -ENOMEM;
         buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
         if (!buf)
-               return -ENOMEM;
+               goto out;
+
+       retval = -ESRCH;
+       pid = m->private;
+       tsk = get_pid_task(pid, PIDTYPE_PID);
+       if (!tsk)
+               goto out_free;
  
-       tsk = m->private;
+       retval = -EINVAL;
         mutex_lock(&manage_mutex);
-       cs = tsk->cpuset;
-       if (!cs) {
-               retval = -EINVAL;
-               goto out;
-       }
  
-       retval = cpuset_path(cs, buf, PAGE_SIZE);
+       retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
         if (retval < 0)
-               goto out;
+               goto out_unlock;
         seq_puts(m, buf);
         seq_putc(m, '\n');
-out:
+out_unlock:
         mutex_unlock(&manage_mutex);
+       put_task_struct(tsk);
+out_free:
         kfree(buf);
+out:
         return retval;
  }
  
  static int cpuset_open(struct inode *inode, struct file *file)
  {
-       struct task_struct *tsk = PROC_I(inode)->task;
-       return single_open(file, proc_cpuset_show, tsk);
+       struct pid *pid = PROC_I(inode)->pid;
+       return single_open(file, proc_cpuset_show, pid);
  }
  
-struct file_operations proc_cpuset_operations = {
+const struct file_operations proc_cpuset_operations = {
         .open           = cpuset_open,
         .read           = seq_read,
         .llseek         = seq_lseek,