+ n = 0;
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+ cgroup_iter_start(cs->css.cgroup, &it);
+ while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
+ struct mm_struct *mm;
+
+ if (n >= ntasks) {
+ printk(KERN_WARNING
+ "Cpuset mempolicy rebind incomplete.\n");
+ break;
+ }
+ mm = get_task_mm(p);
+ if (!mm)
+ continue;
+ mmarray[n++] = mm;
+ }
+ cgroup_iter_end(cs->css.cgroup, &it);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Now that we've dropped the tasklist spinlock, we can
+ * rebind the vma mempolicies of each mm in mmarray[] to their
+ * new cpuset, and release that mm. The mpol_rebind_mm()
+ * call takes mmap_sem, which we couldn't take while holding
+ * tasklist_lock. Forks can happen again now - the mpol_copy()
+ * cpuset_being_rebound check will catch such forks, and rebind
+ * their vma mempolicies too. Because we still hold the global
+ * cpuset manage_mutex, we know that no other rebind effort will
+ * be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
+ */
+ migrate = is_memory_migrate(cs);
+ for (i = 0; i < n; i++) {
+ struct mm_struct *mm = mmarray[i];
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+ mmput(mm);
+ }
+
+ /* We're done rebinding vma's to this cpusets new mems_allowed. */
+ kfree(mmarray);
+ cpuset_being_rebound = NULL;
+ retval = 0;
+done:
+ return retval;
+}
+
+int current_cpuset_is_being_rebound(void)
+{
+ return task_cs(current) == cpuset_being_rebound;
+}
+
+/*
+ * Call with manage_mutex held.
+ */
+
+static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
+{
+ if (simple_strtoul(buf, NULL, 10) != 0)
+ cpuset_memory_pressure_enabled = 1;
+ else
+ cpuset_memory_pressure_enabled = 0;
+ return 0;
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
+ * CS_SCHED_LOAD_BALANCE,
+ * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
+ * cs: the cpuset to update
+ * buf: the buffer where we read the 0 or 1
+ *
+ * Call with manage_mutex held.
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+{
+ int turning_on;
+ struct cpuset trialcs;
+ int err;
+ int cpus_nonempty, balance_flag_changed;
+
+ turning_on = (simple_strtoul(buf, NULL, 10) != 0);
+
+ trialcs = *cs;
+ if (turning_on)
+ set_bit(bit, &trialcs.flags);
+ else
+ clear_bit(bit, &trialcs.flags);
+
+ err = validate_change(cs, &trialcs);
+ if (err < 0)
+ return err;
+
+ cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
+ balance_flag_changed = (is_sched_load_balance(cs) !=
+ is_sched_load_balance(&trialcs));
+
+ mutex_lock(&callback_mutex);
+ cs->flags = trialcs.flags;
+ mutex_unlock(&callback_mutex);
+
+ if (cpus_nonempty && balance_flag_changed)
+ rebuild_sched_domains();
+
+ return 0;
+}
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000. At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event. At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
+#define FM_SCALE 1000 /* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+ fmp->cnt = 0;
+ fmp->val = 0;
+ fmp->time = 0;
+ spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+ time_t now = get_seconds();
+ time_t ticks = now - fmp->time;
+
+ if (ticks == 0)
+ return;
+
+ ticks = min(FM_MAXTICKS, ticks);
+ while (ticks-- > 0)
+ fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+ fmp->time = now;
+
+ fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+ fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+ spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+ int val;
+
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ val = fmp->val;
+ spin_unlock(&fmp->lock);
+ return val;
+}
+
+static int cpuset_can_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont, struct task_struct *tsk)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+
+ return security_task_setscheduler(tsk, 0, NULL);
+}
+
+static void cpuset_attach(struct cgroup_subsys *ss,
+ struct cgroup *cont, struct cgroup *oldcont,
+ struct task_struct *tsk)
+{
+ cpumask_t cpus;
+ nodemask_t from, to;
+ struct mm_struct *mm;
+ struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *oldcs = cgroup_cs(oldcont);
+
+ mutex_lock(&callback_mutex);