X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fcgroup.c;h=4fd90e12977236f54a4dd29168bc2db04bb1cdbc;hb=5f1664f92b2247111b7d37e454a050b76ac61b7f;hp=a9433f50e53d0670b29ee22aea50d7f840c080ca;hpb=102a775e3647628727ae83a9a6abf0564c3ca7cb;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a9433f5..4fd90e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include +#include #include #include #include @@ -50,6 +51,7 @@ #include #include #include +#include /* TODO: replace with more sophisticated array */ #include @@ -165,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +#ifdef CONFIG_PROVE_LOCKING +int cgroup_lock_is_held(void) +{ + return lockdep_is_held(&cgroup_mutex); +} +#else /* #ifdef CONFIG_PROVE_LOCKING */ +int cgroup_lock_is_held(void) +{ + return mutex_is_locked(&cgroup_mutex); +} +#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ + +EXPORT_SYMBOL_GPL(cgroup_lock_is_held); + /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) { @@ -266,6 +282,12 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -309,7 +331,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - kfree(cg); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -696,7 +718,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; +static const struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { .name = "cgroup", @@ -776,6 +798,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -1121,8 +1149,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - init_rwsem(&(cgrp->tasks.mutex)); - init_rwsem(&(cgrp->procs.mutex)); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1539,7 +1567,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1577,7 +1605,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1697,14 +1725,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1740,8 +1767,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -1850,7 +1876,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static struct file_operations cgroup_seqfile_operations = { +static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, @@ -1909,7 +1935,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cgroup_file_operations = { +static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, @@ -2345,6 +2371,42 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * If the new stripped list is sufficiently smaller and there's enough memory * to allocate a new buffer, will let go of the unneeded memory. Returns the @@ -2383,7 +2445,7 @@ after: * we'll just stay with what we've got. */ if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL); + newlist = pidlist_resize(list, dest); if (newlist) *p = newlist; } @@ -2396,9 +2458,58 @@ static int cmppid(const void *a, const void *b) } /* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ -static int pidlist_array_load(struct cgroup *cgrp, bool procs) +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) { pid_t *array; int length; @@ -2414,7 +2525,7 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs) * show up until sometime later on. */ length = cgroup_task_count(cgrp); - array = kmalloc(length * sizeof(pid_t), GFP_KERNEL); + array = pidlist_allocate(length); if (!array) return -ENOMEM; /* now, populate the array */ @@ -2423,7 +2534,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs) if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ - pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk)); + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } @@ -2431,19 +2545,20 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs) length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); - if (procs) { + if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(&array, length); - l = &(cgrp->procs); - } else { - l = &(cgrp->tasks); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; } - /* store array in cgroup, freeing old if necessary */ - down_write(&l->mutex); - kfree(l->list); + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); l->list = array; l->length = length; l->use_count++; up_write(&l->mutex); + *lp = l; return 0; } @@ -2586,13 +2701,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { static void cgroup_release_pid_array(struct cgroup_pidlist *l) { + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); down_write(&l->mutex); BUG_ON(!l->use_count); if (!--l->use_count) { - kfree(l->list); - l->list = NULL; - l->length = 0; + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } + mutex_unlock(&l->owner->pidlist_mutex); up_write(&l->mutex); } @@ -2623,10 +2751,10 @@ static const struct file_operations cgroup_pidlist_operations = { * in the cgroup. */ /* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, bool procs) +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks); + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ @@ -2634,7 +2762,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs) return 0; /* have the array populated */ - retval = pidlist_array_load(cgrp, procs); + retval = pidlist_array_load(cgrp, type, &l); if (retval) return retval; /* configure file information */ @@ -2650,11 +2778,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs) } static int cgroup_tasks_open(struct inode *unused, struct file *file) { - return cgroup_pidlist_open(file, false); + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); } static int cgroup_procs_open(struct inode *unused, struct file *file) { - return cgroup_pidlist_open(file, true); + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, @@ -2823,14 +2951,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); + if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); - if (ss->use_id) - if (alloc_css_id(ss, parent, cgrp)) + if (ss->use_id) { + err = alloc_css_id(ss, parent, cgrp); + if (err) goto err_destroy; + } /* At error, ->destroy() callback has to free assigned ID. */ } @@ -3253,7 +3384,7 @@ static int cgroup_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroup_show, pid); } -struct file_operations proc_cgroup_operations = { +const struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, @@ -3282,7 +3413,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroupstats_show, NULL); } -static struct file_operations proc_cgroupstats_operations = { +static const struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, @@ -3592,8 +3723,10 @@ static void check_for_release(struct cgroup *cgrp) void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int val; rcu_read_lock(); - if (atomic_dec_return(&css->refcnt) == 1) { + val = atomic_dec_return(&css->refcnt); + if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -3601,6 +3734,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); + WARN_ON_ONCE(val < 1); } /*