X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fcgroup.c;h=7ccba4bc5e3b815a9ac9a9c5a0e40d8e70b5c780;hb=906010b2134e14a2e377decbadd357b3d0ab9c6a;hp=a9433f50e53d0670b29ee22aea50d7f840c080ca;hpb=102a775e3647628727ae83a9a6abf0564c3ca7cb;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a9433f5..7ccba4b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -50,6 +50,7 @@ #include #include #include +#include /* TODO: replace with more sophisticated array */ #include @@ -266,6 +267,12 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -309,7 +316,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - kfree(cg); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -776,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -1121,8 +1134,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - init_rwsem(&(cgrp->tasks.mutex)); - init_rwsem(&(cgrp->procs.mutex)); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1539,7 +1552,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1577,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -2345,6 +2358,42 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * If the new stripped list is sufficiently smaller and there's enough memory * to allocate a new buffer, will let go of the unneeded memory. Returns the @@ -2383,7 +2432,7 @@ after: * we'll just stay with what we've got. */ if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL); + newlist = pidlist_resize(list, dest); if (newlist) *p = newlist; } @@ -2396,9 +2445,59 @@ static int cmppid(const void *a, const void *b) } /* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ -static int pidlist_array_load(struct cgroup *cgrp, bool procs) +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) { pid_t *array; int length; @@ -2414,7 +2513,7 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs) * show up until sometime later on. */ length = cgroup_task_count(cgrp); - array = kmalloc(length * sizeof(pid_t), GFP_KERNEL); + array = pidlist_allocate(length); if (!array) return -ENOMEM; /* now, populate the array */ @@ -2423,7 +2522,10 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs) if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ - pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk)); + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } @@ -2431,19 +2533,20 @@ static int pidlist_array_load(struct cgroup *cgrp, bool procs) length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); - if (procs) { + if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(&array, length); - l = &(cgrp->procs); - } else { - l = &(cgrp->tasks); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; } - /* store array in cgroup, freeing old if necessary */ - down_write(&l->mutex); - kfree(l->list); + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); l->list = array; l->length = length; l->use_count++; up_write(&l->mutex); + *lp = l; return 0; } @@ -2586,13 +2689,26 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { static void cgroup_release_pid_array(struct cgroup_pidlist *l) { + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); down_write(&l->mutex); BUG_ON(!l->use_count); if (!--l->use_count) { - kfree(l->list); - l->list = NULL; - l->length = 0; + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } + mutex_unlock(&l->owner->pidlist_mutex); up_write(&l->mutex); } @@ -2623,10 +2739,10 @@ static const struct file_operations cgroup_pidlist_operations = { * in the cgroup. */ /* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, bool procs) +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks); + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ @@ -2634,7 +2750,7 @@ static int cgroup_pidlist_open(struct file *file, bool procs) return 0; /* have the array populated */ - retval = pidlist_array_load(cgrp, procs); + retval = pidlist_array_load(cgrp, type, &l); if (retval) return retval; /* configure file information */ @@ -2650,11 +2766,11 @@ static int cgroup_pidlist_open(struct file *file, bool procs) } static int cgroup_tasks_open(struct inode *unused, struct file *file) { - return cgroup_pidlist_open(file, false); + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); } static int cgroup_procs_open(struct inode *unused, struct file *file) { - return cgroup_pidlist_open(file, true); + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,