X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fexit.c;h=f80dec3f1875db702b7729cdf91e4d2931cfcbbf;hb=f6490438fce5902f840d1f0f905295077c635e7a;hp=9e459fefda77191218466814a9355c85d4b09ea7;hpb=06c93e875747f3020d997220b3e7c98083acc7c3;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/exit.c b/kernel/exit.c index 9e459fe..f80dec3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,12 +13,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -38,20 +40,32 @@ #include #include #include -#include #include #include /* for audit_free() */ #include #include #include +#include +#include +#include #include #include #include #include +#include "cred-internals.h" + +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); static void exit_mm(struct task_struct * tsk); +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + static void __unhash_process(struct task_struct *p) { nr_threads--; @@ -64,7 +78,7 @@ static void __unhash_process(struct task_struct *p) __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - remove_parent(p); + list_del_init(&p->sibling); } /* @@ -78,7 +92,6 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); @@ -105,51 +118,65 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + task_io_accounting_add(&sig->ioac, &tsk->ioac); sig = NULL; /* Marker for below. */ } __unhash_process(tsk); + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); - rcu_read_unlock(); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } static void delayed_put_task_struct(struct rcu_head *rhp) { - put_task_struct(container_of(rhp, struct task_struct, rcu)); + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); } + void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: - atomic_dec(&p->user->processes); + tracehook_prepare_release_task(p); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ + atomic_dec(&__task_cred(p)->user->processes); + proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + tracehook_finish_release_task(p); __exit_signal(p); /* @@ -160,7 +187,7 @@ repeat: zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(leader->exit_signal == -1); + BUG_ON(task_detached(leader)); do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has @@ -170,7 +197,14 @@ repeat: * do_notify_parent() will have marked it self-reaping in * that case. */ - zap_leader = (leader->exit_signal == -1); + zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); @@ -214,20 +248,19 @@ struct pid *session_of_pgrp(struct pid *pgrp) static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; - int ret = 1; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (p == ignored_task - || p->exit_state - || is_global_init(p->real_parent)) + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) continue; + if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) { - ret = 0; - break; - } + task_session(p->real_parent) == task_session(p)) + return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return ret; /* (sighing) "Often!" */ + + return 1; } int is_current_pgrp_orphaned(void) @@ -255,6 +288,37 @@ static int has_stopped_jobs(struct pid *pgrp) return retval; } +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + /** * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * @@ -273,9 +337,8 @@ static void reparent_to_kthreadd(void) ptrace_unlink(current); /* Reparent to init */ - remove_parent(current); current->real_parent = current->parent = kthreadd_task; - add_parent(current); + list_move_tail(¤t->sibling, ¤t->real_parent->children); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; @@ -285,34 +348,33 @@ static void reparent_to_kthreadd(void) /* cpus_allowed? */ /* rt_priority? */ /* signals? */ - security_task_reparent_to_init(current); memcpy(current->signal->rlim, init_task.signal->rlim, sizeof(current->signal->rlim)); - atomic_inc(&(INIT_USER->__count)); + + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); write_unlock_irq(&tasklist_lock); - switch_uid(INIT_USER); } -void __set_special_pids(pid_t session, pid_t pgrp) +void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; + pid_t nr = pid_nr(pid); - if (task_session_nr(curr) != session) { - detach_pid(curr, PIDTYPE_SID); - set_task_session(curr, session); - attach_pid(curr, PIDTYPE_SID, find_pid(session)); + if (task_session(curr) != pid) { + change_pid(curr, PIDTYPE_SID, pid); + set_task_session(curr, nr); } - if (task_pgrp_nr(curr) != pgrp) { - detach_pid(curr, PIDTYPE_PGID); - set_task_pgrp(curr, pgrp); - attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); + if (task_pgrp(curr) != pid) { + change_pid(curr, PIDTYPE_PGID, pid); + set_task_pgrp(curr, nr); } } -static void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(struct pid *pid) { write_lock_irq(&tasklist_lock); - __set_special_pids(session, pgrp); + __set_special_pids(pid); write_unlock_irq(&tasklist_lock); } @@ -381,9 +443,13 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); - set_special_pids(1, 1); + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); proc_clear_tty(current); /* Block and flush all signals */ @@ -398,11 +464,6 @@ void daemonize(const char *name, ...) current->fs = fs; atomic_inc(&fs->count); - if (current->nsproxy != init_task.nsproxy) { - get_nsproxy(init_task.nsproxy); - switch_task_namespaces(current, init_task.nsproxy); - } - exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -458,7 +519,7 @@ struct files_struct *get_files_struct(struct task_struct *task) return files; } -void fastcall put_files_struct(struct files_struct *files) +void put_files_struct(struct files_struct *files) { struct fdtable *fdt; @@ -477,10 +538,9 @@ void fastcall put_files_struct(struct files_struct *files) } } -EXPORT_SYMBOL(put_files_struct); - -void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +void reset_files_struct(struct files_struct *files) { + struct task_struct *tsk = current; struct files_struct *old; old = tsk->files; @@ -489,9 +549,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) task_unlock(tsk); put_files_struct(old); } -EXPORT_SYMBOL(reset_files_struct); -static void __exit_files(struct task_struct *tsk) +void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -503,33 +562,17 @@ static void __exit_files(struct task_struct *tsk) } } -void exit_files(struct task_struct *tsk) -{ - __exit_files(tsk); -} - -static void __put_fs_struct(struct fs_struct *fs) +void put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { - dput(fs->altroot); - mntput(fs->altrootmnt); - } + path_put(&fs->root); + path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } } -void put_fs_struct(struct fs_struct *fs) -{ - __put_fs_struct(fs); -} - -static void __exit_fs(struct task_struct *tsk) +void exit_fs(struct task_struct *tsk) { struct fs_struct * fs = tsk->fs; @@ -537,16 +580,96 @@ static void __exit_fs(struct task_struct *tsk) task_lock(tsk); tsk->fs = NULL; task_unlock(tsk); - __put_fs_struct(fs); + put_fs_struct(fs); } } -void exit_fs(struct task_struct *tsk) +EXPORT_SYMBOL_GPL(exit_fs); + +#ifdef CONFIG_MM_OWNER +/* + * Task p is exiting and it owned mm, lets find a new owner for it + */ +static inline int +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) { - __exit_fs(tsk); + /* + * If there are other users of the mm and the owner (us) is exiting + * we need to find a new owner to take on the responsibility. + */ + if (atomic_read(&mm->mm_users) <= 1) + return 0; + if (mm->owner != p) + return 0; + return 1; } -EXPORT_SYMBOL_GPL(exit_fs); +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + if (!mm_need_new_owner(mm, p)) + return; + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL. + */ + mm->owner = NULL; + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ /* * Turn us into a lazy TLB process if we @@ -555,26 +678,40 @@ EXPORT_SYMBOL_GPL(exit_fs); static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; + struct core_state *core_state; mm_release(tsk, mm); if (!mm) return; /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_waiters + * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread - * will increment core_waiters for each thread in the + * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_waiters) { + core_state = mm->core_state; + if (core_state) { + struct core_thread self; up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - if (!--mm->core_waiters) - complete(mm->core_startup_done); - up_write(&mm->mmap_sem); - wait_for_completion(&mm->core_done); + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); @@ -587,74 +724,121 @@ static void exit_mm(struct task_struct * tsk) /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); task_unlock(tsk); + mm_update_next_owner(mm); mmput(mm); } -static void -reparent_thread(struct task_struct *p, struct task_struct *father, int traced) +/* + * Return nonzero if @parent's children should reap themselves. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static int ignoring_children(struct task_struct *parent) { - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + int ret; + struct sighand_struct *psig = parent->sighand; + unsigned long flags; + spin_lock_irqsave(&psig->siglock, flags); + ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT)); + spin_unlock_irqrestore(&psig->siglock, flags); + return ret; +} - /* Move the child from its dying parent to the new one. */ - if (unlikely(traced)) { - /* Preserve ptrace links if someone else is tracing this child. */ - list_del_init(&p->ptrace_list); - if (p->parent != p->real_parent) - list_add(&p->ptrace_list, &p->real_parent->ptrace_children); - } else { - /* If this child is being traced, then we're the one tracing it - * anyway, so let go of it. +/* + * Detach all tasks we were using ptrace on. + * Any that need to be release_task'd are put on the @dead list. + * + * Called with write_lock(&tasklist_lock) held. + */ +static void ptrace_exit(struct task_struct *parent, struct list_head *dead) +{ + struct task_struct *p, *n; + int ign = -1; + + list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) { + __ptrace_unlink(p); + + if (p->exit_state != EXIT_ZOMBIE) + continue; + + /* + * If it's a zombie, our attachedness prevented normal + * parent notification or self-reaping. Do notification + * now if it would have happened earlier. If it should + * reap itself, add it to the @dead list. We can't call + * release_task() here because we already hold tasklist_lock. + * + * If it's our own child, there is no notification to do. + * But if our normal children self-reap, then this child + * was prevented by ptrace and we must reap it now. */ - p->ptrace = 0; - remove_parent(p); - p->parent = p->real_parent; - add_parent(p); + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, parent)) + do_notify_parent(p, p->exit_signal); + else { + if (ign < 0) + ign = ignoring_children(parent); + if (ign) + p->exit_signal = -1; + } + } - if (task_is_traced(p)) { + if (task_detached(p)) { /* - * If it was at a trace stop, turn it into - * a normal stop since it's no longer being - * traced. + * Mark it as in the process of being reaped. */ - ptrace_untrace(p); + p->exit_state = EXIT_DEAD; + list_add(&p->ptrace_entry, dead); } } +} + +/* + * Finish up exit-time ptrace cleanup. + * + * Called without locks. + */ +static void ptrace_exit_finish(struct task_struct *parent, + struct list_head *dead) +{ + struct task_struct *p, *n; + + BUG_ON(!list_empty(&parent->ptraced)); + + list_for_each_entry_safe(p, n, dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } +} + +static void reparent_thread(struct task_struct *p, struct task_struct *father) +{ + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); /* If this is a threaded reparent there is no need to * notify anyone anything has happened. */ - if (p->real_parent->group_leader == father->group_leader) + if (same_thread_group(p->real_parent, father)) return; /* We don't want people slaying init. */ - if (p->exit_signal != -1) + if (!task_detached(p)) p->exit_signal = SIGCHLD; /* If we'd notified the old parent about this child's death, * also notify the new parent. */ - if (!traced && p->exit_state == EXIT_ZOMBIE && - p->exit_signal != -1 && thread_group_empty(p)) + if (!ptrace_reparented(p) && + p->exit_state == EXIT_ZOMBIE && + !task_detached(p) && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); - /* - * process group orphan check - * Case ii: Our child is in a different pgrp - * than we are, and it was the only connection - * outside, so the child pgrp is now orphaned. - */ - if ((task_pgrp(p) != task_pgrp(father)) && - (task_session(p) == task_session(father))) { - struct pid *pgrp = task_pgrp(p); - - if (will_become_orphaned_pgrp(pgrp, NULL) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } - } + kill_orphaned_pgrp(p, father); } /* @@ -664,104 +848,73 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * the child reaper process (ie "init") in our pid * space. */ -static void forget_original_parent(struct task_struct *father) +static struct task_struct *find_new_reaper(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; - struct list_head ptrace_dead; - - INIT_LIST_HEAD(&ptrace_dead); - - write_lock_irq(&tasklist_lock); - - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; - /* - * There are only two places where our children can be: - * - * - in our child list - * - in our ptraced child list - * - * Search them and reparent children. - */ - list_for_each_entry_safe(p, n, &father->children, sibling) { - int ptrace; - - ptrace = p->ptrace; - - /* if father isn't the real parent, then ptrace must be enabled */ - BUG_ON(father != p->real_parent && !ptrace); + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } - if (father == p->real_parent) { - /* reparent with a reaper, real father it's us */ - p->real_parent = reaper; - reparent_thread(p, father, 0); - } else { - /* reparent ptraced task to its real parent */ - __ptrace_unlink (p); - if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - } + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); /* - * if the ptraced child is a zombie with exit_signal == -1 - * we must collect it before we exit, or it will remain - * zombie forever since we prevented it from self-reap itself - * while it was being traced by us, to be able to see it in wait4. + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. */ - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) - list_add(&p->ptrace_list, &ptrace_dead); + pid_ns->child_reaper = init_pid_ns.child_reaper; } - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { + return pid_ns->child_reaper; +} + +static void forget_original_parent(struct task_struct *father) +{ + struct task_struct *p, *n, *reaper; + LIST_HEAD(ptrace_dead); + + write_lock_irq(&tasklist_lock); + reaper = find_new_reaper(father); + /* + * First clean up ptrace if we were using it. + */ + ptrace_exit(father, &ptrace_dead); + + list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; - reparent_thread(p, father, 1); + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(p, father); } write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&father->children)); - BUG_ON(!list_empty(&father->ptrace_children)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) { - list_del_init(&p->ptrace_list); - release_task(p); - } + ptrace_exit_finish(father, &ptrace_dead); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; - struct task_struct *t; - struct pid *pgrp; - - if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) - && !thread_group_empty(tsk)) { - /* - * This occurs when there was a race between our exit - * syscall and a group signal choosing us as the one to - * wake up. It could be that we are the only thread - * alerted to check for pending signals, but another thread - * should be woken now to take the signal since we will not. - * Now we'll wake all the threads in the group just to make - * sure someone gets all the pending signals. - */ - spin_lock_irq(&tsk->sighand->siglock); - for (t = next_thread(tsk); t != tsk; t = next_thread(t)) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); - spin_unlock_irq(&tsk->sighand->siglock); - } + int signal; + void *cookie; /* * This does two things: @@ -775,25 +928,8 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - /* - * Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - * - * Case i: Our father is in a different pgrp than we are - * and we were the only connection outside, so our pgrp - * is about to become orphaned. - */ - t = tsk->real_parent; - - pgrp = task_pgrp(tsk); - if ((task_pgrp(t) != pgrp) && - (task_session(t) == task_session(tsk)) && - will_become_orphaned_pgrp(pgrp, tsk) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); /* Let father know we died * @@ -809,38 +945,30 @@ static void exit_notify(struct task_struct *tsk) * we have changed execution domain as these two values started * the same after a fork. */ - if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && - ( tsk->parent_exec_id != t->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) - && !capable(CAP_KILL)) + if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && + !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal >= 0) + signal = do_notify_parent(tsk, signal); - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { - int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } - - state = EXIT_ZOMBIE; - if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal == DEATH_REAP) release_task(tsk); } @@ -872,40 +1000,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - -fastcall NORET_TYPE void do_exit(long code) +NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -919,10 +1014,7 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } + tracehook_report_exit(&code); /* * We're taking recursive faults here in do_exit. Safest is to just @@ -941,13 +1033,11 @@ fastcall NORET_TYPE void do_exit(long code) * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; - if (tsk->io_context) - exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } - tsk->flags |= PF_EXITING; + exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against * an exiting task cleaning up the robust pi futexes. @@ -961,25 +1051,13 @@ fastcall NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) @@ -992,13 +1070,14 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); + trace_sched_process_exit(tsk); + exit_sem(tsk); - __exit_files(tsk); - __exit_fs(tsk); + exit_files(tsk); + exit_fs(tsk); check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); - exit_keys(tsk); if (group_dead && tsk->signal->leader) disassociate_ctty(1); @@ -1008,9 +1087,9 @@ fastcall NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); - exit_notify(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA - mpol_free(tsk->mempolicy); + mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX @@ -1043,7 +1122,6 @@ fastcall NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1063,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) EXPORT_SYMBOL(complete_and_exit); -asmlinkage long sys_exit(int error_code) +SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } @@ -1075,19 +1153,21 @@ asmlinkage long sys_exit(int error_code) NORET_TYPE void do_group_exit(int exit_code) { + struct signal_struct *sig = current->signal; + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) - exit_code = current->signal->group_exit_code; + if (signal_group_exit(sig)) + exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { - struct signal_struct *const sig = current->signal; struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (sig->flags & SIGNAL_GROUP_EXIT) + if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { sig->group_exit_code = exit_code; + sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); @@ -1102,35 +1182,33 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; +} + +static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + struct pid *pid = NULL; + if (type == PIDTYPE_PID) + pid = task->pids[type].pid; + else if (type < PIDTYPE_MAX) + pid = task->group_leader->pids[type].pid; + return pid; } -static int eligible_child(pid_t pid, int options, struct task_struct *p) +static int eligible_child(enum pid_type type, struct pid *pid, int options, + struct task_struct *p) { int err; - struct pid_namespace *ns; - ns = current->nsproxy->pid_ns; - if (pid > 0) { - if (task_pid_nr_ns(p, ns) != pid) - return 0; - } else if (!pid) { - if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current)) - return 0; - } else if (pid != -1) { - if (task_pgrp_nr_ns(p, ns) != -pid) + if (type < PIDTYPE_MAX) { + if (task_pid_type(p, type) != pid) return 0; } - /* - * Do not consider detached threads that are - * not ptraced: - */ - if (p->exit_signal == -1 && !p->ptrace) - return 0; - /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1139,12 +1217,6 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p) if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) && !(options & __WALL)) return 0; - /* - * Do not consider thread group leaders that are - * in a non-empty thread group: - */ - if (delay_group_leader(p)) - return 2; err = security_task_wait(p); if (err) @@ -1184,26 +1256,22 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(struct task_struct *p, int noreap, +static int wait_task_zombie(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { unsigned long state; int retval, status, traced; - struct pid_namespace *ns; + pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; - ns = current->nsproxy->pid_ns; + if (!likely(options & WEXITED)) + return 0; - if (unlikely(noreap)) { - pid_t pid = task_pid_nr_ns(p, ns); - uid_t uid = p->uid; + if (unlikely(options & WNOWAIT)) { int exit_code = p->exit_code; int why, status; - if (unlikely(p->exit_state != EXIT_ZOMBIE)) - return 0; - if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) - return 0; get_task_struct(p); read_unlock(&tasklist_lock); if ((exit_code & 0x7f) == 0) { @@ -1227,12 +1295,12 @@ static int wait_task_zombie(struct task_struct *p, int noreap, return 0; } - /* traced means p->ptrace, but not vice versa */ - traced = (p->real_parent != p->parent); + traced = ptrace_reparented(p); if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1248,20 +1316,23 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ + thread_group_cputime(p, &cputime); spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1281,6 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int noreap, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->parent->sighand->siglock); } @@ -1314,11 +1387,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap, retval = put_user(status, &infop->si_status); } if (!retval && infop) - retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); + retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); + retval = put_user(uid, &infop->si_uid); if (!retval) - retval = task_pid_nr_ns(p, ns); + retval = pid; if (traced) { write_lock_irq(&tasklist_lock); @@ -1329,9 +1402,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * If it's still not detached after that, don't release * it now. */ - if (p->exit_signal != -1) { + if (!task_detached(p)) { do_notify_parent(p, p->exit_signal); - if (p->exit_signal != -1) { + if (!task_detached(p)) { p->exit_state = EXIT_ZOMBIE; p = NULL; } @@ -1350,21 +1423,42 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, - int noreap, struct siginfo __user *infop, +static int wait_task_stopped(int ptrace, struct task_struct *p, + int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { - int retval, exit_code; + int retval, exit_code, why; + uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; - if (!p->exit_code) + if (!(options & WUNTRACED)) return 0; - if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && - p->signal->group_stop_count > 0) + + exit_code = 0; + spin_lock_irq(&p->sighand->siglock); + + if (unlikely(!task_is_stopped_or_traced(p))) + goto unlock_sig; + + if (!ptrace && p->signal->group_stop_count > 0) /* * A group stop is in progress and this is the group leader. * We won't report until all threads have stopped. */ + goto unlock_sig; + + exit_code = p->exit_code; + if (!exit_code) + goto unlock_sig; + + if (!unlikely(options & WNOWAIT)) + p->exit_code = 0; + + /* don't need the RCU readlock here as we're holding a spinlock */ + uid = __task_cred(p)->uid; +unlock_sig: + spin_unlock_irq(&p->sighand->siglock); + if (!exit_code) return 0; /* @@ -1374,65 +1468,15 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ - pid = task_pid_nr_ns(p, current->nsproxy->pid_ns); get_task_struct(p); + pid = task_pid_vnr(p); + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); - if (unlikely(noreap)) { - uid_t uid = p->uid; - int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; - - exit_code = p->exit_code; - if (unlikely(!exit_code) || unlikely(p->exit_state)) - goto bail_ref; + if (unlikely(options & WNOWAIT)) return wait_noreap_copyout(p, pid, uid, why, exit_code, infop, ru); - } - - write_lock_irq(&tasklist_lock); - - /* - * This uses xchg to be atomic with the thread resuming and setting - * it. It must also be done with the write lock held to prevent a - * race with the EXIT_ZOMBIE case. - */ - exit_code = xchg(&p->exit_code, 0); - if (unlikely(p->exit_state)) { - /* - * The task resumed and then died. Let the next iteration - * catch it in EXIT_ZOMBIE. Note that exit_code might - * already be zero here if it resumed and did _exit(0). - * The task itself is dead and won't touch exit_code again; - * other processors in this function are locked out. - */ - p->exit_code = exit_code; - exit_code = 0; - } - if (unlikely(exit_code == 0)) { - /* - * Another thread in this function got to it first, or it - * resumed, or it resumed and then died. - */ - write_unlock_irq(&tasklist_lock); -bail_ref: - put_task_struct(p); - /* - * We are returning to the wait loop without having successfully - * removed the process and having released the lock. We cannot - * continue, since the "p" task pointer is potentially stale. - * - * Return -EAGAIN, and do_wait() will restart the loop from the - * beginning. Do _not_ re-acquire the lock. - */ - return -EAGAIN; - } - - /* move to end of parent's list to avoid starvation */ - remove_parent(p); - add_parent(p); - - write_unlock_irq(&tasklist_lock); retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; if (!retval && stat_addr) @@ -1442,15 +1486,13 @@ bail_ref: if (!retval && infop) retval = put_user(0, &infop->si_errno); if (!retval && infop) - retval = put_user((short)((p->ptrace & PT_PTRACED) - ? CLD_TRAPPED : CLD_STOPPED), - &infop->si_code); + retval = put_user((short)why, &infop->si_code); if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); + retval = put_user(uid, &infop->si_uid); if (!retval) retval = pid; put_task_struct(p); @@ -1465,14 +1507,16 @@ bail_ref: * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_continued(struct task_struct *p, int noreap, +static int wait_task_continued(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { int retval; pid_t pid; uid_t uid; - struct pid_namespace *ns; + + if (!unlikely(options & WCONTINUED)) + return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1483,13 +1527,12 @@ static int wait_task_continued(struct task_struct *p, int noreap, spin_unlock_irq(&p->sighand->siglock); return 0; } - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = __task_cred(p)->uid; spin_unlock_irq(&p->sighand->siglock); - ns = current->nsproxy->pid_ns; - pid = task_pid_nr_ns(p, ns); - uid = p->uid; + pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); @@ -1499,7 +1542,7 @@ static int wait_task_continued(struct task_struct *p, int noreap, if (!retval && stat_addr) retval = put_user(0xffff, stat_addr); if (!retval) - retval = task_pid_nr_ns(p, ns); + retval = pid; } else { retval = wait_noreap_copyout(p, pid, uid, CLD_CONTINUED, SIGCONT, @@ -1510,152 +1553,184 @@ static int wait_task_continued(struct task_struct *p, int noreap, return retval; } - -static inline int my_ptrace_child(struct task_struct *p) +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, int ptrace, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) { - if (!(p->ptrace & PT_PTRACED)) + int ret = eligible_child(type, pid, options, p); + if (!ret) + return ret; + + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + + if (p->exit_state == EXIT_DEAD) return 0; - if (!(p->ptrace & PT_ATTACHED)) - return 1; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_is_stopped_or_traced(p)) + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + /* - * This child was PTRACE_ATTACH'd. We should be seeing it only if - * we are the attacher. If we are the real parent, this is a race - * inside ptrace_attach. It is waiting for the tasklist_lock, - * which we have to switch the parent links, but has already set - * the flags in p->ptrace. + * Traditionally we see ptrace'd stopped tasks regardless of options. */ - return (p->parent != p->real_parent); + options |= WUNTRACED; + + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; } -static long do_wait(pid_t pid, int options, struct siginfo __user *infop, - int __user *stat_addr, struct rusage __user *ru) +static long do_wait(enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; - int flag, retval; - int allowed, denied; + int retval; + + trace_sched_process_wait(pid); add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* - * We will set this flag if we see any child that might later + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later * match our criteria, even if we are not able to reap it yet. */ - flag = 0; - allowed = denied = 0; + retval = -ECHILD; + if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) + goto end; + current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; do { - struct task_struct *p; - int ret; - - list_for_each_entry(p, &tsk->children, sibling) { - ret = eligible_child(pid, options, p); - if (!ret) - continue; - - if (unlikely(ret < 0)) { - denied = ret; - continue; - } - allowed = 1; - - if (task_is_stopped_or_traced(p)) { - /* - * It's stopped now, so it might later - * continue, exit, or stop again. - * - * When we hit the race with PTRACE_ATTACH, we - * will not report this child. But the race - * means it has not yet been moved to our - * ptrace_children list, so we need to set the - * flag here to avoid a spurious ECHILD when - * the race happens with the only child. - */ - flag = 1; - - if (!my_ptrace_child(p)) { - if (task_is_traced(p)) - continue; - if (!(options & WUNTRACED)) - continue; - } - - retval = wait_task_stopped(p, ret == 2, - (options & WNOWAIT), infop, - stat_addr, ru); - if (retval == -EAGAIN) - goto repeat; - if (retval != 0) /* He released the lock. */ - goto end; - } else if (p->exit_state == EXIT_DEAD) { - continue; - } else if (p->exit_state == EXIT_ZOMBIE) { - /* - * Eligible but we cannot release it yet: - */ - if (ret == 2) - goto check_continued; - if (!likely(options & WEXITED)) - continue; - retval = wait_task_zombie(p, - (options & WNOWAIT), infop, - stat_addr, ru); - /* He released the lock. */ - if (retval != 0) - goto end; - } else { -check_continued: - /* - * It's running now, so it might later - * exit, stop, or stop and then continue. - */ - flag = 1; - if (!unlikely(options & WCONTINUED)) - continue; - retval = wait_task_continued(p, - (options & WNOWAIT), infop, - stat_addr, ru); - if (retval != 0) /* He released the lock. */ - goto end; - } - } - if (!flag) { - list_for_each_entry(p, &tsk->ptrace_children, - ptrace_list) { - if (!eligible_child(pid, options, p)) - continue; - flag = 1; - break; - } + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; } + if (options & __WNOTHREAD) break; tsk = next_thread(tsk); BUG_ON(tsk->signal != current->signal); } while (tsk != current); - read_unlock(&tasklist_lock); - if (flag) { - retval = 0; - if (options & WNOHANG) - goto end; + + if (!retval && !(options & WNOHANG)) { retval = -ERESTARTSYS; - if (signal_pending(current)) - goto end; - schedule(); - goto repeat; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } } - retval = -ECHILD; - if (unlikely(denied) && !allowed) - retval = denied; + end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); if (infop) { if (retval > 0) - retval = 0; + retval = 0; else { /* * For a WNOHANG return, clear out all the fields @@ -1679,10 +1754,11 @@ end: return retval; } -asmlinkage long sys_waitid(int which, pid_t pid, - struct siginfo __user *infop, int options, - struct rusage __user *ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) { + struct pid *pid = NULL; + enum pid_type type; long ret; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) @@ -1692,40 +1768,61 @@ asmlinkage long sys_waitid(int which, pid_t pid, switch (which) { case P_ALL: - pid = -1; + type = PIDTYPE_MAX; break; case P_PID: - if (pid <= 0) + type = PIDTYPE_PID; + if (upid <= 0) return -EINVAL; break; case P_PGID: - if (pid <= 0) + type = PIDTYPE_PGID; + if (upid <= 0) return -EINVAL; - pid = -pid; break; default: return -EINVAL; } - ret = do_wait(pid, options, infop, NULL, ru); + if (type < PIDTYPE_MAX) + pid = find_get_pid(upid); + ret = do_wait(type, pid, options, infop, NULL, ru); + put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } -asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, - int options, struct rusage __user *ru) +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) { + struct pid *pid = NULL; + enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; - ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru); + + if (upid == -1) + type = PIDTYPE_MAX; + else if (upid < 0) { + type = PIDTYPE_PGID; + pid = find_get_pid(-upid); + } else if (upid == 0) { + type = PIDTYPE_PGID; + pid = get_pid(task_pgrp(current)); + } else /* upid > 0 */ { + type = PIDTYPE_PID; + pid = find_get_pid(upid); + } + + ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); + put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } @@ -1735,7 +1832,7 @@ asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ -asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return sys_wait4(pid, stat_addr, options, NULL); }