X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=kernel%2Fexit.c;h=abf9cf3b95c609f12ccb0c6992cc3a8221bdcf8b;hp=8f6185e69b691265746f81a1ca4ce9de9fd8a640;hb=9d2a3f31d6d7832cd441eeda08bc2266cdd5d972;hpb=da7978b0348d497688541e2d2f5739aa2a2c334f diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e..abf9cf3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -39,24 +40,27 @@ #include #include #include -#include #include #include /* for audit_free() */ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include "cred-internals.h" -static void exit_mm(struct task_struct * tsk); +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); -static inline int task_detached(struct task_struct *p) -{ - return p->exit_signal == -1; -} +static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) { @@ -70,7 +74,7 @@ static void __unhash_process(struct task_struct *p) __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - remove_parent(p); + list_del_init(&p->sibling); } /* @@ -84,7 +88,6 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); @@ -111,15 +114,16 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -135,32 +139,43 @@ static void __exit_signal(struct task_struct *tsk) tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); - rcu_read_unlock(); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); + /* + * Make sure ->signal can't go away under rq->lock, + * see account_group_exec_runtime(). + */ + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } static void delayed_put_task_struct(struct rcu_head *rhp) { - put_task_struct(container_of(rhp, struct task_struct, rcu)); + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); } + void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: - atomic_dec(&p->user->processes); + tracehook_prepare_release_task(p); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials */ + atomic_dec(&__task_cred(p)->user->processes); + proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + tracehook_finish_release_task(p); __exit_signal(p); /* @@ -182,6 +197,13 @@ repeat: * that case. */ zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); @@ -314,9 +336,8 @@ static void reparent_to_kthreadd(void) ptrace_unlink(current); /* Reparent to init */ - remove_parent(current); current->real_parent = current->parent = kthreadd_task; - add_parent(current); + list_move_tail(¤t->sibling, ¤t->real_parent->children); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; @@ -326,27 +347,23 @@ static void reparent_to_kthreadd(void) /* cpus_allowed? */ /* rt_priority? */ /* signals? */ - security_task_reparent_to_init(current); memcpy(current->signal->rlim, init_task.signal->rlim, sizeof(current->signal->rlim)); - atomic_inc(&(INIT_USER->__count)); + + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); write_unlock_irq(&tasklist_lock); - switch_uid(INIT_USER); } void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - pid_t nr = pid_nr(pid); - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - set_task_session(curr, nr); - } - if (task_pgrp(curr) != pid) { + + if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); - set_task_pgrp(curr, nr); - } } static void set_special_pids(struct pid *pid) @@ -404,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal); void daemonize(const char *name, ...) { va_list args; - struct fs_struct *fs; sigset_t blocked; va_start(args, name); @@ -421,7 +437,7 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); if (current->nsproxy != &init_nsproxy) { get_nsproxy(&init_nsproxy); @@ -437,11 +453,7 @@ void daemonize(const char *name, ...) /* Become as one with the init task */ - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - + daemonize_fs_struct(); exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -540,32 +552,6 @@ void exit_files(struct task_struct *tsk) } } -void put_fs_struct(struct fs_struct *fs) -{ - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { - path_put(&fs->root); - path_put(&fs->pwd); - if (fs->altroot.dentry) - path_put(&fs->altroot); - kmem_cache_free(fs_cachep, fs); - } -} - -void exit_fs(struct task_struct *tsk) -{ - struct fs_struct * fs = tsk->fs; - - if (fs) { - task_lock(tsk); - tsk->fs = NULL; - task_unlock(tsk); - put_fs_struct(fs); - } -} - -EXPORT_SYMBOL_GPL(exit_fs); - #ifdef CONFIG_MM_OWNER /* * Task p is exiting and it owned mm, lets find a new owner for it @@ -577,8 +563,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -621,6 +605,12 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL. + */ + mm->owner = NULL; return; assign_new_owner: @@ -641,7 +631,6 @@ assign_new_owner: put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); put_task_struct(c); @@ -655,26 +644,40 @@ assign_new_owner: static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; + struct core_state *core_state; mm_release(tsk, mm); if (!mm) return; /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_waiters + * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread - * will increment core_waiters for each thread in the + * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_waiters) { + core_state = mm->core_state; + if (core_state) { + struct core_thread self; up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - if (!--mm->core_waiters) - complete(mm->core_startup_done); - up_write(&mm->mmap_sem); - wait_for_completion(&mm->core_done); + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); @@ -691,134 +694,107 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static void -reparent_thread(struct task_struct *p, struct task_struct *father, int traced) +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the child reaper process (ie "init") in our pid + * space. + */ +static struct task_struct *find_new_reaper(struct task_struct *father) { - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; - /* Move the child from its dying parent to the new one. */ - if (unlikely(traced)) { - /* Preserve ptrace links if someone else is tracing this child. */ - list_del_init(&p->ptrace_list); - if (ptrace_reparented(p)) - list_add(&p->ptrace_list, &p->real_parent->ptrace_children); - } else { - /* If this child is being traced, then we're the one tracing it - * anyway, so let go of it. - */ - p->ptrace = 0; - remove_parent(p); - p->parent = p->real_parent; - add_parent(p); + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } - if (task_is_traced(p)) { - /* - * If it was at a trace stop, turn it into - * a normal stop since it's no longer being - * traced. - */ - ptrace_untrace(p); - } + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; } - /* If this is a threaded reparent there is no need to + return pid_ns->child_reaper; +} + +/* +* Any that need to be release_task'd are put on the @dead list. + */ +static void reparent_thread(struct task_struct *father, struct task_struct *p, + struct list_head *dead) +{ + if (p->pdeath_signal) + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + list_move_tail(&p->sibling, &p->real_parent->children); + + if (task_detached(p)) + return; + /* + * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (same_thread_group(p->real_parent, father)) return; /* We don't want people slaying init. */ - if (!task_detached(p)) - p->exit_signal = SIGCHLD; + p->exit_signal = SIGCHLD; - /* If we'd notified the old parent about this child's death, - * also notify the new parent. - */ - if (!traced && p->exit_state == EXIT_ZOMBIE && - !task_detached(p) && thread_group_empty(p)) + /* If it has exited notify the new parent about this child's death. */ + if (!p->ptrace && + p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { do_notify_parent(p, p->exit_signal); + if (task_detached(p)) { + p->exit_state = EXIT_DEAD; + list_move_tail(&p->sibling, dead); + } + } kill_orphaned_pgrp(p, father); } -/* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. - */ static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; - struct list_head ptrace_dead; + struct task_struct *p, *n, *reaper; + LIST_HEAD(dead_children); - INIT_LIST_HEAD(&ptrace_dead); + exit_ptrace(father); write_lock_irq(&tasklist_lock); + reaper = find_new_reaper(father); - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); - - /* - * There are only two places where our children can be: - * - * - in our child list - * - in our ptraced child list - * - * Search them and reparent children. - */ list_for_each_entry_safe(p, n, &father->children, sibling) { - int ptrace; - - ptrace = p->ptrace; - - /* if father isn't the real parent, then ptrace must be enabled */ - BUG_ON(father != p->real_parent && !ptrace); - - if (father == p->real_parent) { - /* reparent with a reaper, real father it's us */ - p->real_parent = reaper; - reparent_thread(p, father, 0); - } else { - /* reparent ptraced task to its real parent */ - __ptrace_unlink (p); - if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - } - - /* - * if the ptraced child is a detached zombie we must collect - * it before we exit, or it will remain zombie forever since - * we prevented it from self-reap itself while it was being - * traced by us, to be able to see it in wait4. - */ - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) - list_add(&p->ptrace_list, &ptrace_dead); - } - - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { p->real_parent = reaper; - reparent_thread(p, father, 1); + if (p->parent == father) { + BUG_ON(p->ptrace); + p->parent = p->real_parent; + } + reparent_thread(father, p, &dead_children); } - write_unlock_irq(&tasklist_lock); + BUG_ON(!list_empty(&father->children)); - BUG_ON(!list_empty(&father->ptrace_children)); - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) { - list_del_init(&p->ptrace_list); + list_for_each_entry_safe(p, n, &dead_children, sibling) { + list_del_init(&p->sibling); release_task(p); } - } /* @@ -827,7 +803,8 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; + int signal; + void *cookie; /* * This does two things: @@ -860,37 +837,27 @@ static void exit_notify(struct task_struct *tsk, int group_dead) */ if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) && - !capable(CAP_KILL)) + tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (!task_detached(tsk) && thread_group_empty(tsk)) { - int signal = ptrace_reparented(tsk) ? - SIGCHLD : tsk->exit_signal; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal >= 0) + signal = do_notify_parent(tsk, signal); - state = EXIT_ZOMBIE; - if (task_detached(tsk) && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal == DEATH_REAP) release_task(tsk); } @@ -899,12 +866,9 @@ static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); unsigned long free; - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); + free = stack_not_used(current); if (free >= lowest_to_date) return; @@ -922,39 +886,6 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -969,10 +900,7 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } + tracehook_report_exit(&code); /* * We're taking recursive faults here in do_exit. Safest is to just @@ -991,12 +919,12 @@ NORET_TYPE void do_exit(long code) * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; - if (tsk->io_context) - exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } + exit_irq_thread(); + exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against @@ -1011,25 +939,13 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) @@ -1042,13 +958,14 @@ NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); + trace_sched_process_exit(tsk); + exit_sem(tsk); exit_files(tsk); exit_fs(tsk); check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); - exit_keys(tsk); if (group_dead && tsk->signal->leader) disassociate_ctty(1); @@ -1093,7 +1010,6 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1113,7 +1029,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) EXPORT_SYMBOL(complete_and_exit); -asmlinkage long sys_exit(int error_code) +SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } @@ -1154,9 +1070,11 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; } static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) @@ -1179,13 +1097,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; } - /* - * Do not consider detached threads that are - * not ptraced: - */ - if (task_detached(p) && !p->ptrace) - return 0; - /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1196,14 +1107,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, return 0; err = security_task_wait(p); - if (likely(!err)) - return 1; + if (err) + return err; - if (type != PIDTYPE_PID) - return 0; - /* This child was explicitly requested, abort */ - read_unlock(&tasklist_lock); - return err; + return 1; } static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, @@ -1237,16 +1144,19 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_zombie(struct task_struct *p, int noreap, +static int wait_task_zombie(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; - if (unlikely(noreap)) { - uid_t uid = p->uid; + if (!likely(options & WEXITED)) + return 0; + + if (unlikely(options & WNOWAIT)) { int exit_code = p->exit_code; int why, status; @@ -1278,6 +1188,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap, if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1293,20 +1204,23 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ + thread_group_cputime(p, &cputime); spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1326,6 +1240,8 @@ static int wait_task_zombie(struct task_struct *p, int noreap, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->parent->sighand->siglock); } @@ -1361,7 +1277,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap, if (!retval && infop) retval = put_user(pid, &infop->si_pid); if (!retval && infop) - retval = put_user(p->uid, &infop->si_uid); + retval = put_user(uid, &infop->si_uid); if (!retval) retval = pid; @@ -1389,41 +1305,51 @@ static int wait_task_zombie(struct task_struct *p, int noreap, return retval; } +static int *task_stopped_code(struct task_struct *p, bool ptrace) +{ + if (ptrace) { + if (task_is_stopped_or_traced(p)) + return &p->exit_code; + } else { + if (p->signal->flags & SIGNAL_STOP_STOPPED) + return &p->signal->group_exit_code; + } + return NULL; +} + /* * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_stopped(struct task_struct *p, - int noreap, struct siginfo __user *infop, +static int wait_task_stopped(int ptrace, struct task_struct *p, + int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { - int retval, exit_code, why; + int retval, exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; + if (!(options & WUNTRACED)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); - if (unlikely(!task_is_stopped_or_traced(p))) - goto unlock_sig; - - if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) - /* - * A group stop is in progress and this is the group leader. - * We won't report until all threads have stopped. - */ + p_code = task_stopped_code(p, ptrace); + if (unlikely(!p_code)) goto unlock_sig; - exit_code = p->exit_code; + exit_code = *p_code; if (!exit_code) goto unlock_sig; - if (!noreap) - p->exit_code = 0; + if (!unlikely(options & WNOWAIT)) + *p_code = 0; - uid = p->uid; + /* don't need the RCU readlock here as we're holding a spinlock */ + uid = __task_cred(p)->uid; unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1438,10 +1364,10 @@ unlock_sig: */ get_task_struct(p); pid = task_pid_vnr(p); - why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); - if (unlikely(noreap)) + if (unlikely(options & WNOWAIT)) return wait_noreap_copyout(p, pid, uid, why, exit_code, infop, ru); @@ -1475,7 +1401,7 @@ unlock_sig: * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ -static int wait_task_continued(struct task_struct *p, int noreap, +static int wait_task_continued(struct task_struct *p, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1483,6 +1409,9 @@ static int wait_task_continued(struct task_struct *p, int noreap, pid_t pid; uid_t uid; + if (!unlikely(options & WCONTINUED)) + return 0; + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; @@ -1492,12 +1421,12 @@ static int wait_task_continued(struct task_struct *p, int noreap, spin_unlock_irq(&p->sighand->siglock); return 0; } - if (!noreap) + if (!unlikely(options & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = __task_cred(p)->uid; spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); - uid = p->uid; get_task_struct(p); read_unlock(&tasklist_lock); @@ -1518,89 +1447,163 @@ static int wait_task_continued(struct task_struct *p, int noreap, return retval; } +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then *@notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int wait_consider_task(struct task_struct *parent, int ptrace, + struct task_struct *p, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int ret = eligible_child(type, pid, options, p); + if (!ret) + return ret; + + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (*notask_error) + *notask_error = ret; + } + + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * This child is hidden by ptrace. + * We aren't allowed to see it now, but eventually we will. + */ + *notask_error = 0; + return 0; + } + + if (p->exit_state == EXIT_DEAD) + return 0; + + /* + * We don't reap group leaders with subthreads. + */ + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) + return wait_task_zombie(p, options, infop, stat_addr, ru); + + /* + * It's stopped or running now, so it might + * later continue, exit, or stop again. + */ + *notask_error = 0; + + if (task_stopped_code(p, ptrace)) + return wait_task_stopped(ptrace, p, options, + infop, stat_addr, ru); + + return wait_task_continued(p, options, infop, stat_addr, ru); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in *@notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * *@notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int do_wait_thread(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + /* + * Do not consider detached threads. + */ + if (!task_detached(p)) { + int ret = wait_consider_task(tsk, 0, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + } + + return 0; +} + +static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, + enum pid_type type, struct pid *pid, int options, + struct siginfo __user *infop, int __user *stat_addr, + struct rusage __user *ru) +{ + struct task_struct *p; + + /* + * Traditionally we see ptrace'd stopped tasks regardless of options. + */ + options |= WUNTRACED; + + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(tsk, 1, p, notask_error, + type, pid, options, + infop, stat_addr, ru); + if (ret) + return ret; + } + + return 0; +} + static long do_wait(enum pid_type type, struct pid *pid, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; - int flag, retval; + int retval; + + trace_sched_process_wait(pid); add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: - /* If there is nothing that can match our critier just get out */ + /* + * If there is nothing that can match our critiera just get out. + * We will clear @retval to zero if we see any child that might later + * match our criteria, even if we are not able to reap it yet. + */ retval = -ECHILD; if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) goto end; - /* - * We will set this flag if we see any child that might later - * match our criteria, even if we are not able to reap it yet. - */ - flag = retval = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; do { - struct task_struct *p; - - list_for_each_entry(p, &tsk->children, sibling) { - int ret = eligible_child(type, pid, options, p); - if (!ret) - continue; - - if (unlikely(ret < 0)) { - retval = ret; - } else if (task_is_stopped_or_traced(p)) { - /* - * It's stopped now, so it might later - * continue, exit, or stop again. - */ - flag = 1; - if (!(p->ptrace & PT_PTRACED) && - !(options & WUNTRACED)) - continue; - - retval = wait_task_stopped(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state == EXIT_ZOMBIE && - !delay_group_leader(p)) { - /* - * We don't reap group leaders with subthreads. - */ - if (!likely(options & WEXITED)) - continue; - retval = wait_task_zombie(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } else if (p->exit_state != EXIT_DEAD) { - /* - * It's running now, so it might later - * exit, stop, or stop and then continue. - */ - flag = 1; - if (!unlikely(options & WCONTINUED)) - continue; - retval = wait_task_continued(p, - (options & WNOWAIT), infop, - stat_addr, ru); - } - if (retval != 0) /* tasklist_lock released */ - goto end; - } - if (!flag) { - list_for_each_entry(p, &tsk->ptrace_children, - ptrace_list) { - flag = eligible_child(type, pid, options, p); - if (!flag) - continue; - if (likely(flag > 0)) - break; - retval = flag; - goto end; - } + int tsk_result = do_wait_thread(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (!tsk_result) + tsk_result = ptrace_do_wait(tsk, &retval, + type, pid, options, + infop, stat_addr, ru); + if (tsk_result) { + /* + * tasklist_lock is unlocked and we have a final result. + */ + retval = tsk_result; + goto end; } + if (options & __WNOTHREAD) break; tsk = next_thread(tsk); @@ -1608,16 +1611,14 @@ repeat: } while (tsk != current); read_unlock(&tasklist_lock); - if (flag) { - if (options & WNOHANG) - goto end; + if (!retval && !(options & WNOHANG)) { retval = -ERESTARTSYS; - if (signal_pending(current)) - goto end; - schedule(); - goto repeat; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } } - retval = -ECHILD; + end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); @@ -1647,9 +1648,8 @@ end: return retval; } -asmlinkage long sys_waitid(int which, pid_t upid, - struct siginfo __user *infop, int options, - struct rusage __user *ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; @@ -1688,8 +1688,8 @@ asmlinkage long sys_waitid(int which, pid_t upid, return ret; } -asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage __user *ru) +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; @@ -1706,7 +1706,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; - pid = get_pid(task_pgrp(current)); + pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); @@ -1726,7 +1726,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ -asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return sys_wait4(pid, stat_addr, options, NULL); }