coredump: simplify core_state->nr_threads calculation
[safe/jmp/linux-2.6] / fs / exec.c
index 9ff6069..c74bb34 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
 
 #include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
+#include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
-#include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
-#include <linux/swap.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
@@ -47,7 +46,6 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/rmap.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
 #include <linux/kmod.h>
 #endif
 
+#ifdef __alpha__
+/* for /sbin/loader handling in search_binary_handler() */
+#include <linux/a.out.h>
+#endif
+
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -112,7 +115,7 @@ asmlinkage long sys_uselib(const char __user * library)
                goto out;
 
        error = -EINVAL;
-       if (!S_ISREG(nd.dentry->d_inode->i_mode))
+       if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
                goto exit;
 
        error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
@@ -148,7 +151,7 @@ out:
        return error;
 exit:
        release_open_intent(&nd);
-       path_release(&nd);
+       path_put(&nd.path);
        goto out;
 }
 
@@ -173,8 +176,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                return NULL;
 
        if (write) {
-               struct rlimit *rlim = current->signal->rlim;
                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+               struct rlimit *rlim;
+
+               /*
+                * We've historically supported up to 32 pages (ARG_MAX)
+                * of argument strings even with small stacks
+                */
+               if (size <= ARG_MAX)
+                       return page;
 
                /*
                 * Limit to 1/4-th the stack size for the argv+env strings.
@@ -183,6 +193,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                 *  - the program will have a reasonable amount of stack left
                 *    to work from.
                 */
+               rlim = current->signal->rlim;
                if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
                        put_page(page);
                        return NULL;
@@ -528,7 +539,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-               free_pgd_range(&tlb, new_end, old_end, new_end,
+               free_pgd_range(tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -537,7 +548,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-               free_pgd_range(&tlb, old_start, old_end, new_end,
+               free_pgd_range(tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
        tlb_finish_mmu(tlb, new_end, old_end);
@@ -597,7 +608,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        bprm->exec -= stack_shift;
 
        down_write(&mm->mmap_sem);
-       vm_flags = vma->vm_flags;
+       vm_flags = VM_STACK_FLAGS;
 
        /*
         * Adjust stack execute permissions; explicitly enable for
@@ -652,7 +663,7 @@ struct file *open_exec(const char *name)
        file = ERR_PTR(err);
 
        if (!err) {
-               struct inode *inode = nd.dentry->d_inode;
+               struct inode *inode = nd.path.dentry->d_inode;
                file = ERR_PTR(-EACCES);
                if (S_ISREG(inode->i_mode)) {
                        int err = vfs_permission(&nd, MAY_EXEC);
@@ -672,7 +683,7 @@ out:
                        }
                }
                release_open_intent(&nd);
-               path_release(&nd);
+               path_put(&nd.path);
        }
        goto out;
 }
@@ -711,12 +722,10 @@ static int exec_mmap(struct mm_struct *mm)
                 * Make sure that if there is a core dump in progress
                 * for the old mm, we get out and die instead of going
                 * through with the exec.  We must hold mmap_sem around
-                * checking core_waiters and changing tsk->mm.  The
-                * core-inducing thread will increment core_waiters for
-                * each thread whose ->mm == old_mm.
+                * checking core_state and changing tsk->mm.
                 */
                down_read(&old_mm->mmap_sem);
-               if (unlikely(old_mm->core_waiters)) {
+               if (unlikely(old_mm->core_state)) {
                        up_read(&old_mm->mmap_sem);
                        return -EINTR;
                }
@@ -727,6 +736,7 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
        task_unlock(tsk);
+       mm_update_next_owner(old_mm);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
@@ -757,9 +767,7 @@ static int de_thread(struct task_struct *tsk)
 
        /*
         * Kill all other threads in the thread group.
-        * We must hold tasklist_lock to call zap_other_threads.
         */
-       read_lock(&tasklist_lock);
        spin_lock_irq(lock);
        if (signal_group_exit(sig)) {
                /*
@@ -767,21 +775,10 @@ static int de_thread(struct task_struct *tsk)
                 * return so that the signal is processed.
                 */
                spin_unlock_irq(lock);
-               read_unlock(&tasklist_lock);
                return -EAGAIN;
        }
-
-       /*
-        * child_reaper ignores SIGKILL, change it now.
-        * Reparenting needs write_lock on tasklist_lock,
-        * so it is safe to do it under read_lock.
-        */
-       if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
-               task_active_pid_ns(tsk)->child_reaper = tsk;
-
        sig->group_exit_task = tsk;
        zap_other_threads(tsk);
-       read_unlock(&tasklist_lock);
 
        /* Account for the thread group leader hanging around: */
        count = thread_group_leader(tsk) ? 1 : 2;
@@ -802,7 +799,7 @@ static int de_thread(struct task_struct *tsk)
        if (!thread_group_leader(tsk)) {
                leader = tsk->group_leader;
 
-               sig->notify_count = -1;
+               sig->notify_count = -1; /* for exit_notify() */
                for (;;) {
                        write_lock_irq(&tasklist_lock);
                        if (likely(leader->exit_state))
@@ -812,6 +809,8 @@ static int de_thread(struct task_struct *tsk)
                        schedule();
                }
 
+               if (unlikely(task_child_reaper(tsk) == leader))
+                       task_active_pid_ns(tsk)->child_reaper = tsk;
                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
@@ -861,6 +860,7 @@ static int de_thread(struct task_struct *tsk)
 
 no_thread_group:
        exit_itimers(sig);
+       flush_itimer_signals();
        if (leader)
                release_task(leader);
 
@@ -945,7 +945,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 {
        char * name;
        int i, ch, retval;
-       struct files_struct *files;
        char tcomm[sizeof(current->comm)];
 
        /*
@@ -956,27 +955,18 @@ int flush_old_exec(struct linux_binprm * bprm)
        if (retval)
                goto out;
 
-       /*
-        * Make sure we have private file handles. Ask the
-        * fork helper to do the work for us and the exit
-        * helper to do the cleanup of the old one.
-        */
-       files = current->files;         /* refcounted so safe to hold */
-       retval = unshare_files();
-       if (retval)
-               goto out;
+       set_mm_exe_file(bprm->mm, bprm->file);
+
        /*
         * Release all of the old mmap stuff
         */
        retval = exec_mmap(bprm->mm);
        if (retval)
-               goto mmap_failed;
+               goto out;
 
        bprm->mm = NULL;                /* We're using it now */
 
        /* This is the point of no return */
-       put_files_struct(files);
-
        current->sas_ss_sp = current->sas_ss_size = 0;
 
        if (current->euid == current->uid && current->egid == current->gid)
@@ -1026,8 +1016,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 
        return 0;
 
-mmap_failed:
-       reset_files_struct(current, files);
 out:
        return retval;
 }
@@ -1167,7 +1155,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
        int try,retval;
        struct linux_binfmt *fmt;
-#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
+#ifdef __alpha__
        /* handle /sbin/loader.. */
        {
            struct exec * eh = (struct exec *) bprm->buf;
@@ -1264,6 +1252,12 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
+void free_bprm(struct linux_binprm *bprm)
+{
+       free_arg_pages(bprm);
+       kfree(bprm);
+}
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1274,13 +1268,17 @@ int do_execve(char * filename,
 {
        struct linux_binprm *bprm;
        struct file *file;
-       unsigned long env_p;
+       struct files_struct *displaced;
        int retval;
 
+       retval = unshare_files(&displaced);
+       if (retval)
+               goto out_ret;
+
        retval = -ENOMEM;
        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
        if (!bprm)
-               goto out_ret;
+               goto out_files;
 
        file = open_exec(filename);
        retval = PTR_ERR(file);
@@ -1322,24 +1320,23 @@ int do_execve(char * filename,
        if (retval < 0)
                goto out;
 
-       env_p = bprm->p;
        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out;
-       bprm->argv_len = env_p - bprm->p;
 
+       current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
        if (retval >= 0) {
                /* execve success */
-               free_arg_pages(bprm);
                security_bprm_free(bprm);
                acct_update_integrals(current);
-               kfree(bprm);
+               free_bprm(bprm);
+               if (displaced)
+                       put_files_struct(displaced);
                return retval;
        }
 
 out:
-       free_arg_pages(bprm);
        if (bprm->security)
                security_bprm_free(bprm);
 
@@ -1353,8 +1350,11 @@ out_file:
                fput(bprm->file);
        }
 out_kfree:
-       kfree(bprm);
+       free_bprm(bprm);
 
+out_files:
+       if (displaced)
+               reset_files_struct(displaced);
 out_ret:
        return retval;
 }
@@ -1502,9 +1502,10 @@ out:
        return ispipe;
 }
 
-static void zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start)
 {
        struct task_struct *t;
+       int nr = 0;
 
        start->signal->flags = SIGNAL_GROUP_EXIT;
        start->signal->group_stop_count = 0;
@@ -1512,72 +1513,99 @@ static void zap_process(struct task_struct *start)
        t = start;
        do {
                if (t != current && t->mm) {
-                       t->mm->core_waiters++;
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
+                       nr++;
                }
-       } while ((t = next_thread(t)) != start);
+       } while_each_thread(start, t);
+
+       return nr;
 }
 
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-                               int exit_code)
+                               struct core_state *core_state, int exit_code)
 {
        struct task_struct *g, *p;
        unsigned long flags;
-       int err = -EAGAIN;
+       int nr = -EAGAIN;
 
        spin_lock_irq(&tsk->sighand->siglock);
        if (!signal_group_exit(tsk->signal)) {
+               mm->core_state = core_state;
                tsk->signal->group_exit_code = exit_code;
-               zap_process(tsk);
-               err = 0;
+               nr = zap_process(tsk);
        }
        spin_unlock_irq(&tsk->sighand->siglock);
-       if (err)
-               return err;
+       if (unlikely(nr < 0))
+               return nr;
 
-       if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+       if (atomic_read(&mm->mm_users) == nr + 1)
                goto done;
-
+       /*
+        * We should find and kill all tasks which use this mm, and we should
+        * count them correctly into ->nr_threads. We don't take tasklist
+        * lock, but this is safe wrt:
+        *
+        * fork:
+        *      None of sub-threads can fork after zap_process(leader). All
+        *      processes which were created before this point should be
+        *      visible to zap_threads() because copy_process() adds the new
+        *      process to the tail of init_task.tasks list, and lock/unlock
+        *      of ->siglock provides a memory barrier.
+        *
+        * do_exit:
+        *      The caller holds mm->mmap_sem. This means that the task which
+        *      uses this mm can't pass exit_mm(), so it can't exit or clear
+        *      its ->mm.
+        *
+        * de_thread:
+        *      It does list_replace_rcu(&leader->tasks, &current->tasks),
+        *      we must see either old or new leader, this does not matter.
+        *      However, it can change p->sighand, so lock_task_sighand(p)
+        *      must be used. Since p->mm != NULL and we hold ->mmap_sem
+        *      it can't fail.
+        *
+        *      Note also that "g" can be the old leader with ->mm == NULL
+        *      and already unhashed and thus removed from ->thread_group.
+        *      This is OK, __unhash_process()->list_del_rcu() does not
+        *      clear the ->next pointer, we will find the new leader via
+        *      next_thread().
+        */
        rcu_read_lock();
        for_each_process(g) {
                if (g == tsk->group_leader)
                        continue;
-
+               if (g->flags & PF_KTHREAD)
+                       continue;
                p = g;
                do {
                        if (p->mm) {
-                               if (p->mm == mm) {
-                                       /*
-                                        * p->sighand can't disappear, but
-                                        * may be changed by de_thread()
-                                        */
+                               if (unlikely(p->mm == mm)) {
                                        lock_task_sighand(p, &flags);
-                                       zap_process(p);
+                                       nr += zap_process(p);
                                        unlock_task_sighand(p, &flags);
                                }
                                break;
                        }
-               } while ((p = next_thread(p)) != g);
+               } while_each_thread(g, p);
        }
        rcu_read_unlock();
 done:
-       return mm->core_waiters;
+       core_state->nr_threads = nr;
+       return nr;
 }
 
 static int coredump_wait(int exit_code)
 {
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
-       struct completion startup_done;
+       struct core_state core_state;
        struct completion *vfork_done;
        int core_waiters;
 
        init_completion(&mm->core_done);
-       init_completion(&startup_done);
-       mm->core_startup_done = &startup_done;
-
-       core_waiters = zap_threads(tsk, mm, exit_code);
+       init_completion(&core_state.startup);
+       core_waiters = zap_threads(tsk, mm, &core_state, exit_code);
        up_write(&mm->mmap_sem);
 
        if (unlikely(core_waiters < 0))
@@ -1594,9 +1622,9 @@ static int coredump_wait(int exit_code)
        }
 
        if (core_waiters)
-               wait_for_completion(&startup_done);
+               wait_for_completion(&core_state.startup);
+       mm->core_state = NULL;
 fail:
-       BUG_ON(mm->core_waiters);
        return core_waiters;
 }
 
@@ -1674,7 +1702,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        /*
         * If another thread got here first, or we are not dumpable, bail out.
         */
-       if (mm->core_waiters || !get_dumpable(mm)) {
+       if (mm->core_state || !get_dumpable(mm)) {
                up_write(&mm->mmap_sem);
                goto fail;
        }