coredump: simplify core_state->nr_threads calculation

[safe/jmp/linux-2.6] / fs / exec.c
diff --git a/fs/exec.c b/fs/exec.c

index 9ff6069..c74bb34 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -24,20 +24,19 @@
  
  #include <linux/slab.h>
  #include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
  #include <linux/stat.h>
  #include <linux/fcntl.h>
  #include <linux/smp_lock.h>
+#include <linux/swap.h>
  #include <linux/string.h>
  #include <linux/init.h>
-#include <linux/pagemap.h>
  #include <linux/highmem.h>
  #include <linux/spinlock.h>
  #include <linux/key.h>
  #include <linux/personality.h>
  #include <linux/binfmts.h>
-#include <linux/swap.h>
  #include <linux/utsname.h>
  #include <linux/pid_namespace.h>
  #include <linux/module.h>
@@ -47,7 +46,6 @@
  #include <linux/mount.h>
  #include <linux/security.h>
  #include <linux/syscalls.h>
-#include <linux/rmap.h>
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/audit.h>
@@ -60,6 +58,11 @@
  #include <linux/kmod.h>
  #endif
  
+#ifdef __alpha__
+/* for /sbin/loader handling in search_binary_handler() */
+#include <linux/a.out.h>
+#endif
+
  int core_uses_pid;
  char core_pattern[CORENAME_MAX_SIZE] = "core";
  int suid_dumpable = 0;
@@ -112,7 +115,7 @@ asmlinkage long sys_uselib(const char __user * library)
                 goto out;
  
         error = -EINVAL;
-       if (!S_ISREG(nd.dentry->d_inode->i_mode))
+       if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
                 goto exit;
  
         error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
@@ -148,7 +151,7 @@ out:
         return error;
  exit:
         release_open_intent(&nd);
-       path_release(&nd);
+       path_put(&nd.path);
         goto out;
  }
  
@@ -173,8 +176,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                 return NULL;
  
         if (write) {
-               struct rlimit *rlim = current->signal->rlim;
                 unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+               struct rlimit *rlim;
+
+               /*
+                * We've historically supported up to 32 pages (ARG_MAX)
+                * of argument strings even with small stacks
+                */
+               if (size <= ARG_MAX)
+                       return page;
  
                 /*
                  * Limit to 1/4-th the stack size for the argv+env strings.
@@ -183,6 +193,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                  *  - the program will have a reasonable amount of stack left
                  *    to work from.
                  */
+               rlim = current->signal->rlim;
                 if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
                         put_page(page);
                         return NULL;
@@ -528,7 +539,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 /*
                  * when the old and new regions overlap clear from new_end.
                  */
-               free_pgd_range(&tlb, new_end, old_end, new_end,
+               free_pgd_range(tlb, new_end, old_end, new_end,
                         vma->vm_next ? vma->vm_next->vm_start : 0);
         } else {
                 /*
@@ -537,7 +548,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                  * have constraints on va-space that make this illegal (IA64) -
                  * for the others its just a little faster.
                  */
-               free_pgd_range(&tlb, old_start, old_end, new_end,
+               free_pgd_range(tlb, old_start, old_end, new_end,
                         vma->vm_next ? vma->vm_next->vm_start : 0);
         }
         tlb_finish_mmu(tlb, new_end, old_end);
@@ -597,7 +608,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
         bprm->exec -= stack_shift;
  
         down_write(&mm->mmap_sem);
-       vm_flags = vma->vm_flags;
+       vm_flags = VM_STACK_FLAGS;
  
         /*
          * Adjust stack execute permissions; explicitly enable for
@@ -652,7 +663,7 @@ struct file *open_exec(const char *name)
         file = ERR_PTR(err);
  
         if (!err) {
-               struct inode *inode = nd.dentry->d_inode;
+               struct inode *inode = nd.path.dentry->d_inode;
                 file = ERR_PTR(-EACCES);
                 if (S_ISREG(inode->i_mode)) {
                         int err = vfs_permission(&nd, MAY_EXEC);
@@ -672,7 +683,7 @@ out:
                         }
                 }
                 release_open_intent(&nd);
-               path_release(&nd);
+               path_put(&nd.path);
         }
         goto out;
  }
@@ -711,12 +722,10 @@ static int exec_mmap(struct mm_struct *mm)
                  * Make sure that if there is a core dump in progress
                  * for the old mm, we get out and die instead of going
                  * through with the exec.  We must hold mmap_sem around
-                * checking core_waiters and changing tsk->mm.  The
-                * core-inducing thread will increment core_waiters for
-                * each thread whose ->mm == old_mm.
+                * checking core_state and changing tsk->mm.
                  */
                 down_read(&old_mm->mmap_sem);
-               if (unlikely(old_mm->core_waiters)) {
+               if (unlikely(old_mm->core_state)) {
                         up_read(&old_mm->mmap_sem);
                         return -EINTR;
                 }
@@ -727,6 +736,7 @@ static int exec_mmap(struct mm_struct *mm)
         tsk->active_mm = mm;
         activate_mm(active_mm, mm);
         task_unlock(tsk);
+       mm_update_next_owner(old_mm);
         arch_pick_mmap_layout(mm);
         if (old_mm) {
                 up_read(&old_mm->mmap_sem);
@@ -757,9 +767,7 @@ static int de_thread(struct task_struct *tsk)
  
         /*
          * Kill all other threads in the thread group.
-        * We must hold tasklist_lock to call zap_other_threads.
          */
-       read_lock(&tasklist_lock);
         spin_lock_irq(lock);
         if (signal_group_exit(sig)) {
                 /*
@@ -767,21 +775,10 @@ static int de_thread(struct task_struct *tsk)
                  * return so that the signal is processed.
                  */
                 spin_unlock_irq(lock);
-               read_unlock(&tasklist_lock);
                 return -EAGAIN;
         }
-
-       /*
-        * child_reaper ignores SIGKILL, change it now.
-        * Reparenting needs write_lock on tasklist_lock,
-        * so it is safe to do it under read_lock.
-        */
-       if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
-               task_active_pid_ns(tsk)->child_reaper = tsk;
-
         sig->group_exit_task = tsk;
         zap_other_threads(tsk);
-       read_unlock(&tasklist_lock);
  
         /* Account for the thread group leader hanging around: */
         count = thread_group_leader(tsk) ? 1 : 2;
@@ -802,7 +799,7 @@ static int de_thread(struct task_struct *tsk)
         if (!thread_group_leader(tsk)) {
                 leader = tsk->group_leader;
  
-               sig->notify_count = -1;
+               sig->notify_count = -1; /* for exit_notify() */
                 for (;;) {
                         write_lock_irq(&tasklist_lock);
                         if (likely(leader->exit_state))
@@ -812,6 +809,8 @@ static int de_thread(struct task_struct *tsk)
                         schedule();
                 }
  
+               if (unlikely(task_child_reaper(tsk) == leader))
+                       task_active_pid_ns(tsk)->child_reaper = tsk;
                 /*
                  * The only record we have of the real-time age of a
                  * process, regardless of execs it's done, is start_time.
@@ -861,6 +860,7 @@ static int de_thread(struct task_struct *tsk)
  
  no_thread_group:
         exit_itimers(sig);
+       flush_itimer_signals();
         if (leader)
                 release_task(leader);
  
@@ -945,7 +945,6 @@ int flush_old_exec(struct linux_binprm * bprm)
  {
         char * name;
         int i, ch, retval;
-       struct files_struct *files;
         char tcomm[sizeof(current->comm)];
  
         /*
@@ -956,27 +955,18 @@ int flush_old_exec(struct linux_binprm * bprm)
         if (retval)
                 goto out;
  
-       /*
-        * Make sure we have private file handles. Ask the
-        * fork helper to do the work for us and the exit
-        * helper to do the cleanup of the old one.
-        */
-       files = current->files;         /* refcounted so safe to hold */
-       retval = unshare_files();
-       if (retval)
-               goto out;
+       set_mm_exe_file(bprm->mm, bprm->file);
+
         /*
          * Release all of the old mmap stuff
          */
         retval = exec_mmap(bprm->mm);
         if (retval)
-               goto mmap_failed;
+               goto out;
  
         bprm->mm = NULL;                /* We're using it now */
  
         /* This is the point of no return */
-       put_files_struct(files);
-
         current->sas_ss_sp = current->sas_ss_size = 0;
  
         if (current->euid == current->uid && current->egid == current->gid)
@@ -1026,8 +1016,6 @@ int flush_old_exec(struct linux_binprm * bprm)
  
         return 0;
  
-mmap_failed:
-       reset_files_struct(current, files);
  out:
         return retval;
  }
@@ -1167,7 +1155,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
  {
         int try,retval;
         struct linux_binfmt *fmt;
-#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
+#ifdef __alpha__
         /* handle /sbin/loader.. */
         {
             struct exec * eh = (struct exec *) bprm->buf;
@@ -1264,6 +1252,12 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
  
  EXPORT_SYMBOL(search_binary_handler);
  
+void free_bprm(struct linux_binprm *bprm)
+{
+       free_arg_pages(bprm);
+       kfree(bprm);
+}
+
  /*
   * sys_execve() executes a new program.
   */
@@ -1274,13 +1268,17 @@ int do_execve(char * filename,
  {
         struct linux_binprm *bprm;
         struct file *file;
-       unsigned long env_p;
+       struct files_struct *displaced;
         int retval;
  
+       retval = unshare_files(&displaced);
+       if (retval)
+               goto out_ret;
+
         retval = -ENOMEM;
         bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
         if (!bprm)
-               goto out_ret;
+               goto out_files;
  
         file = open_exec(filename);
         retval = PTR_ERR(file);
@@ -1322,24 +1320,23 @@ int do_execve(char * filename,
         if (retval < 0)
                 goto out;
  
-       env_p = bprm->p;
         retval = copy_strings(bprm->argc, argv, bprm);
         if (retval < 0)
                 goto out;
-       bprm->argv_len = env_p - bprm->p;
  
+       current->flags &= ~PF_KTHREAD;
         retval = search_binary_handler(bprm,regs);
         if (retval >= 0) {
                 /* execve success */
-               free_arg_pages(bprm);
                 security_bprm_free(bprm);
                 acct_update_integrals(current);
-               kfree(bprm);
+               free_bprm(bprm);
+               if (displaced)
+                       put_files_struct(displaced);
                 return retval;
         }
  
  out:
-       free_arg_pages(bprm);
         if (bprm->security)
                 security_bprm_free(bprm);
  
@@ -1353,8 +1350,11 @@ out_file:
                 fput(bprm->file);
         }
  out_kfree:
-       kfree(bprm);
+       free_bprm(bprm);
  
+out_files:
+       if (displaced)
+               reset_files_struct(displaced);
  out_ret:
         return retval;
  }
@@ -1502,9 +1502,10 @@ out:
         return ispipe;
  }
  
-static void zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start)
  {
         struct task_struct *t;
+       int nr = 0;
  
         start->signal->flags = SIGNAL_GROUP_EXIT;
         start->signal->group_stop_count = 0;
@@ -1512,72 +1513,99 @@ static void zap_process(struct task_struct *start)
         t = start;
         do {
                 if (t != current && t->mm) {
-                       t->mm->core_waiters++;
                         sigaddset(&t->pending.signal, SIGKILL);
                         signal_wake_up(t, 1);
+                       nr++;
                 }
-       } while ((t = next_thread(t)) != start);
+       } while_each_thread(start, t);
+
+       return nr;
  }
  
  static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-                               int exit_code)
+                               struct core_state *core_state, int exit_code)
  {
         struct task_struct *g, *p;
         unsigned long flags;
-       int err = -EAGAIN;
+       int nr = -EAGAIN;
  
         spin_lock_irq(&tsk->sighand->siglock);
         if (!signal_group_exit(tsk->signal)) {
+               mm->core_state = core_state;
                 tsk->signal->group_exit_code = exit_code;
-               zap_process(tsk);
-               err = 0;
+               nr = zap_process(tsk);
         }
         spin_unlock_irq(&tsk->sighand->siglock);
-       if (err)
-               return err;
+       if (unlikely(nr < 0))
+               return nr;
  
-       if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+       if (atomic_read(&mm->mm_users) == nr + 1)
                 goto done;
-
+       /*
+        * We should find and kill all tasks which use this mm, and we should
+        * count them correctly into ->nr_threads. We don't take tasklist
+        * lock, but this is safe wrt:
+        *
+        * fork:
+        *      None of sub-threads can fork after zap_process(leader). All
+        *      processes which were created before this point should be
+        *      visible to zap_threads() because copy_process() adds the new
+        *      process to the tail of init_task.tasks list, and lock/unlock
+        *      of ->siglock provides a memory barrier.
+        *
+        * do_exit:
+        *      The caller holds mm->mmap_sem. This means that the task which
+        *      uses this mm can't pass exit_mm(), so it can't exit or clear
+        *      its ->mm.
+        *
+        * de_thread:
+        *      It does list_replace_rcu(&leader->tasks, &current->tasks),
+        *      we must see either old or new leader, this does not matter.
+        *      However, it can change p->sighand, so lock_task_sighand(p)
+        *      must be used. Since p->mm != NULL and we hold ->mmap_sem
+        *      it can't fail.
+        *
+        *      Note also that "g" can be the old leader with ->mm == NULL
+        *      and already unhashed and thus removed from ->thread_group.
+        *      This is OK, __unhash_process()->list_del_rcu() does not
+        *      clear the ->next pointer, we will find the new leader via
+        *      next_thread().
+        */
         rcu_read_lock();
         for_each_process(g) {
                 if (g == tsk->group_leader)
                         continue;
-
+               if (g->flags & PF_KTHREAD)
+                       continue;
                 p = g;
                 do {
                         if (p->mm) {
-                               if (p->mm == mm) {
-                                       /*
-                                        * p->sighand can't disappear, but
-                                        * may be changed by de_thread()
-                                        */
+                               if (unlikely(p->mm == mm)) {
                                         lock_task_sighand(p, &flags);
-                                       zap_process(p);
+                                       nr += zap_process(p);
                                         unlock_task_sighand(p, &flags);
                                 }
                                 break;
                         }
-               } while ((p = next_thread(p)) != g);
+               } while_each_thread(g, p);
         }
         rcu_read_unlock();
  done:
-       return mm->core_waiters;
+       core_state->nr_threads = nr;
+       return nr;
  }
  
  static int coredump_wait(int exit_code)
  {
         struct task_struct *tsk = current;
         struct mm_struct *mm = tsk->mm;
-       struct completion startup_done;
+       struct core_state core_state;
         struct completion *vfork_done;
         int core_waiters;
  
         init_completion(&mm->core_done);
-       init_completion(&startup_done);
-       mm->core_startup_done = &startup_done;
-
-       core_waiters = zap_threads(tsk, mm, exit_code);
+       init_completion(&core_state.startup);
+       core_waiters = zap_threads(tsk, mm, &core_state, exit_code);
         up_write(&mm->mmap_sem);
  
         if (unlikely(core_waiters < 0))
@@ -1594,9 +1622,9 @@ static int coredump_wait(int exit_code)
         }
  
         if (core_waiters)
-               wait_for_completion(&startup_done);
+               wait_for_completion(&core_state.startup);
+       mm->core_state = NULL;
  fail:
-       BUG_ON(mm->core_waiters);
         return core_waiters;
  }
  
@@ -1674,7 +1702,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
         /*
          * If another thread got here first, or we are not dumpable, bail out.
          */
-       if (mm->core_waiters || !get_dumpable(mm)) {
+       if (mm->core_state || !get_dumpable(mm)) {
                 up_write(&mm->mmap_sem);
                 goto fail;
         }