X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fexec.c;h=6501823b7a1bd80e870abedc263151d887551941;hb=5e43aef530ba39206f7923295388f7ec3c5a7d93;hp=434dba778ccc761befe61c26a62f26037993ba32;hpb=cdd6c482c9ff9c55475ee7392ec8f672eddb7be6;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/exec.c b/fs/exec.c index 434dba7..6501823 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -55,6 +54,7 @@ #include #include #include +#include #include #include @@ -63,6 +63,7 @@ int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; +unsigned int core_pipe_limit; int suid_dumpable = 0; /* The maximal length of core_pattern is also specified in sysctl.c */ @@ -194,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { put_page(page); return NULL; } @@ -241,10 +242,12 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; + vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) goto err; @@ -515,7 +518,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL); + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + return -ENOMEM; /* * move the page tables downwards, on failure we rely on @@ -546,15 +550,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) tlb_finish_mmu(tlb, new_end, old_end); /* - * shrink the vma to just the new range. + * Shrink the vma to just the new range. Always succeeds. */ vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); return 0; } -#define EXTRA_STACK_VM_PAGES 20 /* random */ - /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. @@ -570,10 +572,13 @@ int setup_arg_pages(struct linux_binprm *bprm, struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ - stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; + stack_base = rlimit_max(RLIMIT_STACK); if (stack_base > (1 << 30)) stack_base = 1 << 30; @@ -612,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); @@ -622,16 +628,30 @@ int setup_arg_pages(struct linux_binprm *bprm, /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); - if (ret) { - up_write(&mm->mmap_sem); - return ret; - } + if (ret) + goto out_unlock; } + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP - stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; #else - stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; #endif ret = expand_stack(vma, stack_base); if (ret) @@ -639,7 +659,7 @@ int setup_arg_pages(struct linux_binprm *bprm, out_unlock: up_write(&mm->mmap_sem); - return 0; + return ret; } EXPORT_SYMBOL(setup_arg_pages); @@ -703,6 +723,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(tsk, old_mm); mm_release(tsk, old_mm); if (old_mm) { @@ -827,7 +848,9 @@ static int de_thread(struct task_struct *tsk) attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; @@ -845,6 +868,9 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = 0; no_thread_group: + if (current->mm) + setmax_mm_hiwater_rss(&sig->maxrss, current->mm); + exit_itimers(sig); flush_itimer_signals(); @@ -921,6 +947,15 @@ char *get_task_comm(char *buf, struct task_struct *tsk) void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + + /* + * Threads may access current->comm without holding + * the task lock, so write the string carefully. + * Readers without a lock may see incomplete new + * names but are safe from non-terminating string reads. + */ + memset(tsk->comm, 0, TASK_COMM_LEN); + wmb(); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk); @@ -928,9 +963,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -951,6 +984,25 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + current->flags &= ~PF_RANDOMIZE; + flush_thread(); + current->personality &= ~bprm->per_clear; + + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); + /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -972,9 +1024,6 @@ int flush_old_exec(struct linux_binprm * bprm) tcomm[i] = '\0'; set_task_comm(current, tcomm); - current->flags &= ~PF_RANDOMIZE; - flush_thread(); - /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc @@ -990,8 +1039,6 @@ int flush_old_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - current->personality &= ~bprm->per_clear; - /* * Flush performance counters when crossing a * security domain: @@ -1006,14 +1053,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. @@ -1206,9 +1247,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) retval = security_bprm_check(bprm); if (retval) return retval; - retval = ima_bprm_check(bprm); - if (retval) - return retval; /* kernel module loader fixup */ /* so we don't try to load run modprobe in kernel space. */ @@ -1388,18 +1426,16 @@ out_ret: return retval; } -int set_binfmt(struct linux_binfmt *new) +void set_binfmt(struct linux_binfmt *new) { - struct linux_binfmt *old = current->binfmt; + struct mm_struct *mm = current->mm; - if (new) { - if (!try_module_get(new->module)) - return -1; - } - current->binfmt = new; - if (old) - module_put(old->module); - return 0; + if (mm->binfmt) + module_put(mm->binfmt->module); + + mm->binfmt = new; + if (new) + __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); @@ -1500,7 +1536,7 @@ static int format_corename(char *corename, long signr) /* core limit size */ case 'c': rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); + "%lu", rlimit(RLIMIT_CORE)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1528,12 +1564,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start) +static int zap_process(struct task_struct *start, int exit_code) { struct task_struct *t; int nr = 0; start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; t = start; @@ -1558,8 +1595,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - tsk->signal->group_exit_code = exit_code; - nr = zap_process(tsk); + nr = zap_process(tsk, exit_code); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) @@ -1608,7 +1644,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (p->mm) { if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - nr += zap_process(p); + nr += zap_process(p, exit_code); unlock_task_sighand(p, &flags); } break; @@ -1715,52 +1751,127 @@ void set_dumpable(struct mm_struct *mm, int value) } } -int get_dumpable(struct mm_struct *mm) +static int __get_dumpable(unsigned long mm_flags) { int ret; - ret = mm->flags & 0x3; + ret = mm_flags & MMF_DUMPABLE_MASK; return (ret >= 2) ? 2 : ret; } +int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe; + + pipe = file->f_path.dentry->d_inode->i_pipe; + + pipe_lock(pipe); + pipe->readers++; + pipe->writers--; + + while ((pipe->readers > 1) && (!signal_pending(current))) { + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_wait(pipe); + } + + pipe->readers--; + pipe->writers++; + pipe_unlock(pipe); + +} + + +/* + * uhm_pipe_setup + * helper function to customize the process used + * to collect the core in userspace. Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process. Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1. This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info) +{ + struct file *rp, *wp; + struct fdtable *fdt; + struct coredump_params *cp = (struct coredump_params *)info->data; + struct files_struct *cf = current->files; + + wp = create_write_pipe(0); + if (IS_ERR(wp)) + return PTR_ERR(wp); + + rp = create_read_pipe(wp, 0); + if (IS_ERR(rp)) { + free_write_pipe(wp); + return PTR_ERR(rp); + } + + cp->file = wp; + + sys_close(0); + fd_install(0, rp); + spin_lock(&cf->file_lock); + fdt = files_fdtable(cf); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&cf->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + + return 0; +} + void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; - struct inode * inode; - struct file * file; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; - int ispipe = 0; - unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; - char **helper_argv = NULL; - int helper_argc = 0; - char *delimit; + int ispipe; + static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .signr = signr, + .regs = regs, + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + */ + .mm_flags = mm->flags, + }; audit_core_dumps(signr); - binfmt = current->binfmt; + binfmt = mm->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; cred = prepare_creds(); - if (!cred) { - retval = -ENOMEM; + if (!cred) goto fail; - } down_write(&mm->mmap_sem); /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_state || !get_dumpable(mm)) { + if (mm->core_state || !__get_dumpable(cprm.mm_flags)) { up_write(&mm->mmap_sem); - put_cred(cred); - goto fail; + goto fail_creds; } /* @@ -1768,16 +1879,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * process nor do we know its entire history. We only know it * was tainted so we dump it as root in mode 2. */ - if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ + if (__get_dumpable(cprm.mm_flags) == 2) { + /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = 0; /* Dump root private */ } retval = coredump_wait(exit_code, &core_state); - if (retval < 0) { - put_cred(cred); - goto fail; - } + if (retval < 0) + goto fail_creds; old_cred = override_creds(cred); @@ -1794,90 +1904,110 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) lock_kernel(); ispipe = format_corename(corename, signr); unlock_kernel(); - /* - * Don't bother to check the RLIMIT_CORE value if core_pattern points - * to a pipe. Since we're not writing directly to the filesystem - * RLIMIT_CORE doesn't really apply, as no actual core file will be - * created unless the pipe reader choses to write out the core file - * at which point file size limits and permissions will be imposed - * as it does with any other process - */ - if ((!ispipe) && (core_limit < binfmt->min_coredump)) - goto fail_unlock; if (ispipe) { - helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); - if (!helper_argv) { - printk(KERN_WARNING "%s failed to allocate memory\n", - __func__); + int dump_count; + char **helper_argv; + + if (cprm.limit == 1) { + /* + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a speacial value. Any + * non-1 limit gets set to RLIM_INFINITY below, but + * a limit of 0 skips the dump. This is a consistent + * way to catch recursive crashes. We can still crash + * if the core_pattern binary sets RLIM_CORE = !1 + * but it runs as root, and can do lots of stupid things + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + printk(KERN_WARNING + "Process %d(%s) has RLIMIT_CORE set to 1\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Aborting core\n"); goto fail_unlock; } - /* Terminate the string before the first option */ - delimit = strchr(corename, ' '); - if (delimit) - *delimit = '\0'; - delimit = strrchr(helper_argv[0], '/'); - if (delimit) - delimit++; - else - delimit = helper_argv[0]; - if (!strcmp(delimit, current->comm)) { - printk(KERN_NOTICE "Recursive core dump detected, " - "aborting\n"); - goto fail_unlock; + cprm.limit = RLIM_INFINITY; + + dump_count = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < dump_count)) { + printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n", + task_tgid_vnr(current), current->comm); + printk(KERN_WARNING "Skipping core dump\n"); + goto fail_dropcount; } - core_limit = RLIM_INFINITY; + helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_dropcount; + } - /* SIGPIPE can happen, but it's just never processed */ - if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, - &file)) { + retval = call_usermodehelper_fns(helper_argv[0], helper_argv, + NULL, UMH_WAIT_EXEC, umh_pipe_setup, + NULL, &cprm); + argv_free(helper_argv); + if (retval) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); - goto fail_unlock; + goto close_fail; } - } else - file = filp_open(corename, + } else { + struct inode *inode; + + if (cprm.limit < binfmt->min_coredump) + goto fail_unlock; + + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(file)) - goto fail_unlock; - inode = file->f_path.dentry->d_inode; - if (inode->i_nlink > 1) - goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(file->f_path.dentry)) - goto close_fail; - - /* AK: actually i see no reason to not allow this for named pipes etc., - but keep the previous behaviour for now. */ - if (!ispipe && !S_ISREG(inode->i_mode)) - goto close_fail; - /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files: - */ - if (inode->i_uid != current_fsuid()) - goto close_fail; - if (!file->f_op) - goto close_fail; - if (!file->f_op->write) - goto close_fail; - if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) - goto close_fail; + if (IS_ERR(cprm.file)) + goto fail_unlock; - retval = binfmt->core_dump(signr, regs, file, core_limit); + inode = cprm.file->f_path.dentry->d_inode; + if (inode->i_nlink > 1) + goto close_fail; + if (d_unhashed(cprm.file->f_path.dentry)) + goto close_fail; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + goto close_fail; + /* + * Dont allow local users get cute and trick others to coredump + * into their pre-created files. + */ + if (inode->i_uid != current_fsuid()) + goto close_fail; + if (!cprm.file->f_op || !cprm.file->f_op->write) + goto close_fail; + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + goto close_fail; + } + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; + + if (ispipe && core_pipe_limit) + wait_for_dump_helpers(cprm.file); close_fail: - filp_close(file, NULL); + if (cprm.file) + filp_close(cprm.file, NULL); +fail_dropcount: + if (ispipe) + atomic_dec(&core_dump_count); fail_unlock: - if (helper_argv) - argv_free(helper_argv); - + coredump_finish(mm); revert_creds(old_cred); +fail_creds: put_cred(cred); - coredump_finish(mm); fail: return; }