X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Ffork.c;h=7b93da72d4a2331a69438e4ac347a477a70aadcd;hb=30cd324e9787ccc9a5ede59742d5409857550692;hp=31a2bad63a0823189f0f5940595970c4a54e0c34;hpb=6c5f3e7b43300508fe3947ff3cfff0f86043bb57;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/fork.c b/kernel/fork.c index 31a2bad..7b93da7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -22,25 +22,32 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -53,6 +60,7 @@ #include #include #include +#include #include #include @@ -73,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +DEFINE_TRACE(sched_process_fork); + int nr_processes(void) { int cpu; @@ -90,6 +100,23 @@ int nr_processes(void) static struct kmem_cache *task_struct_cachep; #endif +#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR +static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +{ +#ifdef CONFIG_DEBUG_STACK_USAGE + gfp_t mask = GFP_KERNEL | __GFP_ZERO; +#else + gfp_t mask = GFP_KERNEL; +#endif + return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); +} + +static inline void free_thread_info(struct thread_info *ti) +{ + free_pages((unsigned long)ti, THREAD_SIZE_ORDER); +} +#endif + /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -113,6 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -132,6 +160,14 @@ void __put_task_struct(struct task_struct *tsk) free_task(tsk); } +/* + * macro override instead of weak attribute alias, to workaround + * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. + */ +#ifndef arch_task_cache_init +#define arch_task_cache_init() +#endif + void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR @@ -144,6 +180,9 @@ void __init fork_init(unsigned long mempages) ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); #endif + /* do the arch specific task caches init */ + arch_task_cache_init(); + /* * The default maximum number of threads is set to a safe * value: the thread structures can take up at most half @@ -163,6 +202,13 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } +int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + *dst = *src; + return 0; +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; @@ -181,15 +227,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - *tsk = *orig; + err = arch_dup_task_struct(tsk, orig); + if (err) + goto out; + tsk->stack = ti; err = prop_local_init_single(&tsk->dirties); - if (err) { - free_thread_info(ti); - free_task_struct(tsk); - return NULL; - } + if (err) + goto out; setup_thread_stack(tsk, orig); @@ -205,6 +251,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; return tsk; + +out: + free_thread_info(ti); + free_task_struct(tsk); + return NULL; } #ifdef CONFIG_MMU @@ -256,7 +307,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; - pol = mpol_copy(vma_policy(mpnt)); + pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; @@ -268,20 +319,31 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = file->f_mapping; + get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - - /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + spin_lock(&mapping->i_mmap_lock); + if (tmp->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; tmp->vm_truncate_count = mpnt->vm_truncate_count; - flush_dcache_mmap_lock(file->f_mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); - flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); } /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; @@ -349,7 +411,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : MMF_DUMP_FILTER_DEFAULT; - mm->core_waiters = 0; + mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); @@ -358,14 +420,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; - mm_init_cgroup(mm, p); + mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } - mm_free_cgroup(mm); free_mm(mm); return NULL; } @@ -390,12 +452,12 @@ struct mm_struct * mm_alloc(void) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -void fastcall __mmdrop(struct mm_struct *mm) +void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); - mm_free_cgroup(mm); destroy_context(mm); + mmu_notifier_mm_destroy(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -410,6 +472,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); @@ -424,7 +487,7 @@ EXPORT_SYMBOL_GPL(mmput); /** * get_task_mm - acquire a reference to the task's mm * - * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() @@ -437,7 +500,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) task_lock(task); mm = task->mm; if (mm) { - if (task->flags & PF_BORROWED_MM) + if (task->flags & PF_KTHREAD) mm = NULL; else atomic_inc(&mm->mm_users); @@ -464,6 +527,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif +#endif + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); @@ -498,7 +571,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; @@ -522,6 +595,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) if (init_new_context(tsk, mm)) goto fail_nocontext; + dup_mm_exe_file(oldmm, mm); + err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -600,17 +675,10 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { - fs->altrootmnt = NULL; - fs->altroot = NULL; - } + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); read_unlock(&old->lock); } return fs; @@ -635,136 +703,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int count_open_files(struct fdtable *fdt) -{ - int size = fdt->max_fds; - int i; - - /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) - break; - } - i = (i+1) * 8 * sizeof(long); - return i; -} - -static struct files_struct *alloc_files(void) -{ - struct files_struct *newf; - struct fdtable *fdt; - - newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); - if (!newf) - goto out; - - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - fdt = &newf->fdtab; - fdt->max_fds = NR_OPEN_DEFAULT; - fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - fdt->open_fds = (fd_set *)&newf->open_fds_init; - fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&fdt->rcu); - fdt->next = NULL; - rcu_assign_pointer(newf->fdt, fdt); -out: - return newf; -} - -/* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. - */ -static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) -{ - struct files_struct *newf; - struct file **old_fds, **new_fds; - int open_files, size, i; - struct fdtable *old_fdt, *new_fdt; - - *errorp = -ENOMEM; - newf = alloc_files(); - if (!newf) - goto out; - - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - new_fdt = files_fdtable(newf); - open_files = count_open_files(old_fdt); - - /* - * Check whether we need to allocate a larger fd array and fd set. - * Note: we're not a clone task, so the open count won't change. - */ - if (open_files > new_fdt->max_fds) { - new_fdt->max_fds = 0; - spin_unlock(&oldf->file_lock); - spin_lock(&newf->file_lock); - *errorp = expand_files(newf, open_files-1); - spin_unlock(&newf->file_lock); - if (*errorp < 0) - goto out_release; - new_fdt = files_fdtable(newf); - /* - * Reacquire the oldf lock and a pointer to its fd table - * who knows it may have a new bigger fd table. We need - * the latest pointer. - */ - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - } - - old_fds = old_fdt->fd; - new_fds = new_fdt->fd; - - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); - - for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; - if (f) { - get_file(f); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ - FD_CLR(open_files - i, new_fdt->open_fds); - } - rcu_assign_pointer(*new_fds++, f); - } - spin_unlock(&oldf->file_lock); - - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); - - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); - } - - return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; -} - static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; @@ -782,12 +720,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) goto out; } - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -823,34 +755,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return 0; } -/* - * Helper to unshare the files of the current task. - * We don't want to expose copy_files internals to - * the exec layer of the kernel. - */ - -int unshare_files(void) -{ - struct files_struct *files = current->files; - int rc; - - BUG_ON(!files); - - /* This can race but the race causes us to copy when we don't - need to and drop the copy */ - if(atomic_read(&files->count) == 1) - { - atomic_inc(&files->count); - return 0; - } - rc = copy_files(0, current); - if(rc) - current->files = files; - return rc; -} - -EXPORT_SYMBOL(unshare_files); - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -874,15 +778,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -902,7 +835,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; - sig->curr_target = NULL; + sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -910,38 +843,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; + sig->tty = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->sum_sched_runtime = 0; - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); + task_io_accounting_init(&sig->ioac); taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -951,7 +871,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); + tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -971,8 +893,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; clear_freeze_flag(p); } @@ -993,6 +914,26 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + +/* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1006,7 +947,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, - struct pid *pid) + struct pid *pid, + int trace) { int retval; struct task_struct *p; @@ -1041,7 +983,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif @@ -1094,26 +1036,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; + p->default_timer_slack_ns = current->timer_slack_ns; + #ifdef CONFIG_DETECT_SOFTLOCKUP p->last_switch_count = 0; p->last_switch_timestamp = 0; #endif -#ifdef CONFIG_TASK_XACCT - p->rchar = 0; /* I/O counter: bytes read */ - p->wchar = 0; /* I/O counter: bytes written */ - p->syscr = 0; /* I/O counter: read syscalls */ - p->syscw = 0; /* I/O counter: write syscalls */ -#endif - task_io_accounting_init(p); + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1127,7 +1060,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); + p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; @@ -1207,11 +1140,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + ftrace_graph_init_task(p); + p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; + if (current->nsproxy != p->nsproxy) { + retval = ns_cgroup_clone(p, pid); + if (retval) + goto bad_fork_free_graph; + } + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1256,8 +1197,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1288,7 +1227,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_parent = current->real_parent; else p->real_parent = current; - p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); @@ -1305,40 +1243,25 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_free_graph; } if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { - add_parent(p); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); + list_add_tail(&p->sibling, &p->real_parent->children); + tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - p->signal->tty = current->signal->tty; + tty_kref_put(p->signal->tty); + p->signal->tty = tty_kref_get(current->signal->tty); set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); @@ -1357,6 +1280,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; +bad_fork_free_graph: + ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1385,7 +1310,7 @@ bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA - mpol_free(p->mempolicy); + mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); @@ -1416,29 +1341,13 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid); + &init_struct_pid, 0); if (!IS_ERR(task)) init_idle(task, cpu); return task; } -static int fork_traceflag(unsigned clone_flags) -{ - if (clone_flags & CLONE_UNTRACED) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; - - return 0; -} - /* * Ok, this is the main fork-routine. * @@ -1473,14 +1382,14 @@ long do_fork(unsigned long clone_flags, } } - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; - } + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + if (likely(user_mode(regs))) + trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL); + child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1488,6 +1397,8 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + trace_sched_process_fork(current, p); + nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) @@ -1498,32 +1409,36 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + audit_finish_fork(p); + tracehook_report_clone(trace, regs, clone_flags, nr, p); + + /* + * We set PF_STARTING at creation in case tracing wants to + * use this to distinguish a fully live task from one that + * hasn't gotten to tracehook_report_clone() yet. Now we + * clear it and set the child going. + */ + p->flags &= ~PF_STARTING; + + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); - } - - if (!(clone_flags & CLONE_STOPPED)) - wake_up_new_task(p, clone_flags); - else __set_task_state(p, TASK_STOPPED); - - if (unlikely (trace)) { - current->ptrace_message = nr; - ptrace_notify ((trace << 8) | SIGTRAP); + } else { + wake_up_new_task(p, clone_flags); } + tracehook_report_clone_complete(trace, regs, + clone_flags, nr, p); + if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { - current->ptrace_message = nr; - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } + tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); @@ -1535,7 +1450,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(struct kmem_cache *cachep, void *data) +static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; @@ -1675,18 +1590,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp } /* - * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not - * supported yet - */ -static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) -{ - if (unshare_flags & CLONE_SYSVSEM) - return -EINVAL; - - return 0; -} - -/* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by do_fork() cannot be used here directly @@ -1701,8 +1604,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; - struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL; + int do_sysvsem = 0; check_unshare_flags(&unshare_flags); @@ -1714,6 +1617,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWNET)) goto bad_unshare_out; + /* + * CLONE_NEWIPC must also detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. + */ + if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) + do_sysvsem = 1; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) @@ -1724,13 +1634,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; - if ((err = unshare_semundo(unshare_flags, &new_ulist))) - goto bad_unshare_cleanup_fd; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) - goto bad_unshare_cleanup_semundo; + goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { + if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { + /* + * CLONE_SYSVSEM is equivalent to sys_exit(). + */ + exit_sem(current); + } if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); @@ -1766,7 +1680,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); @@ -1788,3 +1701,27 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(struct files_struct **displaced) +{ + struct task_struct *task = current; + struct files_struct *copy = NULL; + int error; + + error = unshare_fd(CLONE_FILES, ©); + if (error || !copy) { + *displaced = NULL; + return error; + } + *displaced = task->files; + task_lock(task); + task->files = copy; + task_unlock(task); + return 0; +}