X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Ffork.c;h=7b93da72d4a2331a69438e4ac347a477a70aadcd;hb=30cd324e9787ccc9a5ede59742d5409857550692;hp=a65bfc47177ce7e6c3f6bc68fa7a3b3e1a70dbf2;hpb=73a2bcb0edb9ffb0b007b3546b430e2c6e415eee;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/fork.c b/kernel/fork.c index a65bfc4..7b93da7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -22,24 +22,32 @@ #include #include #include +#include +#include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -51,6 +59,8 @@ #include #include #include +#include +#include #include #include @@ -71,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +DEFINE_TRACE(sched_process_fork); + int nr_processes(void) { int cpu; @@ -88,6 +100,23 @@ int nr_processes(void) static struct kmem_cache *task_struct_cachep; #endif +#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR +static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +{ +#ifdef CONFIG_DEBUG_STACK_USAGE + gfp_t mask = GFP_KERNEL | __GFP_ZERO; +#else + gfp_t mask = GFP_KERNEL; +#endif + return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); +} + +static inline void free_thread_info(struct thread_info *ti) +{ + free_pages((unsigned long)ti, THREAD_SIZE_ORDER); +} +#endif + /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -111,6 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -130,6 +160,14 @@ void __put_task_struct(struct task_struct *tsk) free_task(tsk); } +/* + * macro override instead of weak attribute alias, to workaround + * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. + */ +#ifndef arch_task_cache_init +#define arch_task_cache_init() +#endif + void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR @@ -142,6 +180,9 @@ void __init fork_init(unsigned long mempages) ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); #endif + /* do the arch specific task caches init */ + arch_task_cache_init(); + /* * The default maximum number of threads is set to a safe * value: the thread structures can take up at most half @@ -161,6 +202,13 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } +int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + *dst = *src; + return 0; +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; @@ -179,15 +227,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - *tsk = *orig; + err = arch_dup_task_struct(tsk, orig); + if (err) + goto out; + tsk->stack = ti; err = prop_local_init_single(&tsk->dirties); - if (err) { - free_thread_info(ti); - free_task_struct(tsk); - return NULL; - } + if (err) + goto out; setup_thread_stack(tsk, orig); @@ -203,6 +251,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; return tsk; + +out: + free_thread_info(ti); + free_task_struct(tsk); + return NULL; } #ifdef CONFIG_MMU @@ -254,7 +307,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; - pol = mpol_copy(vma_policy(mpnt)); + pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; @@ -266,20 +319,31 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = file->f_mapping; + get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - - /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + spin_lock(&mapping->i_mmap_lock); + if (tmp->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; tmp->vm_truncate_count = mpnt->vm_truncate_count; - flush_dcache_mmap_lock(file->f_mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); - flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); } /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; @@ -324,7 +388,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) static inline void mm_free_pgd(struct mm_struct * mm) { - pgd_free(mm->pgd); + pgd_free(mm, mm->pgd); } #else #define dup_mmap(mm, oldmm) (0) @@ -339,7 +403,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #include -static struct mm_struct * mm_init(struct mm_struct * mm) +static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -347,7 +411,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : MMF_DUMP_FILTER_DEFAULT; - mm->core_waiters = 0; + mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); @@ -356,11 +420,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } + free_mm(mm); return NULL; } @@ -375,7 +442,7 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm); + mm = mm_init(mm, current); } return mm; } @@ -385,13 +452,15 @@ struct mm_struct * mm_alloc(void) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -void fastcall __mmdrop(struct mm_struct *mm) +void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + mmu_notifier_mm_destroy(mm); free_mm(mm); } +EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. @@ -403,6 +472,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); @@ -417,7 +487,7 @@ EXPORT_SYMBOL_GPL(mmput); /** * get_task_mm - acquire a reference to the task's mm * - * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() @@ -430,7 +500,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) task_lock(task); mm = task->mm; if (mm) { - if (task->flags & PF_BORROWED_MM) + if (task->flags & PF_KTHREAD) mm = NULL; else atomic_inc(&mm->mm_users); @@ -457,6 +527,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif +#endif + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); @@ -491,7 +571,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; @@ -509,12 +589,14 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; - if (!mm_init(mm)) + if (!mm_init(mm, tsk)) goto fail_nomem; if (init_new_context(tsk, mm)) goto fail_nocontext; + dup_mm_exe_file(oldmm, mm); + err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -593,17 +675,10 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); - } else { - fs->altrootmnt = NULL; - fs->altroot = NULL; - } + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); read_unlock(&old->lock); } return fs; @@ -628,136 +703,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int count_open_files(struct fdtable *fdt) -{ - int size = fdt->max_fds; - int i; - - /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) - break; - } - i = (i+1) * 8 * sizeof(long); - return i; -} - -static struct files_struct *alloc_files(void) -{ - struct files_struct *newf; - struct fdtable *fdt; - - newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); - if (!newf) - goto out; - - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - fdt = &newf->fdtab; - fdt->max_fds = NR_OPEN_DEFAULT; - fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - fdt->open_fds = (fd_set *)&newf->open_fds_init; - fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&fdt->rcu); - fdt->next = NULL; - rcu_assign_pointer(newf->fdt, fdt); -out: - return newf; -} - -/* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. - */ -static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) -{ - struct files_struct *newf; - struct file **old_fds, **new_fds; - int open_files, size, i; - struct fdtable *old_fdt, *new_fdt; - - *errorp = -ENOMEM; - newf = alloc_files(); - if (!newf) - goto out; - - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - new_fdt = files_fdtable(newf); - open_files = count_open_files(old_fdt); - - /* - * Check whether we need to allocate a larger fd array and fd set. - * Note: we're not a clone task, so the open count won't change. - */ - if (open_files > new_fdt->max_fds) { - new_fdt->max_fds = 0; - spin_unlock(&oldf->file_lock); - spin_lock(&newf->file_lock); - *errorp = expand_files(newf, open_files-1); - spin_unlock(&newf->file_lock); - if (*errorp < 0) - goto out_release; - new_fdt = files_fdtable(newf); - /* - * Reacquire the oldf lock and a pointer to its fd table - * who knows it may have a new bigger fd table. We need - * the latest pointer. - */ - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - } - - old_fds = old_fdt->fd; - new_fds = new_fdt->fd; - - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); - - for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; - if (f) { - get_file(f); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ - FD_CLR(open_files - i, new_fdt->open_fds); - } - rcu_assign_pointer(*new_fds++, f); - } - spin_unlock(&oldf->file_lock); - - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); - - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); - } - - return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; -} - static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; @@ -775,12 +720,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) goto out; } - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -791,34 +730,31 @@ out: return error; } -/* - * Helper to unshare the files of the current task. - * We don't want to expose copy_files internals to - * the exec layer of the kernel. - */ - -int unshare_files(void) +static int copy_io(unsigned long clone_flags, struct task_struct *tsk) { - struct files_struct *files = current->files; - int rc; +#ifdef CONFIG_BLOCK + struct io_context *ioc = current->io_context; - BUG_ON(!files); - - /* This can race but the race causes us to copy when we don't - need to and drop the copy */ - if(atomic_read(&files->count) == 1) - { - atomic_inc(&files->count); + if (!ioc) return 0; + /* + * Share io context with parent, if CLONE_IO is set + */ + if (clone_flags & CLONE_IO) { + tsk->io_context = ioc_task_link(ioc); + if (unlikely(!tsk->io_context)) + return -ENOMEM; + } else if (ioprio_valid(ioc->ioprio)) { + tsk->io_context = alloc_io_context(GFP_KERNEL, -1); + if (unlikely(!tsk->io_context)) + return -ENOMEM; + + tsk->io_context->ioprio = ioc->ioprio; } - rc = copy_files(0, current); - if(rc) - current->files = files; - return rc; +#endif + return 0; } -EXPORT_SYMBOL(unshare_files); - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -842,15 +778,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -870,47 +835,33 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; - sig->curr_target = NULL; + sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->tsk = tsk; - - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; + sig->tty = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->sum_sched_runtime = 0; - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); + task_io_accounting_init(&sig->ioac); taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -920,7 +871,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); + tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -940,8 +893,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; clear_freeze_flag(p); } @@ -962,6 +914,26 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + +/* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -975,7 +947,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, - struct pid *pid) + struct pid *pid, + int trace) { int retval; struct task_struct *p; @@ -1010,7 +983,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif @@ -1045,6 +1018,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); +#ifdef CONFIG_PREEMPT_RCU + p->rcu_read_lock_nesting = 0; + p->rcu_flipctr_idx = 0; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); @@ -1057,22 +1034,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; p->prev_utime = cputime_zero; + p->prev_stime = cputime_zero; -#ifdef CONFIG_TASK_XACCT - p->rchar = 0; /* I/O counter: bytes read */ - p->wchar = 0; /* I/O counter: bytes written */ - p->syscr = 0; /* I/O counter: read syscalls */ - p->syscw = 0; /* I/O counter: write syscalls */ + p->default_timer_slack_ns = current->timer_slack_ns; + +#ifdef CONFIG_DETECT_SOFTLOCKUP + p->last_switch_count = 0; + p->last_switch_timestamp = 0; #endif - task_io_accounting_init(p); + + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1081,11 +1055,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_SECURITY p->security = NULL; #endif + p->cap_bset = current->cap_bset; p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); + p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; @@ -1122,6 +1097,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->blocked_on = NULL; /* not blocked yet */ #endif + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p, clone_flags); + if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) @@ -1143,15 +1121,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_mm; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; + if ((retval = copy_io(clone_flags, p))) + goto bad_fork_cleanup_namespaces; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespaces; + goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(task_active_pid_ns(p)); if (!pid) - goto bad_fork_cleanup_namespaces; + goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { retval = pid_ns_prepare_proc(task_active_pid_ns(p)); @@ -1160,11 +1140,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + ftrace_graph_init_task(p); + p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; + if (current->nsproxy != p->nsproxy) { + retval = ns_cgroup_clone(p, pid); + if (retval) + goto bad_fork_free_graph; + } + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1192,6 +1180,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif + clear_all_latency_tracing(p); /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ @@ -1208,11 +1197,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_children); - INIT_LIST_HEAD(&p->ptrace_list); - - /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1223,9 +1207,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ - p->ioprio = current->ioprio; - /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. @@ -1236,6 +1217,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; + p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); @@ -1245,7 +1227,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_parent = current->real_parent; else p->real_parent = current; - p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); @@ -1262,52 +1243,29 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_free_graph; } if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { - add_parent(p); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); + list_add_tail(&p->sibling, &p->real_parent->children); + tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) { + if (clone_flags & CLONE_NEWPID) p->nsproxy->pid_ns->child_reaper = p; - p->signal->tty = NULL; - set_task_pgrp(p, p->pid); - set_task_session(p, p->pid); - attach_pid(p, PIDTYPE_PGID, pid); - attach_pid(p, PIDTYPE_SID, pid); - } else { - p->signal->tty = current->signal->tty; - set_task_pgrp(p, task_pgrp_nr(current)); - set_task_session(p, task_session_nr(current)); - attach_pid(p, PIDTYPE_PGID, - task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, - task_session(current)); - } + p->signal->leader_pid = pid; + tty_kref_put(p->signal->tty); + p->signal->tty = tty_kref_get(current->signal->tty); + set_task_pgrp(p, task_pgrp_nr(current)); + set_task_session(p, task_session_nr(current)); + attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } @@ -1322,9 +1280,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); return p; +bad_fork_free_graph: + ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); +bad_fork_cleanup_io: + put_io_context(p->io_context); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_keys: @@ -1348,7 +1310,7 @@ bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA - mpol_free(p->mempolicy); + mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); @@ -1367,7 +1329,7 @@ fork_out: return ERR_PTR(retval); } -noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; @@ -1379,29 +1341,13 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid); + &init_struct_pid, 0); if (!IS_ERR(task)) init_idle(task, cpu); return task; } -static int fork_traceflag(unsigned clone_flags) -{ - if (clone_flags & CLONE_UNTRACED) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; - - return 0; -} - /* * Ok, this is the main fork-routine. * @@ -1419,14 +1365,31 @@ long do_fork(unsigned long clone_flags, int trace = 0; long nr; - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; + /* + * We hope to recycle these flags after 2.6.26 + */ + if (unlikely(clone_flags & CLONE_STOPPED)) { + static int __read_mostly count = 100; + + if (count > 0 && printk_ratelimit()) { + char comm[TASK_COMM_LEN]; + + count--; + printk(KERN_INFO "fork(): process `%s' used deprecated " + "clone flags 0x%lx\n", + get_task_comm(comm, current), + clone_flags & CLONE_STOPPED); + } } + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + if (likely(user_mode(regs))) + trace = tracehook_prepare_clone(clone_flags); + p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL); + child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1434,13 +1397,9 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; - /* - * this is enough to call pid_nr_ns here, but this if - * improves optimisation of regular fork() - */ - nr = (clone_flags & CLONE_NEWPID) ? - task_pid_nr_ns(p, current->nsproxy->pid_ns) : - task_pid_vnr(p); + trace_sched_process_fork(current, p); + + nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1450,32 +1409,36 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + audit_finish_fork(p); + tracehook_report_clone(trace, regs, clone_flags, nr, p); + + /* + * We set PF_STARTING at creation in case tracing wants to + * use this to distinguish a fully live task from one that + * hasn't gotten to tracehook_report_clone() yet. Now we + * clear it and set the child going. + */ + p->flags &= ~PF_STARTING; + + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); - } - - if (!(clone_flags & CLONE_STOPPED)) + __set_task_state(p, TASK_STOPPED); + } else { wake_up_new_task(p, clone_flags); - else - p->state = TASK_STOPPED; - - if (unlikely (trace)) { - current->ptrace_message = nr; - ptrace_notify ((trace << 8) | SIGTRAP); } + tracehook_report_clone_complete(trace, regs, + clone_flags, nr, p); + if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { - current->ptrace_message = nr; - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } + tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); @@ -1487,7 +1450,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(struct kmem_cache *cachep, void *data) +static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; @@ -1627,18 +1590,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp } /* - * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not - * supported yet - */ -static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) -{ - if (unshare_flags & CLONE_SYSVSEM) - return -EINVAL; - - return 0; -} - -/* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by do_fork() cannot be used here directly @@ -1653,8 +1604,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; - struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL; + int do_sysvsem = 0; check_unshare_flags(&unshare_flags); @@ -1666,6 +1617,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWNET)) goto bad_unshare_out; + /* + * CLONE_NEWIPC must also detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. + */ + if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) + do_sysvsem = 1; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) @@ -1676,13 +1634,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; - if ((err = unshare_semundo(unshare_flags, &new_ulist))) - goto bad_unshare_cleanup_fd; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) - goto bad_unshare_cleanup_semundo; + goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { + if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { + /* + * CLONE_SYSVSEM is equivalent to sys_exit(). + */ + exit_sem(current); + } if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); @@ -1718,7 +1680,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); @@ -1740,3 +1701,27 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(struct files_struct **displaced) +{ + struct task_struct *task = current; + struct files_struct *copy = NULL; + int error; + + error = unshare_fd(CLONE_FILES, ©); + if (error || !copy) { + *displaced = NULL; + return error; + } + *displaced = task->files; + task_lock(task); + task->files = copy; + task_unlock(task); + return 0; +}