X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fsys.c;h=421009cedb51fc97a43b66333299e5217b4bc57f;hb=12b5989be10011387a9da5dee82e5c0d6f9d02e7;hp=5a9d6b075016546d0d72ec590a17cd77985a8947;hpb=71a2224d7d1cefc23a1ac80bba421cc069cc3257;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/sys.c b/kernel/sys.c index 5a9d6b0..421009c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,7 +16,10 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -26,9 +29,11 @@ #include #include #include +#include #include #include +#include #include #include @@ -165,7 +170,7 @@ EXPORT_SYMBOL(notifier_chain_unregister); * of the last notifier function called. */ -int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; @@ -359,7 +364,103 @@ out_unlock: return retval; } +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + device_shutdown(); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + if (!cmd) { + printk(KERN_EMERG "Restarting system.\n"); + } else { + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + } + printk(".\n"); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); +/** + * kernel_kexec - reboot the system + * + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +void kernel_kexec(void) +{ +#ifdef CONFIG_KEXEC + struct kimage *image; + image = xchg(&kexec_image, NULL); + if (!image) { + return; + } + kernel_restart_prepare(NULL); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); +#endif +} +EXPORT_SYMBOL_GPL(kernel_kexec); + +void kernel_shutdown_prepare(enum system_states state) +{ + notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + printk(KERN_EMERG "System halted.\n"); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + printk(KERN_EMERG "Power down.\n"); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -384,14 +485,16 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + lock_kernel(); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); - printk(KERN_EMERG "Restarting system.\n"); - machine_restart(NULL); + kernel_restart(NULL); break; case LINUX_REBOOT_CMD_CAD_ON: @@ -403,21 +506,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user break; case LINUX_REBOOT_CMD_HALT: - notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_state = SYSTEM_HALT; - device_shutdown(); - printk(KERN_EMERG "System halted.\n"); - machine_halt(); + kernel_halt(); unlock_kernel(); do_exit(0); break; case LINUX_REBOOT_CMD_POWER_OFF: - notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_state = SYSTEM_POWER_OFF; - device_shutdown(); - printk(KERN_EMERG "Power down.\n"); - machine_power_off(); + kernel_power_off(); unlock_kernel(); do_exit(0); break; @@ -429,13 +524,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user } buffer[sizeof(buffer) - 1] = '\0'; - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_state = SYSTEM_RESTART; - device_shutdown(); - printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); - machine_restart(buffer); + kernel_restart(buffer); break; + case LINUX_REBOOT_CMD_KEXEC: + kernel_kexec(); + unlock_kernel(); + return -EINVAL; + #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { @@ -455,8 +551,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user static void deferred_cad(void *dummy) { - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - machine_restart(NULL); + kernel_restart(NULL); } /* @@ -535,6 +630,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) current->egid = new_egid; current->gid = new_rgid; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); return 0; } @@ -574,6 +670,7 @@ asmlinkage long sys_setgid(gid_t gid) return -EPERM; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); return 0; } @@ -663,6 +760,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) current->fsuid = current->euid; key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); } @@ -710,6 +808,7 @@ asmlinkage long sys_setuid(uid_t uid) current->suid = new_suid; key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); } @@ -758,6 +857,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) current->suid = suid; key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); } @@ -810,6 +910,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) current->sgid = sgid; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); return 0; } @@ -852,6 +953,7 @@ asmlinkage long sys_setfsuid(uid_t uid) } key_fsuid_changed(current); + proc_id_connector(current, PROC_EVENT_UID); security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); @@ -880,6 +982,7 @@ asmlinkage long sys_setfsgid(gid_t gid) } current->fsgid = gid; key_fsgid_changed(current); + proc_id_connector(current, PROC_EVENT_GID); } return old_fsgid; } @@ -983,10 +1086,11 @@ asmlinkage long sys_times(struct tms __user * tbuf) asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) { struct task_struct *p; + struct task_struct *group_leader = current->group_leader; int err = -EINVAL; if (!pid) - pid = current->pid; + pid = group_leader->pid; if (!pgid) pgid = pid; if (pgid < 0) @@ -1006,16 +1110,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (!thread_group_leader(p)) goto out; - if (p->parent == current || p->real_parent == current) { + if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != current->signal->session) + if (p->signal->session != group_leader->signal->session) goto out; err = -EACCES; if (p->did_exec) goto out; } else { err = -ESRCH; - if (p != current) + if (p != group_leader) goto out; } @@ -1027,7 +1131,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *p; do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == current->signal->session) + if (p->signal->session == group_leader->signal->session) goto ok_pgid; } while_each_task_pid(pgid, PIDTYPE_PGID, p); goto out; @@ -1107,27 +1211,25 @@ asmlinkage long sys_getsid(pid_t pid) asmlinkage long sys_setsid(void) { + struct task_struct *group_leader = current->group_leader; struct pid *pid; int err = -EPERM; - if (!thread_group_leader(current)) - return -EINVAL; - - down(&tty_sem); + mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); - pid = find_pid(PIDTYPE_PGID, current->pid); + pid = find_pid(PIDTYPE_PGID, group_leader->pid); if (pid) goto out; - current->signal->leader = 1; - __set_special_pids(current->pid, current->pid); - current->signal->tty = NULL; - current->signal->tty_old_pgrp = 0; - err = process_group(current); + group_leader->signal->leader = 1; + __set_special_pids(group_leader->pid, group_leader->pid); + group_leader->signal->tty = NULL; + group_leader->signal->tty_old_pgrp = 0; + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - up(&tty_sem); + mutex_unlock(&tty_mutex); return err; } @@ -1259,7 +1361,7 @@ static void groups_sort(struct group_info *group_info) } /* a simple bsearch */ -static int groups_search(struct group_info *group_info, gid_t grp) +int groups_search(struct group_info *group_info, gid_t grp) { int left, right; @@ -1516,20 +1618,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; + unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) return -EINVAL; - if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) - return -EPERM; + return -EPERM; retval = security_task_setrlimit(resource, &new_rlim); if (retval) @@ -1539,19 +1642,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) *old_rlim = new_rlim; task_unlock(current->group_leader); - if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && - (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - new_rlim.rlim_cur <= cputime_to_secs( - current->signal->it_prof_expires))) { - cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); + if (resource != RLIMIT_CPU) + goto out; + + /* + * RLIMIT_CPU handling. Note that the kernel fails to return an error + * code if it rejected the user's attempt to set RLIMIT_CPU. This is a + * very long-standing error, and fixing it now risks breakage of + * applications, so we live with it + */ + if (new_rlim.rlim_cur == RLIM_INFINITY) + goto out; + + it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); + if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { + unsigned long rlim_cur = new_rlim.rlim_cur; + cputime_t cputime; + + if (rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + rlim_cur = 1; + } + cputime = secs_to_cputime(rlim_cur); read_lock(&tasklist_lock); spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, - &cputime, NULL); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); read_unlock(&tasklist_lock); } - +out: return 0; } @@ -1563,9 +1687,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * - * This expects to be called with tasklist_lock read-locked or better, - * and the siglock not locked. It may momentarily take the siglock. - * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't @@ -1573,6 +1694,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. + * + * tasklist_lock locking optimisation: + * If we are current and single threaded, we do not need to take the tasklist + * lock or the siglock. No one else can take our signal_struct away, + * no one else can reap the children to update signal->c* counters, and + * no one else can race with the signal-> fields. + * If we do not take the tasklist_lock, the signal-> fields could be read + * out of order while another thread was just exiting. So we place a + * read memory barrier when we avoid the lock. On the writer side, + * write memory barrier is implied in __exit_signal as __exit_signal releases + * the siglock spinlock after updating the signal-> fields. + * + * We don't really need the siglock when we access the non c* fields + * of the signal_struct (for RUSAGE_SELF) even in multithreaded + * case, since we take the tasklist lock for read and the non c* signal-> + * fields are updated only in __exit_signal, which is called with + * tasklist_lock taken for write, hence these two threads cannot execute + * concurrently. + * */ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) @@ -1580,13 +1720,26 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + int need_lock = 0; memset((char *) r, 0, sizeof *r); + utime = stime = cputime_zero; - if (unlikely(!p->signal)) - return; + if (p != current || !thread_group_empty(p)) + need_lock = 1; + + if (need_lock) { + read_lock(&tasklist_lock); + if (unlikely(!p->signal)) { + read_unlock(&tasklist_lock); + return; + } + } else + /* See locking comments above */ + smp_rmb(); switch (who) { + case RUSAGE_BOTH: case RUSAGE_CHILDREN: spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; @@ -1596,22 +1749,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; spin_unlock_irqrestore(&p->sighand->siglock, flags); - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); - break; + + if (who == RUSAGE_CHILDREN) + break; + case RUSAGE_SELF: - spin_lock_irqsave(&p->sighand->siglock, flags); - utime = stime = cputime_zero; - goto sum_group; - case RUSAGE_BOTH: - spin_lock_irqsave(&p->sighand->siglock, flags); - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - sum_group: utime = cputime_add(utime, p->signal->utime); stime = cputime_add(stime, p->signal->stime); r->ru_nvcsw += p->signal->nvcsw; @@ -1628,21 +1770,22 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += t->maj_flt; t = next_thread(t); } while (t != p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); break; + default: BUG(); } + + if (need_lock) + read_unlock(&tasklist_lock); + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; - read_lock(&tasklist_lock); k_getrusage(p, who, &r); - read_unlock(&tasklist_lock); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } @@ -1663,7 +1806,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { long error; - int sig; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) @@ -1671,19 +1813,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, switch (option) { case PR_SET_PDEATHSIG: - sig = arg2; - if (!valid_signal(sig)) { + if (!valid_signal(arg2)) { error = -EINVAL; break; } - current->pdeath_signal = sig; + current->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(current->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - if (current->mm->dumpable) - error = 1; + error = current->mm->dumpable; break; case PR_SET_DUMPABLE: if (arg2 < 0 || arg2 > 2) {