2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle) == 0)
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status &= ~TS_POLLING;
105 * TS_POLLING-cleared state must be visible before we
110 safe_halt(); /* enables interrupts racelessly */
113 current_thread_info()->status |= TS_POLLING;
116 #ifdef CONFIG_HOTPLUG_CPU
117 DECLARE_PER_CPU(int, cpu_state);
120 /* We halt the CPU with physical CPU hotplug */
121 static inline void play_dead(void)
127 __get_cpu_var(cpu_state) = CPU_DEAD;
134 static inline void play_dead(void)
138 #endif /* CONFIG_HOTPLUG_CPU */
141 * The idle thread. There's no useful work to be
142 * done, so just try to conserve power and have a
143 * low exit latency (ie sit in a loop waiting for
144 * somebody to say that they'd like to reschedule)
148 current_thread_info()->status |= TS_POLLING;
149 /* endless idle loop with no priority at all */
151 tick_nohz_stop_sched_tick();
152 while (!need_resched()) {
156 if (cpu_is_offline(smp_processor_id()))
159 * Idle routines should keep interrupts disabled
160 * from here on, until they go to idle.
161 * Otherwise, idle callbacks can misfire.
166 /* In many cases the interrupt that ended idle
167 has already called exit_idle. But some idle
168 loops can be woken up without interrupt. */
172 tick_nohz_restart_sched_tick();
173 preempt_enable_no_resched();
179 /* Prints also some state that isn't saved in the pt_regs */
180 void __show_regs(struct pt_regs * regs)
182 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
183 unsigned long d0, d1, d2, d3, d6, d7;
184 unsigned int fsindex, gsindex;
185 unsigned int ds, cs, es;
189 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
190 current->pid, current->comm, print_tainted(),
191 init_utsname()->release,
192 (int)strcspn(init_utsname()->version, " "),
193 init_utsname()->version);
194 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
195 printk_address(regs->ip, 1);
196 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
198 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
199 regs->ax, regs->bx, regs->cx);
200 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
201 regs->dx, regs->si, regs->di);
202 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
203 regs->bp, regs->r8, regs->r9);
204 printk("R10: %016lx R11: %016lx R12: %016lx\n",
205 regs->r10, regs->r11, regs->r12);
206 printk("R13: %016lx R14: %016lx R15: %016lx\n",
207 regs->r13, regs->r14, regs->r15);
209 asm("movl %%ds,%0" : "=r" (ds));
210 asm("movl %%cs,%0" : "=r" (cs));
211 asm("movl %%es,%0" : "=r" (es));
212 asm("movl %%fs,%0" : "=r" (fsindex));
213 asm("movl %%gs,%0" : "=r" (gsindex));
215 rdmsrl(MSR_FS_BASE, fs);
216 rdmsrl(MSR_GS_BASE, gs);
217 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
224 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
225 fs,fsindex,gs,gsindex,shadowgs);
226 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
227 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
232 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
236 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
239 void show_regs(struct pt_regs *regs)
241 printk("CPU %d:", smp_processor_id());
243 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
247 * Free current thread data structures etc..
249 void exit_thread(void)
251 struct task_struct *me = current;
252 struct thread_struct *t = &me->thread;
254 if (me->thread.io_bitmap_ptr) {
255 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
257 kfree(t->io_bitmap_ptr);
258 t->io_bitmap_ptr = NULL;
259 clear_thread_flag(TIF_IO_BITMAP);
261 * Careful, clear this in the TSS too:
263 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
264 t->io_bitmap_max = 0;
269 void flush_thread(void)
271 struct task_struct *tsk = current;
273 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
274 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
275 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
276 clear_tsk_thread_flag(tsk, TIF_IA32);
278 set_tsk_thread_flag(tsk, TIF_IA32);
279 current_thread_info()->status |= TS_COMPAT;
282 clear_tsk_thread_flag(tsk, TIF_DEBUG);
284 tsk->thread.debugreg0 = 0;
285 tsk->thread.debugreg1 = 0;
286 tsk->thread.debugreg2 = 0;
287 tsk->thread.debugreg3 = 0;
288 tsk->thread.debugreg6 = 0;
289 tsk->thread.debugreg7 = 0;
290 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
292 * Forget coprocessor state..
298 void release_thread(struct task_struct *dead_task)
301 if (dead_task->mm->context.size) {
302 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
304 dead_task->mm->context.ldt,
305 dead_task->mm->context.size);
311 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
313 struct user_desc ud = {
320 struct desc_struct *desc = t->thread.tls_array;
325 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
327 return get_desc_base(&t->thread.tls_array[tls]);
331 * This gets called before we allocate a new thread and copy
332 * the current task into it.
334 void prepare_to_copy(struct task_struct *tsk)
339 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
340 unsigned long unused,
341 struct task_struct * p, struct pt_regs * regs)
344 struct pt_regs * childregs;
345 struct task_struct *me = current;
347 childregs = ((struct pt_regs *)
348 (THREAD_SIZE + task_stack_page(p))) - 1;
354 childregs->sp = (unsigned long)childregs;
356 p->thread.sp = (unsigned long) childregs;
357 p->thread.sp0 = (unsigned long) (childregs+1);
358 p->thread.usersp = me->thread.usersp;
360 set_tsk_thread_flag(p, TIF_FORK);
362 p->thread.fs = me->thread.fs;
363 p->thread.gs = me->thread.gs;
365 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
366 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
367 asm("mov %%es,%0" : "=m" (p->thread.es));
368 asm("mov %%ds,%0" : "=m" (p->thread.ds));
370 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
371 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
372 if (!p->thread.io_bitmap_ptr) {
373 p->thread.io_bitmap_max = 0;
376 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
378 set_tsk_thread_flag(p, TIF_IO_BITMAP);
382 * Set a new TLS for the child thread?
384 if (clone_flags & CLONE_SETTLS) {
385 #ifdef CONFIG_IA32_EMULATION
386 if (test_thread_flag(TIF_IA32))
387 err = do_set_thread_area(p, -1,
388 (struct user_desc __user *)childregs->si, 0);
391 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
397 if (err && p->thread.io_bitmap_ptr) {
398 kfree(p->thread.io_bitmap_ptr);
399 p->thread.io_bitmap_max = 0;
405 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
407 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
411 write_pda(oldrsp, new_sp);
412 regs->cs = __USER_CS;
413 regs->ss = __USER_DS;
417 * Free the old FP and other extended state
419 free_thread_xstate(current);
421 EXPORT_SYMBOL_GPL(start_thread);
423 static void hard_disable_TSC(void)
425 write_cr4(read_cr4() | X86_CR4_TSD);
428 void disable_TSC(void)
431 if (!test_and_set_thread_flag(TIF_NOTSC))
433 * Must flip the CPU state synchronously with
434 * TIF_NOTSC in the current running context.
440 static void hard_enable_TSC(void)
442 write_cr4(read_cr4() & ~X86_CR4_TSD);
445 static void enable_TSC(void)
448 if (test_and_clear_thread_flag(TIF_NOTSC))
450 * Must flip the CPU state synchronously with
451 * TIF_NOTSC in the current running context.
457 int get_tsc_mode(unsigned long adr)
461 if (test_thread_flag(TIF_NOTSC))
462 val = PR_TSC_SIGSEGV;
466 return put_user(val, (unsigned int __user *)adr);
469 int set_tsc_mode(unsigned int val)
471 if (val == PR_TSC_SIGSEGV)
473 else if (val == PR_TSC_ENABLE)
482 * This special macro can be used to load a debugging register
484 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
486 static inline void __switch_to_xtra(struct task_struct *prev_p,
487 struct task_struct *next_p,
488 struct tss_struct *tss)
490 struct thread_struct *prev, *next;
491 unsigned long debugctl;
493 prev = &prev_p->thread,
494 next = &next_p->thread;
496 debugctl = prev->debugctlmsr;
497 if (next->ds_area_msr != prev->ds_area_msr) {
498 /* we clear debugctl to make sure DS
499 * is not in use when we change it */
501 update_debugctlmsr(0);
502 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
505 if (next->debugctlmsr != debugctl)
506 update_debugctlmsr(next->debugctlmsr);
508 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
518 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
519 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
520 /* prev and next are different */
521 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
527 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
529 * Copy the relevant range of the IO bitmap.
530 * Normally this is 128 bytes or less:
532 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
533 max(prev->io_bitmap_max, next->io_bitmap_max));
534 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
536 * Clear any possible leftover bits:
538 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
542 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
543 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
545 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
546 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
551 * switch_to(x,y) should switch tasks from x to y.
553 * This could still be optimized:
554 * - fold all the options into a flag word and test it with a single test.
555 * - could test fs/gs bitsliced
557 * Kprobes not supported here. Set the probe on schedule instead.
560 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
562 struct thread_struct *prev = &prev_p->thread,
563 *next = &next_p->thread;
564 int cpu = smp_processor_id();
565 struct tss_struct *tss = &per_cpu(init_tss, cpu);
567 /* we're going to use this soon, after a few expensive things */
568 if (next_p->fpu_counter>5)
569 prefetch(next->xstate);
572 * Reload esp0, LDT and the page table pointer:
578 * This won't pick up thread selector changes, but I guess that is ok.
580 asm volatile("mov %%es,%0" : "=m" (prev->es));
581 if (unlikely(next->es | prev->es))
582 loadsegment(es, next->es);
584 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
585 if (unlikely(next->ds | prev->ds))
586 loadsegment(ds, next->ds);
595 asm volatile("movl %%fs,%0" : "=r" (fsindex));
596 /* segment register != 0 always requires a reload.
597 also reload when it has changed.
598 when prev process used 64bit base always reload
599 to avoid an information leak. */
600 if (unlikely(fsindex | next->fsindex | prev->fs)) {
601 loadsegment(fs, next->fsindex);
602 /* check if the user used a selector != 0
603 * if yes clear 64bit base, since overloaded base
604 * is always mapped to the Null selector
609 /* when next process has a 64bit base use it */
611 wrmsrl(MSR_FS_BASE, next->fs);
612 prev->fsindex = fsindex;
616 asm volatile("movl %%gs,%0" : "=r" (gsindex));
617 if (unlikely(gsindex | next->gsindex | prev->gs)) {
618 load_gs_index(next->gsindex);
623 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
624 prev->gsindex = gsindex;
627 /* Must be after DS reload */
631 * Switch the PDA and FPU contexts.
633 prev->usersp = read_pda(oldrsp);
634 write_pda(oldrsp, next->usersp);
635 write_pda(pcurrent, next_p);
637 write_pda(kernelstack,
638 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
639 #ifdef CONFIG_CC_STACKPROTECTOR
640 write_pda(stack_canary, next_p->stack_canary);
642 * Build time only check to make sure the stack_canary is at
643 * offset 40 in the pda; this is a gcc ABI requirement
645 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
649 * Now maybe reload the debug registers and handle I/O bitmaps
651 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
652 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
653 __switch_to_xtra(prev_p, next_p, tss);
655 /* If the task has used fpu the last 5 timeslices, just do a full
656 * restore of the math state immediately to avoid the trap; the
657 * chances of needing FPU soon are obviously high now
659 if (next_p->fpu_counter>5)
660 math_state_restore();
665 * sys_execve() executes a new program.
668 long sys_execve(char __user *name, char __user * __user *argv,
669 char __user * __user *envp, struct pt_regs *regs)
674 filename = getname(name);
675 error = PTR_ERR(filename);
676 if (IS_ERR(filename))
678 error = do_execve(filename, argv, envp, regs);
683 void set_personality_64bit(void)
685 /* inherit personality from parent */
687 /* Make sure to be in 64bit mode */
688 clear_thread_flag(TIF_IA32);
690 /* TBD: overwrites user setup. Should have two bits.
691 But 64bit processes have always behaved this way,
692 so it's not too bad. The main problem is just that
693 32bit childs are affected again. */
694 current->personality &= ~READ_IMPLIES_EXEC;
697 asmlinkage long sys_fork(struct pt_regs *regs)
699 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
703 sys_clone(unsigned long clone_flags, unsigned long newsp,
704 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
708 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
712 * This is trivial, and on the face of it looks like it
713 * could equally well be done in user mode.
715 * Not so, for quite unobvious reasons - register pressure.
716 * In user mode vfork() cannot have a stack frame, and if
717 * done by calling the "clone()" system call directly, you
718 * do not have enough call-clobbered registers to hold all
719 * the information you need.
721 asmlinkage long sys_vfork(struct pt_regs *regs)
723 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
727 unsigned long get_wchan(struct task_struct *p)
733 if (!p || p == current || p->state==TASK_RUNNING)
735 stack = (unsigned long)task_stack_page(p);
736 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
738 fp = *(u64 *)(p->thread.sp);
740 if (fp < (unsigned long)stack ||
741 fp > (unsigned long)stack+THREAD_SIZE)
744 if (!in_sched_functions(ip))
747 } while (count++ < 16);
751 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
754 int doit = task == current;
759 if (addr >= TASK_SIZE_OF(task))
762 /* handle small bases via the GDT because that's faster to
764 if (addr <= 0xffffffff) {
765 set_32bit_tls(task, GS_TLS, addr);
767 load_TLS(&task->thread, cpu);
768 load_gs_index(GS_TLS_SEL);
770 task->thread.gsindex = GS_TLS_SEL;
773 task->thread.gsindex = 0;
774 task->thread.gs = addr;
777 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
783 /* Not strictly needed for fs, but do it for symmetry
785 if (addr >= TASK_SIZE_OF(task))
788 /* handle small bases via the GDT because that's faster to
790 if (addr <= 0xffffffff) {
791 set_32bit_tls(task, FS_TLS, addr);
793 load_TLS(&task->thread, cpu);
794 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
796 task->thread.fsindex = FS_TLS_SEL;
799 task->thread.fsindex = 0;
800 task->thread.fs = addr;
802 /* set the selector to 0 to not confuse
804 asm volatile("movl %0,%%fs" :: "r" (0));
805 ret = checking_wrmsrl(MSR_FS_BASE, addr);
812 if (task->thread.fsindex == FS_TLS_SEL)
813 base = read_32bit_tls(task, FS_TLS);
815 rdmsrl(MSR_FS_BASE, base);
817 base = task->thread.fs;
818 ret = put_user(base, (unsigned long __user *)addr);
824 if (task->thread.gsindex == GS_TLS_SEL)
825 base = read_32bit_tls(task, GS_TLS);
827 asm("movl %%gs,%0" : "=r" (gsindex));
829 rdmsrl(MSR_KERNEL_GS_BASE, base);
831 base = task->thread.gs;
834 base = task->thread.gs;
835 ret = put_user(base, (unsigned long __user *)addr);
847 long sys_arch_prctl(int code, unsigned long addr)
849 return do_arch_prctl(current, code, addr);
852 unsigned long arch_align_stack(unsigned long sp)
854 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
855 sp -= get_random_int() % 8192;
859 unsigned long arch_randomize_brk(struct mm_struct *mm)
861 unsigned long range_end = mm->brk + 0x02000000;
862 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;