2 * linux/arch/x86_64/entry.S
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
10 * entry.S contains the system-call and fault low-level handling routines.
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call.
15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al.
18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved.
25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 #include <linux/linkage.h>
41 #include <asm/segment.h>
42 #include <asm/cache.h>
43 #include <asm/errno.h>
44 #include <asm/dwarf2.h>
45 #include <asm/calling.h>
46 #include <asm/asm-offsets.h>
48 #include <asm/unistd.h>
49 #include <asm/thread_info.h>
50 #include <asm/hw_irq.h>
52 #include <asm/irqflags.h>
53 #include <asm/paravirt.h>
54 #include <asm/ftrace.h>
56 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57 #include <linux/elf-em.h>
58 #define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59 #define __AUDIT_ARCH_64BIT 0x80000000
60 #define __AUDIT_ARCH_LE 0x40000000
64 #ifdef CONFIG_FUNCTION_TRACER
65 #ifdef CONFIG_DYNAMIC_FTRACE
71 cmpl $0, function_trace_stop
74 /* taken from glibc */
86 subq $MCOUNT_INSN_SIZE, %rdi
101 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
102 .globl ftrace_graph_call
112 #else /* ! CONFIG_DYNAMIC_FTRACE */
114 cmpl $0, function_trace_stop
117 cmpq $ftrace_stub, ftrace_trace_function
120 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
121 cmpq $ftrace_stub, ftrace_graph_return
122 jnz ftrace_graph_caller
130 /* taken from glibc */
140 movq 0x38(%rsp), %rdi
142 subq $MCOUNT_INSN_SIZE, %rdi
144 call *ftrace_trace_function
157 #endif /* CONFIG_DYNAMIC_FTRACE */
158 #endif /* CONFIG_FUNCTION_TRACER */
160 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
161 ENTRY(ftrace_graph_caller)
162 cmpl $0, function_trace_stop
175 movq 0x38(%rsp), %rsi
177 call prepare_ftrace_return
188 END(ftrace_graph_caller)
191 .globl return_to_handler
205 call ftrace_return_to_handler
222 #ifndef CONFIG_PREEMPT
223 #define retint_kernel retint_restore_args
226 #ifdef CONFIG_PARAVIRT
227 ENTRY(native_usergs_sysret64)
230 #endif /* CONFIG_PARAVIRT */
233 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
234 #ifdef CONFIG_TRACE_IRQFLAGS
235 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
243 * C code is not supposed to know about undefined top of stack. Every time
244 * a C function with an pt_regs argument is called from the SYSCALL based
245 * fast path FIXUP_TOP_OF_STACK is needed.
246 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
250 /* %rsp:at FRAMEEND */
251 .macro FIXUP_TOP_OF_STACK tmp
252 movq %gs:pda_oldrsp,\tmp
254 movq $__USER_DS,SS(%rsp)
255 movq $__USER_CS,CS(%rsp)
257 movq R11(%rsp),\tmp /* get eflags */
258 movq \tmp,EFLAGS(%rsp)
261 .macro RESTORE_TOP_OF_STACK tmp,offset=0
262 movq RSP-\offset(%rsp),\tmp
263 movq \tmp,%gs:pda_oldrsp
264 movq EFLAGS-\offset(%rsp),\tmp
265 movq \tmp,R11-\offset(%rsp)
268 .macro FAKE_STACK_FRAME child_rip
269 /* push in order ss, rsp, eflags, cs, rip */
271 pushq $__KERNEL_DS /* ss */
272 CFI_ADJUST_CFA_OFFSET 8
273 /*CFI_REL_OFFSET ss,0*/
275 CFI_ADJUST_CFA_OFFSET 8
277 pushq $(1<<9) /* eflags - interrupts on */
278 CFI_ADJUST_CFA_OFFSET 8
279 /*CFI_REL_OFFSET rflags,0*/
280 pushq $__KERNEL_CS /* cs */
281 CFI_ADJUST_CFA_OFFSET 8
282 /*CFI_REL_OFFSET cs,0*/
283 pushq \child_rip /* rip */
284 CFI_ADJUST_CFA_OFFSET 8
286 pushq %rax /* orig rax */
287 CFI_ADJUST_CFA_OFFSET 8
290 .macro UNFAKE_STACK_FRAME
292 CFI_ADJUST_CFA_OFFSET -(6*8)
295 .macro CFI_DEFAULT_STACK start=1
301 CFI_DEF_CFA_OFFSET SS+8
303 CFI_REL_OFFSET r15,R15
304 CFI_REL_OFFSET r14,R14
305 CFI_REL_OFFSET r13,R13
306 CFI_REL_OFFSET r12,R12
307 CFI_REL_OFFSET rbp,RBP
308 CFI_REL_OFFSET rbx,RBX
309 CFI_REL_OFFSET r11,R11
310 CFI_REL_OFFSET r10,R10
313 CFI_REL_OFFSET rax,RAX
314 CFI_REL_OFFSET rcx,RCX
315 CFI_REL_OFFSET rdx,RDX
316 CFI_REL_OFFSET rsi,RSI
317 CFI_REL_OFFSET rdi,RDI
318 CFI_REL_OFFSET rip,RIP
319 /*CFI_REL_OFFSET cs,CS*/
320 /*CFI_REL_OFFSET rflags,EFLAGS*/
321 CFI_REL_OFFSET rsp,RSP
322 /*CFI_REL_OFFSET ss,SS*/
325 * A newly forked process directly context switches into this.
330 push kernel_eflags(%rip)
331 CFI_ADJUST_CFA_OFFSET 8
332 popf # reset kernel eflags
333 CFI_ADJUST_CFA_OFFSET -8
335 GET_THREAD_INFO(%rcx)
336 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
340 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
341 je int_ret_from_sys_call
342 testl $_TIF_IA32,TI_flags(%rcx)
343 jnz int_ret_from_sys_call
344 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
345 jmp ret_from_sys_call
348 call syscall_trace_leave
349 GET_THREAD_INFO(%rcx)
355 * System call entry. Upto 6 arguments in registers are supported.
357 * SYSCALL does not save anything on the stack and does not change the
363 * rax system call number
365 * rcx return address for syscall/sysret, C arg3
368 * r10 arg3 (--> moved to rcx for C)
371 * r11 eflags for syscall/sysret, temporary for C
372 * r12-r15,rbp,rbx saved by C code, not touched.
374 * Interrupts are off on entry.
375 * Only called from user space.
377 * XXX if we had a free scratch register we could save the RSP into the stack frame
378 * and report it properly in ps. Unfortunately we haven't.
380 * When user can change the frames always force IRET. That is because
381 * it deals with uncanonical addresses better. SYSRET has trouble
382 * with them due to bugs in both AMD and Intel CPUs.
388 CFI_DEF_CFA rsp,PDA_STACKOFFSET
390 /*CFI_REGISTER rflags,r11*/
393 * A hypervisor implementation might want to use a label
394 * after the swapgs, so that it can do the swapgs
395 * for the guest and jump here on syscall.
397 ENTRY(system_call_after_swapgs)
399 movq %rsp,%gs:pda_oldrsp
400 movq %gs:pda_kernelstack,%rsp
402 * No need to follow this irqs off/on section - it's straight
405 ENABLE_INTERRUPTS(CLBR_NONE)
407 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
408 movq %rcx,RIP-ARGOFFSET(%rsp)
409 CFI_REL_OFFSET rip,RIP-ARGOFFSET
410 GET_THREAD_INFO(%rcx)
411 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
413 system_call_fastpath:
414 cmpq $__NR_syscall_max,%rax
417 call *sys_call_table(,%rax,8) # XXX: rip relative
418 movq %rax,RAX-ARGOFFSET(%rsp)
420 * Syscall return path ending with SYSRET (fast path)
421 * Has incomplete stack frame and undefined top of stack.
424 movl $_TIF_ALLWORK_MASK,%edi
428 GET_THREAD_INFO(%rcx)
429 DISABLE_INTERRUPTS(CLBR_NONE)
431 movl TI_flags(%rcx),%edx
436 * sysretq will re-enable interrupts:
439 movq RIP-ARGOFFSET(%rsp),%rcx
441 RESTORE_ARGS 0,-ARG_SKIP,1
442 /*CFI_REGISTER rflags,r11*/
443 movq %gs:pda_oldrsp, %rsp
447 /* Handle reschedules */
448 /* edx: work, edi: workmask */
450 bt $TIF_NEED_RESCHED,%edx
453 ENABLE_INTERRUPTS(CLBR_NONE)
455 CFI_ADJUST_CFA_OFFSET 8
458 CFI_ADJUST_CFA_OFFSET -8
461 /* Handle a signal */
464 ENABLE_INTERRUPTS(CLBR_NONE)
465 #ifdef CONFIG_AUDITSYSCALL
466 bt $TIF_SYSCALL_AUDIT,%edx
469 /* edx: work flags (arg3) */
470 leaq do_notify_resume(%rip),%rax
471 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
472 xorl %esi,%esi # oldset -> arg2
473 call ptregscall_common
474 movl $_TIF_WORK_MASK,%edi
475 /* Use IRET because user could have changed frame. This
476 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
477 DISABLE_INTERRUPTS(CLBR_NONE)
482 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
483 jmp ret_from_sys_call
485 #ifdef CONFIG_AUDITSYSCALL
487 * Fast path for syscall audit without full syscall trace.
488 * We just call audit_syscall_entry() directly, and then
489 * jump back to the normal fast path.
492 movq %r10,%r9 /* 6th arg: 4th syscall arg */
493 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
494 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
495 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
496 movq %rax,%rsi /* 2nd arg: syscall number */
497 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
498 call audit_syscall_entry
499 LOAD_ARGS 0 /* reload call-clobbered registers */
500 jmp system_call_fastpath
503 * Return fast path for syscall audit. Call audit_syscall_exit()
504 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
508 movq %rax,%rsi /* second arg, syscall return value */
509 cmpq $0,%rax /* is it < 0? */
510 setl %al /* 1 if so, 0 if not */
511 movzbl %al,%edi /* zero-extend that into %edi */
512 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
513 call audit_syscall_exit
514 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
516 #endif /* CONFIG_AUDITSYSCALL */
518 /* Do syscall tracing */
520 #ifdef CONFIG_AUDITSYSCALL
521 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
525 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
526 FIXUP_TOP_OF_STACK %rdi
528 call syscall_trace_enter
530 * Reload arg registers from stack in case ptrace changed them.
531 * We don't reload %rax because syscall_trace_enter() returned
532 * the value it wants us to use in the table lookup.
534 LOAD_ARGS ARGOFFSET, 1
536 cmpq $__NR_syscall_max,%rax
537 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
538 movq %r10,%rcx /* fixup for C */
539 call *sys_call_table(,%rax,8)
540 movq %rax,RAX-ARGOFFSET(%rsp)
541 /* Use IRET because user could have changed frame */
544 * Syscall return path ending with IRET.
545 * Has correct top of stack, but partial stack frame.
547 .globl int_ret_from_sys_call
548 .globl int_with_check
549 int_ret_from_sys_call:
550 DISABLE_INTERRUPTS(CLBR_NONE)
552 testl $3,CS-ARGOFFSET(%rsp)
553 je retint_restore_args
554 movl $_TIF_ALLWORK_MASK,%edi
555 /* edi: mask to check */
558 GET_THREAD_INFO(%rcx)
559 movl TI_flags(%rcx),%edx
562 andl $~TS_COMPAT,TI_status(%rcx)
565 /* Either reschedule or signal or syscall exit tracking needed. */
566 /* First do a reschedule test. */
567 /* edx: work, edi: workmask */
569 bt $TIF_NEED_RESCHED,%edx
572 ENABLE_INTERRUPTS(CLBR_NONE)
574 CFI_ADJUST_CFA_OFFSET 8
577 CFI_ADJUST_CFA_OFFSET -8
578 DISABLE_INTERRUPTS(CLBR_NONE)
582 /* handle signals and tracing -- both require a full stack frame */
585 ENABLE_INTERRUPTS(CLBR_NONE)
587 /* Check for syscall exit trace */
588 testl $_TIF_WORK_SYSCALL_EXIT,%edx
591 CFI_ADJUST_CFA_OFFSET 8
592 leaq 8(%rsp),%rdi # &ptregs -> arg1
593 call syscall_trace_leave
595 CFI_ADJUST_CFA_OFFSET -8
596 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
600 testl $_TIF_DO_NOTIFY_MASK,%edx
602 movq %rsp,%rdi # &ptregs -> arg1
603 xorl %esi,%esi # oldset -> arg2
604 call do_notify_resume
605 1: movl $_TIF_WORK_MASK,%edi
608 DISABLE_INTERRUPTS(CLBR_NONE)
615 * Certain special system calls that need to save a complete full stack frame.
618 .macro PTREGSCALL label,func,arg
621 leaq \func(%rip),%rax
622 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
623 jmp ptregscall_common
629 PTREGSCALL stub_clone, sys_clone, %r8
630 PTREGSCALL stub_fork, sys_fork, %rdi
631 PTREGSCALL stub_vfork, sys_vfork, %rdi
632 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
633 PTREGSCALL stub_iopl, sys_iopl, %rsi
635 ENTRY(ptregscall_common)
637 CFI_ADJUST_CFA_OFFSET -8
638 CFI_REGISTER rip, r11
641 CFI_REGISTER rip, r15
642 FIXUP_TOP_OF_STACK %r11
644 RESTORE_TOP_OF_STACK %r11
646 CFI_REGISTER rip, r11
649 CFI_ADJUST_CFA_OFFSET 8
650 CFI_REL_OFFSET rip, 0
653 END(ptregscall_common)
658 CFI_ADJUST_CFA_OFFSET -8
659 CFI_REGISTER rip, r11
661 FIXUP_TOP_OF_STACK %r11
664 RESTORE_TOP_OF_STACK %r11
667 jmp int_ret_from_sys_call
672 * sigreturn is special because it needs to restore all registers on return.
673 * This cannot be done with SYSRET, so use the IRET return path instead.
675 ENTRY(stub_rt_sigreturn)
678 CFI_ADJUST_CFA_OFFSET -8
681 FIXUP_TOP_OF_STACK %r11
682 call sys_rt_sigreturn
683 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
685 jmp int_ret_from_sys_call
687 END(stub_rt_sigreturn)
690 * initial frame state for interrupts and exceptions
695 CFI_DEF_CFA rsp,SS+8-\ref
696 /*CFI_REL_OFFSET ss,SS-\ref*/
697 CFI_REL_OFFSET rsp,RSP-\ref
698 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
699 /*CFI_REL_OFFSET cs,CS-\ref*/
700 CFI_REL_OFFSET rip,RIP-\ref
703 /* initial frame state for interrupts (and exceptions without error code) */
704 #define INTR_FRAME _frame RIP
705 /* initial frame state for exceptions with error code (and interrupts with
706 vector already pushed) */
707 #define XCPT_FRAME _frame ORIG_RAX
710 * Interrupt entry/exit.
712 * Interrupt entry points save only callee clobbered registers in fast path.
714 * Entry runs with interrupts off.
717 /* 0(%rsp): interrupt number */
718 .macro interrupt func
721 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
724 * Save rbp twice: One is for marking the stack frame, as usual, and the
725 * other, to fill pt_regs properly. This is because bx comes right
726 * before the last saved register in that structure, and not bp. If the
727 * base pointer were in the place bx is today, this would not be needed.
730 CFI_ADJUST_CFA_OFFSET 8
731 CFI_REL_OFFSET rbp, 0
733 CFI_DEF_CFA_REGISTER rbp
737 /* irqcount is used to check if a CPU is already on an interrupt
738 stack or not. While this is essentially redundant with preempt_count
739 it is a little cheaper to use a separate counter in the PDA
740 (short of moving irq_enter into assembly, which would be too
742 1: incl %gs:pda_irqcount
743 cmoveq %gs:pda_irqstackptr,%rsp
744 push %rbp # backlink for old unwinder
746 * We entered an interrupt context - irqs are off:
752 ENTRY(common_interrupt)
755 /* 0(%rsp): oldrsp-ARGOFFSET */
757 DISABLE_INTERRUPTS(CLBR_NONE)
759 decl %gs:pda_irqcount
761 CFI_DEF_CFA_REGISTER rsp
762 CFI_ADJUST_CFA_OFFSET -8
764 GET_THREAD_INFO(%rcx)
765 testl $3,CS-ARGOFFSET(%rsp)
768 /* Interrupt came from user space */
770 * Has a correct top of stack, but a partial stack frame
771 * %rcx: thread info. Interrupts off.
773 retint_with_reschedule:
774 movl $_TIF_WORK_MASK,%edi
777 movl TI_flags(%rcx),%edx
782 retint_swapgs: /* return to user-space */
784 * The iretq could re-enable interrupts:
786 DISABLE_INTERRUPTS(CLBR_ANY)
791 retint_restore_args: /* return to kernel space */
792 DISABLE_INTERRUPTS(CLBR_ANY)
794 * The iretq could re-enable interrupts:
803 .section __ex_table, "a"
804 .quad irq_return, bad_iret
807 #ifdef CONFIG_PARAVIRT
811 .section __ex_table,"a"
812 .quad native_iret, bad_iret
819 * The iret traps when the %cs or %ss being restored is bogus.
820 * We've lost the original trap vector and error code.
821 * #GPF is the most likely one to get for an invalid selector.
822 * So pretend we completed the iret and took the #GPF in user mode.
824 * We are now running with the kernel GS after exception recovery.
825 * But error_entry expects us to have user GS to match the user %cs,
831 jmp general_protection
835 /* edi: workmask, edx: work */
838 bt $TIF_NEED_RESCHED,%edx
841 ENABLE_INTERRUPTS(CLBR_NONE)
843 CFI_ADJUST_CFA_OFFSET 8
846 CFI_ADJUST_CFA_OFFSET -8
847 GET_THREAD_INFO(%rcx)
848 DISABLE_INTERRUPTS(CLBR_NONE)
853 testl $_TIF_DO_NOTIFY_MASK,%edx
856 ENABLE_INTERRUPTS(CLBR_NONE)
858 movq $-1,ORIG_RAX(%rsp)
859 xorl %esi,%esi # oldset
860 movq %rsp,%rdi # &pt_regs
861 call do_notify_resume
863 DISABLE_INTERRUPTS(CLBR_NONE)
865 GET_THREAD_INFO(%rcx)
866 jmp retint_with_reschedule
868 #ifdef CONFIG_PREEMPT
869 /* Returning to kernel space. Check if we need preemption */
870 /* rcx: threadinfo. interrupts off. */
872 cmpl $0,TI_preempt_count(%rcx)
873 jnz retint_restore_args
874 bt $TIF_NEED_RESCHED,TI_flags(%rcx)
875 jnc retint_restore_args
876 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
877 jnc retint_restore_args
878 call preempt_schedule_irq
883 END(common_interrupt)
888 .macro apicinterrupt num,func
891 CFI_ADJUST_CFA_OFFSET 8
897 ENTRY(thermal_interrupt)
898 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
899 END(thermal_interrupt)
901 ENTRY(threshold_interrupt)
902 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
903 END(threshold_interrupt)
906 ENTRY(reschedule_interrupt)
907 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
908 END(reschedule_interrupt)
910 .macro INVALIDATE_ENTRY num
911 ENTRY(invalidate_interrupt\num)
912 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
913 END(invalidate_interrupt\num)
925 ENTRY(call_function_interrupt)
926 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
927 END(call_function_interrupt)
928 ENTRY(call_function_single_interrupt)
929 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
930 END(call_function_single_interrupt)
931 ENTRY(irq_move_cleanup_interrupt)
932 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
933 END(irq_move_cleanup_interrupt)
936 ENTRY(apic_timer_interrupt)
937 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
938 END(apic_timer_interrupt)
940 ENTRY(uv_bau_message_intr1)
941 apicinterrupt 220,uv_bau_message_interrupt
942 END(uv_bau_message_intr1)
944 ENTRY(error_interrupt)
945 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
948 ENTRY(spurious_interrupt)
949 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
950 END(spurious_interrupt)
953 * Exception entry points.
957 PARAVIRT_ADJUST_EXCEPTION_FRAME
958 pushq $0 /* push error code/oldrax */
959 CFI_ADJUST_CFA_OFFSET 8
960 pushq %rax /* push real oldrax to the rdi slot */
961 CFI_ADJUST_CFA_OFFSET 8
968 .macro errorentry sym
970 PARAVIRT_ADJUST_EXCEPTION_FRAME
972 CFI_ADJUST_CFA_OFFSET 8
979 /* error code is on the stack already */
980 /* handle NMI like exceptions that can happen everywhere */
981 .macro paranoidentry sym, ist=0, irqtrace=1
985 movl $MSR_GS_BASE,%ecx
993 movq %gs:pda_data_offset, %rbp
999 movq ORIG_RAX(%rsp),%rsi
1000 movq $-1,ORIG_RAX(%rsp)
1002 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1006 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1008 DISABLE_INTERRUPTS(CLBR_NONE)
1015 * "Paranoid" exit path from exception stack.
1016 * Paranoid because this is used by NMIs and cannot take
1017 * any kernel state for granted.
1018 * We don't do kernel preemption checks here, because only
1019 * NMI should be common and it does not enable IRQs and
1020 * cannot get reschedule ticks.
1022 * "trace" is 0 for the NMI handler only, because irq-tracing
1023 * is fundamentally NMI-unsafe. (we cannot change the soft and
1024 * hard flags at once, atomically)
1026 .macro paranoidexit trace=1
1027 /* ebx: no swapgs flag */
1028 paranoid_exit\trace:
1029 testl %ebx,%ebx /* swapgs needed? */
1030 jnz paranoid_restore\trace
1032 jnz paranoid_userspace\trace
1033 paranoid_swapgs\trace:
1038 paranoid_restore\trace:
1041 paranoid_userspace\trace:
1042 GET_THREAD_INFO(%rcx)
1043 movl TI_flags(%rcx),%ebx
1044 andl $_TIF_WORK_MASK,%ebx
1045 jz paranoid_swapgs\trace
1046 movq %rsp,%rdi /* &pt_regs */
1048 movq %rax,%rsp /* switch stack for scheduling */
1049 testl $_TIF_NEED_RESCHED,%ebx
1050 jnz paranoid_schedule\trace
1051 movl %ebx,%edx /* arg3: thread flags */
1055 ENABLE_INTERRUPTS(CLBR_NONE)
1056 xorl %esi,%esi /* arg2: oldset */
1057 movq %rsp,%rdi /* arg1: &pt_regs */
1058 call do_notify_resume
1059 DISABLE_INTERRUPTS(CLBR_NONE)
1063 jmp paranoid_userspace\trace
1064 paranoid_schedule\trace:
1068 ENABLE_INTERRUPTS(CLBR_ANY)
1070 DISABLE_INTERRUPTS(CLBR_ANY)
1074 jmp paranoid_userspace\trace
1079 * Exception entry point. This expects an error code/orig_rax on the stack
1080 * and the exception handler in %rax.
1082 KPROBE_ENTRY(error_entry)
1084 CFI_REL_OFFSET rax,0
1085 /* rdi slot contains rax, oldrax contains error code */
1088 CFI_ADJUST_CFA_OFFSET (14*8)
1089 movq %rsi,13*8(%rsp)
1090 CFI_REL_OFFSET rsi,RSI
1091 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1092 CFI_REGISTER rax,rsi
1093 movq %rdx,12*8(%rsp)
1094 CFI_REL_OFFSET rdx,RDX
1095 movq %rcx,11*8(%rsp)
1096 CFI_REL_OFFSET rcx,RCX
1097 movq %rsi,10*8(%rsp) /* store rax */
1098 CFI_REL_OFFSET rax,RAX
1100 CFI_REL_OFFSET r8,R8
1102 CFI_REL_OFFSET r9,R9
1104 CFI_REL_OFFSET r10,R10
1106 CFI_REL_OFFSET r11,R11
1108 CFI_REL_OFFSET rbx,RBX
1110 CFI_REL_OFFSET rbp,RBP
1112 CFI_REL_OFFSET r12,R12
1114 CFI_REL_OFFSET r13,R13
1116 CFI_REL_OFFSET r14,R14
1118 CFI_REL_OFFSET r15,R15
1121 je error_kernelspace
1127 CFI_REL_OFFSET rdi,RDI
1129 movq ORIG_RAX(%rsp),%rsi /* get error code */
1130 movq $-1,ORIG_RAX(%rsp)
1132 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1136 DISABLE_INTERRUPTS(CLBR_NONE)
1138 GET_THREAD_INFO(%rcx)
1141 LOCKDEP_SYS_EXIT_IRQ
1142 movl TI_flags(%rcx),%edx
1143 movl $_TIF_WORK_MASK,%edi
1151 /* There are two places in the kernel that can potentially fault with
1152 usergs. Handle them here. The exception handlers after
1153 iret run with kernel gs again, so don't set the user space flag.
1154 B stepping K8s sometimes report an truncated RIP for IRET
1155 exceptions returning to compat mode. Check for these here too. */
1156 leaq irq_return(%rip),%rcx
1159 movl %ecx,%ecx /* zero extend */
1162 cmpq $gs_change,RIP(%rsp)
1165 KPROBE_END(error_entry)
1167 /* Reload gs selector with exception handling */
1168 /* edi: new selector */
1169 ENTRY(native_load_gs_index)
1172 CFI_ADJUST_CFA_OFFSET 8
1173 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1177 2: mfence /* workaround */
1180 CFI_ADJUST_CFA_OFFSET -8
1183 ENDPROC(native_load_gs_index)
1185 .section __ex_table,"a"
1187 .quad gs_change,bad_gs
1189 .section .fixup,"ax"
1190 /* running with kernelgs */
1192 SWAPGS /* switch back to user gs */
1199 * Create a kernel thread.
1201 * C extern interface:
1202 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1204 * asm input arguments:
1205 * rdi: fn, rsi: arg, rdx: flags
1207 ENTRY(kernel_thread)
1209 FAKE_STACK_FRAME $child_rip
1212 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1214 orq kernel_thread_flags(%rip),%rdi
1227 * It isn't worth to check for reschedule here,
1228 * so internally to the x86_64 port you can rely on kernel_thread()
1229 * not to reschedule the child before returning, this avoids the need
1230 * of hacks for example to fork off the per-CPU idle tasks.
1231 * [Hopefully no generic code relies on the reschedule -AK]
1237 ENDPROC(kernel_thread)
1240 pushq $0 # fake return address
1243 * Here we are in the child and the registers are set as they were
1244 * at kernel_thread() invocation in the parent.
1256 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1258 * C extern interface:
1259 * extern long execve(char *name, char **argv, char **envp)
1261 * asm input arguments:
1262 * rdi: name, rsi: argv, rdx: envp
1264 * We want to fallback into:
1265 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
1267 * do_sys_execve asm fallback arguments:
1268 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1270 ENTRY(kernel_execve)
1276 movq %rax, RAX(%rsp)
1279 je int_ret_from_sys_call
1284 ENDPROC(kernel_execve)
1286 KPROBE_ENTRY(page_fault)
1287 errorentry do_page_fault
1288 KPROBE_END(page_fault)
1290 ENTRY(coprocessor_error)
1291 zeroentry do_coprocessor_error
1292 END(coprocessor_error)
1294 ENTRY(simd_coprocessor_error)
1295 zeroentry do_simd_coprocessor_error
1296 END(simd_coprocessor_error)
1298 ENTRY(device_not_available)
1299 zeroentry do_device_not_available
1300 END(device_not_available)
1302 /* runs on exception stack */
1305 PARAVIRT_ADJUST_EXCEPTION_FRAME
1307 CFI_ADJUST_CFA_OFFSET 8
1308 paranoidentry do_debug, DEBUG_STACK
1312 /* runs on exception stack */
1315 PARAVIRT_ADJUST_EXCEPTION_FRAME
1317 CFI_ADJUST_CFA_OFFSET 8
1318 paranoidentry do_nmi, 0, 0
1319 #ifdef CONFIG_TRACE_IRQFLAGS
1329 PARAVIRT_ADJUST_EXCEPTION_FRAME
1331 CFI_ADJUST_CFA_OFFSET 8
1332 paranoidentry do_int3, DEBUG_STACK
1338 zeroentry do_overflow
1346 zeroentry do_invalid_op
1349 ENTRY(coprocessor_segment_overrun)
1350 zeroentry do_coprocessor_segment_overrun
1351 END(coprocessor_segment_overrun)
1353 /* runs on exception stack */
1356 PARAVIRT_ADJUST_EXCEPTION_FRAME
1357 paranoidentry do_double_fault
1363 errorentry do_invalid_TSS
1366 ENTRY(segment_not_present)
1367 errorentry do_segment_not_present
1368 END(segment_not_present)
1370 /* runs on exception stack */
1371 ENTRY(stack_segment)
1373 PARAVIRT_ADJUST_EXCEPTION_FRAME
1374 paranoidentry do_stack_segment
1379 KPROBE_ENTRY(general_protection)
1380 errorentry do_general_protection
1381 KPROBE_END(general_protection)
1383 ENTRY(alignment_check)
1384 errorentry do_alignment_check
1385 END(alignment_check)
1388 zeroentry do_divide_error
1391 ENTRY(spurious_interrupt_bug)
1392 zeroentry do_spurious_interrupt_bug
1393 END(spurious_interrupt_bug)
1395 #ifdef CONFIG_X86_MCE
1396 /* runs on exception stack */
1397 ENTRY(machine_check)
1399 PARAVIRT_ADJUST_EXCEPTION_FRAME
1401 CFI_ADJUST_CFA_OFFSET 8
1402 paranoidentry do_machine_check
1408 /* Call softirq on interrupt stack. Interrupts are off. */
1412 CFI_ADJUST_CFA_OFFSET 8
1413 CFI_REL_OFFSET rbp,0
1415 CFI_DEF_CFA_REGISTER rbp
1416 incl %gs:pda_irqcount
1417 cmove %gs:pda_irqstackptr,%rsp
1418 push %rbp # backlink for old unwinder
1421 CFI_DEF_CFA_REGISTER rsp
1422 CFI_ADJUST_CFA_OFFSET -8
1423 decl %gs:pda_irqcount
1426 ENDPROC(call_softirq)
1428 KPROBE_ENTRY(ignore_sysret)
1433 ENDPROC(ignore_sysret)
1436 ENTRY(xen_hypervisor_callback)
1437 zeroentry xen_do_hypervisor_callback
1438 END(xen_hypervisor_callback)
1441 # A note on the "critical region" in our callback handler.
1442 # We want to avoid stacking callback handlers due to events occurring
1443 # during handling of the last event. To do this, we keep events disabled
1444 # until we've done all processing. HOWEVER, we must enable events before
1445 # popping the stack frame (can't be done atomically) and so it would still
1446 # be possible to get enough handler activations to overflow the stack.
1447 # Although unlikely, bugs of that kind are hard to track down, so we'd
1448 # like to avoid the possibility.
1449 # So, on entry to the handler we detect whether we interrupted an
1450 # existing activation in its critical region -- if so, we pop the current
1451 # activation and restart the handler using the previous one.
1453 ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1455 /* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1456 see the correct pointer to the pt_regs */
1457 movq %rdi, %rsp # we don't return, adjust the stack frame
1460 11: incl %gs:pda_irqcount
1462 CFI_DEF_CFA_REGISTER rbp
1463 cmovzq %gs:pda_irqstackptr,%rsp
1464 pushq %rbp # backlink for old unwinder
1465 call xen_evtchn_do_upcall
1467 CFI_DEF_CFA_REGISTER rsp
1468 decl %gs:pda_irqcount
1471 END(do_hypervisor_callback)
1474 # Hypervisor uses this for application faults while it executes.
1475 # We get here for two reasons:
1476 # 1. Fault while reloading DS, ES, FS or GS
1477 # 2. Fault while executing IRET
1478 # Category 1 we do not need to fix up as Xen has already reloaded all segment
1479 # registers that could be reloaded and zeroed the others.
1480 # Category 2 we fix up by killing the current process. We cannot use the
1481 # normal Linux return path in this case because if we use the IRET hypercall
1482 # to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1483 # We distinguish between categories by comparing each saved segment register
1484 # with its current contents: any discrepancy means we in category 1.
1486 ENTRY(xen_failsafe_callback)
1487 framesz = (RIP-0x30) /* workaround buggy gas */
1489 CFI_REL_OFFSET rcx, 0
1490 CFI_REL_OFFSET r11, 8
1504 /* All segments match their saved values => Category 2 (Bad IRET). */
1510 CFI_ADJUST_CFA_OFFSET -0x30
1512 CFI_ADJUST_CFA_OFFSET 8
1514 CFI_ADJUST_CFA_OFFSET 8
1516 CFI_ADJUST_CFA_OFFSET 8
1517 jmp general_protection
1519 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1525 CFI_ADJUST_CFA_OFFSET -0x30
1527 CFI_ADJUST_CFA_OFFSET 8
1531 END(xen_failsafe_callback)
1533 #endif /* CONFIG_XEN */