X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=arch%2Fx86%2Fkernel%2Fkprobes.c;h=1658efdfb4e5a617b8919de37ca7ca61c4a4306f;hb=3374cd1abd478f767aaedf2c21d109596ff0fe72;hp=884d985b8b8223a0df4c566b29ed0d47204e8453;hpb=57c44c5f6fb0a8002feb258c1af58e1a744b1fcb;p=safe%2Fjmp%2Flinux-2.6 diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 884d985..1658efd 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,31 +48,23 @@ #include #include #include +#include +#include #include #include #include #include #include +#include +#include void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -106,50 +98,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -159,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -193,6 +147,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes) kprobe_opcode_t opcode; kprobe_opcode_t *orig_opcodes = opcodes; + if (search_exception_tables((unsigned long)opcodes)) + return 0; /* Page fault may occur on this address. */ + retry: if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) return 0; @@ -241,6 +198,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -265,86 +291,67 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; + struct insn insn; + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); } - break; } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } +#ifdef CONFIG_X86_64 + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + kernel_insn_init(&insn, dest); + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) dest + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -356,6 +363,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -420,18 +432,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -443,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) + if (setup_detour_execution(p, regs, reenter)) + return; + +#if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -469,37 +499,21 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. - */ - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); @@ -511,7 +525,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. + * remain disabled throughout this function. */ static int __kprobes kprobe_handler(struct pt_regs *regs) { @@ -520,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) struct kprobe_ctlblk *kcb; addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->ip = (unsigned long)addr; - return 1; - } - /* * We don't want to be preempted for the entire * duration of kprobe processing. We conditionally @@ -562,13 +562,26 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } + } else if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + preempt_enable_no_resched(); + return 1; } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -577,6 +590,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -590,81 +666,24 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subl $12, %esp\n" - " pushl %fs\n" - " pushl %ds\n" - " pushl %es\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ - " movl %eax, 52(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ip, orig_ax, es, ds, fs */ - " addl $20, %esp\n" + " movl %eax, 56(%esp)\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); @@ -688,6 +707,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) regs->cs = __KERNEL_CS; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -831,8 +851,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -847,7 +867,7 @@ no_change: /* * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. + * remain disabled throughout this function. */ static int __kprobes post_kprobe_handler(struct pt_regs *regs) { @@ -963,8 +983,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* @@ -1053,6 +1079,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0;