Merge branch 'topic/core-cleanup' into for-linus
[safe/jmp/linux-2.6] / arch / x86 / kernel / kprobes.c
index 884d985..1658efd 100644 (file)
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
 
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
 
 void jprobe_return_end(void);
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-#ifdef CONFIG_X86_64
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-#else
-/*
- * "&regs->sp" looks wrong, but it's correct for x86_32.  x86_32 CPUs
- * don't save the ss and esp registers if the CPU is already in kernel
- * mode when it traps.  So for kprobes, regs->sp and regs->ss are not
- * the [nonexistent] saved stack pointer and ss register, but rather
- * the top 8 bytes of the pre-int3 stack.  So &regs->sp happens to
- * point to the top of the pre-int3 stack.
- */
-#define stack_addr(regs) ((unsigned long *)&regs->sp)
-#endif
+#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
 
 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
@@ -106,50 +98,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
        /*      -----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 };
-static const u32 onebyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-       W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-       W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-       W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-       W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-       W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-       W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-       W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-       W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-       W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-       W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-       W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-       W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-       W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-       W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-       W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-       W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-       W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-       W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-       W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-       W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-       W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-       W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-       W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-       W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-       W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-       W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-       W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-       W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-       W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-       W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
 #undef W
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -159,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 };
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
+static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
 {
-       struct __arch_jmp_op {
-               char op;
+       struct __arch_relative_insn {
+               u8 op;
                s32 raddr;
-       } __attribute__((packed)) * jop;
-       jop = (struct __arch_jmp_op *)from;
-       jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
-       jop->op = RELATIVEJUMP_INSTRUCTION;
+       } __attribute__((packed)) *insn;
+
+       insn = (struct __arch_relative_insn *)from;
+       insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+       insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes synthesize_reljump(void *from, void *to)
+{
+       __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
 }
 
 /*
@@ -193,6 +147,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
        kprobe_opcode_t opcode;
        kprobe_opcode_t *orig_opcodes = opcodes;
 
+       if (search_exception_tables((unsigned long)opcodes))
+               return 0;       /* Page fault may occur on this address. */
+
 retry:
        if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
                return 0;
@@ -241,6 +198,75 @@ retry:
        }
 }
 
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+       struct kprobe *kp;
+       kp = get_kprobe((void *)addr);
+       if (!kp)
+               return -EINVAL;
+
+       /*
+        *  Basically, kp->ainsn.insn has an original instruction.
+        *  However, RIP-relative instruction can not do single-stepping
+        *  at different place, __copy_instruction() tweaks the displacement of
+        *  that instruction. In that case, we can't recover the instruction
+        *  from the kp->ainsn.insn.
+        *
+        *  On the other hand, kp->opcode has a copy of the first byte of
+        *  the probed instruction, which is overwritten by int3. And
+        *  the instruction at kp->addr is not modified by kprobes except
+        *  for the first byte, we can recover the original instruction
+        *  from it and kp->opcode.
+        */
+       memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       buf[0] = kp->opcode;
+       return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+       int ret;
+       unsigned long addr, offset = 0;
+       struct insn insn;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+       if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+               return 0;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr) {
+               kernel_insn_init(&insn, (void *)addr);
+               insn_get_opcode(&insn);
+
+               /*
+                * Check if the instruction has been modified by another
+                * kprobe, in which case we replace the breakpoint by the
+                * original instruction in our buffer.
+                */
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf, addr);
+                       if (ret)
+                               /*
+                                * Another debugging subsystem might insert
+                                * this breakpoint. In that case, we can't
+                                * recover it.
+                                */
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+               insn_get_length(&insn);
+               addr += insn.length;
+       }
+
+       return (addr == paddr);
+}
+
 /*
  * Returns non-zero if opcode modifies the interrupt flag.
  */
@@ -265,86 +291,67 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 }
 
 /*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
  * If it does, Return the address of the 32-bit displacement word.
  * If not, return null.
  * Only applicable to 64-bit x86.
  */
-static void __kprobes fix_riprel(struct kprobe *p)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
 {
-#ifdef CONFIG_X86_64
-       u8 *insn = p->ainsn.insn;
-       s64 disp;
-       int need_modrm;
-
-       /* Skip legacy instruction prefixes.  */
-       while (1) {
-               switch (*insn) {
-               case 0x66:
-               case 0x67:
-               case 0x2e:
-               case 0x3e:
-               case 0x26:
-               case 0x64:
-               case 0x65:
-               case 0x36:
-               case 0xf0:
-               case 0xf3:
-               case 0xf2:
-                       ++insn;
-                       continue;
+       struct insn insn;
+       int ret;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+       kernel_insn_init(&insn, src);
+       if (recover) {
+               insn_get_opcode(&insn);
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf,
+                                                        (unsigned long)src);
+                       if (ret)
+                               return 0;
+                       kernel_insn_init(&insn, buf);
                }
-               break;
        }
+       insn_get_length(&insn);
+       memcpy(dest, insn.kaddr, insn.length);
 
-       /* Skip REX instruction prefix.  */
-       if (is_REX_prefix(insn))
-               ++insn;
-
-       if (*insn == 0x0f) {
-               /* Two-byte opcode.  */
-               ++insn;
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)twobyte_has_modrm);
-       } else
-               /* One-byte opcode.  */
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)onebyte_has_modrm);
-
-       if (need_modrm) {
-               u8 modrm = *++insn;
-               if ((modrm & 0xc7) == 0x05) {
-                       /* %rip+disp32 addressing mode */
-                       /* Displacement follows ModRM byte.  */
-                       ++insn;
-                       /*
-                        * The copied instruction uses the %rip-relative
-                        * addressing mode.  Adjust the displacement for the
-                        * difference between the original location of this
-                        * instruction and the location of the copy that will
-                        * actually be run.  The tricky bit here is making sure
-                        * that the sign extension happens correctly in this
-                        * calculation, since we need a signed 32-bit result to
-                        * be sign-extended to 64 bits when it's added to the
-                        * %rip value and yield the same 64-bit result that the
-                        * sign-extension of the original signed 32-bit
-                        * displacement would have given.
-                        */
-                       disp = (u8 *) p->addr + *((s32 *) insn) -
-                              (u8 *) p->ainsn.insn;
-                       BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-                       *(s32 *)insn = (s32) disp;
-               }
+#ifdef CONFIG_X86_64
+       if (insn_rip_relative(&insn)) {
+               s64 newdisp;
+               u8 *disp;
+               kernel_insn_init(&insn, dest);
+               insn_get_displacement(&insn);
+               /*
+                * The copied instruction uses the %rip-relative addressing
+                * mode.  Adjust the displacement for the difference between
+                * the original location of this instruction and the location
+                * of the copy that will actually be run.  The tricky bit here
+                * is making sure that the sign extension happens correctly in
+                * this calculation, since we need a signed 32-bit result to
+                * be sign-extended to 64 bits when it's added to the %rip
+                * value and yield the same 64-bit result that the sign-
+                * extension of the original signed 32-bit displacement would
+                * have given.
+                */
+               newdisp = (u8 *) src + (s64) insn.displacement.value -
+                         (u8 *) dest;
+               BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
+               disp = (u8 *) dest + insn_offset_displacement(&insn);
+               *(s32 *) disp = (s32) newdisp;
        }
 #endif
+       return insn.length;
 }
 
 static void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
-       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
-       fix_riprel(p);
+       /*
+        * Copy an instruction without recovering int3, because it will be
+        * put by another subsystem.
+        */
+       __copy_instruction(p->ainsn.insn, p->addr, 0);
 
        if (can_boost(p->addr))
                p->ainsn.boostable = 0;
@@ -356,6 +363,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
+       if (alternatives_text_reserved(p->addr, p->addr))
+               return -EINVAL;
+
+       if (!can_probe((unsigned long)p->addr))
+               return -EILSEQ;
        /* insn: must be on special executable page on x86. */
        p->ainsn.insn = get_insn_slot();
        if (!p->ainsn.insn)
@@ -420,18 +432,6 @@ static void __kprobes restore_btf(void)
                update_debugctlmsr(current->thread.debugctlmsr);
 }
 
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       clear_btf();
-       regs->flags |= X86_EFLAGS_TF;
-       regs->flags &= ~X86_EFLAGS_IF;
-       /* single step inline if the instruction is an int3 */
-       if (p->opcode == BREAKPOINT_INSTRUCTION)
-               regs->ip = (unsigned long)p->addr;
-       else
-               regs->ip = (unsigned long)p->ainsn.insn;
-}
-
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
                                      struct pt_regs *regs)
 {
@@ -443,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
        *sara = (unsigned long) &kretprobe_trampoline;
 }
 
+#ifdef CONFIG_OPTPROBES
+static int  __kprobes setup_detour_execution(struct kprobe *p,
+                                            struct pt_regs *regs,
+                                            int reenter);
+#else
+#define setup_detour_execution(p, regs, reenter) (0)
+#endif
+
 static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
-                                      struct kprobe_ctlblk *kcb)
+                                      struct kprobe_ctlblk *kcb, int reenter)
 {
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+       if (setup_detour_execution(p, regs, reenter))
+               return;
+
+#if !defined(CONFIG_PREEMPT)
        if (p->ainsn.boostable == 1 && !p->post_handler) {
                /* Boost up -- we can execute copied instructions directly */
-               reset_current_kprobe();
+               if (!reenter)
+                       reset_current_kprobe();
+               /*
+                * Reentering boosted probe doesn't reset current_kprobe,
+                * nor set current_kprobe, because it doesn't use single
+                * stepping.
+                */
                regs->ip = (unsigned long)p->ainsn.insn;
                preempt_enable_no_resched();
                return;
        }
 #endif
-       prepare_singlestep(p, regs);
-       kcb->kprobe_status = KPROBE_HIT_SS;
+       if (reenter) {
+               save_previous_kprobe(kcb);
+               set_current_kprobe(p, regs, kcb);
+               kcb->kprobe_status = KPROBE_REENTER;
+       } else
+               kcb->kprobe_status = KPROBE_HIT_SS;
+       /* Prepare real single stepping */
+       clear_btf();
+       regs->flags |= X86_EFLAGS_TF;
+       regs->flags &= ~X86_EFLAGS_IF;
+       /* single step inline if the instruction is an int3 */
+       if (p->opcode == BREAKPOINT_INSTRUCTION)
+               regs->ip = (unsigned long)p->addr;
+       else
+               regs->ip = (unsigned long)p->ainsn.insn;
 }
 
 /*
@@ -469,37 +499,21 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
        switch (kcb->kprobe_status) {
        case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
-               /* TODO: Provide re-entrancy from post_kprobes_handler() and
-                * avoid exception stack corruption while single-stepping on
-                * the instruction of the new probe.
-                */
-               arch_disarm_kprobe(p);
-               regs->ip = (unsigned long)p->addr;
-               reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-#endif
        case KPROBE_HIT_ACTIVE:
-               save_previous_kprobe(kcb);
-               set_current_kprobe(p, regs, kcb);
                kprobes_inc_nmissed_count(p);
-               prepare_singlestep(p, regs);
-               kcb->kprobe_status = KPROBE_REENTER;
+               setup_singlestep(p, regs, kcb, 1);
                break;
        case KPROBE_HIT_SS:
-               if (p == kprobe_running()) {
-                       regs->flags &= ~X86_EFLAGS_TF;
-                       regs->flags |= kcb->kprobe_saved_flags;
-                       return 0;
-               } else {
-                       /* A probe has been hit in the codepath leading up
-                        * to, or just after, single-stepping of a probed
-                        * instruction. This entire codepath should strictly
-                        * reside in .kprobes.text section. Raise a warning
-                        * to highlight this peculiar case.
-                        */
-               }
+               /* A probe has been hit in the codepath leading up to, or just
+                * after, single-stepping of a probed instruction. This entire
+                * codepath should strictly reside in .kprobes.text section.
+                * Raise a BUG or we'll continue in an endless reentering loop
+                * and eventually a stack overflow.
+                */
+               printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+                      p->addr);
+               dump_kprobe(p);
+               BUG();
        default:
                /* impossible cases */
                WARN_ON(1);
@@ -511,7 +525,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
+ * remain disabled throughout this function.
  */
 static int __kprobes kprobe_handler(struct pt_regs *regs)
 {
@@ -520,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
        struct kprobe_ctlblk *kcb;
 
        addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
-       if (*addr != BREAKPOINT_INSTRUCTION) {
-               /*
-                * The breakpoint instruction was removed right
-                * after we hit it.  Another cpu has removed
-                * either a probepoint or a debugger breakpoint
-                * at this address.  In either case, no further
-                * handling of this interrupt is appropriate.
-                * Back up over the (now missing) int3 and run
-                * the original instruction.
-                */
-               regs->ip = (unsigned long)addr;
-               return 1;
-       }
-
        /*
         * We don't want to be preempted for the entire
         * duration of kprobe processing. We conditionally
@@ -562,13 +562,26 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
                         * more here.
                         */
                        if (!p->pre_handler || !p->pre_handler(p, regs))
-                               setup_singlestep(p, regs, kcb);
+                               setup_singlestep(p, regs, kcb, 0);
                        return 1;
                }
+       } else if (*addr != BREAKPOINT_INSTRUCTION) {
+               /*
+                * The breakpoint instruction was removed right
+                * after we hit it.  Another cpu has removed
+                * either a probepoint or a debugger breakpoint
+                * at this address.  In either case, no further
+                * handling of this interrupt is appropriate.
+                * Back up over the (now missing) int3 and run
+                * the original instruction.
+                */
+               regs->ip = (unsigned long)addr;
+               preempt_enable_no_resched();
+               return 1;
        } else if (kprobe_running()) {
                p = __get_cpu_var(current_kprobe);
                if (p->break_handler && p->break_handler(p, regs)) {
-                       setup_singlestep(p, regs, kcb);
+                       setup_singlestep(p, regs, kcb, 0);
                        return 1;
                }
        } /* else: not a kprobe fault; let the kernel handle it */
@@ -577,6 +590,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
        return 0;
 }
 
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING               \
+       /* Skip cs, ip, orig_ax. */     \
+       "       subq $24, %rsp\n"       \
+       "       pushq %rdi\n"           \
+       "       pushq %rsi\n"           \
+       "       pushq %rdx\n"           \
+       "       pushq %rcx\n"           \
+       "       pushq %rax\n"           \
+       "       pushq %r8\n"            \
+       "       pushq %r9\n"            \
+       "       pushq %r10\n"           \
+       "       pushq %r11\n"           \
+       "       pushq %rbx\n"           \
+       "       pushq %rbp\n"           \
+       "       pushq %r12\n"           \
+       "       pushq %r13\n"           \
+       "       pushq %r14\n"           \
+       "       pushq %r15\n"
+#define RESTORE_REGS_STRING            \
+       "       popq %r15\n"            \
+       "       popq %r14\n"            \
+       "       popq %r13\n"            \
+       "       popq %r12\n"            \
+       "       popq %rbp\n"            \
+       "       popq %rbx\n"            \
+       "       popq %r11\n"            \
+       "       popq %r10\n"            \
+       "       popq %r9\n"             \
+       "       popq %r8\n"             \
+       "       popq %rax\n"            \
+       "       popq %rcx\n"            \
+       "       popq %rdx\n"            \
+       "       popq %rsi\n"            \
+       "       popq %rdi\n"            \
+       /* Skip orig_ax, ip, cs */      \
+       "       addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING               \
+       /* Skip cs, ip, orig_ax and gs. */      \
+       "       subl $16, %esp\n"       \
+       "       pushl %fs\n"            \
+       "       pushl %ds\n"            \
+       "       pushl %es\n"            \
+       "       pushl %eax\n"           \
+       "       pushl %ebp\n"           \
+       "       pushl %edi\n"           \
+       "       pushl %esi\n"           \
+       "       pushl %edx\n"           \
+       "       pushl %ecx\n"           \
+       "       pushl %ebx\n"
+#define RESTORE_REGS_STRING            \
+       "       popl %ebx\n"            \
+       "       popl %ecx\n"            \
+       "       popl %edx\n"            \
+       "       popl %esi\n"            \
+       "       popl %edi\n"            \
+       "       popl %ebp\n"            \
+       "       popl %eax\n"            \
+       /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+       "       addl $24, %esp\n"
+#endif
+
 /*
  * When a retprobed function returns, this code saves registers and
  * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -590,81 +666,24 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
                        /* We don't bother saving the ss register */
                        "       pushq %rsp\n"
                        "       pushfq\n"
-                       /*
-                        * Skip cs, ip, orig_ax.
-                        * trampoline_handler() will plug in these values
-                        */
-                       "       subq $24, %rsp\n"
-                       "       pushq %rdi\n"
-                       "       pushq %rsi\n"
-                       "       pushq %rdx\n"
-                       "       pushq %rcx\n"
-                       "       pushq %rax\n"
-                       "       pushq %r8\n"
-                       "       pushq %r9\n"
-                       "       pushq %r10\n"
-                       "       pushq %r11\n"
-                       "       pushq %rbx\n"
-                       "       pushq %rbp\n"
-                       "       pushq %r12\n"
-                       "       pushq %r13\n"
-                       "       pushq %r14\n"
-                       "       pushq %r15\n"
+                       SAVE_REGS_STRING
                        "       movq %rsp, %rdi\n"
                        "       call trampoline_handler\n"
                        /* Replace saved sp with true return address. */
                        "       movq %rax, 152(%rsp)\n"
-                       "       popq %r15\n"
-                       "       popq %r14\n"
-                       "       popq %r13\n"
-                       "       popq %r12\n"
-                       "       popq %rbp\n"
-                       "       popq %rbx\n"
-                       "       popq %r11\n"
-                       "       popq %r10\n"
-                       "       popq %r9\n"
-                       "       popq %r8\n"
-                       "       popq %rax\n"
-                       "       popq %rcx\n"
-                       "       popq %rdx\n"
-                       "       popq %rsi\n"
-                       "       popq %rdi\n"
-                       /* Skip orig_ax, ip, cs */
-                       "       addq $24, %rsp\n"
+                       RESTORE_REGS_STRING
                        "       popfq\n"
 #else
                        "       pushf\n"
-                       /*
-                        * Skip cs, ip, orig_ax.
-                        * trampoline_handler() will plug in these values
-                        */
-                       "       subl $12, %esp\n"
-                       "       pushl %fs\n"
-                       "       pushl %ds\n"
-                       "       pushl %es\n"
-                       "       pushl %eax\n"
-                       "       pushl %ebp\n"
-                       "       pushl %edi\n"
-                       "       pushl %esi\n"
-                       "       pushl %edx\n"
-                       "       pushl %ecx\n"
-                       "       pushl %ebx\n"
+                       SAVE_REGS_STRING
                        "       movl %esp, %eax\n"
                        "       call trampoline_handler\n"
                        /* Move flags to cs */
-                       "       movl 52(%esp), %edx\n"
-                       "       movl %edx, 48(%esp)\n"
+                       "       movl 56(%esp), %edx\n"
+                       "       movl %edx, 52(%esp)\n"
                        /* Replace saved flags with true return address. */
-                       "       movl %eax, 52(%esp)\n"
-                       "       popl %ebx\n"
-                       "       popl %ecx\n"
-                       "       popl %edx\n"
-                       "       popl %esi\n"
-                       "       popl %edi\n"
-                       "       popl %ebp\n"
-                       "       popl %eax\n"
-                       /* Skip ip, orig_ax, es, ds, fs */
-                       "       addl $20, %esp\n"
+                       "       movl %eax, 56(%esp)\n"
+                       RESTORE_REGS_STRING
                        "       popf\n"
 #endif
                        "       ret\n");
@@ -688,6 +707,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
        regs->cs = __KERNEL_CS;
 #else
        regs->cs = __KERNEL_CS | get_kernel_rpl();
+       regs->gs = 0;
 #endif
        regs->ip = trampoline_address;
        regs->orig_ax = ~0UL;
@@ -831,8 +851,8 @@ static void __kprobes resume_execution(struct kprobe *p,
                         * These instructions can be executed directly if it
                         * jumps back to correct address.
                         */
-                       set_jmp_op((void *)regs->ip,
-                                  (void *)orig_ip + (regs->ip - copy_ip));
+                       synthesize_reljump((void *)regs->ip,
+                               (void *)orig_ip + (regs->ip - copy_ip));
                        p->ainsn.boostable = 1;
                } else {
                        p->ainsn.boostable = -1;
@@ -847,7 +867,7 @@ no_change:
 
 /*
  * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
+ * remain disabled throughout this function.
  */
 static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 {
@@ -963,8 +983,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
                        ret = NOTIFY_STOP;
                break;
        case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
+               if (post_kprobe_handler(args->regs)) {
+                       /*
+                        * Reset the BS bit in dr6 (pointed by args->err) to
+                        * denote completion of processing
+                        */
+                       (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
                        ret = NOTIFY_STOP;
+               }
                break;
        case DIE_GPF:
                /*
@@ -1053,6 +1079,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
 
+
+#ifdef CONFIG_OPTPROBES
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+static void __kprobes synthesize_relcall(void *from, void *to)
+{
+       __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
+                                         unsigned long val)
+{
+#ifdef CONFIG_X86_64
+       *addr++ = 0x48;
+       *addr++ = 0xbf;
+#else
+       *addr++ = 0xb8;
+#endif
+       *(unsigned long *)addr = val;
+}
+
+void __kprobes kprobes_optinsn_template_holder(void)
+{
+       asm volatile (
+                       ".global optprobe_template_entry\n"
+                       "optprobe_template_entry: \n"
+#ifdef CONFIG_X86_64
+                       /* We don't bother saving the ss register */
+                       "       pushq %rsp\n"
+                       "       pushfq\n"
+                       SAVE_REGS_STRING
+                       "       movq %rsp, %rsi\n"
+                       ".global optprobe_template_val\n"
+                       "optprobe_template_val: \n"
+                       ASM_NOP5
+                       ASM_NOP5
+                       ".global optprobe_template_call\n"
+                       "optprobe_template_call: \n"
+                       ASM_NOP5
+                       /* Move flags to rsp */
+                       "       movq 144(%rsp), %rdx\n"
+                       "       movq %rdx, 152(%rsp)\n"
+                       RESTORE_REGS_STRING
+                       /* Skip flags entry */
+                       "       addq $8, %rsp\n"
+                       "       popfq\n"
+#else /* CONFIG_X86_32 */
+                       "       pushf\n"
+                       SAVE_REGS_STRING
+                       "       movl %esp, %edx\n"
+                       ".global optprobe_template_val\n"
+                       "optprobe_template_val: \n"
+                       ASM_NOP5
+                       ".global optprobe_template_call\n"
+                       "optprobe_template_call: \n"
+                       ASM_NOP5
+                       RESTORE_REGS_STRING
+                       "       addl $4, %esp\n"        /* skip cs */
+                       "       popf\n"
+#endif
+                       ".global optprobe_template_end\n"
+                       "optprobe_template_end: \n");
+}
+
+#define TMPL_MOVE_IDX \
+       ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+       ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+       ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op,
+                                        struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       preempt_disable();
+       if (kprobe_running()) {
+               kprobes_inc_nmissed_count(&op->kp);
+       } else {
+               /* Save skipped registers */
+#ifdef CONFIG_X86_64
+               regs->cs = __KERNEL_CS;
+#else
+               regs->cs = __KERNEL_CS | get_kernel_rpl();
+               regs->gs = 0;
+#endif
+               regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+               regs->orig_ax = ~0UL;
+
+               __get_cpu_var(current_kprobe) = &op->kp;
+               kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+               opt_pre_handler(&op->kp, regs);
+               __get_cpu_var(current_kprobe) = NULL;
+       }
+       preempt_enable_no_resched();
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+       int len = 0, ret;
+
+       while (len < RELATIVEJUMP_SIZE) {
+               ret = __copy_instruction(dest + len, src + len, 1);
+               if (!ret || !can_boost(dest + len))
+                       return -EINVAL;
+               len += ret;
+       }
+       /* Check whether the address range is reserved */
+       if (ftrace_text_reserved(src, src + len - 1) ||
+           alternatives_text_reserved(src, src + len - 1))
+               return -EBUSY;
+
+       return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+       return ((insn->opcode.bytes[0] == 0xff &&
+               (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+               insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+       unsigned long target = 0;
+
+       switch (insn->opcode.bytes[0]) {
+       case 0xe0:      /* loopne */
+       case 0xe1:      /* loope */
+       case 0xe2:      /* loop */
+       case 0xe3:      /* jcxz */
+       case 0xe9:      /* near relative jump */
+       case 0xeb:      /* short relative jump */
+               break;
+       case 0x0f:
+               if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+                       break;
+               return 0;
+       default:
+               if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+                       break;
+               return 0;
+       }
+       target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+       return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+       int ret;
+       unsigned long addr, size = 0, offset = 0;
+       struct insn insn;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+       /* Dummy buffers for lookup_symbol_attrs */
+       static char __dummy_buf[KSYM_NAME_LEN];
+
+       /* Lookup symbol including addr */
+       if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+               return 0;
+
+       /* Check there is enough space for a relative jump. */
+       if (size - offset < RELATIVEJUMP_SIZE)
+               return 0;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr - offset + size) { /* Decode until function end */
+               if (search_exception_tables(addr))
+                       /*
+                        * Since some fixup code will jumps into this function,
+                        * we can't optimize kprobe in this function.
+                        */
+                       return 0;
+               kernel_insn_init(&insn, (void *)addr);
+               insn_get_opcode(&insn);
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf, addr);
+                       if (ret)
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+               insn_get_length(&insn);
+               /* Recover address */
+               insn.kaddr = (void *)addr;
+               insn.next_byte = (void *)(addr + insn.length);
+               /* Check any instructions don't jump into target */
+               if (insn_is_indirect_jump(&insn) ||
+                   insn_jump_into_range(&insn, paddr + INT3_SIZE,
+                                        RELATIVE_ADDR_SIZE))
+                       return 0;
+               addr += insn.length;
+       }
+
+       return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+       int i;
+       struct kprobe *p;
+
+       for (i = 1; i < op->optinsn.size; i++) {
+               p = get_kprobe(op->kp.addr + i);
+               if (p && !kprobe_disabled(p))
+                       return -EEXIST;
+       }
+
+       return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
+                                          unsigned long addr)
+{
+       return ((unsigned long)op->kp.addr <= addr &&
+               (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+       if (op->optinsn.insn) {
+               free_optinsn_slot(op->optinsn.insn, dirty);
+               op->optinsn.insn = NULL;
+               op->optinsn.size = 0;
+       }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+       __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+       u8 *buf;
+       int ret;
+       long rel;
+
+       if (!can_optimize((unsigned long)op->kp.addr))
+               return -EILSEQ;
+
+       op->optinsn.insn = get_optinsn_slot();
+       if (!op->optinsn.insn)
+               return -ENOMEM;
+
+       /*
+        * Verify if the address gap is in 2GB range, because this uses
+        * a relative jump.
+        */
+       rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+       if (abs(rel) > 0x7fffffff)
+               return -ERANGE;
+
+       buf = (u8 *)op->optinsn.insn;
+
+       /* Copy instructions into the out-of-line buffer */
+       ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+       if (ret < 0) {
+               __arch_remove_optimized_kprobe(op, 0);
+               return ret;
+       }
+       op->optinsn.size = ret;
+
+       /* Copy arch-dep-instance from template */
+       memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+       /* Set probe information */
+       synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+       /* Set probe function call */
+       synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+       /* Set returning jmp instruction at the tail of out-of-line buffer */
+       synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+                          (u8 *)op->kp.addr + op->optinsn.size);
+
+       flush_icache_range((unsigned long) buf,
+                          (unsigned long) buf + TMPL_END_IDX +
+                          op->optinsn.size + RELATIVEJUMP_SIZE);
+       return 0;
+}
+
+/* Replace a breakpoint (int3) with a relative jump.  */
+int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+{
+       unsigned char jmp_code[RELATIVEJUMP_SIZE];
+       s32 rel = (s32)((long)op->optinsn.insn -
+                       ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+       /* Backup instructions which will be replaced by jump address */
+       memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+              RELATIVE_ADDR_SIZE);
+
+       jmp_code[0] = RELATIVEJUMP_OPCODE;
+       *(s32 *)(&jmp_code[1]) = rel;
+
+       /*
+        * text_poke_smp doesn't support NMI/MCE code modifying.
+        * However, since kprobes itself also doesn't support NMI/MCE
+        * code probing, it's not a problem.
+        */
+       text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
+       return 0;
+}
+
+/* Replace a relative jump with a breakpoint (int3).  */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+       u8 buf[RELATIVEJUMP_SIZE];
+
+       /* Set int3 to first byte for kprobes */
+       buf[0] = BREAKPOINT_INSTRUCTION;
+       memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+       text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+static int  __kprobes setup_detour_execution(struct kprobe *p,
+                                            struct pt_regs *regs,
+                                            int reenter)
+{
+       struct optimized_kprobe *op;
+
+       if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+               /* This kprobe is really able to run optimized path. */
+               op = container_of(p, struct optimized_kprobe, kp);
+               /* Detour through copied instructions */
+               regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+               if (!reenter)
+                       reset_current_kprobe();
+               preempt_enable_no_resched();
+               return 1;
+       }
+       return 0;
+}
+#endif
+
 int __init arch_init_kprobes(void)
 {
        return 0;