Merge branch 'for-next' into for-linus
[safe/jmp/linux-2.6] / arch / x86 / kernel / ptrace.c
index 7ec39ab..a503b1f 100644 (file)
@@ -21,6 +21,9 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
+#include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
 enum x86_regset {
        REGSET_GENERAL,
        REGSET_FP,
        REGSET_XFP,
        REGSET_IOPERM64 = REGSET_XFP,
+       REGSET_XSTATE,
        REGSET_TLS,
        REGSET_IOPERM32,
 };
 
+struct pt_regs_offset {
+       const char *name;
+       int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+       REG_OFFSET_NAME(r15),
+       REG_OFFSET_NAME(r14),
+       REG_OFFSET_NAME(r13),
+       REG_OFFSET_NAME(r12),
+       REG_OFFSET_NAME(r11),
+       REG_OFFSET_NAME(r10),
+       REG_OFFSET_NAME(r9),
+       REG_OFFSET_NAME(r8),
+#endif
+       REG_OFFSET_NAME(bx),
+       REG_OFFSET_NAME(cx),
+       REG_OFFSET_NAME(dx),
+       REG_OFFSET_NAME(si),
+       REG_OFFSET_NAME(di),
+       REG_OFFSET_NAME(bp),
+       REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+       REG_OFFSET_NAME(ds),
+       REG_OFFSET_NAME(es),
+       REG_OFFSET_NAME(fs),
+       REG_OFFSET_NAME(gs),
+#endif
+       REG_OFFSET_NAME(orig_ax),
+       REG_OFFSET_NAME(ip),
+       REG_OFFSET_NAME(cs),
+       REG_OFFSET_NAME(flags),
+       REG_OFFSET_NAME(sp),
+       REG_OFFSET_NAME(ss),
+       REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name:      the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (!strcmp(roff->name, name))
+                       return roff->offset;
+       return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset:    the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (roff->offset == offset)
+                       return roff->name;
+       return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+       [0] = offsetof(struct pt_regs, ax),
+       [1] = offsetof(struct pt_regs, dx),
+       [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+       [0] = offsetof(struct pt_regs, di),
+       [1] = offsetof(struct pt_regs, si),
+       [2] = offsetof(struct pt_regs, dx),
+       [3] = offsetof(struct pt_regs, cx),
+       [4] = offsetof(struct pt_regs, r8),
+       [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
@@ -133,11 +229,6 @@ static int set_segment_reg(struct task_struct *task,
        return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-       return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK              (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -262,15 +353,6 @@ static int set_segment_reg(struct task_struct *task,
        return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (test_tsk_thread_flag(task, TIF_IA32))
-               return IA32_PAGE_OFFSET - 3;
-#endif
-       return TASK_SIZE64 - 7;
-}
-
 #endif /* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -321,16 +403,6 @@ static int putreg(struct task_struct *child,
                return set_flags(child, value);
 
 #ifdef CONFIG_X86_64
-       /*
-        * Orig_ax is really just a flag with small positive and
-        * negative values, so make sure to always sign-extend it
-        * from 32 bits so that it works correctly regardless of
-        * whether we come from a 32-bit environment or not.
-        */
-       case offsetof(struct user_regs_struct, orig_ax):
-               value = (long) (s32) value;
-               break;
-
        case offsetof(struct user_regs_struct,fs_base):
                if (value >= TASK_SIZE_OF(child))
                        return -EIO;
@@ -414,14 +486,14 @@ static int genregs_get(struct task_struct *target,
 {
        if (kbuf) {
                unsigned long *k = kbuf;
-               while (count > 0) {
+               while (count >= sizeof(*k)) {
                        *k++ = getreg(target, pos);
                        count -= sizeof(*k);
                        pos += sizeof(*k);
                }
        } else {
                unsigned long __user *u = ubuf;
-               while (count > 0) {
+               while (count >= sizeof(*u)) {
                        if (__put_user(getreg(target, pos), u++))
                                return -EFAULT;
                        count -= sizeof(*u);
@@ -440,14 +512,14 @@ static int genregs_set(struct task_struct *target,
        int ret = 0;
        if (kbuf) {
                const unsigned long *k = kbuf;
-               while (count > 0 && !ret) {
+               while (count >= sizeof(*k) && !ret) {
                        ret = putreg(target, pos, *k++);
                        count -= sizeof(*k);
                        pos += sizeof(*k);
                }
        } else {
                const unsigned long  __user *u = ubuf;
-               while (count > 0 && !ret) {
+               while (count >= sizeof(*u) && !ret) {
                        unsigned long word;
                        ret = __get_user(word, u++);
                        if (ret)
@@ -460,99 +532,240 @@ static int genregs_set(struct task_struct *target,
        return ret;
 }
 
+static void ptrace_triggered(struct perf_event *bp, int nmi,
+                            struct perf_sample_data *data,
+                            struct pt_regs *regs)
+{
+       int i;
+       struct thread_struct *thread = &(current->thread);
+
+       /*
+        * Store in the virtual DR6 register the fact that the breakpoint
+        * was hit so the thread's debugger will see it.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               if (thread->ptrace_bps[i] == bp)
+                       break;
+       }
+
+       thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 {
-       switch (n) {
-       case 0:         return child->thread.debugreg0;
-       case 1:         return child->thread.debugreg1;
-       case 2:         return child->thread.debugreg2;
-       case 3:         return child->thread.debugreg3;
-       case 6:         return child->thread.debugreg6;
-       case 7:         return child->thread.debugreg7;
+       int i;
+       int dr7 = 0;
+       struct arch_hw_breakpoint *info;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               if (bp[i] && !bp[i]->attr.disabled) {
+                       info = counter_arch_bp(bp[i]);
+                       dr7 |= encode_dr7(i, info->len, info->type);
+               }
        }
-       return 0;
+
+       return dr7;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-                              int n, unsigned long data)
+static int
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+                        struct task_struct *tsk, int disabled)
 {
-       int i;
+       int err;
+       int gen_len, gen_type;
+       struct perf_event_attr attr;
 
-       if (unlikely(n == 4 || n == 5))
-               return -EIO;
+       /*
+        * We should have at least an inactive breakpoint at this
+        * slot. It means the user is writing dr7 without having
+        * written the address register first
+        */
+       if (!bp)
+               return -EINVAL;
 
-       if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-               return -EIO;
+       err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+       if (err)
+               return err;
 
-       switch (n) {
-       case 0:         child->thread.debugreg0 = data; break;
-       case 1:         child->thread.debugreg1 = data; break;
-       case 2:         child->thread.debugreg2 = data; break;
-       case 3:         child->thread.debugreg3 = data; break;
+       attr = bp->attr;
+       attr.bp_len = gen_len;
+       attr.bp_type = gen_type;
+       attr.disabled = disabled;
 
-       case 6:
-               if ((data & ~0xffffffffUL) != 0)
-                       return -EIO;
-               child->thread.debugreg6 = data;
-               break;
+       return modify_user_hw_breakpoint(bp, &attr);
+}
 
-       case 7:
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long old_dr7;
+       int i, orig_ret = 0, rc = 0;
+       int enabled, second_pass = 0;
+       unsigned len, type;
+       struct perf_event *bp;
+
+       data &= ~DR_CONTROL_RESERVED;
+       old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+restore:
+       /*
+        * Loop through all the hardware breakpoints, making the
+        * appropriate changes to each.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               enabled = decode_dr7(data, i, &len, &type);
+               bp = thread->ptrace_bps[i];
+
+               if (!enabled) {
+                       if (bp) {
+                               /*
+                                * Don't unregister the breakpoints right-away,
+                                * unless all register_user_hw_breakpoint()
+                                * requests have succeeded. This prevents
+                                * any window of opportunity for debug
+                                * register grabbing by other users.
+                                */
+                               if (!second_pass)
+                                       continue;
+
+                               rc = ptrace_modify_breakpoint(bp, len, type,
+                                                             tsk, 1);
+                               if (rc)
+                                       break;
+                       }
+                       continue;
+               }
+
+               rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+               if (rc)
+                       break;
+       }
+       /*
+        * Make a second pass to free the remaining unused breakpoints
+        * or to restore the original breakpoints if an error occurred.
+        */
+       if (!second_pass) {
+               second_pass = 1;
+               if (rc < 0) {
+                       orig_ret = rc;
+                       data = old_dr7;
+               }
+               goto restore;
+       }
+       return ((orig_ret < 0) ? orig_ret : rc);
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long val = 0;
+
+       if (n < HBP_NUM) {
+               struct perf_event *bp;
+               bp = thread->ptrace_bps[n];
+               if (!bp)
+                       return 0;
+               val = bp->hw.info.address;
+       } else if (n == 6) {
+               val = thread->debugreg6;
+        } else if (n == 7) {
+               val = thread->ptrace_dr7;
+       }
+       return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+                                     unsigned long addr)
+{
+       struct perf_event *bp;
+       struct thread_struct *t = &tsk->thread;
+       struct perf_event_attr attr;
+
+       if (!t->ptrace_bps[nr]) {
+               hw_breakpoint_init(&attr);
                /*
-                * Sanity-check data. Take one half-byte at once with
-                * check = (val >> (16 + 4*i)) & 0xf. It contains the
-                * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-                * 2 and 3 are LENi. Given a list of invalid values,
-                * we do mask |= 1 << invalid_value, so that
-                * (mask >> check) & 1 is a correct test for invalid
-                * values.
-                *
-                * R/Wi contains the type of the breakpoint /
-                * watchpoint, LENi contains the length of the watched
-                * data in the watchpoint case.
-                *
-                * The invalid values are:
-                * - LENi == 0x10 (undefined), so mask |= 0x0f00.       [32-bit]
-                * - R/Wi == 0x10 (break on I/O reads or writes), so
-                *   mask |= 0x4444.
-                * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-                *   0x1110.
-                *
-                * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-                *
-                * See the Intel Manual "System Programming Guide",
-                * 15.2.4
-                *
-                * Note that LENi == 0x10 is defined on x86_64 in long
-                * mode (i.e. even for 32-bit userspace software, but
-                * 64-bit kernel), so the x86_64 mask value is 0x5454.
-                * See the AMD manual no. 24593 (AMD64 System Programming)
+                * Put stub len and type to register (reserve) an inactive but
+                * correct bp
                 */
-#ifdef CONFIG_X86_32
-#define        DR7_MASK        0x5f54
-#else
-#define        DR7_MASK        0x5554
-#endif
-               data &= ~DR_CONTROL_RESERVED;
-               for (i = 0; i < 4; i++)
-                       if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                               return -EIO;
-               child->thread.debugreg7 = data;
-               if (data)
-                       set_tsk_thread_flag(child, TIF_DEBUG);
-               else
-                       clear_tsk_thread_flag(child, TIF_DEBUG);
-               break;
+               attr.bp_addr = addr;
+               attr.bp_len = HW_BREAKPOINT_LEN_1;
+               attr.bp_type = HW_BREAKPOINT_W;
+               attr.disabled = 1;
+
+               bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+
+               /*
+                * CHECKME: the previous code returned -EIO if the addr wasn't
+                * a valid task virtual addr. The new one will return -EINVAL in
+                *  this case.
+                * -EINVAL may be what we want for in-kernel breakpoints users,
+                * but -EIO looks better for ptrace, since we refuse a register
+                * writing for the user. And anyway this is the previous
+                * behaviour.
+                */
+               if (IS_ERR(bp))
+                       return PTR_ERR(bp);
+
+               t->ptrace_bps[nr] = bp;
+       } else {
+               int err;
+
+               bp = t->ptrace_bps[nr];
+
+               attr = bp->attr;
+               attr.bp_addr = addr;
+               err = modify_user_hw_breakpoint(bp, &attr);
+               if (err)
+                       return err;
        }
 
+
        return 0;
 }
 
 /*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       int rc = 0;
+
+       /* There are no DR4 or DR5 registers */
+       if (n == 4 || n == 5)
+               return -EIO;
+
+       if (n == 6) {
+               thread->debugreg6 = val;
+               goto ret_path;
+       }
+       if (n < HBP_NUM) {
+               rc = ptrace_set_breakpoint_addr(tsk, n, val);
+               if (rc)
+                       return rc;
+       }
+       /* All that's left is DR7 */
+       if (n == 7) {
+               rc = ptrace_write_dr7(tsk, val);
+               if (!rc)
+                       thread->ptrace_dr7 = val;
+       }
+
+ret_path:
+       return rc;
+}
+
+/*
  * These access the current or another (stopped) task's io permission
  * bitmap for debugging or core dump.
  */
@@ -576,17 +789,130 @@ static int ioperm_get(struct task_struct *target,
 }
 
 #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+       /* The branch trace handle. */
+       struct bts_tracer       *tracer;
+
+       /* The buffer used to store the branch trace and its size. */
+       void                    *buffer;
+       unsigned int            size;
+
+       /* The mm that paid for the above buffer. */
+       struct mm_struct        *mm;
+
+       /* The task this context belongs to. */
+       struct task_struct      *task;
+
+       /* The signal to send on a bts buffer overflow. */
+       unsigned int            bts_ovfl_signal;
+
+       /* The work struct to destroy a context. */
+       struct work_struct      work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+       void *buffer = NULL;
+       int err = -ENOMEM;
+
+       err = account_locked_memory(current->mm, current->signal->rlim, size);
+       if (err < 0)
+               return err;
+
+       buffer = kzalloc(size, GFP_KERNEL);
+       if (!buffer)
+               goto out_refund;
+
+       context->buffer = buffer;
+       context->size = size;
+       context->mm = get_task_mm(current);
+
+       return 0;
+
+ out_refund:
+       refund_locked_memory(current->mm, size);
+       return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+       if (!context->buffer)
+               return;
+
+       kfree(context->buffer);
+       context->buffer = NULL;
+
+       refund_locked_memory(context->mm, context->size);
+       context->size = 0;
+
+       mmput(context->mm);
+       context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+       struct bts_context *context;
+
+       context = container_of(w, struct bts_context, work);
+
+       ds_release_bts(context->tracer);
+       put_task_struct(context->task);
+       free_bts_buffer(context);
+       kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+       INIT_WORK(&context->work, free_bts_context_work);
+       schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+       struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+       if (context) {
+               context->task = task;
+               task->bts = context;
+
+               get_task_struct(task);
+       }
+
+       return context;
+}
+
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
                                  struct bts_struct __user *out)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
        struct bts_struct bts;
        const unsigned char *at;
        int error;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        at = trace->ds.top - ((index + 1) * trace->ds.size);
        if ((void *)at < trace->ds.begin)
@@ -595,7 +921,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
        if (!trace->read)
                return -EOPNOTSUPP;
 
-       error = trace->read(child->bts, at, &bts);
+       error = trace->read(context->tracer, at, &bts);
        if (error < 0)
                return error;
 
@@ -609,13 +935,18 @@ static int ptrace_bts_drain(struct task_struct *child,
                            long size,
                            struct bts_struct __user *out)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
        const unsigned char *at;
        int error, drained = 0;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        if (!trace->read)
                return -EOPNOTSUPP;
@@ -626,9 +957,8 @@ static int ptrace_bts_drain(struct task_struct *child,
        for (at = trace->ds.begin; (void *)at < trace->ds.top;
             out++, drained++, at += trace->ds.size) {
                struct bts_struct bts;
-               int error;
 
-               error = trace->read(child->bts, at, &bts);
+               error = trace->read(context->tracer, at, &bts);
                if (error < 0)
                        return error;
 
@@ -638,35 +968,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 
        memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-       error = ds_reset_bts(child->bts);
+       error = ds_reset_bts(context->tracer);
        if (error < 0)
                return error;
 
        return drained;
 }
 
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-       child->bts_buffer = alloc_locked_buffer(size);
-       if (!child->bts_buffer)
-               return -ENOMEM;
-
-       child->bts_size = size;
-
-       return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-       free_locked_buffer(child->bts_buffer, child->bts_size);
-       child->bts_buffer = NULL;
-       child->bts_size = 0;
-}
-
 static int ptrace_bts_config(struct task_struct *child,
                             long cfg_size,
                             const struct ptrace_bts_config __user *ucfg)
 {
+       struct bts_context *context;
        struct ptrace_bts_config cfg;
        unsigned int flags = 0;
 
@@ -676,29 +989,33 @@ static int ptrace_bts_config(struct task_struct *child,
        if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
                return -EFAULT;
 
-       if (child->bts) {
-               ds_release_bts(child->bts);
-               child->bts = NULL;
-       }
+       context = child->bts;
+       if (!context)
+               context = alloc_bts_context(child);
+       if (!context)
+               return -ENOMEM;
 
        if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
                if (!cfg.signal)
                        return -EINVAL;
 
                return -EOPNOTSUPP;
-
-               child->thread.bts_ovfl_signal = cfg.signal;
+               context->bts_ovfl_signal = cfg.signal;
        }
 
-       if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-           (cfg.size != child->bts_size)) {
-               int error;
+       ds_release_bts(context->tracer);
+       context->tracer = NULL;
 
-               ptrace_bts_free_buffer(child);
+       if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+               int err;
 
-               error = ptrace_bts_allocate_buffer(child, cfg.size);
-               if (error < 0)
-                       return error;
+               free_bts_buffer(context);
+               if (!cfg.size)
+                       return 0;
+
+               err = alloc_bts_buffer(context, cfg.size);
+               if (err < 0)
+                       return err;
        }
 
        if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -707,15 +1024,14 @@ static int ptrace_bts_config(struct task_struct *child,
        if (cfg.flags & PTRACE_BTS_O_SCHED)
                flags |= BTS_TIMESTAMPS;
 
-       child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-                                   /* ovfl = */ NULL, /* th = */ (size_t)-1,
-                                   flags);
-       if (IS_ERR(child->bts)) {
-               int error = PTR_ERR(child->bts);
-
-               ptrace_bts_free_buffer(child);
-               child->bts = NULL;
+       context->tracer =
+               ds_request_bts_task(child, context->buffer, context->size,
+                                   NULL, (size_t)-1, flags);
+       if (unlikely(IS_ERR(context->tracer))) {
+               int error = PTR_ERR(context->tracer);
 
+               free_bts_buffer(context);
+               context->tracer = NULL;
                return error;
        }
 
@@ -726,20 +1042,25 @@ static int ptrace_bts_status(struct task_struct *child,
                             long cfg_size,
                             struct ptrace_bts_config __user *ucfg)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
        struct ptrace_bts_config cfg;
 
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
        if (cfg_size < sizeof(cfg))
                return -EIO;
 
-       trace = ds_read_bts(child->bts);
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        memset(&cfg, 0, sizeof(cfg));
-       cfg.size = trace->ds.end - trace->ds.begin;
-       cfg.signal = child->thread.bts_ovfl_signal;
-       cfg.bts_size = sizeof(struct bts_struct);
+       cfg.size        = trace->ds.end - trace->ds.begin;
+       cfg.signal      = context->bts_ovfl_signal;
+       cfg.bts_size    = sizeof(struct bts_struct);
 
        if (cfg.signal)
                cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -758,76 +1079,51 @@ static int ptrace_bts_status(struct task_struct *child,
 
 static int ptrace_bts_clear(struct task_struct *child)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-       return ds_reset_bts(child->bts);
+       return ds_reset_bts(context->tracer);
 }
 
 static int ptrace_bts_size(struct task_struct *child)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
-       tsk->bts = NULL;
-       tsk->bts_buffer = NULL;
-       tsk->bts_size = 0;
-       tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
-{
-       if (unlikely(child->bts)) {
-               ds_release_bts(child->bts);
-               child->bts = NULL;
-
-               /* We cannot update total_vm and locked_vm since
-                  child's mm is already gone. But we can reclaim the
-                  memory. */
-               kfree(child->bts_buffer);
-               child->bts_buffer = NULL;
-               child->bts_size = 0;
-       }
-}
-
-static void ptrace_bts_detach(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
 {
        if (unlikely(child->bts)) {
-               ds_release_bts(child->bts);
+               free_bts_context(child->bts);
                child->bts = NULL;
-
-               ptrace_bts_free_buffer(child);
        }
 }
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-       ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-       ptrace_bts_untrace(child);
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -839,7 +1135,6 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-       ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1040,10 +1335,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 
        case offsetof(struct user32, regs.orig_eax):
                /*
-                * Sign-extend the value so that orig_eax = -1
-                * causes (long)orig_ax < 0 tests to fire correctly.
+                * A 32-bit debugger setting orig_eax means to restore
+                * the state of the task restarting a 32-bit syscall.
+                * Make sure we interpret the -ERESTART* codes correctly
+                * in case the task is not actually still sitting at the
+                * exit from a 32-bit syscall with TS_COMPAT still set.
                 */
-               regs->orig_ax = (long) (s32) value;
+               regs->orig_ax = value;
+               if (syscall_get_nr(child, regs) >= 0)
+                       task_thread_info(child)->status |= TS_COMPAT;
                break;
 
        case offsetof(struct user32, regs.eflags):
@@ -1138,14 +1438,14 @@ static int genregs32_get(struct task_struct *target,
 {
        if (kbuf) {
                compat_ulong_t *k = kbuf;
-               while (count > 0) {
+               while (count >= sizeof(*k)) {
                        getreg32(target, pos, k++);
                        count -= sizeof(*k);
                        pos += sizeof(*k);
                }
        } else {
                compat_ulong_t __user *u = ubuf;
-               while (count > 0) {
+               while (count >= sizeof(*u)) {
                        compat_ulong_t word;
                        getreg32(target, pos, &word);
                        if (__put_user(word, u++))
@@ -1166,14 +1466,14 @@ static int genregs32_set(struct task_struct *target,
        int ret = 0;
        if (kbuf) {
                const compat_ulong_t *k = kbuf;
-               while (count > 0 && !ret) {
+               while (count >= sizeof(*k) && !ret) {
                        ret = putreg32(target, pos, *k++);
                        count -= sizeof(*k);
                        pos += sizeof(*k);
                }
        } else {
                const compat_ulong_t __user *u = ubuf;
-               while (count > 0 && !ret) {
+               while (count >= sizeof(*u) && !ret) {
                        compat_ulong_t word;
                        ret = __get_user(word, u++);
                        if (ret)
@@ -1264,7 +1564,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 #ifdef CONFIG_X86_64
 
-static const struct user_regset x86_64_regsets[] = {
+static struct user_regset x86_64_regsets[] __read_mostly = {
        [REGSET_GENERAL] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1277,6 +1577,12 @@ static const struct user_regset x86_64_regsets[] = {
                .size = sizeof(long), .align = sizeof(long),
                .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
        },
+       [REGSET_XSTATE] = {
+               .core_note_type = NT_X86_XSTATE,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = xstateregs_active, .get = xstateregs_get,
+               .set = xstateregs_set
+       },
        [REGSET_IOPERM64] = {
                .core_note_type = NT_386_IOPERM,
                .n = IO_BITMAP_LONGS,
@@ -1302,7 +1608,7 @@ static const struct user_regset_view user_x86_64_view = {
 #endif /* CONFIG_X86_64 */
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static const struct user_regset x86_32_regsets[] = {
+static struct user_regset x86_32_regsets[] __read_mostly = {
        [REGSET_GENERAL] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1321,6 +1627,12 @@ static const struct user_regset x86_32_regsets[] = {
                .size = sizeof(u32), .align = sizeof(u32),
                .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
        },
+       [REGSET_XSTATE] = {
+               .core_note_type = NT_X86_XSTATE,
+               .size = sizeof(u64), .align = sizeof(u64),
+               .active = xstateregs_active, .get = xstateregs_get,
+               .set = xstateregs_set
+       },
        [REGSET_TLS] = {
                .core_note_type = NT_386_TLS,
                .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1343,6 +1655,23 @@ static const struct user_regset_view user_x86_32_view = {
 };
 #endif
 
+/*
+ * This represents bytes 464..511 in the memory layout exported through
+ * the REGSET_XSTATE interface.
+ */
+u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+{
+#ifdef CONFIG_X86_64
+       x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+       x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+       xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
+}
+
 const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 {
 #ifdef CONFIG_IA32_EMULATION
@@ -1356,21 +1685,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 #endif
 }
 
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
-                                        int error_code, int si_code)
+static void fill_sigtrap_info(struct task_struct *tsk,
+                               struct pt_regs *regs,
+                               int error_code, int si_code,
+                               struct siginfo *info)
 {
-       struct siginfo info;
-
        tsk->thread.trap_no = 1;
        tsk->thread.error_code = error_code;
 
-       memset(&info, 0, sizeof(info));
-       info.si_signo = SIGTRAP;
-       info.si_code = si_code;
+       memset(info, 0, sizeof(*info));
+       info->si_signo = SIGTRAP;
+       info->si_code = si_code;
+       info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+}
+
+void user_single_step_siginfo(struct task_struct *tsk,
+                               struct pt_regs *regs,
+                               struct siginfo *info)
+{
+       fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
+}
 
-       /* User-mode ip? */
-       info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+                                        int error_code, int si_code)
+{
+       struct siginfo info;
 
+       fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
        /* Send us the fake SIGTRAP */
        force_sig_info(SIGTRAP, &info, tsk);
 }
@@ -1379,7 +1720,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 #ifdef CONFIG_X86_32
 # define IS_IA32       1
 #elif defined CONFIG_IA32_EMULATION
-# define IS_IA32       test_thread_flag(TIF_IA32)
+# define IS_IA32       is_compat_task()
 #else
 # define IS_IA32       0
 #endif
@@ -1412,6 +1753,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
            tracehook_report_syscall_entry(regs))
                ret = -1L;
 
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_enter(regs, regs->orig_ax);
+
        if (unlikely(current->audit_context)) {
                if (IS_IA32)
                        audit_syscall_entry(AUDIT_ARCH_I386,
@@ -1432,26 +1776,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
 
 asmregparm void syscall_trace_leave(struct pt_regs *regs)
 {
+       bool step;
+
        if (unlikely(current->audit_context))
                audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
-       if (test_thread_flag(TIF_SYSCALL_TRACE))
-               tracehook_report_syscall_exit(regs, 0);
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_exit(regs, regs->ax);
 
        /*
         * If TIF_SYSCALL_EMU is set, we only get here because of
         * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
         * We already reported this syscall instruction in
-        * syscall_trace_enter(), so don't do any more now.
-        */
-       if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
-               return;
-
-       /*
-        * If we are single-stepping, synthesize a trap to follow the
-        * system call instruction.
+        * syscall_trace_enter().
         */
-       if (test_thread_flag(TIF_SINGLESTEP) &&
-           tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
-               send_sigtrap(current, regs, 0, TRAP_BRKPT);
+       step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
+                       !test_thread_flag(TIF_SYSCALL_EMU);
+       if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+               tracehook_report_syscall_exit(regs, step);
 }