static atomic_t nr_mmap_counters __read_mostly;
static atomic_t nr_comm_counters __read_mostly;
-int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+/*
+ * perf counter paranoia level:
+ * 0 - not paranoid
+ * 1 - disallow cpu counters to unpriv
+ * 2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly;
+
+static inline bool perf_paranoid_cpu(void)
+{
+ return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+ return sysctl_perf_counter_paranoid > 1;
+}
+
int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
static atomic64_t perf_counter_id;
static void get_ctx(struct perf_counter_context *ctx)
{
- atomic_inc(&ctx->refcount);
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
static void free_ctx(struct rcu_head *head)
spin_unlock_irqrestore(&ctx->lock, *flags);
goto retry;
}
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
}
rcu_read_unlock();
return ctx;
ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
- get_ctx(ctx);
spin_unlock_irqrestore(&ctx->lock, flags);
}
return ctx;
int do_switch = 1;
regs = task_pt_regs(task);
- perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+ perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
if (likely(!ctx || !cpuctx->task_ctx))
return;
static void perf_log_throttle(struct perf_counter *counter, int enable);
static void perf_log_period(struct perf_counter *counter, u64 period);
-static void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 period, sample_period;
+ s64 delta;
+
+ events *= hwc->sample_period;
+ period = div64_u64(events, counter->attr.sample_freq);
+
+ delta = (s64)(period - hwc->sample_period);
+ delta = (delta + 7) / 8; /* low pass filter */
+
+ sample_period = hwc->sample_period + delta;
+
+ if (!sample_period)
+ sample_period = 1;
+
+ perf_log_period(counter, sample_period);
+
+ hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
{
struct perf_counter *counter;
struct hw_perf_counter *hwc;
- u64 interrupts, sample_period;
- u64 events, period, freq;
- s64 delta;
+ u64 interrupts, freq;
spin_lock(&ctx->lock);
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
interrupts = hwc->interrupts;
hwc->interrupts = 0;
+ /*
+ * unthrottle counters on the tick
+ */
if (interrupts == MAX_INTERRUPTS) {
perf_log_throttle(counter, 1);
counter->pmu->unthrottle(counter);
- interrupts = 2*sysctl_perf_counter_limit/HZ;
+ interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
}
if (!counter->attr.freq || !counter->attr.sample_freq)
continue;
+ /*
+ * if the specified freq < HZ then we need to skip ticks
+ */
if (counter->attr.sample_freq < HZ) {
freq = counter->attr.sample_freq;
} else
freq = HZ;
- events = freq * interrupts * hwc->sample_period;
- period = div64_u64(events, counter->attr.sample_freq);
-
- delta = (s64)(1 + period - hwc->sample_period);
- delta >>= 1;
-
- sample_period = hwc->sample_period + delta;
-
- if (!sample_period)
- sample_period = 1;
+ perf_adjust_period(counter, freq * interrupts);
- perf_log_period(counter, sample_period);
-
- hwc->sample_period = sample_period;
+ /*
+ * In order to avoid being stalled by an (accidental) huge
+ * sample period, force reset the sample period if we didn't
+ * get any events in this freq period.
+ */
+ if (!interrupts) {
+ perf_disable();
+ counter->pmu->disable(counter);
+ atomic64_set(&hwc->period_left, 0);
+ counter->pmu->enable(counter);
+ perf_enable();
+ }
}
spin_unlock(&ctx->lock);
}
cpuctx = &per_cpu(perf_cpu_context, cpu);
ctx = curr->perf_counter_ctxp;
- perf_adjust_freq(&cpuctx->ctx);
+ perf_ctx_adjust_freq(&cpuctx->ctx);
if (ctx)
- perf_adjust_freq(ctx);
+ perf_ctx_adjust_freq(ctx);
perf_counter_cpu_sched_out(cpuctx);
if (ctx)
*/
if (cpu != -1) {
/* Must be root to operate on a CPU counter: */
- if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
if (cpu < 0 || cpu > num_possible_cpus())
put_ctx(parent_ctx);
ctx->parent_ctx = NULL; /* no longer a clone */
}
- /*
- * Get an extra reference before dropping the lock so that
- * this context won't get freed if the task exits.
- */
- get_ctx(ctx);
spin_unlock_irqrestore(&ctx->lock, flags);
}
static ssize_t
perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
{
- u64 values[3];
+ u64 values[4];
int n;
/*
perf_counter_update_userpage(counter);
}
-static void perf_counter_for_each_sibling(struct perf_counter *counter,
- void (*func)(struct perf_counter *))
-{
- struct perf_counter_context *ctx = counter->ctx;
- struct perf_counter *sibling;
-
- WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
- counter = counter->group_leader;
-
- func(counter);
- list_for_each_entry(sibling, &counter->sibling_list, list_entry)
- func(sibling);
- mutex_unlock(&ctx->mutex);
-}
-
/*
* Holding the top-level counter's child_mutex means that any
* descendant process that has inherited this counter will block
static void perf_counter_for_each(struct perf_counter *counter,
void (*func)(struct perf_counter *))
{
- struct perf_counter *child;
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *sibling;
- WARN_ON_ONCE(counter->ctx->parent_ctx);
- mutex_lock(&counter->child_mutex);
- perf_counter_for_each_sibling(counter, func);
- list_for_each_entry(child, &counter->child_list, child_list)
- perf_counter_for_each_sibling(child, func);
- mutex_unlock(&counter->child_mutex);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ counter = counter->group_leader;
+
+ perf_counter_for_each_child(counter, func);
+ func(counter);
+ list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+ perf_counter_for_each_child(counter, func);
+ mutex_unlock(&ctx->mutex);
}
static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
spin_lock_irq(&ctx->lock);
if (counter->attr.freq) {
- if (value > sysctl_perf_counter_limit) {
+ if (value > sysctl_perf_counter_sample_rate) {
ret = -EINVAL;
goto unlock;
}
counter->attr.sample_freq = value;
} else {
+ perf_log_period(counter, value);
+
counter->attr.sample_period = value;
counter->hw.sample_period = value;
-
- perf_log_period(counter, value);
}
unlock:
spin_unlock_irq(&ctx->lock);
struct perf_mmap_data *data;
int ret = VM_FAULT_SIGBUS;
+ if (vmf->flags & FAULT_FLAG_MKWRITE) {
+ if (vmf->pgoff == 0)
+ ret = 0;
+ return ret;
+ }
+
rcu_read_lock();
data = rcu_dereference(counter->data);
if (!data)
if ((unsigned)nr > data->nr_pages)
goto unlock;
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ goto unlock;
+
vmf->page = virt_to_page(data->data_pages[nr]);
}
+
get_page(vmf->page);
+ vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
ret = 0;
unlock:
rcu_read_unlock();
return -ENOMEM;
}
+static void perf_mmap_free_page(unsigned long addr)
+{
+ struct page *page = virt_to_page(addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
static void __perf_mmap_data_free(struct rcu_head *rcu_head)
{
struct perf_mmap_data *data;
data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
- free_page((unsigned long)data->user_page);
+ perf_mmap_free_page((unsigned long)data->user_page);
for (i = 0; i < data->nr_pages; i++)
- free_page((unsigned long)data->data_pages[i]);
+ perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
kfree(data);
}
}
static struct vm_operations_struct perf_mmap_vmops = {
- .open = perf_mmap_open,
- .close = perf_mmap_close,
- .fault = perf_mmap_fault,
+ .open = perf_mmap_open,
+ .close = perf_mmap_close,
+ .fault = perf_mmap_fault,
+ .page_mkwrite = perf_mmap_fault,
};
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
long user_extra, extra;
int ret = 0;
- if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+ if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
vma_size = vma->vm_end - vma->vm_start;
atomic_long_add(user_extra, &user->locked_vm);
vma->vm_mm->locked_vm += extra;
counter->data->nr_locked = extra;
+ if (vma->vm_flags & VM_WRITE)
+ counter->data->writable = 1;
+
unlock:
mutex_unlock(&counter->mmap_mutex);
- vma->vm_flags &= ~VM_MAYWRITE;
vma->vm_flags |= VM_RESERVED;
vma->vm_ops = &perf_mmap_vmops;
unsigned long head;
unsigned long offset;
int nmi;
- int overflow;
+ int sample;
int locked;
unsigned long flags;
};
+static bool perf_output_space(struct perf_mmap_data *data,
+ unsigned int offset, unsigned int head)
+{
+ unsigned long tail;
+ unsigned long mask;
+
+ if (!data->writable)
+ return true;
+
+ mask = (data->nr_pages << PAGE_SHIFT) - 1;
+ /*
+ * Userspace could choose to issue a mb() before updating the tail
+ * pointer. So that all reads will be completed before the write is
+ * issued.
+ */
+ tail = ACCESS_ONCE(data->user_page->data_tail);
+ smp_rmb();
+
+ offset = (offset - tail) & mask;
+ head = (head - tail) & mask;
+
+ if ((int)(head - offset) < 0)
+ return false;
+
+ return true;
+}
+
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->data->poll, POLL_IN);
local_irq_restore(handle->flags);
}
+static void perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ unsigned int pages_mask;
+ unsigned int offset;
+ unsigned int size;
+ void **pages;
+
+ offset = handle->offset;
+ pages_mask = handle->data->nr_pages - 1;
+ pages = handle->data->data_pages;
+
+ do {
+ unsigned int page_offset;
+ int nr;
+
+ nr = (offset >> PAGE_SHIFT) & pages_mask;
+ page_offset = offset & (PAGE_SIZE - 1);
+ size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+ memcpy(pages[nr] + page_offset, buf, size);
+
+ len -= size;
+ buf += size;
+ offset += size;
+ } while (len);
+
+ handle->offset = offset;
+
+ /*
+ * Check we didn't copy past our reservation window, taking the
+ * possible unsigned int wrap into account.
+ */
+ WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+#define perf_output_put(handle, x) \
+ perf_output_copy((handle), &(x), sizeof(x))
+
static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size,
- int nmi, int overflow)
+ int nmi, int sample)
{
struct perf_mmap_data *data;
unsigned int offset, head;
+ int have_lost;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
/*
* For inherited counters we send all the output towards the parent.
if (!data)
goto out;
- handle->data = data;
- handle->counter = counter;
- handle->nmi = nmi;
- handle->overflow = overflow;
+ handle->data = data;
+ handle->counter = counter;
+ handle->nmi = nmi;
+ handle->sample = sample;
if (!data->nr_pages)
goto fail;
+ have_lost = atomic_read(&data->lost);
+ if (have_lost)
+ size += sizeof(lost_event);
+
perf_output_lock(handle);
do {
offset = head = atomic_long_read(&data->head);
head += size;
+ if (unlikely(!perf_output_space(data, offset, head)))
+ goto fail;
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
handle->offset = offset;
if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
atomic_set(&data->wakeup, 1);
+ if (have_lost) {
+ lost_event.header.type = PERF_EVENT_LOST;
+ lost_event.header.misc = 0;
+ lost_event.header.size = sizeof(lost_event);
+ lost_event.id = counter->id;
+ lost_event.lost = atomic_xchg(&data->lost, 0);
+
+ perf_output_put(handle, lost_event);
+ }
+
return 0;
fail:
- perf_output_wakeup(handle);
+ atomic_inc(&data->lost);
+ perf_output_unlock(handle);
out:
rcu_read_unlock();
return -ENOSPC;
}
-static void perf_output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
-{
- unsigned int pages_mask;
- unsigned int offset;
- unsigned int size;
- void **pages;
-
- offset = handle->offset;
- pages_mask = handle->data->nr_pages - 1;
- pages = handle->data->data_pages;
-
- do {
- unsigned int page_offset;
- int nr;
-
- nr = (offset >> PAGE_SHIFT) & pages_mask;
- page_offset = offset & (PAGE_SIZE - 1);
- size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-
- memcpy(pages[nr] + page_offset, buf, size);
-
- len -= size;
- buf += size;
- offset += size;
- } while (len);
-
- handle->offset = offset;
-
- /*
- * Check we didn't copy past our reservation window, taking the
- * possible unsigned int wrap into account.
- */
- WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-
-#define perf_output_put(handle, x) \
- perf_output_copy((handle), &(x), sizeof(x))
-
static void perf_output_end(struct perf_output_handle *handle)
{
struct perf_counter *counter = handle->counter;
int wakeup_events = counter->attr.wakeup_events;
- if (handle->overflow && wakeup_events) {
+ if (handle->sample && wakeup_events) {
int events = atomic_inc_return(&data->events);
if (events >= wakeup_events) {
atomic_sub(wakeup_events, &data->events);
return task_pid_nr_ns(p, counter->ns);
}
-static void perf_counter_output(struct perf_counter *counter,
- int nmi, struct pt_regs *regs, u64 addr)
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
{
int ret;
u64 sample_type = counter->attr.sample_type;
header.size = sizeof(header);
header.misc = PERF_EVENT_MISC_OVERFLOW;
- header.misc |= perf_misc_flags(regs);
+ header.misc |= perf_misc_flags(data->regs);
if (sample_type & PERF_SAMPLE_IP) {
- ip = perf_instruction_pointer(regs);
+ ip = perf_instruction_pointer(data->regs);
header.type |= PERF_SAMPLE_IP;
header.size += sizeof(ip);
}
}
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- callchain = perf_callchain(regs);
+ callchain = perf_callchain(data->regs);
if (callchain) {
callchain_size = (1 + callchain->nr) * sizeof(u64);
perf_output_put(&handle, time);
if (sample_type & PERF_SAMPLE_ADDR)
- perf_output_put(&handle, addr);
+ perf_output_put(&handle, data->addr);
if (sample_type & PERF_SAMPLE_ID)
perf_output_put(&handle, counter->id);
perf_output_put(&handle, cpu_entry);
if (sample_type & PERF_SAMPLE_PERIOD)
- perf_output_put(&handle, counter->hw.sample_period);
+ perf_output_put(&handle, data->period);
/*
* XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
* event flow.
*/
+struct freq_event {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 period;
+};
+
static void perf_log_period(struct perf_counter *counter, u64 period)
{
struct perf_output_handle handle;
+ struct freq_event event;
int ret;
- struct {
- struct perf_event_header header;
- u64 time;
- u64 id;
- u64 period;
- } freq_event = {
+ if (counter->hw.sample_period == period)
+ return;
+
+ if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+ return;
+
+ event = (struct freq_event) {
.header = {
.type = PERF_EVENT_PERIOD,
.misc = 0,
- .size = sizeof(freq_event),
+ .size = sizeof(event),
},
.time = sched_clock(),
.id = counter->id,
.period = period,
};
- if (counter->hw.sample_period == period)
- return;
-
- ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+ ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
if (ret)
return;
- perf_output_put(&handle, freq_event);
+ perf_output_put(&handle, event);
perf_output_end(&handle);
}
struct {
struct perf_event_header header;
u64 time;
+ u64 id;
} throttle_event = {
.header = {
.type = PERF_EVENT_THROTTLE + 1,
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = sched_clock(),
+ .time = sched_clock(),
+ .id = counter->id,
};
ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
}
/*
- * Generic counter overflow handling.
+ * Generic counter overflow handling, sampling.
*/
-int perf_counter_overflow(struct perf_counter *counter,
- int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
{
int events = atomic_read(&counter->event_limit);
int throttle = counter->pmu->unthrottle != NULL;
+ struct hw_perf_counter *hwc = &counter->hw;
int ret = 0;
if (!throttle) {
- counter->hw.interrupts++;
+ hwc->interrupts++;
} else {
- if (counter->hw.interrupts != MAX_INTERRUPTS) {
- counter->hw.interrupts++;
- if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
- counter->hw.interrupts = MAX_INTERRUPTS;
+ if (hwc->interrupts != MAX_INTERRUPTS) {
+ hwc->interrupts++;
+ if (HZ * hwc->interrupts >
+ (u64)sysctl_perf_counter_sample_rate) {
+ hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(counter, 0);
ret = 1;
}
}
}
+ if (counter->attr.freq) {
+ u64 now = sched_clock();
+ s64 delta = now - hwc->freq_stamp;
+
+ hwc->freq_stamp = now;
+
+ if (delta > 0 && delta < TICK_NSEC)
+ perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+ }
+
/*
* XXX event_limit might not quite work as expected on inherited
* counters
perf_counter_disable(counter);
}
- perf_counter_output(counter, nmi, regs, addr);
+ perf_counter_output(counter, nmi, data);
return ret;
}
if (unlikely(left <= -period)) {
left = period;
atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
}
if (unlikely(left <= 0)) {
left += period;
atomic64_add(period, &hwc->period_left);
+ hwc->last_period = period;
}
atomic64_set(&hwc->prev_count, -left);
static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
struct perf_counter *counter;
- struct pt_regs *regs;
u64 period;
counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
counter->pmu->read(counter);
- regs = get_irq_regs();
+ data.addr = 0;
+ data.regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
- if ((counter->attr.exclude_kernel || !regs) &&
+ if ((counter->attr.exclude_kernel || !data.regs) &&
!counter->attr.exclude_user)
- regs = task_pt_regs(current);
+ data.regs = task_pt_regs(current);
- if (regs) {
- if (perf_counter_overflow(counter, 0, regs, 0))
+ if (data.regs) {
+ if (perf_counter_overflow(counter, 0, &data))
ret = HRTIMER_NORESTART;
}
}
static void perf_swcounter_overflow(struct perf_counter *counter,
- int nmi, struct pt_regs *regs, u64 addr)
+ int nmi, struct perf_sample_data *data)
{
+ data->period = counter->hw.last_period;
+
perf_swcounter_update(counter);
perf_swcounter_set_period(counter);
- if (perf_counter_overflow(counter, nmi, regs, addr))
+ if (perf_counter_overflow(counter, nmi, data))
/* soft-disable the counter */
;
-
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
}
static int perf_swcounter_match(struct perf_counter *counter,
- enum perf_event_types type,
+ enum perf_type_id type,
u32 event, struct pt_regs *regs)
{
if (!perf_swcounter_is_counting(counter))
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
- int nmi, struct pt_regs *regs, u64 addr)
+ int nmi, struct perf_sample_data *data)
{
int neg = atomic64_add_negative(nr, &counter->hw.count);
- if (counter->hw.sample_period && !neg && regs)
- perf_swcounter_overflow(counter, nmi, regs, addr);
+ if (counter->hw.sample_period && !neg && data->regs)
+ perf_swcounter_overflow(counter, nmi, data);
}
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
- enum perf_event_types type, u32 event,
- u64 nr, int nmi, struct pt_regs *regs,
- u64 addr)
+ enum perf_type_id type,
+ u32 event, u64 nr, int nmi,
+ struct perf_sample_data *data)
{
struct perf_counter *counter;
rcu_read_lock();
list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
- if (perf_swcounter_match(counter, type, event, regs))
- perf_swcounter_add(counter, nr, nmi, regs, addr);
+ if (perf_swcounter_match(counter, type, event, data->regs))
+ perf_swcounter_add(counter, nr, nmi, data);
}
rcu_read_unlock();
}
return &cpuctx->recursion[0];
}
-static void __perf_swcounter_event(enum perf_event_types type, u32 event,
- u64 nr, int nmi, struct pt_regs *regs,
- u64 addr)
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
+ u64 nr, int nmi,
+ struct perf_sample_data *data)
{
struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
int *recursion = perf_swcounter_recursion_context(cpuctx);
barrier();
perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
- nr, nmi, regs, addr);
+ nr, nmi, data);
rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
*/
ctx = rcu_dereference(current->perf_counter_ctxp);
if (ctx)
- perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+ perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
rcu_read_unlock();
barrier();
void
perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
{
- __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
+ struct perf_sample_data data = {
+ .regs = regs,
+ .addr = addr,
+ };
+
+ do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
}
static void perf_swcounter_read(struct perf_counter *counter)
.read = task_clock_perf_counter_read,
};
-/*
- * Software counter: cpu migrations
- */
-void perf_counter_task_migration(struct task_struct *task, int cpu)
-{
- struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
- struct perf_counter_context *ctx;
-
- perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
- PERF_COUNT_CPU_MIGRATIONS,
- 1, 1, NULL, 0);
-
- ctx = perf_pin_task_context(task);
- if (ctx) {
- perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
- PERF_COUNT_CPU_MIGRATIONS,
- 1, 1, NULL, 0);
- perf_unpin_context(ctx);
- }
-}
-
#ifdef CONFIG_EVENT_PROFILE
void perf_tpcounter_event(int event_id)
{
- struct pt_regs *regs = get_irq_regs();
+ struct perf_sample_data data = {
+ .regs = get_irq_regs();
+ .addr = 0,
+ };
- if (!regs)
- regs = task_pt_regs(current);
+ if (!data.regs)
+ data.regs = task_pt_regs(current);
- __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
}
EXPORT_SYMBOL_GPL(perf_tpcounter_event);
return NULL;
counter->destroy = tp_perf_counter_destroy;
- counter->hw.sample_period = counter->attr.sample_period;
return &perf_ops_generic;
}
* events.
*/
switch (counter->attr.config) {
- case PERF_COUNT_CPU_CLOCK:
+ case PERF_COUNT_SW_CPU_CLOCK:
pmu = &perf_ops_cpu_clock;
break;
- case PERF_COUNT_TASK_CLOCK:
+ case PERF_COUNT_SW_TASK_CLOCK:
/*
* If the user instantiates this as a per-cpu counter,
* use the cpu_clock counter instead.
pmu = &perf_ops_cpu_clock;
break;
- case PERF_COUNT_PAGE_FAULTS:
- case PERF_COUNT_PAGE_FAULTS_MIN:
- case PERF_COUNT_PAGE_FAULTS_MAJ:
- case PERF_COUNT_CONTEXT_SWITCHES:
- case PERF_COUNT_CPU_MIGRATIONS:
+ case PERF_COUNT_SW_PAGE_FAULTS:
+ case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+ case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+ case PERF_COUNT_SW_CONTEXT_SWITCHES:
+ case PERF_COUNT_SW_CPU_MIGRATIONS:
pmu = &perf_ops_generic;
break;
}
pmu = NULL;
hwc = &counter->hw;
+ hwc->sample_period = attr->sample_period;
if (attr->freq && attr->sample_freq)
- hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
- else
- hwc->sample_period = attr->sample_period;
+ hwc->sample_period = 1;
+
+ atomic64_set(&hwc->period_left, hwc->sample_period);
/*
* we currently do not support PERF_SAMPLE_GROUP on inherited counters
if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
goto done;
- if (attr->type == PERF_TYPE_RAW) {
- pmu = hw_perf_counter_init(counter);
- goto done;
- }
-
switch (attr->type) {
+ case PERF_TYPE_RAW:
case PERF_TYPE_HARDWARE:
case PERF_TYPE_HW_CACHE:
pmu = hw_perf_counter_init(counter);
case PERF_TYPE_TRACEPOINT:
pmu = tp_perf_counter_init(counter);
break;
+
+ default:
+ break;
}
done:
err = 0;
return counter;
}
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+ struct perf_counter_attr *attr)
+{
+ int ret;
+ u32 size;
+
+ if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size < PERF_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned long val;
+ unsigned long __user *addr;
+ unsigned long __user *end;
+
+ addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+ sizeof(unsigned long));
+ end = PTR_ALIGN((void __user *)uattr + size,
+ sizeof(unsigned long));
+
+ for (; addr < end; addr += sizeof(unsigned long)) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * If the type exists, the corresponding creation will verify
+ * the attr->config.
+ */
+ if (attr->type >= PERF_TYPE_MAX)
+ return -EINVAL;
+
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+ return -EINVAL;
+
+ if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+ return -EINVAL;
+
+ if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+ return -EINVAL;
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
/**
* sys_perf_counter_open - open a performance counter, associate it to a task/cpu
*
* @group_fd: group leader counter fd
*/
SYSCALL_DEFINE5(perf_counter_open,
- const struct perf_counter_attr __user *, attr_uptr,
+ struct perf_counter_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_counter *counter, *group_leader;
if (flags)
return -EINVAL;
- if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
- return -EFAULT;
+ ret = perf_copy_attr(attr_uptr, &attr);
+ if (ret)
+ return ret;
+
+ if (!attr.exclude_kernel) {
+ if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr.freq) {
+ if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+ return -EINVAL;
+ }
/*
* Get the target context (task or percpu):
else
child_counter->state = PERF_COUNTER_STATE_OFF;
+ if (parent_counter->attr.freq)
+ child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
/*
* Link it up in the child's context:
*/
spin_unlock(&child_ctx->lock);
local_irq_restore(flags);
- mutex_lock(&child_ctx->mutex);
+ /*
+ * We can recurse on the same lock type through:
+ *
+ * __perf_counter_exit_task()
+ * sync_child_counter()
+ * fput(parent_counter->filp)
+ * perf_release()
+ * mutex_lock(&ctx->mutex)
+ *
+ * But since its the parent context it won't be the same instance.
+ */
+ mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
again:
list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,