X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=tools%2Fperf%2Fbuiltin-sched.c;h=2d542368de3c187e5b0bf29fa9ac17135f918302;hb=c8a37751043427c6e4397a2cbfd617cb5f215c72;hp=61a80e8c9d0dfc52f4b902eae7de6a3f668fe512;hpb=c6ced61112f1e6139914149fab65695801a74f0f;p=safe%2Fjmp%2Flinux-2.6 diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 61a80e8..2d54236 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1,4 +1,5 @@ #include "builtin.h" +#include "perf.h" #include "util/util.h" #include "util/cache.h" @@ -7,15 +8,16 @@ #include "util/header.h" #include "util/parse-options.h" +#include "util/trace-event.h" -#include "perf.h" #include "util/debug.h" -#include "util/trace-event.h" #include +#include - -#define MAX_CPUS 4096 +#include +#include +#include static char const *input_name = "perf.data"; static int input; @@ -30,47 +32,137 @@ static struct thread *last_match; static struct perf_header *header; static u64 sample_type; -static int replay_mode; -static int lat_mode; +static char default_sort_order[] = "avg, max, switch, runtime"; +static char *sort_order = default_sort_order; +#define PR_SET_NAME 15 /* Set process name */ +#define MAX_CPUS 4096 -/* - * Scheduler benchmarks - */ -#include -#include -#include -#include -#include +#define BUG_ON(x) assert(!(x)) -#include +static u64 run_measurement_overhead; +static u64 sleep_measurement_overhead; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#define COMM_LEN 20 +#define SYM_LEN 129 + +#define MAX_PID 65536 + +static unsigned long nr_tasks; + +struct sched_atom; + +struct task_desc { + unsigned long nr; + unsigned long pid; + char comm[COMM_LEN]; + + unsigned long nr_events; + unsigned long curr_event; + struct sched_atom **atoms; + + pthread_t thread; + sem_t sleep_sem; + + sem_t ready_for_work; + sem_t work_done_sem; + + u64 cpu_usage; +}; + +enum sched_event_type { + SCHED_EVENT_RUN, + SCHED_EVENT_SLEEP, + SCHED_EVENT_WAKEUP, +}; + +struct sched_atom { + enum sched_event_type type; + u64 timestamp; + u64 duration; + unsigned long nr; + int specific_wait; + sem_t *wait_sem; + struct task_desc *wakee; +}; + +static struct task_desc *pid_to_task[MAX_PID]; + +static struct task_desc **tasks; + +static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER; +static u64 start_time; + +static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER; -#include +static unsigned long nr_run_events; +static unsigned long nr_sleep_events; +static unsigned long nr_wakeup_events; -#define PR_SET_NAME 15 /* Set process name */ +static unsigned long nr_sleep_corrections; +static unsigned long nr_run_events_optimized; -#define BUG_ON(x) assert(!(x)) +static unsigned long targetless_wakeups; +static unsigned long multitarget_wakeups; -#define DEBUG 0 +static u64 cpu_usage; +static u64 runavg_cpu_usage; +static u64 parent_cpu_usage; +static u64 runavg_parent_cpu_usage; + +static unsigned long nr_runs; +static u64 sum_runtime; +static u64 sum_fluct; +static u64 run_avg; + +static unsigned long replay_repeat = 10; +static unsigned long nr_timestamps; +static unsigned long nr_unordered_timestamps; +static unsigned long nr_state_machine_bugs; +static unsigned long nr_context_switch_bugs; +static unsigned long nr_events; +static unsigned long nr_lost_chunks; +static unsigned long nr_lost_events; -typedef unsigned long long nsec_t; +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" -static nsec_t run_measurement_overhead; -static nsec_t sleep_measurement_overhead; +enum thread_state { + THREAD_SLEEPING = 0, + THREAD_WAIT_CPU, + THREAD_SCHED_IN, + THREAD_IGNORE +}; -static nsec_t get_nsecs(void) +struct work_atom { + struct list_head list; + enum thread_state state; + u64 sched_out_time; + u64 wake_up_time; + u64 sched_in_time; + u64 runtime; +}; + +struct work_atoms { + struct list_head work_list; + struct thread *thread; + struct rb_node node; + u64 max_lat; + u64 total_lat; + u64 nb_atoms; + u64 total_runtime; +}; + +typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); + +static struct rb_root atom_root, sorted_atom_root; + +static u64 all_runtime; +static u64 all_count; + +static int read_events(void); + + +static u64 get_nsecs(void) { struct timespec ts; @@ -79,16 +171,16 @@ static nsec_t get_nsecs(void) return ts.tv_sec * 1000000000ULL + ts.tv_nsec; } -static void burn_nsecs(nsec_t nsecs) +static void burn_nsecs(u64 nsecs) { - nsec_t T0 = get_nsecs(), T1; + u64 T0 = get_nsecs(), T1; do { T1 = get_nsecs(); } while (T1 + run_measurement_overhead < T0 + nsecs); } -static void sleep_nsecs(nsec_t nsecs) +static void sleep_nsecs(u64 nsecs) { struct timespec ts; @@ -100,7 +192,7 @@ static void sleep_nsecs(nsec_t nsecs) static void calibrate_run_measurement_overhead(void) { - nsec_t T0, T1, delta, min_delta = 1000000000ULL; + u64 T0, T1, delta, min_delta = 1000000000ULL; int i; for (i = 0; i < 10; i++) { @@ -117,7 +209,7 @@ static void calibrate_run_measurement_overhead(void) static void calibrate_sleep_measurement_overhead(void) { - nsec_t T0, T1, delta, min_delta = 1000000000ULL; + u64 T0, T1, delta, min_delta = 1000000000ULL; int i; for (i = 0; i < 10; i++) { @@ -133,69 +225,10 @@ static void calibrate_sleep_measurement_overhead(void) printf("sleep measurement overhead: %Ld nsecs\n", min_delta); } -#define COMM_LEN 20 -#define SYM_LEN 129 - -#define MAX_PID 65536 - -static unsigned long nr_tasks; - -struct sched_event; - -struct task_desc { - unsigned long nr; - unsigned long pid; - char comm[COMM_LEN]; - - unsigned long nr_events; - unsigned long curr_event; - struct sched_event **events; - - pthread_t thread; - sem_t sleep_sem; - - sem_t ready_for_work; - sem_t work_done_sem; - - nsec_t cpu_usage; -}; - -enum sched_event_type { - SCHED_EVENT_RUN, - SCHED_EVENT_SLEEP, - SCHED_EVENT_WAKEUP, -}; - -struct sched_event { - enum sched_event_type type; - nsec_t timestamp; - nsec_t duration; - unsigned long nr; - int specific_wait; - sem_t *wait_sem; - struct task_desc *wakee; -}; - -static struct task_desc *pid_to_task[MAX_PID]; - -static struct task_desc **tasks; - -static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER; -static nsec_t start_time; - -static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER; - -static unsigned long nr_run_events; -static unsigned long nr_sleep_events; -static unsigned long nr_wakeup_events; - -static unsigned long nr_sleep_corrections; -static unsigned long nr_run_events_optimized; - -static struct sched_event * -get_new_event(struct task_desc *task, nsec_t timestamp) +static struct sched_atom * +get_new_event(struct task_desc *task, u64 timestamp) { - struct sched_event *event = calloc(1, sizeof(*event)); + struct sched_atom *event = calloc(1, sizeof(*event)); unsigned long idx = task->nr_events; size_t size; @@ -203,27 +236,27 @@ get_new_event(struct task_desc *task, nsec_t timestamp) event->nr = idx; task->nr_events++; - size = sizeof(struct sched_event *) * task->nr_events; - task->events = realloc(task->events, size); - BUG_ON(!task->events); + size = sizeof(struct sched_atom *) * task->nr_events; + task->atoms = realloc(task->atoms, size); + BUG_ON(!task->atoms); - task->events[idx] = event; + task->atoms[idx] = event; return event; } -static struct sched_event *last_event(struct task_desc *task) +static struct sched_atom *last_event(struct task_desc *task) { if (!task->nr_events) return NULL; - return task->events[task->nr_events - 1]; + return task->atoms[task->nr_events - 1]; } static void -add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration) +add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration) { - struct sched_event *event, *curr_event = last_event(task); + struct sched_atom *event, *curr_event = last_event(task); /* * optimize an existing RUN event by merging this one @@ -243,14 +276,11 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration) nr_run_events++; } -static unsigned long targetless_wakeups; -static unsigned long multitarget_wakeups; - static void -add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, +add_sched_event_wakeup(struct task_desc *task, u64 timestamp, struct task_desc *wakee) { - struct sched_event *event, *wakee_event; + struct sched_atom *event, *wakee_event; event = get_new_event(task, timestamp); event->type = SCHED_EVENT_WAKEUP; @@ -275,10 +305,10 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, } static void -add_sched_event_sleep(struct task_desc *task, nsec_t timestamp, +add_sched_event_sleep(struct task_desc *task, u64 timestamp, u64 task_state __used) { - struct sched_event *event = get_new_event(task, timestamp); + struct sched_atom *event = get_new_event(task, timestamp); event->type = SCHED_EVENT_SLEEP; @@ -347,27 +377,27 @@ static void add_cross_task_wakeups(void) } static void -process_sched_event(struct task_desc *this_task __used, struct sched_event *event) +process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom) { int ret = 0; - nsec_t now; + u64 now; long long delta; now = get_nsecs(); - delta = start_time + event->timestamp - now; + delta = start_time + atom->timestamp - now; - switch (event->type) { + switch (atom->type) { case SCHED_EVENT_RUN: - burn_nsecs(event->duration); + burn_nsecs(atom->duration); break; case SCHED_EVENT_SLEEP: - if (event->wait_sem) - ret = sem_wait(event->wait_sem); + if (atom->wait_sem) + ret = sem_wait(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_WAKEUP: - if (event->wait_sem) - ret = sem_post(event->wait_sem); + if (atom->wait_sem) + ret = sem_post(atom->wait_sem); BUG_ON(ret); break; default: @@ -375,10 +405,10 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even } } -static nsec_t get_cpu_usage_nsec_parent(void) +static u64 get_cpu_usage_nsec_parent(void) { struct rusage ru; - nsec_t sum; + u64 sum; int err; err = getrusage(RUSAGE_SELF, &ru); @@ -390,12 +420,12 @@ static nsec_t get_cpu_usage_nsec_parent(void) return sum; } -static nsec_t get_cpu_usage_nsec_self(void) +static u64 get_cpu_usage_nsec_self(void) { char filename [] = "/proc/1234567890/sched"; unsigned long msecs, nsecs; char *line = NULL; - nsec_t total = 0; + u64 total = 0; size_t len = 0; ssize_t chars; FILE *file; @@ -423,7 +453,7 @@ static nsec_t get_cpu_usage_nsec_self(void) static void *thread_func(void *ctx) { struct task_desc *this_task = ctx; - nsec_t cpu_usage_0, cpu_usage_1; + u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; @@ -442,7 +472,7 @@ again: for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; - process_sched_event(this_task, this_task->events[i]); + process_sched_event(this_task, this_task->atoms[i]); } cpu_usage_1 = get_cpu_usage_nsec_self(); @@ -485,14 +515,9 @@ static void create_tasks(void) } } -static nsec_t cpu_usage; -static nsec_t runavg_cpu_usage; -static nsec_t parent_cpu_usage; -static nsec_t runavg_parent_cpu_usage; - static void wait_for_tasks(void) { - nsec_t cpu_usage_0, cpu_usage_1; + u64 cpu_usage_0, cpu_usage_1; struct task_desc *task; unsigned long i, ret; @@ -543,16 +568,9 @@ static void wait_for_tasks(void) } } -static int read_events(void); - -static unsigned long nr_runs; -static nsec_t sum_runtime; -static nsec_t sum_fluct; -static nsec_t run_avg; - static void run_one_test(void) { - nsec_t T0, T1, delta, avg_delta, fluct, std_dev; + u64 T0, T1, delta, avg_delta, fluct, std_dev; T0 = get_nsecs(); wait_for_tasks(); @@ -576,10 +594,6 @@ static void run_one_test(void) printf("#%-3ld: %0.3f, ", nr_runs, (double)delta/1000000.0); -#if 0 - printf("%0.2f +- %0.2f, ", - (double)avg_delta/1e6, (double)std_dev/1e6); -#endif printf("ravg: %0.2f, ", (double)run_avg/1e6); @@ -605,7 +619,7 @@ static void run_one_test(void) static void test_calibrations(void) { - nsec_t T0, T1; + u64 T0, T1; T0 = get_nsecs(); burn_nsecs(1e6); @@ -622,7 +636,7 @@ static void test_calibrations(void) static void __cmd_replay(void) { - long nr_iterations = 10, i; + unsigned long i; calibrate_run_measurement_overhead(); calibrate_sleep_measurement_overhead(); @@ -640,7 +654,7 @@ static void __cmd_replay(void) if (multitarget_wakeups) printf("multi-target wakeups: %ld\n", multitarget_wakeups); if (nr_run_events_optimized) - printf("run events optimized: %ld\n", + printf("run atoms optimized: %ld\n", nr_run_events_optimized); print_task_traces(); @@ -648,7 +662,7 @@ static void __cmd_replay(void) create_tasks(); printf("------------------------------------------------------------\n"); - for (i = 0; i < nr_iterations; i++) + for (i = 0; i < replay_repeat; i++) run_one_test(); } @@ -659,14 +673,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: perf_event_comm: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing perf_event_comm, skipping event.\n"); return -1; } total_comm++; @@ -718,6 +732,20 @@ struct trace_switch_event { u32 next_prio; }; +struct trace_runtime_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char comm[16]; + u32 pid; + u64 runtime; + u64 vruntime; +}; struct trace_wakeup_event { u32 size; @@ -758,6 +786,12 @@ struct trace_sched_handler { u64 timestamp, struct thread *thread); + void (*runtime_event)(struct trace_runtime_event *, + struct event *, + int cpu, + u64 timestamp, + struct thread *thread); + void (*wakeup_event)(struct trace_wakeup_event *, struct event *, int cpu, @@ -796,7 +830,7 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event, add_sched_event_wakeup(waker, timestamp, wakee); } -static unsigned long cpu_last_switched[MAX_CPUS]; +static u64 cpu_last_switched[MAX_CPUS]; static void replay_switch_event(struct trace_switch_event *switch_event, @@ -863,45 +897,51 @@ static struct trace_sched_handler replay_ops = { .fork_event = replay_fork_event, }; -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" - -enum thread_state { - THREAD_SLEEPING = 0, - THREAD_WAIT_CPU, - THREAD_SCHED_IN, - THREAD_IGNORE -}; - -struct work_atom { +struct sort_dimension { + const char *name; + sort_fn_t cmp; struct list_head list; - enum thread_state state; - u64 wake_up_time; - u64 sched_in_time; - u64 runtime; }; -struct task_atoms { - struct list_head snapshot_list; - struct thread *thread; - struct rb_node node; -}; +static LIST_HEAD(cmp_pid); -static struct rb_root lat_snapshot_root; +static int +thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) +{ + struct sort_dimension *sort; + int ret = 0; + + BUG_ON(list_empty(list)); + + list_for_each_entry(sort, list, list) { + ret = sort->cmp(l, r); + if (ret) + return ret; + } + + return ret; +} -static struct task_atoms * -thread_atom_list_search(struct rb_root *root, struct thread *thread) +static struct work_atoms * +thread_atoms_search(struct rb_root *root, struct thread *thread, + struct list_head *sort_list) { struct rb_node *node = root->rb_node; + struct work_atoms key = { .thread = thread }; while (node) { - struct task_atoms *atoms; + struct work_atoms *atoms; + int cmp; - atoms = container_of(node, struct task_atoms, node); - if (thread->pid < atoms->thread->pid) + atoms = container_of(node, struct work_atoms, node); + + cmp = thread_lat_cmp(sort_list, &key, atoms); + if (cmp > 0) node = node->rb_left; - else if (thread->pid > atoms->thread->pid) + else if (cmp < 0) node = node->rb_right; else { + BUG_ON(thread != atoms->thread); return atoms; } } @@ -909,37 +949,41 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread) } static void -__thread_latency_insert(struct rb_root *root, struct task_atoms *data) +__thread_latency_insert(struct rb_root *root, struct work_atoms *data, + struct list_head *sort_list) { struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { - struct task_atoms *this; + struct work_atoms *this; + int cmp; - this = container_of(*new, struct task_atoms, node); + this = container_of(*new, struct work_atoms, node); parent = *new; - if (data->thread->pid < this->thread->pid) + + cmp = thread_lat_cmp(sort_list, data, this); + + if (cmp > 0) new = &((*new)->rb_left); - else if (data->thread->pid > this->thread->pid) - new = &((*new)->rb_right); else - die("Double thread insertion\n"); + new = &((*new)->rb_right); } rb_link_node(&data->node, parent, new); rb_insert_color(&data->node, root); } -static void thread_atom_list_insert(struct thread *thread) +static void thread_atoms_insert(struct thread *thread) { - struct task_atoms *atoms; + struct work_atoms *atoms; + atoms = calloc(sizeof(*atoms), 1); if (!atoms) die("No memory"); atoms->thread = thread; - INIT_LIST_HEAD(&atoms->snapshot_list); - __thread_latency_insert(&lat_snapshot_root, atoms); + INIT_LIST_HEAD(&atoms->work_list); + __thread_latency_insert(&atom_root, atoms, &cmp_pid); } static void @@ -961,47 +1005,66 @@ static char sched_out_state(struct trace_switch_event *switch_event) } static void -lat_sched_out(struct task_atoms *atoms, - struct trace_switch_event *switch_event __used, - u64 delta, - u64 timestamp) +add_sched_out_event(struct work_atoms *atoms, + char run_state, + u64 timestamp) { - struct work_atom *snapshot; + struct work_atom *atom; - snapshot = calloc(sizeof(*snapshot), 1); - if (!snapshot) + atom = calloc(sizeof(*atom), 1); + if (!atom) die("Non memory"); - if (sched_out_state(switch_event) == 'R') { - snapshot->state = THREAD_WAIT_CPU; - snapshot->wake_up_time = timestamp; + atom->sched_out_time = timestamp; + + if (run_state == 'R') { + atom->state = THREAD_WAIT_CPU; + atom->wake_up_time = atom->sched_out_time; } - snapshot->runtime = delta; - list_add_tail(&snapshot->list, &atoms->snapshot_list); + list_add_tail(&atom->list, &atoms->work_list); +} + +static void +add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used) +{ + struct work_atom *atom; + + BUG_ON(list_empty(&atoms->work_list)); + + atom = list_entry(atoms->work_list.prev, struct work_atom, list); + + atom->runtime += delta; + atoms->total_runtime += delta; } static void -lat_sched_in(struct task_atoms *atoms, u64 timestamp) +add_sched_in_event(struct work_atoms *atoms, u64 timestamp) { - struct work_atom *snapshot; + struct work_atom *atom; + u64 delta; - if (list_empty(&atoms->snapshot_list)) + if (list_empty(&atoms->work_list)) return; - snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, - list); + atom = list_entry(atoms->work_list.prev, struct work_atom, list); - if (snapshot->state != THREAD_WAIT_CPU) + if (atom->state != THREAD_WAIT_CPU) return; - if (timestamp < snapshot->wake_up_time) { - snapshot->state = THREAD_IGNORE; + if (timestamp < atom->wake_up_time) { + atom->state = THREAD_IGNORE; return; } - snapshot->state = THREAD_SCHED_IN; - snapshot->sched_in_time = timestamp; + atom->state = THREAD_SCHED_IN; + atom->sched_in_time = timestamp; + + delta = atom->sched_in_time - atom->wake_up_time; + atoms->total_lat += delta; + if (delta > atoms->max_lat) + atoms->max_lat = delta; + atoms->nb_atoms++; } static void @@ -1011,13 +1074,12 @@ latency_switch_event(struct trace_switch_event *switch_event, u64 timestamp, struct thread *thread __used) { - struct task_atoms *out_atoms, *in_atoms; + struct work_atoms *out_events, *in_events; struct thread *sched_out, *sched_in; u64 timestamp0; s64 delta; - if (cpu >= MAX_CPUS || cpu < 0) - return; + BUG_ON(cpu >= MAX_CPUS || cpu < 0); timestamp0 = cpu_last_switched[cpu]; cpu_last_switched[cpu] = timestamp; @@ -1033,35 +1095,64 @@ latency_switch_event(struct trace_switch_event *switch_event, sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); - in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in); - if (!in_atoms) { - thread_atom_list_insert(sched_in); - in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in); - if (!in_atoms) - die("Internal latency tree error"); + out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); + if (!out_events) { + thread_atoms_insert(sched_out); + out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); + if (!out_events) + die("out-event: Internal tree error"); + } + add_sched_out_event(out_events, sched_out_state(switch_event), timestamp); + + in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid); + if (!in_events) { + thread_atoms_insert(sched_in); + in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid); + if (!in_events) + die("in-event: Internal tree error"); + /* + * Take came in we have not heard about yet, + * add in an initial atom in runnable state: + */ + add_sched_out_event(in_events, 'R', timestamp); } + add_sched_in_event(in_events, timestamp); +} - out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out); - if (!out_atoms) { - thread_atom_list_insert(sched_out); - out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out); - if (!out_atoms) - die("Internal latency tree error"); +static void +latency_runtime_event(struct trace_runtime_event *runtime_event, + struct event *event __used, + int cpu, + u64 timestamp, + struct thread *this_thread __used) +{ + struct work_atoms *atoms; + struct thread *thread; + + BUG_ON(cpu >= MAX_CPUS || cpu < 0); + + thread = threads__findnew(runtime_event->pid, &threads, &last_match); + atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); + if (!atoms) { + thread_atoms_insert(thread); + atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); + if (!atoms) + die("in-event: Internal tree error"); + add_sched_out_event(atoms, 'R', timestamp); } - lat_sched_in(in_atoms, timestamp); - lat_sched_out(out_atoms, switch_event, delta, timestamp); + add_runtime_event(atoms, runtime_event->runtime, timestamp); } static void latency_wakeup_event(struct trace_wakeup_event *wakeup_event, - struct event *event __used, + struct event *__event __used, int cpu __used, u64 timestamp, struct thread *thread __used) { - struct task_atoms *atoms; - struct work_atom *snapshot; + struct work_atoms *atoms; + struct work_atom *atom; struct thread *wakee; /* Note for later, it may be interesting to observe the failing cases */ @@ -1069,73 +1160,198 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, return; wakee = threads__findnew(wakeup_event->pid, &threads, &last_match); - atoms = thread_atom_list_search(&lat_snapshot_root, wakee); + atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) { - thread_atom_list_insert(wakee); - return; + thread_atoms_insert(wakee); + atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); + if (!atoms) + die("wakeup-event: Internal tree error"); + add_sched_out_event(atoms, 'S', timestamp); } - if (list_empty(&atoms->snapshot_list)) - return; + BUG_ON(list_empty(&atoms->work_list)); + + atom = list_entry(atoms->work_list.prev, struct work_atom, list); - snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, - list); + if (atom->state != THREAD_SLEEPING) + nr_state_machine_bugs++; - if (snapshot->state != THREAD_SLEEPING) + nr_timestamps++; + if (atom->sched_out_time > timestamp) { + nr_unordered_timestamps++; return; + } - snapshot->state = THREAD_WAIT_CPU; - snapshot->wake_up_time = timestamp; + atom->state = THREAD_WAIT_CPU; + atom->wake_up_time = timestamp; } static struct trace_sched_handler lat_ops = { .wakeup_event = latency_wakeup_event, .switch_event = latency_switch_event, + .runtime_event = latency_runtime_event, .fork_event = latency_fork_event, }; -static u64 all_runtime; -static u64 all_count; - -static void output_lat_thread(struct task_atoms *atom_list) +static void output_lat_thread(struct work_atoms *work_list) { - struct work_atom *atom; - int count = 0; int i; int ret; - u64 max = 0, avg; - u64 total = 0, delta; - u64 total_runtime = 0; + u64 avg; - list_for_each_entry(atom, &atom_list->snapshot_list, list) { - total_runtime += atom->runtime; + if (!work_list->nb_atoms) + return; + /* + * Ignore idle threads: + */ + if (!work_list->thread->pid) + return; - if (atom->state != THREAD_SCHED_IN) - continue; + all_runtime += work_list->total_runtime; + all_count += work_list->nb_atoms; - count++; + ret = printf(" %s-%d ", work_list->thread->comm, work_list->thread->pid); - delta = atom->sched_in_time - atom->wake_up_time; - if (delta > max) - max = delta; - total += delta; - } + for (i = 0; i < 24 - ret; i++) + printf(" "); - all_runtime += total_runtime; - all_count += count; + avg = work_list->total_lat / work_list->nb_atoms; - if (!count) - return; + printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", + (double)work_list->total_runtime / 1e6, + work_list->nb_atoms, (double)avg / 1e6, + (double)work_list->max_lat / 1e6); +} - ret = printf(" %s ", atom_list->thread->comm); +static int pid_cmp(struct work_atoms *l, struct work_atoms *r) +{ + if (l->thread->pid < r->thread->pid) + return -1; + if (l->thread->pid > r->thread->pid) + return 1; - for (i = 0; i < 19 - ret; i++) - printf(" "); + return 0; +} + +static struct sort_dimension pid_sort_dimension = { + .name = "pid", + .cmp = pid_cmp, +}; + +static int avg_cmp(struct work_atoms *l, struct work_atoms *r) +{ + u64 avgl, avgr; + + if (!l->nb_atoms) + return -1; + + if (!r->nb_atoms) + return 1; + + avgl = l->total_lat / l->nb_atoms; + avgr = r->total_lat / r->nb_atoms; + + if (avgl < avgr) + return -1; + if (avgl > avgr) + return 1; - avg = total / count; + return 0; +} + +static struct sort_dimension avg_sort_dimension = { + .name = "avg", + .cmp = avg_cmp, +}; + +static int max_cmp(struct work_atoms *l, struct work_atoms *r) +{ + if (l->max_lat < r->max_lat) + return -1; + if (l->max_lat > r->max_lat) + return 1; - printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n", - (double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9); + return 0; +} + +static struct sort_dimension max_sort_dimension = { + .name = "max", + .cmp = max_cmp, +}; + +static int switch_cmp(struct work_atoms *l, struct work_atoms *r) +{ + if (l->nb_atoms < r->nb_atoms) + return -1; + if (l->nb_atoms > r->nb_atoms) + return 1; + + return 0; +} + +static struct sort_dimension switch_sort_dimension = { + .name = "switch", + .cmp = switch_cmp, +}; + +static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) +{ + if (l->total_runtime < r->total_runtime) + return -1; + if (l->total_runtime > r->total_runtime) + return 1; + + return 0; +} + +static struct sort_dimension runtime_sort_dimension = { + .name = "runtime", + .cmp = runtime_cmp, +}; + +static struct sort_dimension *available_sorts[] = { + &pid_sort_dimension, + &avg_sort_dimension, + &max_sort_dimension, + &switch_sort_dimension, + &runtime_sort_dimension, +}; + +#define NB_AVAILABLE_SORTS (int)(sizeof(available_sorts) / sizeof(struct sort_dimension *)) + +static LIST_HEAD(sort_list); + +static int sort_dimension__add(char *tok, struct list_head *list) +{ + int i; + + for (i = 0; i < NB_AVAILABLE_SORTS; i++) { + if (!strcmp(available_sorts[i]->name, tok)) { + list_add_tail(&available_sorts[i]->list, list); + + return 0; + } + } + + return -1; +} + +static void setup_sorting(void); + +static void sort_lat(void) +{ + struct rb_node *node; + + for (;;) { + struct work_atoms *data; + node = rb_first(&atom_root); + if (!node) + break; + + rb_erase(node, &atom_root); + data = rb_entry(node, struct work_atoms, node); + __thread_latency_insert(&sorted_atom_root, data, &sort_list); + } } static void __cmd_lat(void) @@ -1144,25 +1360,56 @@ static void __cmd_lat(void) setup_pager(); read_events(); + sort_lat(); - printf("-----------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); - printf("-----------------------------------------------------------------------------------\n"); + printf("\n -----------------------------------------------------------------------------------------\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); + printf(" -----------------------------------------------------------------------------------------\n"); - next = rb_first(&lat_snapshot_root); + next = rb_first(&sorted_atom_root); while (next) { - struct task_atoms *atom_list; + struct work_atoms *work_list; - atom_list = rb_entry(next, struct task_atoms, node); - output_lat_thread(atom_list); + work_list = rb_entry(next, struct work_atoms, node); + output_lat_thread(work_list); next = rb_next(next); } - printf("-----------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%9.3f ms |%9Ld |\n", - (double)all_runtime/1e9, all_count); - printf("---------------------------------------------\n"); + printf(" -----------------------------------------------------------------------------------------\n"); + printf(" TOTAL: |%11.3f ms |%9Ld |\n", + (double)all_runtime/1e6, all_count); + + printf(" ---------------------------------------------------\n"); + if (nr_unordered_timestamps && nr_timestamps) { + printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", + (double)nr_unordered_timestamps/(double)nr_timestamps*100.0, + nr_unordered_timestamps, nr_timestamps); + } else { + } + if (nr_lost_events && nr_events) { + printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", + (double)nr_lost_events/(double)nr_events*100.0, + nr_lost_events, nr_events, nr_lost_chunks); + } + if (nr_state_machine_bugs && nr_timestamps) { + printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)", + (double)nr_state_machine_bugs/(double)nr_timestamps*100.0, + nr_state_machine_bugs, nr_timestamps); + if (nr_lost_events) + printf(" (due to lost events?)"); + printf("\n"); + } + if (nr_context_switch_bugs && nr_timestamps) { + printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", + (double)nr_context_switch_bugs/(double)nr_timestamps*100.0, + nr_context_switch_bugs, nr_timestamps); + if (nr_lost_events) + printf(" (due to lost events?)"); + printf("\n"); + } + printf("\n"); + } static struct trace_sched_handler *trace_handler; @@ -1187,10 +1434,16 @@ process_sched_wakeup_event(struct raw_event_sample *raw, trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); } +/* + * Track the current task - that way we can know whether there's any + * weird events, such as a task being switched away that is not current. + */ +static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 }; + static void process_sched_switch_event(struct raw_event_sample *raw, struct event *event, - int cpu __used, + int cpu, u64 timestamp __used, struct thread *thread __used) { @@ -1206,10 +1459,37 @@ process_sched_switch_event(struct raw_event_sample *raw, FILL_FIELD(switch_event, next_pid, event, raw->data); FILL_FIELD(switch_event, next_prio, event, raw->data); + if (curr_pid[cpu] != (u32)-1) { + /* + * Are we trying to switch away a PID that is + * not current? + */ + if (curr_pid[cpu] != switch_event.prev_pid) + nr_context_switch_bugs++; + } + curr_pid[cpu] = switch_event.next_pid; + trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread); } static void +process_sched_runtime_event(struct raw_event_sample *raw, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + struct trace_runtime_event runtime_event; + + FILL_ARRAY(runtime_event, comm, event, raw->data); + FILL_FIELD(runtime_event, pid, event, raw->data); + FILL_FIELD(runtime_event, runtime, event, raw->data); + FILL_FIELD(runtime_event, vruntime, event, raw->data); + + trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); +} + +static void process_sched_fork_event(struct raw_event_sample *raw, struct event *event, int cpu __used, @@ -1251,6 +1531,8 @@ process_raw_event(event_t *raw_event __used, void *more_data, if (!strcmp(event->name, "sched_switch")) process_sched_switch_event(raw, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_stat_runtime")) + process_sched_runtime_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) @@ -1344,8 +1626,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head) { trace_event(event); + nr_events++; switch (event->header.type) { - case PERF_EVENT_MMAP ... PERF_EVENT_LOST: + case PERF_EVENT_MMAP: + return 0; + case PERF_EVENT_LOST: + nr_lost_chunks++; + nr_lost_events += event->lost.lost; return 0; case PERF_EVENT_COMM: @@ -1462,48 +1749,140 @@ more: } static const char * const sched_usage[] = { - "perf sched [] ", + "perf sched [] {record|latency|replay|trace}", NULL }; -static const struct option options[] = { +static const struct option sched_options[] = { + OPT_BOOLEAN('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), - OPT_BOOLEAN('r', "replay", &replay_mode, - "replay sched behaviour from traces"), - OPT_BOOLEAN('l', "latency", &lat_mode, - "measure various latencies"), + OPT_END() +}; + +static const char * const latency_usage[] = { + "perf sched latency []", + NULL +}; + +static const struct option latency_options[] = { + OPT_STRING('s', "sort", &sort_order, "key[,key2...]", + "sort by key(s): runtime, switch, avg, max"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, + "dump raw trace in ASCII"), OPT_END() }; +static const char * const replay_usage[] = { + "perf sched replay []", + NULL +}; + +static const struct option replay_options[] = { + OPT_INTEGER('r', "repeat", &replay_repeat, + "repeat the workload replay N times (-1: infinite)"), + OPT_BOOLEAN('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, + "dump raw trace in ASCII"), + OPT_END() +}; + +static void setup_sorting(void) +{ + char *tmp, *tok, *str = strdup(sort_order); + + for (tok = strtok_r(str, ", ", &tmp); + tok; tok = strtok_r(NULL, ", ", &tmp)) { + if (sort_dimension__add(tok, &sort_list) < 0) { + error("Unknown --sort key: `%s'", tok); + usage_with_options(latency_usage, latency_options); + } + } + + free(str); + + sort_dimension__add((char *)"pid", &cmp_pid); +} + +static const char *record_args[] = { + "record", + "-a", + "-R", + "-M", + "-f", + "-m", "1024", + "-c", "1", + "-e", "sched:sched_switch:r", + "-e", "sched:sched_stat_wait:r", + "-e", "sched:sched_stat_sleep:r", + "-e", "sched:sched_stat_iowait:r", + "-e", "sched:sched_stat_runtime:r", + "-e", "sched:sched_process_exit:r", + "-e", "sched:sched_process_fork:r", + "-e", "sched:sched_wakeup:r", + "-e", "sched:sched_migrate_task:r", +}; + +static int __cmd_record(int argc, const char **argv) +{ + unsigned int rec_argc, i, j; + const char **rec_argv; + + rec_argc = ARRAY_SIZE(record_args) + argc - 1; + rec_argv = calloc(rec_argc + 1, sizeof(char *)); + + for (i = 0; i < ARRAY_SIZE(record_args); i++) + rec_argv[i] = strdup(record_args[i]); + + for (j = 1; j < (unsigned int)argc; j++, i++) + rec_argv[i] = argv[j]; + + BUG_ON(i != rec_argc); + + return cmd_record(i, rec_argv, NULL); +} + int cmd_sched(int argc, const char **argv, const char *prefix __used) { symbol__init(); page_size = getpagesize(); - argc = parse_options(argc, argv, options, sched_usage, 0); - if (argc) { - /* - * Special case: if there's an argument left then assume tha - * it's a symbol filter: - */ - if (argc > 1) - usage_with_options(sched_usage, options); - } + argc = parse_options(argc, argv, sched_options, sched_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (!argc) + usage_with_options(sched_usage, sched_options); - if (replay_mode) - trace_handler = &replay_ops; - else if (lat_mode) + if (!strncmp(argv[0], "rec", 3)) { + return __cmd_record(argc, argv); + } else if (!strncmp(argv[0], "lat", 3)) { trace_handler = &lat_ops; - else - usage_with_options(sched_usage, options); - - if (replay_mode) - __cmd_replay(); - else if (lat_mode) + if (argc > 1) { + argc = parse_options(argc, argv, latency_options, latency_usage, 0); + if (argc) + usage_with_options(latency_usage, latency_options); + } + setup_sorting(); __cmd_lat(); + } else if (!strncmp(argv[0], "rep", 3)) { + trace_handler = &replay_ops; + if (argc) { + argc = parse_options(argc, argv, replay_options, replay_usage, 0); + if (argc) + usage_with_options(replay_usage, replay_options); + } + __cmd_replay(); + } else if (!strcmp(argv[0], "trace")) { + /* + * Aliased to 'perf trace' for now: + */ + return cmd_trace(argc, argv, prefix); + } else { + usage_with_options(sched_usage, sched_options); + } return 0; }