perf tools: Factorize the map helpers
[safe/jmp/linux-2.6] / tools / perf / builtin-record.c
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9
10 #include "perf.h"
11
12 #include "util/util.h"
13 #include "util/parse-options.h"
14 #include "util/parse-events.h"
15 #include "util/string.h"
16
17 #include "util/header.h"
18 #include "util/event.h"
19
20 #include <unistd.h>
21 #include <sched.h>
22
23 #define ALIGN(x, a)             __ALIGN_MASK(x, (typeof(x))(a)-1)
24 #define __ALIGN_MASK(x, mask)   (((x)+(mask))&~(mask))
25
26 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
27
28 static long                     default_interval                = 100000;
29
30 static int                      nr_cpus                         = 0;
31 static unsigned int             page_size;
32 static unsigned int             mmap_pages                      = 128;
33 static int                      freq                            = 0;
34 static int                      output;
35 static const char               *output_name                    = "perf.data";
36 static int                      group                           = 0;
37 static unsigned int             realtime_prio                   = 0;
38 static int                      system_wide                     = 0;
39 static pid_t                    target_pid                      = -1;
40 static int                      inherit                         = 1;
41 static int                      force                           = 0;
42 static int                      append_file                     = 0;
43 static int                      call_graph                      = 0;
44 static int                      inherit_stat                    = 0;
45 static int                      no_samples                      = 0;
46 static int                      sample_address                  = 0;
47
48 static long                     samples;
49 static struct timeval           last_read;
50 static struct timeval           this_read;
51
52 static u64                      bytes_written;
53
54 static struct pollfd            event_array[MAX_NR_CPUS * MAX_COUNTERS];
55
56 static int                      nr_poll;
57 static int                      nr_cpu;
58
59 static int                      file_new = 1;
60
61 struct perf_header              *header;
62
63 struct mmap_data {
64         int                     counter;
65         void                    *base;
66         unsigned int            mask;
67         unsigned int            prev;
68 };
69
70 static struct mmap_data         mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
71
72 static unsigned long mmap_read_head(struct mmap_data *md)
73 {
74         struct perf_counter_mmap_page *pc = md->base;
75         long head;
76
77         head = pc->data_head;
78         rmb();
79
80         return head;
81 }
82
83 static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
84 {
85         struct perf_counter_mmap_page *pc = md->base;
86
87         /*
88          * ensure all reads are done before we write the tail out.
89          */
90         /* mb(); */
91         pc->data_tail = tail;
92 }
93
94 static void write_output(void *buf, size_t size)
95 {
96         while (size) {
97                 int ret = write(output, buf, size);
98
99                 if (ret < 0)
100                         die("failed to write");
101
102                 size -= ret;
103                 buf += ret;
104
105                 bytes_written += ret;
106         }
107 }
108
109 static void mmap_read(struct mmap_data *md)
110 {
111         unsigned int head = mmap_read_head(md);
112         unsigned int old = md->prev;
113         unsigned char *data = md->base + page_size;
114         unsigned long size;
115         void *buf;
116         int diff;
117
118         gettimeofday(&this_read, NULL);
119
120         /*
121          * If we're further behind than half the buffer, there's a chance
122          * the writer will bite our tail and mess up the samples under us.
123          *
124          * If we somehow ended up ahead of the head, we got messed up.
125          *
126          * In either case, truncate and restart at head.
127          */
128         diff = head - old;
129         if (diff < 0) {
130                 struct timeval iv;
131                 unsigned long msecs;
132
133                 timersub(&this_read, &last_read, &iv);
134                 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
135
136                 fprintf(stderr, "WARNING: failed to keep up with mmap data."
137                                 "  Last read %lu msecs ago.\n", msecs);
138
139                 /*
140                  * head points to a known good entry, start there.
141                  */
142                 old = head;
143         }
144
145         last_read = this_read;
146
147         if (old != head)
148                 samples++;
149
150         size = head - old;
151
152         if ((old & md->mask) + size != (head & md->mask)) {
153                 buf = &data[old & md->mask];
154                 size = md->mask + 1 - (old & md->mask);
155                 old += size;
156
157                 write_output(buf, size);
158         }
159
160         buf = &data[old & md->mask];
161         size = head - old;
162         old += size;
163
164         write_output(buf, size);
165
166         md->prev = old;
167         mmap_write_tail(md, old);
168 }
169
170 static volatile int done = 0;
171 static volatile int signr = -1;
172
173 static void sig_handler(int sig)
174 {
175         done = 1;
176         signr = sig;
177 }
178
179 static void sig_atexit(void)
180 {
181         if (signr == -1)
182                 return;
183
184         signal(signr, SIG_DFL);
185         kill(getpid(), signr);
186 }
187
188 static void pid_synthesize_comm_event(pid_t pid, int full)
189 {
190         struct comm_event comm_ev;
191         char filename[PATH_MAX];
192         char bf[BUFSIZ];
193         int fd;
194         size_t size;
195         char *field, *sep;
196         DIR *tasks;
197         struct dirent dirent, *next;
198
199         snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
200
201         fd = open(filename, O_RDONLY);
202         if (fd < 0) {
203                 /*
204                  * We raced with a task exiting - just return:
205                  */
206                 if (verbose)
207                         fprintf(stderr, "couldn't open %s\n", filename);
208                 return;
209         }
210         if (read(fd, bf, sizeof(bf)) < 0) {
211                 fprintf(stderr, "couldn't read %s\n", filename);
212                 exit(EXIT_FAILURE);
213         }
214         close(fd);
215
216         /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
217         memset(&comm_ev, 0, sizeof(comm_ev));
218         field = strchr(bf, '(');
219         if (field == NULL)
220                 goto out_failure;
221         sep = strchr(++field, ')');
222         if (sep == NULL)
223                 goto out_failure;
224         size = sep - field;
225         memcpy(comm_ev.comm, field, size++);
226
227         comm_ev.pid = pid;
228         comm_ev.header.type = PERF_EVENT_COMM;
229         size = ALIGN(size, sizeof(u64));
230         comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
231
232         if (!full) {
233                 comm_ev.tid = pid;
234
235                 write_output(&comm_ev, comm_ev.header.size);
236                 return;
237         }
238
239         snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
240
241         tasks = opendir(filename);
242         while (!readdir_r(tasks, &dirent, &next) && next) {
243                 char *end;
244                 pid = strtol(dirent.d_name, &end, 10);
245                 if (*end)
246                         continue;
247
248                 comm_ev.tid = pid;
249
250                 write_output(&comm_ev, comm_ev.header.size);
251         }
252         closedir(tasks);
253         return;
254
255 out_failure:
256         fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
257                 filename);
258         exit(EXIT_FAILURE);
259 }
260
261 static void pid_synthesize_mmap_samples(pid_t pid)
262 {
263         char filename[PATH_MAX];
264         FILE *fp;
265
266         snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
267
268         fp = fopen(filename, "r");
269         if (fp == NULL) {
270                 /*
271                  * We raced with a task exiting - just return:
272                  */
273                 if (verbose)
274                         fprintf(stderr, "couldn't open %s\n", filename);
275                 return;
276         }
277         while (1) {
278                 char bf[BUFSIZ], *pbf = bf;
279                 struct mmap_event mmap_ev = {
280                         .header = { .type = PERF_EVENT_MMAP },
281                 };
282                 int n;
283                 size_t size;
284                 if (fgets(bf, sizeof(bf), fp) == NULL)
285                         break;
286
287                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
288                 n = hex2u64(pbf, &mmap_ev.start);
289                 if (n < 0)
290                         continue;
291                 pbf += n + 1;
292                 n = hex2u64(pbf, &mmap_ev.len);
293                 if (n < 0)
294                         continue;
295                 pbf += n + 3;
296                 if (*pbf == 'x') { /* vm_exec */
297                         char *execname = strchr(bf, '/');
298
299                         /* Catch VDSO */
300                         if (execname == NULL)
301                                 execname = strstr(bf, "[vdso]");
302
303                         if (execname == NULL)
304                                 continue;
305
306                         size = strlen(execname);
307                         execname[size - 1] = '\0'; /* Remove \n */
308                         memcpy(mmap_ev.filename, execname, size);
309                         size = ALIGN(size, sizeof(u64));
310                         mmap_ev.len -= mmap_ev.start;
311                         mmap_ev.header.size = (sizeof(mmap_ev) -
312                                                (sizeof(mmap_ev.filename) - size));
313                         mmap_ev.pid = pid;
314                         mmap_ev.tid = pid;
315
316                         write_output(&mmap_ev, mmap_ev.header.size);
317                 }
318         }
319
320         fclose(fp);
321 }
322
323 static void synthesize_all(void)
324 {
325         DIR *proc;
326         struct dirent dirent, *next;
327
328         proc = opendir("/proc");
329
330         while (!readdir_r(proc, &dirent, &next) && next) {
331                 char *end;
332                 pid_t pid;
333
334                 pid = strtol(dirent.d_name, &end, 10);
335                 if (*end) /* only interested in proper numerical dirents */
336                         continue;
337
338                 pid_synthesize_comm_event(pid, 1);
339                 pid_synthesize_mmap_samples(pid);
340         }
341
342         closedir(proc);
343 }
344
345 static int group_fd;
346
347 static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
348 {
349         struct perf_header_attr *h_attr;
350
351         if (nr < header->attrs) {
352                 h_attr = header->attr[nr];
353         } else {
354                 h_attr = perf_header_attr__new(a);
355                 perf_header__add_attr(header, h_attr);
356         }
357
358         return h_attr;
359 }
360
361 static void create_counter(int counter, int cpu, pid_t pid)
362 {
363         struct perf_counter_attr *attr = attrs + counter;
364         struct perf_header_attr *h_attr;
365         int track = !counter; /* only the first counter needs these */
366         struct {
367                 u64 count;
368                 u64 time_enabled;
369                 u64 time_running;
370                 u64 id;
371         } read_data;
372
373         attr->read_format       = PERF_FORMAT_TOTAL_TIME_ENABLED |
374                                   PERF_FORMAT_TOTAL_TIME_RUNNING |
375                                   PERF_FORMAT_ID;
376
377         attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
378
379         if (freq) {
380                 attr->sample_type       |= PERF_SAMPLE_PERIOD;
381                 attr->freq              = 1;
382                 attr->sample_freq       = freq;
383         }
384
385         if (no_samples)
386                 attr->sample_freq = 0;
387
388         if (inherit_stat)
389                 attr->inherit_stat = 1;
390
391         if (sample_address)
392                 attr->sample_type       |= PERF_SAMPLE_ADDR;
393
394         if (call_graph)
395                 attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
396
397
398         attr->mmap              = track;
399         attr->comm              = track;
400         attr->inherit           = (cpu < 0) && inherit;
401         attr->disabled          = 1;
402
403 try_again:
404         fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
405
406         if (fd[nr_cpu][counter] < 0) {
407                 int err = errno;
408
409                 if (err == EPERM)
410                         die("Permission error - are you root?\n");
411
412                 /*
413                  * If it's cycles then fall back to hrtimer
414                  * based cpu-clock-tick sw counter, which
415                  * is always available even if no PMU support:
416                  */
417                 if (attr->type == PERF_TYPE_HARDWARE
418                         && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
419
420                         if (verbose)
421                                 warning(" ... trying to fall back to cpu-clock-ticks\n");
422                         attr->type = PERF_TYPE_SOFTWARE;
423                         attr->config = PERF_COUNT_SW_CPU_CLOCK;
424                         goto try_again;
425                 }
426                 printf("\n");
427                 error("perfcounter syscall returned with %d (%s)\n",
428                         fd[nr_cpu][counter], strerror(err));
429                 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
430                 exit(-1);
431         }
432
433         h_attr = get_header_attr(attr, counter);
434
435         if (!file_new) {
436                 if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
437                         fprintf(stderr, "incompatible append\n");
438                         exit(-1);
439                 }
440         }
441
442         if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
443                 perror("Unable to read perf file descriptor\n");
444                 exit(-1);
445         }
446
447         perf_header_attr__add_id(h_attr, read_data.id);
448
449         assert(fd[nr_cpu][counter] >= 0);
450         fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
451
452         /*
453          * First counter acts as the group leader:
454          */
455         if (group && group_fd == -1)
456                 group_fd = fd[nr_cpu][counter];
457
458         event_array[nr_poll].fd = fd[nr_cpu][counter];
459         event_array[nr_poll].events = POLLIN;
460         nr_poll++;
461
462         mmap_array[nr_cpu][counter].counter = counter;
463         mmap_array[nr_cpu][counter].prev = 0;
464         mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
465         mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
466                         PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
467         if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
468                 error("failed to mmap with %d (%s)\n", errno, strerror(errno));
469                 exit(-1);
470         }
471
472         ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
473 }
474
475 static void open_counters(int cpu, pid_t pid)
476 {
477         int counter;
478
479         group_fd = -1;
480         for (counter = 0; counter < nr_counters; counter++)
481                 create_counter(counter, cpu, pid);
482
483         nr_cpu++;
484 }
485
486 static void atexit_header(void)
487 {
488         header->data_size += bytes_written;
489
490         perf_header__write(header, output);
491 }
492
493 static int __cmd_record(int argc, const char **argv)
494 {
495         int i, counter;
496         struct stat st;
497         pid_t pid = 0;
498         int flags;
499         int ret;
500
501         page_size = sysconf(_SC_PAGE_SIZE);
502         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
503         assert(nr_cpus <= MAX_NR_CPUS);
504         assert(nr_cpus >= 0);
505
506         atexit(sig_atexit);
507         signal(SIGCHLD, sig_handler);
508         signal(SIGINT, sig_handler);
509
510         if (!stat(output_name, &st) && st.st_size) {
511                 if (!force && !append_file) {
512                         fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
513                                         output_name);
514                         exit(-1);
515                 }
516         } else {
517                 append_file = 0;
518         }
519
520         flags = O_CREAT|O_RDWR;
521         if (append_file)
522                 file_new = 0;
523         else
524                 flags |= O_TRUNC;
525
526         output = open(output_name, flags, S_IRUSR|S_IWUSR);
527         if (output < 0) {
528                 perror("failed to create output file");
529                 exit(-1);
530         }
531
532         if (!file_new)
533                 header = perf_header__read(output);
534         else
535                 header = perf_header__new();
536
537         atexit(atexit_header);
538
539         if (!system_wide) {
540                 pid = target_pid;
541                 if (pid == -1)
542                         pid = getpid();
543
544                 open_counters(-1, pid);
545         } else for (i = 0; i < nr_cpus; i++)
546                 open_counters(i, target_pid);
547
548         if (file_new)
549                 perf_header__write(header, output);
550
551         if (!system_wide) {
552                 pid_synthesize_comm_event(pid, 0);
553                 pid_synthesize_mmap_samples(pid);
554         } else
555                 synthesize_all();
556
557         if (target_pid == -1 && argc) {
558                 pid = fork();
559                 if (pid < 0)
560                         perror("failed to fork");
561
562                 if (!pid) {
563                         if (execvp(argv[0], (char **)argv)) {
564                                 perror(argv[0]);
565                                 exit(-1);
566                         }
567                 }
568         }
569
570         if (realtime_prio) {
571                 struct sched_param param;
572
573                 param.sched_priority = realtime_prio;
574                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
575                         printf("Could not set realtime priority.\n");
576                         exit(-1);
577                 }
578         }
579
580         for (;;) {
581                 int hits = samples;
582
583                 for (i = 0; i < nr_cpu; i++) {
584                         for (counter = 0; counter < nr_counters; counter++)
585                                 mmap_read(&mmap_array[i][counter]);
586                 }
587
588                 if (hits == samples) {
589                         if (done)
590                                 break;
591                         ret = poll(event_array, nr_poll, 100);
592                 }
593         }
594
595         /*
596          * Approximate RIP event size: 24 bytes.
597          */
598         fprintf(stderr,
599                 "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
600                 (double)bytes_written / 1024.0 / 1024.0,
601                 output_name,
602                 bytes_written / 24);
603
604         return 0;
605 }
606
607 static const char * const record_usage[] = {
608         "perf record [<options>] [<command>]",
609         "perf record [<options>] -- <command> [<options>]",
610         NULL
611 };
612
613 static const struct option options[] = {
614         OPT_CALLBACK('e', "event", NULL, "event",
615                      "event selector. use 'perf list' to list available events",
616                      parse_events),
617         OPT_INTEGER('p', "pid", &target_pid,
618                     "record events on existing pid"),
619         OPT_INTEGER('r', "realtime", &realtime_prio,
620                     "collect data with this RT SCHED_FIFO priority"),
621         OPT_BOOLEAN('a', "all-cpus", &system_wide,
622                             "system-wide collection from all CPUs"),
623         OPT_BOOLEAN('A', "append", &append_file,
624                             "append to the output file to do incremental profiling"),
625         OPT_BOOLEAN('f', "force", &force,
626                         "overwrite existing data file"),
627         OPT_LONG('c', "count", &default_interval,
628                     "event period to sample"),
629         OPT_STRING('o', "output", &output_name, "file",
630                     "output file name"),
631         OPT_BOOLEAN('i', "inherit", &inherit,
632                     "child tasks inherit counters"),
633         OPT_INTEGER('F', "freq", &freq,
634                     "profile at this frequency"),
635         OPT_INTEGER('m', "mmap-pages", &mmap_pages,
636                     "number of mmap data pages"),
637         OPT_BOOLEAN('g', "call-graph", &call_graph,
638                     "do call-graph (stack chain/backtrace) recording"),
639         OPT_BOOLEAN('v', "verbose", &verbose,
640                     "be more verbose (show counter open errors, etc)"),
641         OPT_BOOLEAN('s', "stat", &inherit_stat,
642                     "per thread counts"),
643         OPT_BOOLEAN('d', "data", &sample_address,
644                     "Sample addresses"),
645         OPT_BOOLEAN('n', "no-samples", &no_samples,
646                     "don't sample"),
647         OPT_END()
648 };
649
650 int cmd_record(int argc, const char **argv, const char *prefix __used)
651 {
652         int counter;
653
654         argc = parse_options(argc, argv, options, record_usage,
655                 PARSE_OPT_STOP_AT_NON_OPTION);
656         if (!argc && target_pid == -1 && !system_wide)
657                 usage_with_options(record_usage, options);
658
659         if (!nr_counters) {
660                 nr_counters     = 1;
661                 attrs[0].type   = PERF_TYPE_HARDWARE;
662                 attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
663         }
664
665         for (counter = 0; counter < nr_counters; counter++) {
666                 if (attrs[counter].sample_period)
667                         continue;
668
669                 attrs[counter].sample_period = default_interval;
670         }
671
672         return __cmd_record(argc, argv);
673 }