perf record: Disable profiling before draining the buffer
[safe/jmp/linux-2.6] / tools / perf / builtin-record.c
index 0f5771f..2459e5a 100644 (file)
 #include "util/parse-events.h"
 #include "util/string.h"
 
+#include "util/header.h"
+#include "util/event.h"
+#include "util/debug.h"
+#include "util/trace-event.h"
+
 #include <unistd.h>
 #include <sched.h>
 
@@ -32,41 +37,34 @@ static int                  output;
 static const char              *output_name                    = "perf.data";
 static int                     group                           = 0;
 static unsigned int            realtime_prio                   = 0;
+static int                     raw_samples                     = 0;
 static int                     system_wide                     = 0;
+static int                     profile_cpu                     = -1;
 static pid_t                   target_pid                      = -1;
 static int                     inherit                         = 1;
 static int                     force                           = 0;
 static int                     append_file                     = 0;
-static int                     verbose                         = 0;
+static int                     call_graph                      = 0;
+static int                     inherit_stat                    = 0;
+static int                     no_samples                      = 0;
+static int                     sample_address                  = 0;
+static int                     multiplex                       = 0;
+static int                     multiplex_fd                    = -1;
 
 static long                    samples;
 static struct timeval          last_read;
 static struct timeval          this_read;
 
-static __u64                   bytes_written;
+static u64                     bytes_written;
 
 static struct pollfd           event_array[MAX_NR_CPUS * MAX_COUNTERS];
 
 static int                     nr_poll;
 static int                     nr_cpu;
 
-struct mmap_event {
-       struct perf_event_header        header;
-       __u32                           pid;
-       __u32                           tid;
-       __u64                           start;
-       __u64                           len;
-       __u64                           pgoff;
-       char                            filename[PATH_MAX];
-};
-
-struct comm_event {
-       struct perf_event_header        header;
-       __u32                           pid;
-       __u32                           tid;
-       char                            comm[16];
-};
+static int                     file_new = 1;
 
+struct perf_header             *header;
 
 struct mmap_data {
        int                     counter;
@@ -77,10 +75,10 @@ struct mmap_data {
 
 static struct mmap_data                mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
-static unsigned int mmap_read_head(struct mmap_data *md)
+static unsigned long mmap_read_head(struct mmap_data *md)
 {
        struct perf_counter_mmap_page *pc = md->base;
-       int head;
+       long head;
 
        head = pc->data_head;
        rmb();
@@ -88,6 +86,32 @@ static unsigned int mmap_read_head(struct mmap_data *md)
        return head;
 }
 
+static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
+{
+       struct perf_counter_mmap_page *pc = md->base;
+
+       /*
+        * ensure all reads are done before we write the tail out.
+        */
+       /* mb(); */
+       pc->data_tail = tail;
+}
+
+static void write_output(void *buf, size_t size)
+{
+       while (size) {
+               int ret = write(output, buf, size);
+
+               if (ret < 0)
+                       die("failed to write");
+
+               size -= ret;
+               buf += ret;
+
+               bytes_written += ret;
+       }
+}
+
 static void mmap_read(struct mmap_data *md)
 {
        unsigned int head = mmap_read_head(md);
@@ -108,7 +132,7 @@ static void mmap_read(struct mmap_data *md)
         * In either case, truncate and restart at head.
         */
        diff = head - old;
-       if (diff > md->mask / 2 || diff < 0) {
+       if (diff < 0) {
                struct timeval iv;
                unsigned long msecs;
 
@@ -136,36 +160,17 @@ static void mmap_read(struct mmap_data *md)
                size = md->mask + 1 - (old & md->mask);
                old += size;
 
-               while (size) {
-                       int ret = write(output, buf, size);
-
-                       if (ret < 0)
-                               die("failed to write");
-
-                       size -= ret;
-                       buf += ret;
-
-                       bytes_written += ret;
-               }
+               write_output(buf, size);
        }
 
        buf = &data[old & md->mask];
        size = head - old;
        old += size;
 
-       while (size) {
-               int ret = write(output, buf, size);
-
-               if (ret < 0)
-                       die("failed to write");
-
-               size -= ret;
-               buf += ret;
-
-               bytes_written += ret;
-       }
+       write_output(buf, size);
 
        md->prev = old;
+       mmap_write_tail(md, old);
 }
 
 static volatile int done = 0;
@@ -186,55 +191,57 @@ static void sig_atexit(void)
        kill(getpid(), signr);
 }
 
-static void pid_synthesize_comm_event(pid_t pid, int full)
+static pid_t pid_synthesize_comm_event(pid_t pid, int full)
 {
        struct comm_event comm_ev;
        char filename[PATH_MAX];
        char bf[BUFSIZ];
-       int fd, ret;
-       size_t size;
-       char *field, *sep;
+       FILE *fp;
+       size_t size = 0;
        DIR *tasks;
        struct dirent dirent, *next;
+       pid_t tgid = 0;
 
-       snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
+       snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
 
-       fd = open(filename, O_RDONLY);
-       if (fd < 0) {
-               fprintf(stderr, "couldn't open %s\n", filename);
-               exit(EXIT_FAILURE);
-       }
-       if (read(fd, bf, sizeof(bf)) < 0) {
-               fprintf(stderr, "couldn't read %s\n", filename);
-               exit(EXIT_FAILURE);
+       fp = fopen(filename, "r");
+       if (fp == NULL) {
+               /*
+                * We raced with a task exiting - just return:
+                */
+               if (verbose)
+                       fprintf(stderr, "couldn't open %s\n", filename);
+               return 0;
        }
-       close(fd);
 
-       /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
        memset(&comm_ev, 0, sizeof(comm_ev));
-       field = strchr(bf, '(');
-       if (field == NULL)
-               goto out_failure;
-       sep = strchr(++field, ')');
-       if (sep == NULL)
-               goto out_failure;
-       size = sep - field;
-       memcpy(comm_ev.comm, field, size++);
-
-       comm_ev.pid = pid;
+       while (!comm_ev.comm[0] || !comm_ev.pid) {
+               if (fgets(bf, sizeof(bf), fp) == NULL)
+                       goto out_failure;
+
+               if (memcmp(bf, "Name:", 5) == 0) {
+                       char *name = bf + 5;
+                       while (*name && isspace(*name))
+                               ++name;
+                       size = strlen(name) - 1;
+                       memcpy(comm_ev.comm, name, size++);
+               } else if (memcmp(bf, "Tgid:", 5) == 0) {
+                       char *tgids = bf + 5;
+                       while (*tgids && isspace(*tgids))
+                               ++tgids;
+                       tgid = comm_ev.pid = atoi(tgids);
+               }
+       }
+
        comm_ev.header.type = PERF_EVENT_COMM;
-       size = ALIGN(size, sizeof(__u64));
+       size = ALIGN(size, sizeof(u64));
        comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
        if (!full) {
                comm_ev.tid = pid;
 
-               ret = write(output, &comm_ev, comm_ev.header.size);
-               if (ret < 0) {
-                       perror("failed to write");
-                       exit(-1);
-               }
-               return;
+               write_output(&comm_ev, comm_ev.header.size);
+               goto out_fclose;
        }
 
        snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
@@ -248,14 +255,13 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
                comm_ev.tid = pid;
 
-               ret = write(output, &comm_ev, comm_ev.header.size);
-               if (ret < 0) {
-                       perror("failed to write");
-                       exit(-1);
-               }
+               write_output(&comm_ev, comm_ev.header.size);
        }
        closedir(tasks);
-       return;
+
+out_fclose:
+       fclose(fp);
+       return tgid;
 
 out_failure:
        fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
@@ -263,7 +269,7 @@ out_failure:
        exit(EXIT_FAILURE);
 }
 
-static void pid_synthesize_mmap_samples(pid_t pid)
+static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
 {
        char filename[PATH_MAX];
        FILE *fp;
@@ -272,13 +278,17 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 
        fp = fopen(filename, "r");
        if (fp == NULL) {
-               fprintf(stderr, "couldn't open %s\n", filename);
-               exit(EXIT_FAILURE);
+               /*
+                * We raced with a task exiting - just return:
+                */
+               if (verbose)
+                       fprintf(stderr, "couldn't open %s\n", filename);
+               return;
        }
        while (1) {
                char bf[BUFSIZ], *pbf = bf;
                struct mmap_event mmap_ev = {
-                       .header.type = PERF_EVENT_MMAP,
+                       .header = { .type = PERF_EVENT_MMAP },
                };
                int n;
                size_t size;
@@ -295,33 +305,33 @@ static void pid_synthesize_mmap_samples(pid_t pid)
                        continue;
                pbf += n + 3;
                if (*pbf == 'x') { /* vm_exec */
-                       char *execname = strrchr(bf, ' ');
+                       char *execname = strchr(bf, '/');
 
-                       if (execname == NULL || execname[1] != '/')
+                       /* Catch VDSO */
+                       if (execname == NULL)
+                               execname = strstr(bf, "[vdso]");
+
+                       if (execname == NULL)
                                continue;
 
-                       execname += 1;
                        size = strlen(execname);
                        execname[size - 1] = '\0'; /* Remove \n */
                        memcpy(mmap_ev.filename, execname, size);
-                       size = ALIGN(size, sizeof(__u64));
+                       size = ALIGN(size, sizeof(u64));
                        mmap_ev.len -= mmap_ev.start;
                        mmap_ev.header.size = (sizeof(mmap_ev) -
                                               (sizeof(mmap_ev.filename) - size));
-                       mmap_ev.pid = pid;
+                       mmap_ev.pid = tgid;
                        mmap_ev.tid = pid;
 
-                       if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
-                               perror("failed to write");
-                               exit(-1);
-                       }
+                       write_output(&mmap_ev, mmap_ev.header.size);
                }
        }
 
        fclose(fp);
 }
 
-static void synthesize_samples(void)
+static void synthesize_all(void)
 {
        DIR *proc;
        struct dirent dirent, *next;
@@ -330,14 +340,14 @@ static void synthesize_samples(void)
 
        while (!readdir_r(proc, &dirent, &next) && next) {
                char *end;
-               pid_t pid;
+               pid_t pid, tgid;
 
                pid = strtol(dirent.d_name, &end, 10);
                if (*end) /* only interested in proper numerical dirents */
                        continue;
 
-               pid_synthesize_comm_event(pid, 1);
-               pid_synthesize_mmap_samples(pid);
+               tgid = pid_synthesize_comm_event(pid, 1);
+               pid_synthesize_mmap_samples(pid, tgid);
        }
 
        closedir(proc);
@@ -345,24 +355,67 @@ static void synthesize_samples(void)
 
 static int group_fd;
 
+static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
+{
+       struct perf_header_attr *h_attr;
+
+       if (nr < header->attrs) {
+               h_attr = header->attr[nr];
+       } else {
+               h_attr = perf_header_attr__new(a);
+               perf_header__add_attr(header, h_attr);
+       }
+
+       return h_attr;
+}
+
 static void create_counter(int counter, int cpu, pid_t pid)
 {
        struct perf_counter_attr *attr = attrs + counter;
-       int track = 1;
+       struct perf_header_attr *h_attr;
+       int track = !counter; /* only the first counter needs these */
+       struct {
+               u64 count;
+               u64 time_enabled;
+               u64 time_running;
+               u64 id;
+       } read_data;
+
+       attr->read_format       = PERF_FORMAT_TOTAL_TIME_ENABLED |
+                                 PERF_FORMAT_TOTAL_TIME_RUNNING |
+                                 PERF_FORMAT_ID;
+
+       attr->sample_type       |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
-       attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
        if (freq) {
                attr->sample_type       |= PERF_SAMPLE_PERIOD;
                attr->freq              = 1;
                attr->sample_freq       = freq;
        }
+
+       if (no_samples)
+               attr->sample_freq = 0;
+
+       if (inherit_stat)
+               attr->inherit_stat = 1;
+
+       if (sample_address)
+               attr->sample_type       |= PERF_SAMPLE_ADDR;
+
+       if (call_graph)
+               attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
+
+       if (raw_samples) {
+               attr->sample_type       |= PERF_SAMPLE_TIME;
+               attr->sample_type       |= PERF_SAMPLE_RAW;
+               attr->sample_type       |= PERF_SAMPLE_CPU;
+       }
+
        attr->mmap              = track;
        attr->comm              = track;
        attr->inherit           = (cpu < 0) && inherit;
        attr->disabled          = 1;
 
-       track = 0; /* only the first counter needs these */
-
 try_again:
        fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
@@ -371,6 +424,8 @@ try_again:
 
                if (err == EPERM)
                        die("Permission error - are you root?\n");
+               else if (err ==  ENODEV && profile_cpu != -1)
+                       die("No such device - did you specify an out-of-range profile CPU?\n");
 
                /*
                 * If it's cycles then fall back to hrtimer
@@ -393,6 +448,22 @@ try_again:
                exit(-1);
        }
 
+       h_attr = get_header_attr(attr, counter);
+
+       if (!file_new) {
+               if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
+                       fprintf(stderr, "incompatible append\n");
+                       exit(-1);
+               }
+       }
+
+       if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
+               perror("Unable to read perf file descriptor\n");
+               exit(-1);
+       }
+
+       perf_header_attr__add_id(h_attr, read_data.id);
+
        assert(fd[nr_cpu][counter] >= 0);
        fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
 
@@ -401,19 +472,28 @@ try_again:
         */
        if (group && group_fd == -1)
                group_fd = fd[nr_cpu][counter];
-
-       event_array[nr_poll].fd = fd[nr_cpu][counter];
-       event_array[nr_poll].events = POLLIN;
-       nr_poll++;
-
-       mmap_array[nr_cpu][counter].counter = counter;
-       mmap_array[nr_cpu][counter].prev = 0;
-       mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-       mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-                       PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
-       if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-               error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-               exit(-1);
+       if (multiplex && multiplex_fd == -1)
+               multiplex_fd = fd[nr_cpu][counter];
+
+       if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
+               int ret;
+
+               ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
+               assert(ret != -1);
+       } else {
+               event_array[nr_poll].fd = fd[nr_cpu][counter];
+               event_array[nr_poll].events = POLLIN;
+               nr_poll++;
+
+               mmap_array[nr_cpu][counter].counter = counter;
+               mmap_array[nr_cpu][counter].prev = 0;
+               mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+               mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+                               PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
+               if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+                       error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+                       exit(-1);
+               }
        }
 
        ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
@@ -423,11 +503,6 @@ static void open_counters(int cpu, pid_t pid)
 {
        int counter;
 
-       if (pid > 0) {
-               pid_synthesize_comm_event(pid, 0);
-               pid_synthesize_mmap_samples(pid);
-       }
-
        group_fd = -1;
        for (counter = 0; counter < nr_counters; counter++)
                create_counter(counter, cpu, pid);
@@ -435,28 +510,44 @@ static void open_counters(int cpu, pid_t pid)
        nr_cpu++;
 }
 
+static void atexit_header(void)
+{
+       header->data_size += bytes_written;
+
+       perf_header__write(header, output);
+}
+
 static int __cmd_record(int argc, const char **argv)
 {
        int i, counter;
        struct stat st;
-       pid_t pid;
+       pid_t pid = 0;
        int flags;
        int ret;
+       unsigned long waking = 0;
 
        page_size = sysconf(_SC_PAGE_SIZE);
        nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
        assert(nr_cpus <= MAX_NR_CPUS);
        assert(nr_cpus >= 0);
 
-       if (!stat(output_name, &st) && !force && !append_file) {
-               fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
-                               output_name);
-               exit(-1);
+       atexit(sig_atexit);
+       signal(SIGCHLD, sig_handler);
+       signal(SIGINT, sig_handler);
+
+       if (!stat(output_name, &st) && st.st_size) {
+               if (!force && !append_file) {
+                       fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
+                                       output_name);
+                       exit(-1);
+               }
+       } else {
+               append_file = 0;
        }
 
        flags = O_CREAT|O_RDWR;
        if (append_file)
-               flags |= O_APPEND;
+               file_new = 0;
        else
                flags |= O_TRUNC;
 
@@ -466,14 +557,47 @@ static int __cmd_record(int argc, const char **argv)
                exit(-1);
        }
 
+       if (!file_new)
+               header = perf_header__read(output);
+       else
+               header = perf_header__new();
+
+
+       if (raw_samples) {
+               read_tracing_data(attrs, nr_counters);
+       } else {
+               for (i = 0; i < nr_counters; i++) {
+                       if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
+                               read_tracing_data(attrs, nr_counters);
+                               break;
+                       }
+               }
+       }
+       atexit(atexit_header);
+
        if (!system_wide) {
-               open_counters(-1, target_pid != -1 ? target_pid : getpid());
-       } else for (i = 0; i < nr_cpus; i++)
-               open_counters(i, target_pid);
+               pid = target_pid;
+               if (pid == -1)
+                       pid = getpid();
+
+               open_counters(profile_cpu, pid);
+       } else {
+               if (profile_cpu != -1) {
+                       open_counters(profile_cpu, target_pid);
+               } else {
+                       for (i = 0; i < nr_cpus; i++)
+                               open_counters(i, target_pid);
+               }
+       }
 
-       atexit(sig_atexit);
-       signal(SIGCHLD, sig_handler);
-       signal(SIGINT, sig_handler);
+       if (file_new)
+               perf_header__write(header, output);
+
+       if (!system_wide) {
+               pid_t tgid = pid_synthesize_comm_event(pid, 0);
+               pid_synthesize_mmap_samples(pid, tgid);
+       } else
+               synthesize_all();
 
        if (target_pid == -1 && argc) {
                pid = fork();
@@ -498,21 +622,33 @@ static int __cmd_record(int argc, const char **argv)
                }
        }
 
-       if (system_wide)
-               synthesize_samples();
-
-       while (!done) {
+       for (;;) {
                int hits = samples;
 
                for (i = 0; i < nr_cpu; i++) {
-                       for (counter = 0; counter < nr_counters; counter++)
-                               mmap_read(&mmap_array[i][counter]);
+                       for (counter = 0; counter < nr_counters; counter++) {
+                               if (mmap_array[i][counter].base)
+                                       mmap_read(&mmap_array[i][counter]);
+                       }
+               }
+
+               if (hits == samples) {
+                       if (done)
+                               break;
+                       ret = poll(event_array, nr_poll, -1);
+                       waking++;
                }
 
-               if (hits == samples)
-                       ret = poll(event_array, nr_poll, 100);
+               if (done) {
+                       for (i = 0; i < nr_cpu; i++) {
+                               for (counter = 0; counter < nr_counters; counter++)
+                                       ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE);
+                       }
+               }
        }
 
+       fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
+
        /*
         * Approximate RIP event size: 24 bytes.
         */
@@ -539,10 +675,14 @@ static const struct option options[] = {
                    "record events on existing pid"),
        OPT_INTEGER('r', "realtime", &realtime_prio,
                    "collect data with this RT SCHED_FIFO priority"),
+       OPT_BOOLEAN('R', "raw-samples", &raw_samples,
+                   "collect raw sample records from all opened counters"),
        OPT_BOOLEAN('a', "all-cpus", &system_wide,
                            "system-wide collection from all CPUs"),
        OPT_BOOLEAN('A', "append", &append_file,
                            "append to the output file to do incremental profiling"),
+       OPT_INTEGER('C', "profile_cpu", &profile_cpu,
+                           "CPU to profile on"),
        OPT_BOOLEAN('f', "force", &force,
                        "overwrite existing data file"),
        OPT_LONG('c', "count", &default_interval,
@@ -555,16 +695,27 @@ static const struct option options[] = {
                    "profile at this frequency"),
        OPT_INTEGER('m', "mmap-pages", &mmap_pages,
                    "number of mmap data pages"),
+       OPT_BOOLEAN('g', "call-graph", &call_graph,
+                   "do call-graph (stack chain/backtrace) recording"),
        OPT_BOOLEAN('v', "verbose", &verbose,
                    "be more verbose (show counter open errors, etc)"),
+       OPT_BOOLEAN('s', "stat", &inherit_stat,
+                   "per thread counts"),
+       OPT_BOOLEAN('d', "data", &sample_address,
+                   "Sample addresses"),
+       OPT_BOOLEAN('n', "no-samples", &no_samples,
+                   "don't sample"),
+       OPT_BOOLEAN('M', "multiplex", &multiplex,
+                   "multiplex counter output in a single channel"),
        OPT_END()
 };
 
-int cmd_record(int argc, const char **argv, const char *prefix)
+int cmd_record(int argc, const char **argv, const char *prefix __used)
 {
        int counter;
 
-       argc = parse_options(argc, argv, options, record_usage, 0);
+       argc = parse_options(argc, argv, options, record_usage,
+               PARSE_OPT_STOP_AT_NON_OPTION);
        if (!argc && target_pid == -1 && !system_wide)
                usage_with_options(record_usage, options);