Merge branch 'linus' into tracing/core
authorIngo Molnar <mingo@elte.hu>
Thu, 8 Apr 2010 07:06:12 +0000 (09:06 +0200)
committerIngo Molnar <mingo@elte.hu>
Thu, 8 Apr 2010 08:18:47 +0000 (10:18 +0200)
Conflicts:
include/linux/module.h
kernel/module.c

Semantic conflict:
include/trace/events/module.h

Merge reason: Resolve the conflict with upstream commit 5fbfb18 ("Fix up
              possibly racy module refcounting")

Signed-off-by: Ingo Molnar <mingo@elte.hu>
1  2 
include/linux/module.h
include/trace/events/module.h
kernel/module.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_selftest.c

diff --combined include/linux/module.h
@@@ -330,8 -330,11 +330,11 @@@ struct modul
        struct module_notes_attrs *notes_attrs;
  #endif
  
+ #ifdef CONFIG_SMP
        /* Per-cpu data. */
-       void *percpu;
+       void __percpu *percpu;
+       unsigned int percpu_size;
+ #endif
  
        /* The command line arguments (may be mangled).  People like
           keeping pointers to this stuff */
        void (*exit)(void);
  
        struct module_ref {
-               int count;
+               unsigned int incs;
+               unsigned int decs;
        } __percpu *refptr;
  #endif
  
@@@ -392,6 -396,7 +396,7 @@@ static inline int module_is_live(struc
  struct module *__module_text_address(unsigned long addr);
  struct module *__module_address(unsigned long addr);
  bool is_module_address(unsigned long addr);
+ bool is_module_percpu_address(unsigned long addr);
  bool is_module_text_address(unsigned long addr);
  
  static inline int within_module_core(unsigned long addr, struct module *mod)
@@@ -459,8 -464,9 +464,8 @@@ static inline void __module_get(struct 
  {
        if (module) {
                preempt_disable();
-               __this_cpu_inc(module->refptr->count);
+               __this_cpu_inc(module->refptr->incs);
 -              trace_module_get(module, _THIS_IP_,
 -                               __this_cpu_read(module->refptr->incs));
 +              trace_module_get(module, _THIS_IP_);
                preempt_enable();
        }
  }
@@@ -473,10 -479,10 +478,9 @@@ static inline int try_module_get(struc
                preempt_disable();
  
                if (likely(module_is_live(module))) {
-                       __this_cpu_inc(module->refptr->count);
+                       __this_cpu_inc(module->refptr->incs);
 -                      trace_module_get(module, _THIS_IP_,
 -                              __this_cpu_read(module->refptr->incs));
 +                      trace_module_get(module, _THIS_IP_);
-               }
-               else
+               } else
                        ret = 0;
  
                preempt_enable();
@@@ -561,6 -567,11 +565,11 @@@ static inline bool is_module_address(un
        return false;
  }
  
+ static inline bool is_module_percpu_address(unsigned long addr)
+ {
+       return false;
+ }
  static inline bool is_module_text_address(unsigned long addr)
  {
        return false;
@@@ -51,14 -51,11 +51,14 @@@ TRACE_EVENT(module_free
        TP_printk("%s", __get_str(name))
  );
  
 +#ifdef CONFIG_MODULE_UNLOAD
 +/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
 +
  DECLARE_EVENT_CLASS(module_refcnt,
  
 -      TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
 +      TP_PROTO(struct module *mod, unsigned long ip),
  
 -      TP_ARGS(mod, ip, refcnt),
 +      TP_ARGS(mod, ip),
  
        TP_STRUCT__entry(
                __field(        unsigned long,  ip              )
@@@ -68,7 -65,7 +68,7 @@@
  
        TP_fast_assign(
                __entry->ip     = ip;
-               __entry->refcnt = __this_cpu_read(mod->refptr->count);
 -              __entry->refcnt = refcnt;
++              __entry->refcnt = __this_cpu_read(mod->refptr->incs) + __this_cpu_read(mod->refptr->decs);
                __assign_str(name, mod->name);
        ),
  
  
  DEFINE_EVENT(module_refcnt, module_get,
  
 -      TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
 +      TP_PROTO(struct module *mod, unsigned long ip),
  
 -      TP_ARGS(mod, ip, refcnt)
 +      TP_ARGS(mod, ip)
  );
  
  DEFINE_EVENT(module_refcnt, module_put,
  
 -      TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
 +      TP_PROTO(struct module *mod, unsigned long ip),
  
 -      TP_ARGS(mod, ip, refcnt)
 +      TP_ARGS(mod, ip)
  );
 +#endif /* CONFIG_MODULE_UNLOAD */
  
  TRACE_EVENT(module_request,
  
diff --combined kernel/module.c
@@@ -59,6 -59,8 +59,6 @@@
  #define CREATE_TRACE_POINTS
  #include <trace/events/module.h>
  
 -EXPORT_TRACEPOINT_SYMBOL(module_get);
 -
  #if 0
  #define DEBUGP printk
  #else
@@@ -368,27 -370,33 +368,33 @@@ EXPORT_SYMBOL_GPL(find_module)
  
  #ifdef CONFIG_SMP
  
- static void *percpu_modalloc(unsigned long size, unsigned long align,
-                            const char *name)
+ static inline void __percpu *mod_percpu(struct module *mod)
  {
-       void *ptr;
+       return mod->percpu;
+ }
  
+ static int percpu_modalloc(struct module *mod,
+                          unsigned long size, unsigned long align)
+ {
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                      name, align, PAGE_SIZE);
+                      mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
  
-       ptr = __alloc_reserved_percpu(size, align);
-       if (!ptr)
+       mod->percpu = __alloc_reserved_percpu(size, align);
+       if (!mod->percpu) {
                printk(KERN_WARNING
                       "Could not allocate %lu bytes percpu data\n", size);
-       return ptr;
+               return -ENOMEM;
+       }
+       mod->percpu_size = size;
+       return 0;
  }
  
- static void percpu_modfree(void *freeme)
+ static void percpu_modfree(struct module *mod)
  {
-       free_percpu(freeme);
+       free_percpu(mod->percpu);
  }
  
  static unsigned int find_pcpusec(Elf_Ehdr *hdr,
        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
  }
  
- static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+ static void percpu_modcopy(struct module *mod,
+                          const void *from, unsigned long size)
  {
        int cpu;
  
        for_each_possible_cpu(cpu)
-               memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+               memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+ }
+ /**
+  * is_module_percpu_address - test whether address is from module static percpu
+  * @addr: address to test
+  *
+  * Test whether @addr belongs to module static percpu area.
+  *
+  * RETURNS:
+  * %true if @addr is from module static percpu area
+  */
+ bool is_module_percpu_address(unsigned long addr)
+ {
+       struct module *mod;
+       unsigned int cpu;
+       preempt_disable();
+       list_for_each_entry_rcu(mod, &modules, list) {
+               if (!mod->percpu_size)
+                       continue;
+               for_each_possible_cpu(cpu) {
+                       void *start = per_cpu_ptr(mod->percpu, cpu);
+                       if ((void *)addr >= start &&
+                           (void *)addr < start + mod->percpu_size) {
+                               preempt_enable();
+                               return true;
+                       }
+               }
+       }
+       preempt_enable();
+       return false;
  }
  
  #else /* ... !CONFIG_SMP */
  
- static inline void *percpu_modalloc(unsigned long size, unsigned long align,
-                                   const char *name)
+ static inline void __percpu *mod_percpu(struct module *mod)
  {
        return NULL;
  }
- static inline void percpu_modfree(void *pcpuptr)
+ static inline int percpu_modalloc(struct module *mod,
+                                 unsigned long size, unsigned long align)
+ {
+       return -ENOMEM;
+ }
+ static inline void percpu_modfree(struct module *mod)
  {
-       BUG();
  }
  static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                        Elf_Shdr *sechdrs,
  {
        return 0;
  }
- static inline void percpu_modcopy(void *pcpudst, const void *src,
-                                 unsigned long size)
+ static inline void percpu_modcopy(struct module *mod,
+                                 const void *from, unsigned long size)
  {
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
  }
+ bool is_module_percpu_address(unsigned long addr)
+ {
+       return false;
+ }
  
  #endif /* CONFIG_SMP */
  
@@@ -465,20 -515,19 +513,22 @@@ MODINFO_ATTR(srcversion)
  static char last_unloaded_module[MODULE_NAME_LEN+1];
  
  #ifdef CONFIG_MODULE_UNLOAD
 +
 +EXPORT_TRACEPOINT_SYMBOL(module_get);
 +
  /* Init the unload section of the module. */
  static void module_unload_init(struct module *mod)
  {
        int cpu;
  
        INIT_LIST_HEAD(&mod->modules_which_use_me);
-       for_each_possible_cpu(cpu)
-               per_cpu_ptr(mod->refptr, cpu)->count = 0;
+       for_each_possible_cpu(cpu) {
+               per_cpu_ptr(mod->refptr, cpu)->incs = 0;
+               per_cpu_ptr(mod->refptr, cpu)->decs = 0;
+       }
  
        /* Hold reference count during initialization. */
-       __this_cpu_write(mod->refptr->count, 1);
+       __this_cpu_write(mod->refptr->incs, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
  }
@@@ -617,12 -666,28 +667,28 @@@ static int try_stop_module(struct modul
  
  unsigned int module_refcount(struct module *mod)
  {
-       unsigned int total = 0;
+       unsigned int incs = 0, decs = 0;
        int cpu;
  
        for_each_possible_cpu(cpu)
-               total += per_cpu_ptr(mod->refptr, cpu)->count;
-       return total;
+               decs += per_cpu_ptr(mod->refptr, cpu)->decs;
+       /*
+        * ensure the incs are added up after the decs.
+        * module_put ensures incs are visible before decs with smp_wmb.
+        *
+        * This 2-count scheme avoids the situation where the refcount
+        * for CPU0 is read, then CPU0 increments the module refcount,
+        * then CPU1 drops that refcount, then the refcount for CPU1 is
+        * read. We would record a decrement but not its corresponding
+        * increment so we would see a low count (disaster).
+        *
+        * Rare situation? But module_refcount can be preempted, and we
+        * might be tallying up 4096+ CPUs. So it is not impossible.
+        */
+       smp_rmb();
+       for_each_possible_cpu(cpu)
+               incs += per_cpu_ptr(mod->refptr, cpu)->incs;
+       return incs - decs;
  }
  EXPORT_SYMBOL(module_refcount);
  
@@@ -799,9 -864,11 +865,10 @@@ void module_put(struct module *module
  {
        if (module) {
                preempt_disable();
-               __this_cpu_dec(module->refptr->count);
+               smp_wmb(); /* see comment in module_refcount */
+               __this_cpu_inc(module->refptr->decs);
  
 -              trace_module_put(module, _RET_IP_,
 -                               __this_cpu_read(module->refptr->decs));
 +              trace_module_put(module, _RET_IP_);
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
@@@ -1400,8 -1467,7 +1467,7 @@@ static void free_module(struct module *
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
-       if (mod->percpu)
-               percpu_modfree(mod->percpu);
+       percpu_modfree(mod);
  #if defined(CONFIG_MODULE_UNLOAD)
        if (mod->refptr)
                free_percpu(mod->refptr);
@@@ -1520,7 -1586,7 +1586,7 @@@ static int simplify_symbols(Elf_Shdr *s
                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == pcpuindex)
-                               secbase = (unsigned long)mod->percpu;
+                               secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
@@@ -1954,7 -2020,7 +2020,7 @@@ static noinline struct module *load_mod
        unsigned int modindex, versindex, infoindex, pcpuindex;
        struct module *mod;
        long err = 0;
-       void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+       void *ptr = NULL; /* Stops spurious gcc warning */
        unsigned long symoffs, stroffs, *strmap;
  
        mm_segment_t old_fs;
  
        if (pcpuindex) {
                /* We have a special allocation for this section. */
-               percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
-                                        sechdrs[pcpuindex].sh_addralign,
-                                        mod->name);
-               if (!percpu) {
-                       err = -ENOMEM;
+               err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
+                                     sechdrs[pcpuindex].sh_addralign);
+               if (err)
                        goto free_mod;
-               }
                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-               mod->percpu = percpu;
        }
  
        /* Determine total sizes, and put offsets in sh_entsize.  For now
        sort_extable(mod->extable, mod->extable + mod->num_exentries);
  
        /* Finally, copy percpu area over. */
-       percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+       percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
                       sechdrs[pcpuindex].sh_size);
  
        add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
        module_free(mod, mod->module_core);
        /* mod will be freed with core. Don't access it beyond this line! */
   free_percpu:
-       if (percpu)
-               percpu_modfree(percpu);
+       percpu_modfree(mod);
   free_mod:
        kfree(args);
        kfree(strmap);
@@@ -14,6 -14,7 +14,7 @@@
  #include <linux/module.h>
  #include <linux/percpu.h>
  #include <linux/mutex.h>
+ #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/hash.h>
  #include <linux/list.h>
@@@ -318,11 -319,6 +319,11 @@@ EXPORT_SYMBOL_GPL(ring_buffer_event_dat
  #define TS_MASK               ((1ULL << TS_SHIFT) - 1)
  #define TS_DELTA_TEST (~TS_MASK)
  
 +/* Flag when events were overwritten */
 +#define RB_MISSED_EVENTS      (1 << 31)
 +/* Missed count stored at end */
 +#define RB_MISSED_STORED      (1 << 30)
 +
  struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
@@@ -342,7 -338,6 +343,7 @@@ struct buffer_page 
        local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
        local_t          entries;       /* entries on this page */
 +      unsigned long    real_end;      /* real end of data */
        struct buffer_data_page *page;  /* Actual data page */
  };
  
@@@ -422,12 -417,6 +423,12 @@@ int ring_buffer_print_page_header(struc
                               (unsigned int)sizeof(field.commit),
                               (unsigned int)is_signed_type(long));
  
 +      ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
 +                             "offset:%u;\tsize:%u;\tsigned:%u;\n",
 +                             (unsigned int)offsetof(typeof(field), commit),
 +                             1,
 +                             (unsigned int)is_signed_type(long));
 +
        ret = trace_seq_printf(s, "\tfield: char data;\t"
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
@@@ -451,8 -440,6 +452,8 @@@ struct ring_buffer_per_cpu 
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
 +      unsigned long                   lost_events;
 +      unsigned long                   last_overrun;
        local_t                         commit_overrun;
        local_t                         overrun;
        local_t                         entries;
@@@ -1223,18 -1210,19 +1224,19 @@@ rb_remove_pages(struct ring_buffer_per_
  
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                       return;
+                       goto out;
                p = cpu_buffer->pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                free_buffer_page(bpage);
        }
        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-               return;
+               goto out;
  
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
  
+ out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
  }
  
@@@ -1251,7 -1239,7 +1253,7 @@@ rb_insert_pages(struct ring_buffer_per_
  
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-                       return;
+                       goto out;
                p = pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
  
+ out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
  }
  
@@@ -1773,13 -1762,6 +1776,13 @@@ rb_reset_tail(struct ring_buffer_per_cp
        kmemcheck_annotate_bitfield(event, bitfield);
  
        /*
 +       * Save the original length to the meta data.
 +       * This will be used by the reader to add lost event
 +       * counter.
 +       */
 +      tail_page->real_end = tail;
 +
 +      /*
         * If this event is bigger than the minimum size, then
         * we need to be careful that we don't subtract the
         * write counter enough to allow another writer to slip
@@@ -2856,7 -2838,6 +2859,7 @@@ static struct buffer_page 
  rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
  {
        struct buffer_page *reader = NULL;
 +      unsigned long overwrite;
        unsigned long flags;
        int nr_loops = 0;
        int ret;
        local_set(&cpu_buffer->reader_page->write, 0);
        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
 +      cpu_buffer->reader_page->real_end = 0;
  
   spin:
        /*
        rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
  
        /*
 +       * We want to make sure we read the overruns after we set up our
 +       * pointers to the next object. The writer side does a
 +       * cmpxchg to cross pages which acts as the mb on the writer
 +       * side. Note, the reader will constantly fail the swap
 +       * while the writer is updating the pointers, so this
 +       * guarantees that the overwrite recorded here is the one we
 +       * want to compare with the last_overrun.
 +       */
 +      smp_mb();
 +      overwrite = local_read(&(cpu_buffer->overrun));
 +
 +      /*
         * Here's the tricky part.
         *
         * We need to move the pointer past the header page.
        cpu_buffer->reader_page = reader;
        rb_reset_reader_page(cpu_buffer);
  
 +      if (overwrite != cpu_buffer->last_overrun) {
 +              cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
 +              cpu_buffer->last_overrun = overwrite;
 +      }
 +
        goto again;
  
   out:
@@@ -3042,14 -3005,8 +3045,14 @@@ static void rb_advance_iter(struct ring
                rb_advance_iter(iter);
  }
  
 +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
 +{
 +      return cpu_buffer->lost_events;
 +}
 +
  static struct ring_buffer_event *
 -rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 +             unsigned long *lost_events)
  {
        struct ring_buffer_event *event;
        struct buffer_page *reader;
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
                                                         cpu_buffer->cpu, ts);
                }
 +              if (lost_events)
 +                      *lost_events = rb_lost_events(cpu_buffer);
                return event;
  
        default:
@@@ -3213,14 -3168,12 +3216,14 @@@ static inline int rb_ok_to_lock(void
   * @buffer: The ring buffer to read
   * @cpu: The cpu to peak at
   * @ts: The timestamp counter of this event.
 + * @lost_events: a variable to store if events were lost (may be NULL)
   *
   * This will return the event that will be read next, but does
   * not consume the data.
   */
  struct ring_buffer_event *
 -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
 +               unsigned long *lost_events)
  {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
        local_irq_save(flags);
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
 -      event = rb_buffer_peek(cpu_buffer, ts);
 +      event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
@@@ -3277,17 -3230,13 +3280,17 @@@ ring_buffer_iter_peek(struct ring_buffe
  /**
   * ring_buffer_consume - return an event and consume it
   * @buffer: The ring buffer to get the next event from
 + * @cpu: the cpu to read the buffer from
 + * @ts: a variable to store the timestamp (may be NULL)
 + * @lost_events: a variable to store if events were lost (may be NULL)
   *
   * Returns the next event in the ring buffer, and that event is consumed.
   * Meaning, that sequential reads will keep returning a different event,
   * and eventually empty the ring buffer if the producer is slower.
   */
  struct ring_buffer_event *
 -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 +                  unsigned long *lost_events)
  {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event = NULL;
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
  
 -      event = rb_buffer_peek(cpu_buffer, ts);
 -      if (event)
 +      event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 +      if (event) {
 +              cpu_buffer->lost_events = 0;
                rb_advance_reader(cpu_buffer);
 +      }
  
        if (dolock)
                spin_unlock(&cpu_buffer->reader_lock);
@@@ -3461,9 -3408,6 +3464,9 @@@ rb_reset_cpu(struct ring_buffer_per_cp
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
  
 +      cpu_buffer->lost_events = 0;
 +      cpu_buffer->last_overrun = 0;
 +
        rb_head_page_activate(cpu_buffer);
  }
  
@@@ -3739,7 -3683,6 +3742,7 @@@ int ring_buffer_read_page(struct ring_b
        struct ring_buffer_event *event;
        struct buffer_data_page *bpage;
        struct buffer_page *reader;
 +      unsigned long missed_events;
        unsigned long flags;
        unsigned int commit;
        unsigned int read;
        read = reader->read;
        commit = rb_page_commit(reader);
  
 +      /* Check if any events were dropped */
 +      missed_events = cpu_buffer->lost_events;
 +
        /*
         * If this page has been partially read or
         * if len is not big enough to read the rest of the page or
                local_set(&reader->entries, 0);
                reader->read = 0;
                *data_page = bpage;
 +
 +              /*
 +               * Use the real_end for the data size,
 +               * This gives us a chance to store the lost events
 +               * on the page.
 +               */
 +              if (reader->real_end)
 +                      local_set(&bpage->commit, reader->real_end);
        }
        ret = read;
  
 +      cpu_buffer->lost_events = 0;
 +      /*
 +       * Set a flag in the commit field if we lost events
 +       */
 +      if (missed_events) {
 +              commit = local_read(&bpage->commit);
 +
 +              /* If there is room at the end of the page to save the
 +               * missed events, then record it there.
 +               */
 +              if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
 +                      memcpy(&bpage->data[commit], &missed_events,
 +                             sizeof(missed_events));
 +                      local_add(RB_MISSED_STORED, &bpage->commit);
 +              }
 +              local_add(RB_MISSED_EVENTS, &bpage->commit);
 +      }
 +
   out_unlock:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
  
diff --combined kernel/trace/trace.c
  #include <linux/kdebug.h>
  #include <linux/string.h>
  #include <linux/rwsem.h>
+ #include <linux/slab.h>
  #include <linux/ctype.h>
  #include <linux/init.h>
  #include <linux/poll.h>
- #include <linux/gfp.h>
  #include <linux/fs.h>
  
  #include "trace.h"
@@@ -1545,8 -1545,7 +1545,8 @@@ static void trace_iterator_increment(st
  }
  
  static struct trace_entry *
 -peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
 +              unsigned long *lost_events)
  {
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
        else
 -              event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
 +              event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
 +                                       lost_events);
  
        ftrace_enable_cpu();
  
  }
  
  static struct trace_entry *
 -__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 +__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 +                unsigned long *missing_events, u64 *ent_ts)
  {
        struct ring_buffer *buffer = iter->tr->buffer;
        struct trace_entry *ent, *next = NULL;
 +      unsigned long lost_events, next_lost = 0;
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
        if (cpu_file > TRACE_PIPE_ALL_CPU) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
 -              ent = peek_next_entry(iter, cpu_file, ent_ts);
 +              ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
                if (ent_cpu)
                        *ent_cpu = cpu_file;
  
                if (ring_buffer_empty_cpu(buffer, cpu))
                        continue;
  
 -              ent = peek_next_entry(iter, cpu, &ts);
 +              ent = peek_next_entry(iter, cpu, &ts, &lost_events);
  
                /*
                 * Pick the entry with the smallest timestamp:
                        next = ent;
                        next_cpu = cpu;
                        next_ts = ts;
 +                      next_lost = lost_events;
                }
        }
  
        if (ent_ts)
                *ent_ts = next_ts;
  
 +      if (missing_events)
 +              *missing_events = next_lost;
 +
        return next;
  }
  
  struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts)
  {
 -      return __find_next_entry(iter, ent_cpu, ent_ts);
 +      return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
  }
  
  /* Find the next real entry, and increment the iterator to the next entry */
  static void *find_next_entry_inc(struct trace_iterator *iter)
  {
 -      iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
 +      iter->ent = __find_next_entry(iter, &iter->cpu,
 +                                    &iter->lost_events, &iter->ts);
  
        if (iter->ent)
                trace_iterator_increment(iter);
@@@ -1644,8 -1635,7 +1644,8 @@@ static void trace_consume(struct trace_
  {
        /* Don't allow ftrace to trace into the ring buffers */
        ftrace_disable_cpu();
 -      ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
 +      ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
 +                          &iter->lost_events);
        ftrace_enable_cpu();
  }
  
@@@ -2040,10 -2030,6 +2040,10 @@@ static enum print_line_t print_trace_li
  {
        enum print_line_t ret;
  
 +      if (iter->lost_events)
 +              trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
 +                               iter->cpu, iter->lost_events);
 +
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
                if (ret != TRACE_TYPE_UNHANDLED)
@@@ -9,6 -9,7 +9,7 @@@
  #include <linux/debugfs.h>
  #include <linux/uaccess.h>
  #include <linux/ftrace.h>
+ #include <linux/slab.h>
  #include <linux/fs.h>
  
  #include "trace.h"
@@@ -489,10 -490,9 +490,10 @@@ get_return_for_leaf(struct trace_iterat
                         * We need to consume the current entry to see
                         * the next one.
                         */
 -                      ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
 +                      ring_buffer_consume(iter->tr->buffer, iter->cpu,
 +                                          NULL, NULL);
                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
 -                                               NULL);
 +                                               NULL, NULL);
                }
  
                if (!event)
@@@ -3,6 -3,7 +3,7 @@@
  #include <linux/stringify.h>
  #include <linux/kthread.h>
  #include <linux/delay.h>
+ #include <linux/slab.h>
  
  static inline int trace_valid_entry(struct trace_entry *entry)
  {
@@@ -29,7 -30,7 +30,7 @@@ static int trace_test_buffer_cpu(struc
        struct trace_entry *entry;
        unsigned int loops = 0;
  
 -      while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
 +      while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
                entry = ring_buffer_event_data(event);
  
                /*