Merge branch 'linus' into perf/core
[safe/jmp/linux-2.6] / mm / percpu.c
index a64133f..6e09741 100644 (file)
@@ -46,8 +46,6 @@
  *
  * To use this allocator, arch code should do the followings.
  *
- * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
- *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back if they need to be
  *   different from the default
@@ -74,6 +72,7 @@
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
 
 #define PCPU_SLOT_BASE_SHIFT           5       /* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC             16      /* start a map with 16 ents */
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
 #ifndef __addr_to_pcpu_ptr
 #define __addr_to_pcpu_ptr(addr)                                       \
-       (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr  \
-                + (unsigned long)__per_cpu_start)
+       (void __percpu *)((unsigned long)(addr) -                       \
+                         (unsigned long)pcpu_base_addr +               \
+                         (unsigned long)__per_cpu_start)
 #endif
 #ifndef __pcpu_ptr_to_addr
 #define __pcpu_ptr_to_addr(ptr)                                                \
-       (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr   \
-                - (unsigned long)__per_cpu_start)
+       (void __force *)((unsigned long)(ptr) +                         \
+                        (unsigned long)pcpu_base_addr -                \
+                        (unsigned long)__per_cpu_start)
 #endif
 
 struct pcpu_chunk {
@@ -153,7 +154,10 @@ static int pcpu_reserved_chunk_limit;
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released.
+ * allocations are done using GFP_KERNEL with pcpu_lock released.  In
+ * general, percpu memory can't be allocated with irq off but
+ * irqsave/restore are still used in alloc path so that it can be used
+ * from early init path - sched_init() specifically.
  *
  * Free path accesses and alters only the index data structures, so it
  * can be safely called from atomic context.  When memory needs to be
@@ -352,62 +356,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 }
 
 /**
- * pcpu_extend_area_map - extend area map for allocation
- * @chunk: target chunk
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * @chunk: chunk of interest
  *
- * Extend area map of @chunk so that it can accomodate an allocation.
- * A single allocation can split an area into three areas, so this
- * function makes sure that @chunk->map has at least two extra slots.
+ * Determine whether area map of @chunk needs to be extended to
+ * accomodate a new allocation.
  *
  * CONTEXT:
- * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
- * if area map is extended.
+ * pcpu_lock.
  *
  * RETURNS:
- * 0 if noop, 1 if successfully extended, -errno on failure.
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
  */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
 {
        int new_alloc;
-       int *new;
-       size_t size;
 
-       /* has enough? */
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
 
-       spin_unlock_irq(&pcpu_lock);
-
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
                new_alloc *= 2;
 
-       new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-       if (!new) {
-               spin_lock_irq(&pcpu_lock);
+       return new_alloc;
+}
+
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+       int *old = NULL, *new = NULL;
+       size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+       unsigned long flags;
+
+       new = pcpu_mem_alloc(new_size);
+       if (!new)
                return -ENOMEM;
-       }
 
-       /*
-        * Acquire pcpu_lock and switch to new area map.  Only free
-        * could have happened inbetween, so map_used couldn't have
-        * grown.
-        */
-       spin_lock_irq(&pcpu_lock);
-       BUG_ON(new_alloc < chunk->map_used + 2);
+       /* acquire pcpu_lock and switch to new area map */
+       spin_lock_irqsave(&pcpu_lock, flags);
+
+       if (new_alloc <= chunk->map_alloc)
+               goto out_unlock;
 
-       size = chunk->map_alloc * sizeof(chunk->map[0]);
-       memcpy(new, chunk->map, size);
+       old_size = chunk->map_alloc * sizeof(chunk->map[0]);
+       memcpy(new, chunk->map, old_size);
 
        /*
         * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
         * one of the first chunks and still using static map.
         */
        if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-               pcpu_mem_free(chunk->map, size);
+               old = chunk->map;
 
        chunk->map_alloc = new_alloc;
        chunk->map = new;
+       new = NULL;
+
+out_unlock:
+       spin_unlock_irqrestore(&pcpu_lock, flags);
+
+       /*
+        * pcpu_mem_free() might end up calling vfree() which uses
+        * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+        */
+       pcpu_mem_free(old, old_size);
+       pcpu_mem_free(new, new_size);
+
        return 0;
 }
 
@@ -887,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
        int rs, re;
 
        /* quick path, check whether it's empty already */
-       pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
-               if (rs == page_start && re == page_end)
-                       return;
-               break;
-       }
+       rs = page_start;
+       pcpu_next_unpop(chunk, &rs, &re, page_end);
+       if (rs == page_start && re == page_end)
+               return;
 
        /* immutable chunks can't be depopulated */
        WARN_ON(chunk->immutable);
@@ -942,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
        int rs, re, rc;
 
        /* quick path, check whether all pages are already there */
-       pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
-               if (rs == page_start && re == page_end)
-                       goto clear;
-               break;
-       }
+       rs = page_start;
+       pcpu_next_pop(chunk, &rs, &re, page_end);
+       if (rs == page_start && re == page_end)
+               goto clear;
 
        /* need to allocate and map pages, this chunk can't be immutable */
        WARN_ON(chunk->immutable);
@@ -1041,10 +1067,13 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_alloc(size_t size, size_t align, bool reserved)
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
+       static int warn_limit = 10;
        struct pcpu_chunk *chunk;
-       int slot, off;
+       const char *err;
+       int slot, off, new_alloc;
+       unsigned long flags;
 
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
                WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1053,17 +1082,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        }
 
        mutex_lock(&pcpu_alloc_mutex);
-       spin_lock_irq(&pcpu_lock);
+       spin_lock_irqsave(&pcpu_lock, flags);
 
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
-               if (size > chunk->contig_hint ||
-                   pcpu_extend_area_map(chunk) < 0)
+
+               if (size > chunk->contig_hint) {
+                       err = "alloc from reserved chunk failed";
                        goto fail_unlock;
+               }
+
+               while ((new_alloc = pcpu_need_to_extend(chunk))) {
+                       spin_unlock_irqrestore(&pcpu_lock, flags);
+                       if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+                               err = "failed to extend area map of reserved chunk";
+                               goto fail_unlock_mutex;
+                       }
+                       spin_lock_irqsave(&pcpu_lock, flags);
+               }
+
                off = pcpu_alloc_area(chunk, size, align);
                if (off >= 0)
                        goto area_found;
+
+               err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }
 
@@ -1074,13 +1117,20 @@ restart:
                        if (size > chunk->contig_hint)
                                continue;
 
-                       switch (pcpu_extend_area_map(chunk)) {
-                       case 0:
-                               break;
-                       case 1:
-                               goto restart;   /* pcpu_lock dropped, restart */
-                       default:
-                               goto fail_unlock;
+                       new_alloc = pcpu_need_to_extend(chunk);
+                       if (new_alloc) {
+                               spin_unlock_irqrestore(&pcpu_lock, flags);
+                               if (pcpu_extend_area_map(chunk,
+                                                        new_alloc) < 0) {
+                                       err = "failed to extend area map";
+                                       goto fail_unlock_mutex;
+                               }
+                               spin_lock_irqsave(&pcpu_lock, flags);
+                               /*
+                                * pcpu_lock has been dropped, need to
+                                * restart cpu_slot list walking.
+                                */
+                               goto restart;
                        }
 
                        off = pcpu_alloc_area(chunk, size, align);
@@ -1090,23 +1140,26 @@ restart:
        }
 
        /* hmmm... no space left, create a new chunk */
-       spin_unlock_irq(&pcpu_lock);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
 
        chunk = alloc_pcpu_chunk();
-       if (!chunk)
+       if (!chunk) {
+               err = "failed to allocate new chunk";
                goto fail_unlock_mutex;
+       }
 
-       spin_lock_irq(&pcpu_lock);
+       spin_lock_irqsave(&pcpu_lock, flags);
        pcpu_chunk_relocate(chunk, -1);
        goto restart;
 
 area_found:
-       spin_unlock_irq(&pcpu_lock);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
 
        /* populate, map and clear the area */
        if (pcpu_populate_chunk(chunk, off, size)) {
-               spin_lock_irq(&pcpu_lock);
+               spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_free_area(chunk, off);
+               err = "failed to populate";
                goto fail_unlock;
        }
 
@@ -1116,9 +1169,16 @@ area_found:
        return __addr_to_pcpu_ptr(chunk->base_addr + off);
 
 fail_unlock:
-       spin_unlock_irq(&pcpu_lock);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
 fail_unlock_mutex:
        mutex_unlock(&pcpu_alloc_mutex);
+       if (warn_limit) {
+               pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
+                          "%s\n", size, align, err);
+               dump_stack();
+               if (!--warn_limit)
+                       pr_info("PERCPU: limit reached, disable warning\n");
+       }
        return NULL;
 }
 
@@ -1136,7 +1196,7 @@ fail_unlock_mutex:
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_percpu(size_t size, size_t align)
+void __percpu *__alloc_percpu(size_t size, size_t align)
 {
        return pcpu_alloc(size, align, false);
 }
@@ -1157,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_reserved_percpu(size_t size, size_t align)
+void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 {
        return pcpu_alloc(size, align, true);
 }
@@ -1209,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work)
  * CONTEXT:
  * Can be called from atomic context.
  */
-void free_percpu(void *ptr)
+void free_percpu(void __percpu *ptr)
 {
-       void *addr = __pcpu_ptr_to_addr(ptr);
+       void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int off;
@@ -1219,6 +1279,8 @@ void free_percpu(void *ptr)
        if (!ptr)
                return;
 
+       addr = __pcpu_ptr_to_addr(ptr);
+
        spin_lock_irqsave(&pcpu_lock, flags);
 
        chunk = pcpu_chunk_addr_search(addr);
@@ -1241,6 +1303,53 @@ void free_percpu(void *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
+/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area.  Module
+ * static percpu areas are not considered.  For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+       const size_t static_size = __per_cpu_end - __per_cpu_start;
+       void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu) {
+               void *start = per_cpu_ptr(base, cpu);
+
+               if ((void *)addr >= start && (void *)addr < start + static_size)
+                       return true;
+        }
+       return false;
+}
+
+/**
+ * per_cpu_ptr_to_phys - convert translated percpu address to physical address
+ * @addr: the address to be converted to physical address
+ *
+ * Given @addr which is dereferenceable address obtained via one of
+ * percpu access macros, this function translates it into its physical
+ * address.  The caller is responsible for ensuring @addr stays valid
+ * until this function finishes.
+ *
+ * RETURNS:
+ * The physical address for @addr.
+ */
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+       if ((unsigned long)addr < VMALLOC_START ||
+                       (unsigned long)addr >= VMALLOC_END)
+               return __pa(addr);
+       else
+               return page_to_phys(vmalloc_to_page(addr));
+}
+
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
                                        size_t reserved_size,
                                        ssize_t *dyn_sizep)
@@ -1578,6 +1687,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                  void *base_addr)
 {
+       static char cpus_buf[4096] __initdata;
        static int smap[2], dmap[2];
        size_t dyn_size = ai->dyn_size;
        size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
@@ -1589,17 +1699,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        int *unit_map;
        int group, unit, i;
 
+       cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
+
+#define PCPU_SETUP_BUG_ON(cond)        do {                                    \
+       if (unlikely(cond)) {                                           \
+               pr_emerg("PERCPU: failed to initialize, %s", #cond);    \
+               pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf);   \
+               pcpu_dump_alloc_info(KERN_EMERG, ai);                   \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
        /* sanity checks */
        BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
                     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
-       BUG_ON(ai->nr_groups <= 0);
-       BUG_ON(!ai->static_size);
-       BUG_ON(!base_addr);
-       BUG_ON(ai->unit_size < size_sum);
-       BUG_ON(ai->unit_size & ~PAGE_MASK);
-       BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
-
-       pcpu_dump_alloc_info(KERN_DEBUG, ai);
+       PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+       PCPU_SETUP_BUG_ON(!ai->static_size);
+       PCPU_SETUP_BUG_ON(!base_addr);
+       PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
+       PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+       PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
 
        /* process group information and build config tables accordingly */
        group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1622,8 +1741,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                        if (cpu == NR_CPUS)
                                continue;
 
-                       BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
-                       BUG_ON(unit_map[cpu] != UINT_MAX);
+                       PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
+                       PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
+                       PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
 
                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;
@@ -1636,7 +1756,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        pcpu_nr_units = unit;
 
        for_each_possible_cpu(cpu)
-               BUG_ON(unit_map[cpu] == UINT_MAX);
+               PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
+
+       /* we're done parsing the input, undefine BUG macro and dump config */
+#undef PCPU_SETUP_BUG_ON
+       pcpu_dump_alloc_info(KERN_INFO, ai);
 
        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
@@ -1786,7 +1910,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
-       size_t size_sum, areas_size;
+       size_t size_sum, areas_size, max_distance;
        int group, i, rc;
 
        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -1836,8 +1960,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
        }
 
        /* base address is now known, determine group base offsets */
-       for (group = 0; group < ai->nr_groups; group++)
+       max_distance = 0;
+       for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
+               max_distance = max_t(size_t, max_distance,
+                                    ai->groups[group].base_offset);
+       }
+       max_distance += ai->unit_size;
+
+       /* warn if maximum distance is further than 75% of vmalloc space */
+       if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
+               pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
+                          "space 0x%lx\n",
+                          max_distance, VMALLOC_END - VMALLOC_START);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+               /* and fail if we have fallback */
+               rc = -EINVAL;
+               goto out_free;
+#endif
+       }
 
        pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,