#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#define BYTES_PER_WORD sizeof(void *)
#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
-
#ifndef ARCH_KMALLOC_FLAGS
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
#endif
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
size_t slab_buffer_size(struct kmem_cache *cachep)
{
return cachep->buffer_size;
#define BAD_ALIEN_MAGIC 0x01020304ul
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+ NONE,
+ PARTIAL_AC,
+ PARTIAL_L3,
+ EARLY,
+ FULL
+} g_cpucache_up;
+
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+ return g_cpucache_up >= EARLY;
+}
+
#ifdef CONFIG_LOCKDEP
/*
static struct lock_class_key on_slab_l3_key;
static struct lock_class_key on_slab_alc_key;
-static inline void init_lock_keys(void)
-
+static void init_node_lock_keys(int q)
{
- int q;
struct cache_sizes *s = malloc_sizes;
- while (s->cs_size != ULONG_MAX) {
- for_each_node(q) {
- struct array_cache **alc;
- int r;
- struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
- if (!l3 || OFF_SLAB(s->cs_cachep))
- continue;
- lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
- alc = l3->alien;
- /*
- * FIXME: This check for BAD_ALIEN_MAGIC
- * should go away when common slab code is taught to
- * work even without alien caches.
- * Currently, non NUMA code returns BAD_ALIEN_MAGIC
- * for alloc_alien_cache,
- */
- if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
- continue;
- for_each_node(r) {
- if (alc[r])
- lockdep_set_class(&alc[r]->lock,
- &on_slab_alc_key);
- }
+ if (g_cpucache_up != FULL)
+ return;
+
+ for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
+ struct array_cache **alc;
+ struct kmem_list3 *l3;
+ int r;
+
+ l3 = s->cs_cachep->nodelists[q];
+ if (!l3 || OFF_SLAB(s->cs_cachep))
+ continue;
+ lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
+ alc = l3->alien;
+ /*
+ * FIXME: This check for BAD_ALIEN_MAGIC
+ * should go away when common slab code is taught to
+ * work even without alien caches.
+ * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+ * for alloc_alien_cache,
+ */
+ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+ continue;
+ for_each_node(r) {
+ if (alc[r])
+ lockdep_set_class(&alc[r]->lock,
+ &on_slab_alc_key);
}
- s++;
}
}
+
+static inline void init_lock_keys(void)
+{
+ int node;
+
+ for_each_node(node)
+ init_node_lock_keys(node);
+}
#else
+static void init_node_lock_keys(int q)
+{
+}
+
static inline void init_lock_keys(void)
{
}
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
- NONE,
- PARTIAL_AC,
- PARTIAL_L3,
- EARLY,
- FULL
-} g_cpucache_up;
-
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
- return g_cpucache_up >= EARLY;
-}
-
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
* objects freed on different nodes from which they were allocated) and the
* flushing of remote pcps by calling drain_node_pages.
*/
-static DEFINE_PER_CPU(unsigned long, reap_node);
+static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
- per_cpu(reap_node, cpu) = node;
+ per_cpu(slab_reap_node, cpu) = node;
}
static void next_reap_node(void)
{
- int node = __get_cpu_var(reap_node);
+ int node = __get_cpu_var(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ __get_cpu_var(slab_reap_node) = node;
}
#else
*/
static void __cpuinit start_cpu_timer(int cpu)
{
- struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+ struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
from->avail -= nr;
to->avail += nr;
- to->touched = 1;
return nr;
}
if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, gfp, node);
+ ac_ptr = kzalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
- if (i == node || !node_online(i)) {
- ac_ptr[i] = NULL;
+ if (i == node || !node_online(i))
continue;
- }
ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(reap_node);
+ int node = __get_cpu_var(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
}
#endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3;
+ const int memsize = sizeof(struct kmem_list3);
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ /*
+ * Set up the size64 kmemlist for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
+ return -ENOMEM;
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+ return 0;
+}
+
static void __cpuinit cpuup_canceled(long cpu)
{
struct kmem_cache *cachep;
if (nc)
free_block(cachep, nc->entry, nc->avail, node);
- if (!cpus_empty(*mask)) {
+ if (!cpumask_empty(mask)) {
spin_unlock_irq(&l3->list_lock);
goto free_array_cache;
}
struct kmem_cache *cachep;
struct kmem_list3 *l3 = NULL;
int node = cpu_to_node(cpu);
- const int memsize = sizeof(struct kmem_list3);
+ int err;
/*
* We need to do this right in the beginning since
* kmalloc_node allows us to add the slab to the right
* kmem_list3 and not this cpu's kmem_list3
*/
-
- list_for_each_entry(cachep, &cache_chain, next) {
- /*
- * Set up the size64 kmemlist for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
- */
- if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!l3)
- goto bad;
- kmem_list3_init(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- /*
- * The l3s don't come and go as CPUs come and
- * go. cache_chain_mutex is sufficient
- * protection here.
- */
- cachep->nodelists[node] = l3;
- }
-
- spin_lock_irq(&cachep->nodelists[node]->list_lock);
- cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&cachep->nodelists[node]->list_lock);
- }
+ err = init_cache_nodelists_node(node);
+ if (err < 0)
+ goto bad;
/*
* Now we can go ahead with allocating the shared arrays and
kfree(shared);
free_alien_cache(alien);
}
+ init_node_lock_keys(node);
+
return 0;
bad:
cpuup_canceled(cpu);
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+ cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
- per_cpu(reap_work, cpu).work.func = NULL;
+ per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
&cpuup_callback, NULL, 0
};
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+ struct kmem_cache *cachep;
+ int ret = 0;
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct kmem_list3 *l3;
+
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ drain_freelist(cachep, l3, l3->free_objects);
+
+ if (!list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mnb = arg;
+ int ret = 0;
+ int nid;
+
+ nid = mnb->status_change_nid;
+ if (nid < 0)
+ goto out;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = init_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&cache_chain_mutex);
+ ret = drain_cache_nodelists_node(nid);
+ mutex_unlock(&cache_chain_mutex);
+ break;
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+out:
+ return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
/*
* swap the static kmem_list3 with kmalloced memory
*/
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
- int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ int nodeid)
{
struct kmem_list3 *ptr;
*/
register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+ /*
+ * Register a memory hotplug callback that initializes and frees
+ * nodelists.
+ */
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
if (ralign < align) {
ralign = align;
}
- /* disable debug if necessary */
- if (ralign > __alignof__(unsigned long long))
+ /* disable debug if not aligning with REDZONE_ALIGN */
+ if (ralign & (__alignof__(unsigned long long) - 1))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += sizeof(unsigned long long);
- size += 2 * sizeof(unsigned long long);
+ cachep->obj_offset += align;
+ size += align + sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
- * it too early on.)
+ * it too early on. Always use on-slab management when
+ * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+ !(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
* kmemleak does not treat the ->s_mem pointer as a reference
* to the object. Otherwise we will not report the leak.
*/
- kmemleak_scan_area(slabp, offsetof(struct slab, list),
- sizeof(struct list_head), local_flags);
+ kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
+ local_flags);
if (!slabp)
return NULL;
} else {
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
+ l3->shared->touched = 1;
goto alloc_done;
+ }
while (batchcount > 0) {
struct list_head *entry;
if (cachep == &cache_cache)
return false;
- return should_failslab(obj_size(cachep), flags);
+ return should_failslab(obj_size(cachep), flags, cachep->flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
}
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
- kmemleak_erase(&ac->entry[ac->avail]);
+ if (objp)
+ kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_node_id();
+ get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_mem_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
+ put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
if (flags & __GFP_THISNODE)
return NULL;
+ get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
}
}
}
+ put_mems_allowed();
return obj;
}
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- if (unlikely(nodeid == -1))
+ if (nodeid == -1)
nodeid = numa_node_id();
if (unlikely(!cachep->nodelists[nodeid])) {
}
EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
*/
int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = BYTES_PER_WORD - 1;
unsigned long size = cachep->buffer_size;
struct page *page;
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
+ if (unlikely(!kern_ptr_validate(ptr, size)))
goto out;
page = virt_to_page(ptr);
if (unlikely(!PageSlab(page)))
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
gfp_t flags,
int nodeid)
return ret;
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node,
return __do_kmalloc_node(size, flags, node, NULL);
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
#endif /* CONFIG_NUMA */
/**
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, __builtin_return_address(0));
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
- reaped, errors, max_freeable, node_allocs,
- node_frees, overflows);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
+ "%4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
}
/* cpu stats */
{