X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fslab.c;h=e76eee46688619bd65c110f934cab134dd698e27;hb=921615f111108258820226a3258a047d9bf1d96a;hp=0d4e57431de41c8d7fccfe9307bfb504087e5c03;hpb=af9997e426f9ddfe7a84cb4cd3c7ff938fabd41a;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/slab.c b/mm/slab.c index 0d4e574..e76eee4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -26,7 +26,7 @@ * initialized objects. * * This means, that your constructor is used only for newly allocated - * slabs and you must pass objects with the same intializations to + * slabs and you must pass objects with the same initializations to * kmem_cache_free. * * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, @@ -110,14 +110,14 @@ #include #include #include +#include #include #include #include /* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, - * SLAB_RED_ZONE & SLAB_POISON. + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * * STATS - 1 to collect stats for /proc/slabinfo. @@ -138,10 +138,7 @@ /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) - -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) #ifndef ARCH_KMALLOC_MINALIGN /* @@ -149,10 +146,11 @@ * Usually, the kmalloc caches are cache_line_size() aligned, except when * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. - * Note that this flag disables some debug features. + * alignment larger than the alignment of a 64-bit integer. + * ARCH_KMALLOC_MINALIGN allows that. + * Note that increasing this value may disable some debug features. */ -#define ARCH_KMALLOC_MINALIGN 0 +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #endif #ifndef ARCH_SLAB_MINALIGN @@ -172,17 +170,19 @@ /* Legal flag mask for kmem_cache_create(). */ #if DEBUG -# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ +# define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_DEBUG_OBJECTS) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ + SLAB_DEBUG_OBJECTS) #endif /* @@ -266,11 +266,10 @@ struct array_cache { unsigned int batchcount; unsigned int touched; spinlock_t lock; - void *entry[0]; /* + void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * [0] is for gcc 2.95. It should really be []. */ }; @@ -304,11 +303,11 @@ struct kmem_list3 { /* * Need this for bootstrapping a per node allocator. */ -#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) +#define NUM_INIT_LISTS (3 * MAX_NUMNODES) struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define CACHE_CACHE 0 -#define SIZE_AC 1 -#define SIZE_L3 (1 + MAX_NUMNODES) +#define SIZE_AC MAX_NUMNODES +#define SIZE_L3 (2 * MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); @@ -333,7 +332,7 @@ static __always_inline int index_of(const size_t size) return i; \ else \ i++; -#include "linux/kmalloc_sizes.h" +#include #undef CACHE __bad_size(); } else @@ -389,7 +388,6 @@ struct kmem_cache { unsigned int buffer_size; u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -408,10 +406,7 @@ struct kmem_cache { unsigned int dflags; /* dynamic flags */ /* constructor func */ - void (*ctor) (void *, struct kmem_cache *, unsigned long); - - /* de-constructor func */ - void (*dtor) (void *, struct kmem_cache *, unsigned long); + void (*ctor)(void *obj); /* 5) cache creation/removal */ const char *name; @@ -444,6 +439,17 @@ struct kmem_cache { int obj_offset; int obj_size; #endif + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ }; #define CFLGS_OFF_SLAB (0x80000000UL) @@ -527,19 +533,22 @@ static int obj_size(struct kmem_cache *cachep) return cachep->obj_size; } -static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); + return (unsigned long long*) (objp + obj_offset(cachep) - + sizeof(unsigned long long)); } -static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); if (cachep->flags & SLAB_STORE_USER) - return (unsigned long *)(objp + cachep->buffer_size - - 2 * BYTES_PER_WORD); - return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); + return (unsigned long long *)(objp + cachep->buffer_size - + sizeof(unsigned long long) - + REDZONE_ALIGN); + return (unsigned long long *) (objp + cachep->buffer_size - + sizeof(unsigned long long)); } static void **dbg_userword(struct kmem_cache *cachep, void *objp) @@ -552,28 +561,13 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #define obj_offset(x) 0 #define obj_size(cachep) (cachep->buffer_size) -#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) -#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) #endif /* - * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp - * order. - */ -#if defined(CONFIG_LARGE_ALLOCS) -#define MAX_OBJ_ORDER 13 /* up to 32Mb */ -#define MAX_GFP_ORDER 13 /* up to 32Mb */ -#elif defined(CONFIG_MMU) -#define MAX_OBJ_ORDER 5 /* 32 pages */ -#define MAX_GFP_ORDER 5 /* 32 pages */ -#else -#define MAX_OBJ_ORDER 8 /* up to 1Mb */ -#define MAX_GFP_ORDER 8 /* up to 1Mb */ -#endif - -/* * Do not go above this order unless 0 objects fit into the slab. */ #define BREAK_GFP_ORDER_HI 1 @@ -592,8 +586,7 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) static inline struct kmem_cache *page_get_cache(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); + page = compound_head(page); BUG_ON(!PageSlab(page)); return (struct kmem_cache *)page->lru.next; } @@ -605,21 +598,19 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); BUG_ON(!PageSlab(page)); return (struct slab *)page->lru.prev; } static inline struct kmem_cache *virt_to_cache(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_cache(page); } static inline struct slab *virt_to_slab(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_slab(page); } @@ -678,9 +669,6 @@ static struct kmem_cache cache_cache = { .shared = 1, .buffer_size = sizeof(struct kmem_cache), .name = "kmem_cache", -#if DEBUG - .obj_size = sizeof(struct kmem_cache), -#endif }; #define BAD_ALIEN_MAGIC 0x01020304ul @@ -741,8 +729,7 @@ static inline void init_lock_keys(void) #endif /* - * 1. Guard access to the cache-chain. - * 2. Protect sanity of cpu_online_map against cpu hotplug events + * Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; @@ -785,6 +772,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, */ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); #endif + if (!size) + return ZERO_SIZE_PTR; + while (size > csizep->cs_size) csizep++; @@ -793,8 +783,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, * has cs_{dma,}cachep==NULL. Thus no special case * for large kmalloc calls required. */ +#ifdef CONFIG_ZONE_DMA if (unlikely(gfpflags & GFP_DMA)) return csizep->cs_dmacachep; +#endif return csizep->cs_cachep; } @@ -869,7 +861,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } -#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) +#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) @@ -888,6 +880,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -919,12 +912,6 @@ static void next_reap_node(void) { int node = __get_cpu_var(reap_node); - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); - node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); @@ -943,7 +930,7 @@ static void next_reap_node(void) * the CPUs getting into lockstep and contending for the global cache chain * lock. */ -static void __devinit start_cpu_timer(int cpu) +static void __cpuinit start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(reap_work, cpu); @@ -1040,7 +1027,7 @@ static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { struct array_cache **ac_ptr; - int memsize = sizeof(void *) * MAX_NUMNODES; + int memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) @@ -1054,7 +1041,7 @@ static struct array_cache **alloc_alien_cache(int node, int limit) } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); if (!ac_ptr[i]) { - for (i--; i <= 0; i--) + for (i--; i >= 0; i--) kfree(ac_ptr[i]); kfree(ac_ptr); return NULL; @@ -1144,7 +1131,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) + if (likely(slabp->nodeid == node)) return 0; l3 = cachep->nodelists[node]; @@ -1167,114 +1154,209 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } #endif -static int __cpuinit cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void __cpuinit cpuup_canceled(long cpu) { - long cpu = (long)hcpu; struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - int memsize = sizeof(struct kmem_list3); + node_to_cpumask_ptr(mask, node); - switch (action) { - case CPU_UP_PREPARE: - mutex_lock(&cache_chain_mutex); - /* - * We need to do this right in the beginning since - * alloc_arraycache's are going to use this list. - * kmalloc_node allows us to add the slab to the right - * kmem_list3 and not this cpu's kmem_list3 - */ + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + struct array_cache *shared; + struct array_cache **alien; - list_for_each_entry(cachep, &cache_chain, next) { - /* - * Set up the size64 kmemlist for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) - goto bad; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - /* - * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient - * protection here. - */ - cachep->nodelists[node] = l3; - } + /* cpu is dead; no one can alloc from it. */ + nc = cachep->array[cpu]; + cachep->array[cpu] = NULL; + l3 = cachep->nodelists[node]; + + if (!l3) + goto free_array_cache; + + spin_lock_irq(&l3->list_lock); + + /* Free limit for this kmem_list3 */ + l3->free_limit -= cachep->batchcount; + if (nc) + free_block(cachep, nc->entry, nc->avail, node); - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + if (!cpus_empty(*mask)) { + spin_unlock_irq(&l3->list_lock); + goto free_array_cache; } + shared = l3->shared; + if (shared) { + free_block(cachep, shared->entry, + shared->avail, node); + l3->shared = NULL; + } + + alien = l3->alien; + l3->alien = NULL; + + spin_unlock_irq(&l3->list_lock); + + kfree(shared); + if (alien) { + drain_alien_cache(cachep, alien); + free_alien_cache(alien); + } +free_array_cache: + kfree(nc); + } + /* + * In the previous loop, all the objects were freed to + * the respective cache's slabs, now we can go ahead and + * shrink each nodelist to its limit. + */ + list_for_each_entry(cachep, &cache_chain, next) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + drain_freelist(cachep, l3, l3->free_objects); + } +} + +static int __cpuinit cpuup_prepare(long cpu) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3 = NULL; + int node = cpu_to_node(cpu); + const int memsize = sizeof(struct kmem_list3); + + /* + * We need to do this right in the beginning since + * alloc_arraycache's are going to use this list. + * kmalloc_node allows us to add the slab to the right + * kmem_list3 and not this cpu's kmem_list3 + */ + + list_for_each_entry(cachep, &cache_chain, next) { /* - * Now we can go ahead with allocating the shared arrays and - * array caches + * Set up the size64 kmemlist for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this */ - list_for_each_entry(cachep, &cache_chain, next) { - struct array_cache *nc; - struct array_cache *shared; - struct array_cache **alien = NULL; - - nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount); - if (!nc) - goto bad; - shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, - 0xbaadf00d); - if (!shared) + if (!cachep->nodelists[node]) { + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) goto bad; + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit); - if (!alien) - goto bad; - } - cachep->array[cpu] = nc; - l3 = cachep->nodelists[node]; - BUG_ON(!l3); + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ + cachep->nodelists[node] = l3; + } - spin_lock_irq(&l3->list_lock); - if (!l3->shared) { - /* - * We are serialised from CPU_DEAD or - * CPU_UP_CANCELLED by the cpucontrol lock - */ - l3->shared = shared; - shared = NULL; + spin_lock_irq(&cachep->nodelists[node]->list_lock); + cachep->nodelists[node]->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->nodelists[node]->list_lock); + } + + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + struct array_cache *shared = NULL; + struct array_cache **alien = NULL; + + nc = alloc_arraycache(node, cachep->limit, + cachep->batchcount); + if (!nc) + goto bad; + if (cachep->shared) { + shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, + 0xbaadf00d); + if (!shared) { + kfree(nc); + goto bad; } -#ifdef CONFIG_NUMA - if (!l3->alien) { - l3->alien = alien; - alien = NULL; + } + if (use_alien_caches) { + alien = alloc_alien_cache(node, cachep->limit); + if (!alien) { + kfree(shared); + kfree(nc); + goto bad; } -#endif - spin_unlock_irq(&l3->list_lock); - kfree(shared); - free_alien_cache(alien); } + cachep->array[cpu] = nc; + l3 = cachep->nodelists[node]; + BUG_ON(!l3); + + spin_lock_irq(&l3->list_lock); + if (!l3->shared) { + /* + * We are serialised from CPU_DEAD or + * CPU_UP_CANCELLED by the cpucontrol lock + */ + l3->shared = shared; + shared = NULL; + } +#ifdef CONFIG_NUMA + if (!l3->alien) { + l3->alien = alien; + alien = NULL; + } +#endif + spin_unlock_irq(&l3->list_lock); + kfree(shared); + free_alien_cache(alien); + } + return 0; +bad: + cpuup_canceled(cpu); + return -ENOMEM; +} + +static int __cpuinit cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + mutex_lock(&cache_chain_mutex); + err = cpuup_prepare(cpu); + mutex_unlock(&cache_chain_mutex); break; case CPU_ONLINE: - mutex_unlock(&cache_chain_mutex); + case CPU_ONLINE_FROZEN: start_cpu_timer(cpu); break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - mutex_lock(&cache_chain_mutex); - break; - case CPU_DOWN_FAILED: - mutex_unlock(&cache_chain_mutex); - break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Shutdown cache reaper. Note that the cache_chain_mutex is + * held so that if cache_reap() is invoked it cannot do + * anything expensive but will only modify reap_work + * and reschedule the timer. + */ + cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(reap_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* * Even if all the cpus of a node are down, we don't free the * kmem_list3 of any cache. This to avoid a race between @@ -1283,73 +1365,16 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * structure is usually allocated from kmem_cache_create() and * gets destroyed at kmem_cache_destroy(). */ - /* fall thru */ + /* fall through */ #endif case CPU_UP_CANCELED: - list_for_each_entry(cachep, &cache_chain, next) { - struct array_cache *nc; - struct array_cache *shared; - struct array_cache **alien; - cpumask_t mask; - - mask = node_to_cpumask(node); - /* cpu is dead; no one can alloc from it. */ - nc = cachep->array[cpu]; - cachep->array[cpu] = NULL; - l3 = cachep->nodelists[node]; - - if (!l3) - goto free_array_cache; - - spin_lock_irq(&l3->list_lock); - - /* Free limit for this kmem_list3 */ - l3->free_limit -= cachep->batchcount; - if (nc) - free_block(cachep, nc->entry, nc->avail, node); - - if (!cpus_empty(mask)) { - spin_unlock_irq(&l3->list_lock); - goto free_array_cache; - } - - shared = l3->shared; - if (shared) { - free_block(cachep, l3->shared->entry, - l3->shared->avail, node); - l3->shared = NULL; - } - - alien = l3->alien; - l3->alien = NULL; - - spin_unlock_irq(&l3->list_lock); - - kfree(shared); - if (alien) { - drain_alien_cache(cachep, alien); - free_alien_cache(alien); - } -free_array_cache: - kfree(nc); - } - /* - * In the previous loop, all the objects were freed to - * the respective cache's slabs, now we can go ahead and - * shrink each nodelist to its limit. - */ - list_for_each_entry(cachep, &cache_chain, next) { - l3 = cachep->nodelists[node]; - if (!l3) - continue; - drain_freelist(cachep, l3, l3->free_objects); - } + case CPU_UP_CANCELED_FROZEN: + mutex_lock(&cache_chain_mutex); + cpuup_canceled(cpu); mutex_unlock(&cache_chain_mutex); break; } - return NOTIFY_OK; -bad: - return NOTIFY_BAD; + return err ? NOTIFY_BAD : NOTIFY_OK; } static struct notifier_block __cpuinitdata cpucache_notifier = { @@ -1380,6 +1405,22 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, } /* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ +static void __init set_up_list3s(struct kmem_cache *cachep, int index) +{ + int node; + + for_each_online_node(node) { + cachep->nodelists[node] = &initkmem_list3[index + node]; + cachep->nodelists[node]->next_reap = jiffies + + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + } +} + +/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ @@ -1392,11 +1433,17 @@ void __init kmem_cache_init(void) int order; int node; + if (num_possible_nodes() == 1) { + use_alien_caches = 0; + numa_platform = 0; + } + for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) cache_cache.nodelists[i] = NULL; } + set_up_list3s(&cache_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1432,8 +1479,17 @@ void __init kmem_cache_init(void) list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; + /* + * struct kmem_cache size depends on nr_node_ids, which + * can be less than MAX_NUMNODES. + */ + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + + nr_node_ids * sizeof(struct kmem_list3 *); +#if DEBUG + cache_cache.obj_size = cache_cache.buffer_size; +#endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); cache_cache.reciprocal_buffer_size = @@ -1465,7 +1521,7 @@ void __init kmem_cache_init(void) sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = @@ -1473,7 +1529,7 @@ void __init kmem_cache_init(void) sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } slab_early_init = 0; @@ -1491,15 +1547,17 @@ void __init kmem_cache_init(void) sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } - - sizes->cs_dmacachep = kmem_cache_create(names->name_dma, +#ifdef CONFIG_ZONE_DMA + sizes->cs_dmacachep = kmem_cache_create( + names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC, - NULL, NULL); + NULL); +#endif sizes++; names++; } @@ -1541,10 +1599,9 @@ void __init kmem_cache_init(void) { int nid; - /* Replace the static kmem_list3 structures for the boot cpu */ - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); - for_each_online_node(nid) { + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -1619,6 +1676,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) #endif flags |= cachep->gfpflags; + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -1756,7 +1815,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", + printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -1798,8 +1857,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: %s start=%p, len=%d\n", + cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1842,15 +1901,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -/** - * slab_destroy_objs - destroy a slab and its objects - * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed - * - * Call the registered destructor for each object in a slab that is being - * destroyed. - */ -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { @@ -1876,20 +1927,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) slab_error(cachep, "end of a freed object " "was overwritten"); } - if (cachep->dtor && !(cachep->flags & SLAB_POISON)) - (cachep->dtor) (objp + obj_offset(cachep), cachep, 0); } } #else -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { - if (cachep->dtor) { - int i; - for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); - (cachep->dtor) (objp, cachep, 0); - } - } } #endif @@ -1906,7 +1948,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { void *addr = slabp->s_mem - slabp->colouroff; - slab_destroy_objs(cachep, slabp); + slab_destroy_debugcheck(cachep, slabp); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { struct slab_rcu *slab_rcu; @@ -1921,22 +1963,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -/* - * For setting up all the kmem_list3s for cache whose buffer_size is same as - * size of kmem_list3. - */ -static void set_up_list3s(struct kmem_cache *cachep, int index) -{ - int node; - - for_each_online_node(node) { - cachep->nodelists[node] = &initkmem_list3[index + node]; - cachep->nodelists[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - } -} - static void __kmem_cache_destroy(struct kmem_cache *cachep) { int i; @@ -1978,7 +2004,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t left_over = 0; int gfporder; - for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { + for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { unsigned int num; size_t remainder; @@ -2028,7 +2054,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } -static int setup_cpu_cache(struct kmem_cache *cachep) +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) { if (g_cpucache_up == FULL) return enable_cpucache(cachep); @@ -2089,12 +2115,10 @@ static int setup_cpu_cache(struct kmem_cache *cachep) * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. - * @dtor: A destructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache - * and the @dtor is run before the pages are handed back. + * The @ctor is run when new pages are allocated by the cache. * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. @@ -2113,9 +2137,7 @@ static int setup_cpu_cache(struct kmem_cache *cachep) */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long), - void (*dtor)(void*, struct kmem_cache *, unsigned long)) + unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2124,8 +2146,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, * Sanity checks... these are all serious usage bugs. */ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || - (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { - printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, + size > KMALLOC_MAX_SIZE) { + printk(KERN_ERR "%s: Early error in slab %s\n", __func__, name); BUG(); } @@ -2134,6 +2156,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * We use cache_chain_mutex to ensure a consistent view of * cpu_online_map as well. Please see cpuup_callback */ + get_online_cpus(); mutex_lock(&cache_chain_mutex); list_for_each_entry(pc, &cache_chain, next) { @@ -2147,13 +2170,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ res = probe_kernel_address(pc->name, tmp); if (res) { - printk("SLAB: cache with size %d has lost its name\n", + printk(KERN_ERR + "SLAB: cache with size %d has lost its name\n", pc->buffer_size); continue; } if (!strcmp(pc->name, name)) { - printk("kmem_cache_create: duplicate cache %s\n", name); + printk(KERN_ERR + "kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; } @@ -2161,12 +2186,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ - if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { - /* No constructor, but inital state check requested */ - printk(KERN_ERR "%s: No con, but init state check " - "requested - %s\n", __FUNCTION__, name); - flags &= ~SLAB_DEBUG_INITIAL; - } #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -2174,7 +2193,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; @@ -2182,9 +2202,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(dtor); - /* * Always checks flags, a caller might be expecting debug support which * isn't available. @@ -2218,13 +2235,21 @@ kmem_cache_create (const char *name, size_t size, size_t align, } /* - * Redzoning and user store require word alignment. Note this will be - * overridden by architecture or caller mandated alignment if either - * is greater than BYTES_PER_WORD. + * Redzoning and user store require word alignment or possibly larger. + * Note this will be overridden by architecture or caller mandated + * alignment if either is greater than BYTES_PER_WORD. */ - if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; + if (flags & SLAB_RED_ZONE) { + ralign = REDZONE_ALIGN; + /* If redzoning, ensure that the second redzone is suitably + * aligned, by adjusting the object size accordingly. */ + size += REDZONE_ALIGN - 1; + size &= ~(REDZONE_ALIGN - 1); + } + /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; @@ -2234,7 +2259,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, ralign = align; } /* disable debug if necessary */ - if (ralign > BYTES_PER_WORD) + if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2255,14 +2280,18 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += BYTES_PER_WORD; - size += 2 * BYTES_PER_WORD; + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of - * the real object. + * the real object. But if the second red zone needs to be + * aligned to 64 bits, we must allow that much space. */ - size += BYTES_PER_WORD; + if (flags & SLAB_RED_ZONE) + size += REDZONE_ALIGN; + else + size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size @@ -2290,7 +2319,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { - printk("kmem_cache_create: couldn't create cache %s.\n", name); + printk(KERN_ERR + "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; @@ -2321,7 +2351,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; - if (flags & SLAB_CACHE_DMA) + if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -2335,10 +2365,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(!cachep->slabp_cache); + BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; - cachep->dtor = dtor; cachep->name = name; if (setup_cpu_cache(cachep)) { @@ -2354,6 +2383,7 @@ oops: panic("kmem_cache_create(): failed to create slab `%s'\n", name); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2415,7 +2445,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1, 1); + on_each_cpu(do_drain, cachep, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -2505,9 +2535,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep) int ret; BUG_ON(!cachep || in_interrupt()); + get_online_cpus(); mutex_lock(&cache_chain_mutex); ret = __cache_shrink(cachep); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); return ret; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -2516,7 +2548,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * kmem_cache_destroy - delete a cache * @cachep: the cache to destroy * - * Remove a struct kmem_cache object from the slab cache. + * Remove a &struct kmem_cache object from the slab cache. * * It is expected this function will be called by a module when it is * unloaded. This will remove the cache completely, and avoid a duplicate @@ -2533,6 +2565,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) BUG_ON(!cachep || in_interrupt()); /* Find the cache in the chain of caches. */ + get_online_cpus(); mutex_lock(&cache_chain_mutex); /* * the chain is never empty, cache_cache is never destroyed @@ -2542,6 +2575,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) slab_error(cachep, "Can't free all objects"); list_add(&cachep->next, &cache_chain); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); return; } @@ -2550,6 +2584,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2584,6 +2619,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; slabp->nodeid = nodeid; + slabp->free = 0; return slabp; } @@ -2593,7 +2629,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) } static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp, unsigned long ctor_flags) + struct slab *slabp) { int i; @@ -2616,8 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep, * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) - cachep->ctor(objp + obj_offset(cachep), cachep, - ctor_flags); + cachep->ctor(objp + obj_offset(cachep)); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2633,20 +2668,21 @@ static void cache_init_objs(struct kmem_cache *cachep, cachep->buffer_size / PAGE_SIZE, 0); #else if (cachep->ctor) - cachep->ctor(objp, cachep, ctor_flags); + cachep->ctor(objp); #endif slab_bufctl(slabp)[i] = i + 1; } slab_bufctl(slabp)[i - 1] = BUFCTL_END; - slabp->free = 0; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { - if (flags & GFP_DMA) - BUG_ON(!(cachep->gfpflags & GFP_DMA)); - else - BUG_ON(cachep->gfpflags & GFP_DMA); + if (CONFIG_ZONE_DMA_FLAG) { + if (flags & GFP_DMA) + BUG_ON(!(cachep->gfpflags & GFP_DMA)); + else + BUG_ON(cachep->gfpflags & GFP_DMA); + } } static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, @@ -2720,25 +2756,14 @@ static int cache_grow(struct kmem_cache *cachep, struct slab *slabp; size_t offset; gfp_t local_flags; - unsigned long ctor_flags; struct kmem_list3 *l3; /* * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); - if (flags & __GFP_NO_GROW) - return 0; - - ctor_flags = SLAB_CTOR_CONSTRUCTOR; - local_flags = (flags & GFP_LEVEL_MASK); - if (!(local_flags & __GFP_WAIT)) - /* - * Not allowed to sleep. Need to tell a constructor about - * this - it might need to know... - */ - ctor_flags |= SLAB_CTOR_ATOMIC; + BUG_ON(flags & GFP_SLAB_BUG_MASK); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); /* Take the l3 list lock to change the colour_next on this node */ check_irq_off(); @@ -2770,20 +2795,19 @@ static int cache_grow(struct kmem_cache *cachep, * 'nodeid'. */ if (!objp) - objp = kmem_getpages(cachep, flags, nodeid); + objp = kmem_getpages(cachep, local_flags, nodeid); if (!objp) goto failed; /* Get slab management. */ slabp = alloc_slabmgmt(cachep, objp, offset, - local_flags & ~GFP_THISNODE, nodeid); + local_flags & ~GFP_CONSTRAINT_MASK, nodeid); if (!slabp) goto opps1; - slabp->nodeid = nodeid; slab_map_pages(cachep, slabp, objp); - cache_init_objs(cachep, slabp, ctor_flags); + cache_init_objs(cachep, slabp); if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2810,28 +2834,19 @@ failed: * Perform extra freeing checks: * - detect bad pointers. * - POISON/RED_ZONE checking - * - destructor calls, for caches with POISON+dtor */ static void kfree_debugcheck(const void *objp) { - struct page *page; - if (!virt_addr_valid(objp)) { printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", (unsigned long)objp); BUG(); } - page = virt_to_page(objp); - if (!PageSlab(page)) { - printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", - (unsigned long)objp); - BUG(); - } } static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) { - unsigned long redzone1, redzone2; + unsigned long long redzone1, redzone2; redzone1 = *dbg_redzone1(cache, obj); redzone2 = *dbg_redzone2(cache, obj); @@ -2847,7 +2862,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", + printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", obj, redzone1, redzone2); } @@ -2858,9 +2873,11 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned int objnr; struct slab *slabp; + BUG_ON(virt_to_cache(objp) != cachep); + objp -= obj_offset(cachep); kfree_debugcheck(objp); - page = virt_to_page(objp); + page = virt_to_head_page(objp); slabp = page_get_slab(page); @@ -2877,21 +2894,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); - if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* - * Need to call the slab's constructor so the caller can - * perform a verify of its state (debugging). Called without - * the cache-lock held. - */ - cachep->ctor(objp + obj_offset(cachep), - cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); - } - if (cachep->flags & SLAB_POISON && cachep->dtor) { - /* we want to cache poison the object, - * call the destruction callback - */ - cachep->dtor(objp + obj_offset(cachep), cachep, 0); - } #ifdef CONFIG_DEBUG_SLAB_LEAK slab_bufctl(slabp)[objnr] = BUFCTL_FREE; #endif @@ -2951,11 +2953,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; int node; - node = numa_node_id(); - +retry: check_irq_off(); + node = numa_node_id(); ac = cpu_cache_get(cachep); -retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { /* @@ -2989,6 +2990,14 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); check_spinlock_acquired(cachep); + + /* + * The slab was either on partial or free list so + * there must be at least one object available for + * allocation. + */ + BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); @@ -3064,7 +3073,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, slab_error(cachep, "double free, or memory outside" " object was overwritten"); printk(KERN_ERR - "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", + "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3076,20 +3085,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, struct slab *slabp; unsigned objnr; - slabp = page_get_slab(virt_to_page(objp)); + slabp = page_get_slab(virt_to_head_page(objp)); objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; } #endif objp += obj_offset(cachep); - if (cachep->ctor && cachep->flags & SLAB_POISON) { - unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; - - if (!(flags & __GFP_WAIT)) - ctor_flags |= SLAB_CTOR_ATOMIC; - - cachep->ctor(objp, cachep, ctor_flags); - } + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", @@ -3144,7 +3147,7 @@ static int __init failslab_debugfs(void) struct dentry *dir; int err; - err = init_fault_attr_dentries(&failslab.attr, "failslab"); + err = init_fault_attr_dentries(&failslab.attr, "failslab"); if (err) return err; dir = failslab.attr.dentries.dir; @@ -3182,9 +3185,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); - if (should_failslab(cachep, flags)) - return NULL; - ac = cpu_cache_get(cachep); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); @@ -3197,35 +3197,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) return objp; } -static __always_inline void *__cache_alloc(struct kmem_cache *cachep, - gfp_t flags, void *caller) -{ - unsigned long save_flags; - void *objp = NULL; - - cache_alloc_debugcheck_before(cachep, flags); - - local_irq_save(save_flags); - - if (unlikely(NUMA_BUILD && - current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) - objp = alternate_node_alloc(cachep, flags); - - if (!objp) - objp = ____cache_alloc(cachep, flags); - /* - * We may just have run out of memory on the local node. - * ____cache_alloc_node() knows how to locate memory on other nodes - */ - if (NUMA_BUILD && !objp) - objp = ____cache_alloc_node(cachep, flags, numa_node_id()); - local_irq_restore(save_flags); - objp = cache_alloc_debugcheck_after(cachep, flags, objp, - caller); - prefetchw(objp); - return objp; -} - #ifdef CONFIG_NUMA /* * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. @@ -3257,28 +3228,38 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. */ -void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { - struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) - ->node_zonelists[gfp_zone(flags)]; - struct zone **z; + struct zonelist *zonelist; + gfp_t local_flags; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; - gfp_t local_flags = (flags & GFP_LEVEL_MASK); + + if (flags & __GFP_THISNODE) + return NULL; + + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry: /* * Look through allowed nodes for objects available * from existing per node queues. */ - for (z = zonelist->zones; *z && !obj; z++) { - nid = zone_to_nid(*z); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + nid = zone_to_nid(zone); - if (cpuset_zone_allowed_hardwall(*z, flags) && + if (cpuset_zone_allowed_hardwall(zone, flags) && cache->nodelists[nid] && - cache->nodelists[nid]->free_objects) + cache->nodelists[nid]->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); + if (obj) + break; + } } if (!obj) { @@ -3291,7 +3272,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, flags, -1); + obj = kmem_getpages(cache, local_flags, -1); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3310,7 +3291,7 @@ retry: */ goto retry; } else { - kmem_freepages(cache, obj); + /* cache_grow already freed obj */ obj = NULL; } } @@ -3374,16 +3355,122 @@ must_grow: if (x) goto retry; - if (!(flags & __GFP_THISNODE)) - /* Unable to grow the cache. Fall back to other nodes. */ - return fallback_alloc(cachep, flags); - - return NULL; + return fallback_alloc(cachep, flags); done: return obj; } -#endif + +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * @caller: return address of caller, used for debug information + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + */ +static __always_inline void * +__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *caller) +{ + unsigned long save_flags; + void *ptr; + + if (should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + + if (unlikely(nodeid == -1)) + nodeid = numa_node_id(); + + if (unlikely(!cachep->nodelists[nodeid])) { + /* Node not bootstrapped yet */ + ptr = fallback_alloc(cachep, flags); + goto out; + } + + if (nodeid == numa_node_id()) { + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + ptr = ____cache_alloc(cachep, flags); + if (ptr) + goto out; + } + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + out: + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + + if (unlikely((flags & __GFP_ZERO) && ptr)) + memset(ptr, 0, obj_size(cachep)); + + return ptr; +} + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +{ + void *objp; + + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + objp = alternate_node_alloc(cache, flags); + if (objp) + goto out; + } + objp = ____cache_alloc(cache, flags); + + /* + * We may just have run out of memory on the local node. + * ____cache_alloc_node() knows how to locate memory on other nodes + */ + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_node_id()); + + out: + return objp; +} +#else + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return ____cache_alloc(cachep, flags); +} + +#endif /* CONFIG_NUMA */ + +static __always_inline void * +__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +{ + unsigned long save_flags; + void *objp; + + if (should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + objp = __do_cache_alloc(cachep, flags); + local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + prefetchw(objp); + + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); + + return objp; +} /* * Caller needs to acquire correct kmem_list's list_lock @@ -3494,7 +3581,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3523,29 +3617,11 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_zalloc - Allocate an object. The memory is set to zero. - * @cache: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache and set the allocated memory to zero. - * The flags are only relevant if the cache has no available objects. - */ -void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) -{ - void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); - if (ret) - memset(ret, 0, obj_size(cache)); - return ret; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - -/** - * kmem_ptr_validate - check if an untrusted pointer might - * be a slab entry. + * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against * @ptr: pointer to validate * - * This verifies that the untrusted pointer looks sane: + * This verifies that the untrusted pointer looks sane; * it is _not_ a guarantee that the pointer is actually * part of the slab cache in question, but it at least * validates that the pointer can be dereferenced and @@ -3582,57 +3658,6 @@ out: } #ifdef CONFIG_NUMA -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * @caller: return address of caller, used for debug information - * - * Identical to kmem_cache_alloc but it will allocate memory on the given - * node, which can improve the performance for cpu bound structures. - * - * Fallback to other node is possible if __GFP_THISNODE is not set. - */ -static __always_inline void * -__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid, void *caller) -{ - unsigned long save_flags; - void *ptr = NULL; - - cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - - if (unlikely(nodeid == -1)) - nodeid = numa_node_id(); - - if (likely(cachep->nodelists[nodeid])) { - if (nodeid == numa_node_id()) { - /* - * Use the locally cached objects if possible. - * However ____cache_alloc does not allow fallback - * to other nodes. It may fail while we still have - * objects on other nodes available. - */ - ptr = ____cache_alloc(cachep, flags); - } - if (!ptr) { - /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); - } - } else { - /* Node not bootstrapped yet */ - if (!(flags & __GFP_THISNODE)) - ptr = fallback_alloc(cachep, flags); - } - - local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - - return ptr; -} - void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { return __cache_alloc_node(cachep, flags, nodeid, @@ -3646,8 +3671,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) struct kmem_cache *cachep; cachep = kmem_find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return kmem_cache_alloc_node(cachep, flags, node); } @@ -3691,8 +3716,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, * functions. */ cachep = __find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return __cache_alloc(cachep, flags, caller); } @@ -3730,9 +3755,10 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; - BUG_ON(virt_to_cache(objp) != cachep); - local_irq_save(flags); + debug_check_no_locks_freed(objp, obj_size(cachep)); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); } @@ -3752,12 +3778,13 @@ void kfree(const void *objp) struct kmem_cache *c; unsigned long flags; - if (unlikely(!objp)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return; local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); + debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); } @@ -3776,7 +3803,7 @@ const char *kmem_cache_name(struct kmem_cache *cachep) EXPORT_SYMBOL_GPL(kmem_cache_name); /* - * This initializes kmem_list3 or resizes varioius caches for all nodes. + * This initializes kmem_list3 or resizes various caches for all nodes. */ static int alloc_kmemlist(struct kmem_cache *cachep) { @@ -3793,12 +3820,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) goto fail; } - new_shared = alloc_arraycache(node, + new_shared = NULL; + if (cachep->shared) { + new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; + if (!new_shared) { + free_alien_cache(new_alien); + goto fail; + } } l3 = cachep->nodelists[node]; @@ -3900,7 +3930,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, } new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); + on_each_cpu(do_ccupdate_local, (void *)new, 1); check_irq_on(); cachep->batchcount = batchcount; @@ -3956,10 +3986,8 @@ static int enable_cpucache(struct kmem_cache *cachep) * to a larger limit. Thus disabled by default. */ shared = 0; -#ifdef CONFIG_SMP - if (cachep->buffer_size <= PAGE_SIZE) + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#endif #if DEBUG /* @@ -4007,7 +4035,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, /** * cache_reap - Reclaim memory from caches. - * @unused: unused parameter + * @w: work descriptor * * Called from workqueue/eventd every few seconds. * Purpose: @@ -4017,18 +4045,17 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, * If we cannot acquire the cache chain mutex then just give up - we'll try * again on the next iteration. */ -static void cache_reap(struct work_struct *unused) +static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; struct kmem_list3 *l3; int node = numa_node_id(); + struct delayed_work *work = + container_of(w, struct delayed_work, work); - if (!mutex_trylock(&cache_chain_mutex)) { + if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ - schedule_delayed_work(&__get_cpu_var(reap_work), - round_jiffies_relative(REAPTIMEOUT_CPUC)); - return; - } + goto out; list_for_each_entry(searchp, &cache_chain, next) { check_irq_on(); @@ -4070,13 +4097,12 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); - refresh_cpu_vm_stats(smp_processor_id()); +out: /* Set up the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), - round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); } -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_SLABINFO static void print_slabinfo_header(struct seq_file *m) { @@ -4104,26 +4130,17 @@ static void print_slabinfo_header(struct seq_file *m) static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; - struct list_head *p; mutex_lock(&cache_chain_mutex); if (!n) print_slabinfo_header(m); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + + return seq_list_start(&cache_chain, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct kmem_cache *cachep = p; - ++*pos; - return cachep->next.next == &cache_chain ? - NULL : list_entry(cachep->next.next, struct kmem_cache, next); + return seq_list_next(p, &cache_chain, pos); } static void s_stop(struct seq_file *m, void *p) @@ -4133,7 +4150,7 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4302,17 +4319,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, static void *leaks_start(struct seq_file *m, loff_t *pos) { - loff_t n = *pos; - struct list_head *p; - mutex_lock(&cache_chain_mutex); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + return seq_list_start(&cache_chain, *pos); } static inline int add_caller(unsigned long *n, unsigned long v) @@ -4362,16 +4370,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) static void show_symbol(struct seq_file *m, unsigned long address) { #ifdef CONFIG_KALLSYMS - char *modname; - const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN+1]; - - name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; - if (name) { + if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { seq_printf(m, "%s+%#lx/%#lx", name, offset, size); - if (modname) + if (modname[0]) seq_printf(m, " [%s]", modname); return; } @@ -4381,7 +4385,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; struct kmem_list3 *l3; const char *name; @@ -4460,9 +4464,10 @@ const struct seq_operations slabstats_op = { * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. */ -unsigned int ksize(const void *objp) +size_t ksize(const void *objp) { - if (unlikely(objp == NULL)) + BUG_ON(!objp); + if (unlikely(objp == ZERO_SIZE_PTR)) return 0; return obj_size(virt_to_cache(objp));