X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fslab.c;h=6f6abef83a1af82432305245755ab690c8363081;hb=c0e7b4aa1c09ea992808ea8c079141bc8dd0f5bc;hp=c23b99250df264572c5f71c8d61470d549e0469b;hpb=1d2c8eea698514cfaa53fc991b960791d09508e1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/slab.c b/mm/slab.c index c23b992..6f6abef 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -86,7 +86,6 @@ * All object allocations for a node occur from node specific slab lists. */ -#include #include #include #include @@ -104,19 +103,20 @@ #include #include #include +#include #include #include #include +#include #include +#include -#include #include #include #include /* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, - * SLAB_RED_ZONE & SLAB_POISON. + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * * STATS - 1 to collect stats for /proc/slabinfo. @@ -137,6 +137,7 @@ /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -148,10 +149,11 @@ * Usually, the kmalloc caches are cache_line_size() aligned, except when * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. - * Note that this flag disables some debug features. + * alignment larger than the alignment of a 64-bit integer. + * ARCH_KMALLOC_MINALIGN allows that. + * Note that increasing this value may disable some debug features. */ -#define ARCH_KMALLOC_MINALIGN 0 +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #endif #ifndef ARCH_SLAB_MINALIGN @@ -171,15 +173,15 @@ /* Legal flag mask for kmem_cache_create(). */ #if DEBUG -# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ +# define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif @@ -314,7 +316,7 @@ static int drain_freelist(struct kmem_cache *cache, static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); static int enable_cpucache(struct kmem_cache *cachep); -static void cache_reap(void *unused); +static void cache_reap(struct work_struct *unused); /* * This function must be completely optimized away if a constant is passed to @@ -386,8 +388,8 @@ struct kmem_cache { unsigned int shared; unsigned int buffer_size; + u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -408,9 +410,6 @@ struct kmem_cache { /* constructor func */ void (*ctor) (void *, struct kmem_cache *, unsigned long); - /* de-constructor func */ - void (*dtor) (void *, struct kmem_cache *, unsigned long); - /* 5) cache creation/removal */ const char *name; struct list_head next; @@ -442,6 +441,17 @@ struct kmem_cache { int obj_offset; int obj_size; #endif + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ }; #define CFLGS_OFF_SLAB (0x80000000UL) @@ -525,19 +535,22 @@ static int obj_size(struct kmem_cache *cachep) return cachep->obj_size; } -static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); + return (unsigned long long*) (objp + obj_offset(cachep) - + sizeof(unsigned long long)); } -static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); if (cachep->flags & SLAB_STORE_USER) - return (unsigned long *)(objp + cachep->buffer_size - - 2 * BYTES_PER_WORD); - return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); + return (unsigned long long *)(objp + cachep->buffer_size - + sizeof(unsigned long long) - + REDZONE_ALIGN); + return (unsigned long long *) (objp + cachep->buffer_size - + sizeof(unsigned long long)); } static void **dbg_userword(struct kmem_cache *cachep, void *objp) @@ -550,28 +563,13 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #define obj_offset(x) 0 #define obj_size(cachep) (cachep->buffer_size) -#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) -#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) #endif /* - * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp - * order. - */ -#if defined(CONFIG_LARGE_ALLOCS) -#define MAX_OBJ_ORDER 13 /* up to 32Mb */ -#define MAX_GFP_ORDER 13 /* up to 32Mb */ -#elif defined(CONFIG_MMU) -#define MAX_OBJ_ORDER 5 /* 32 pages */ -#define MAX_GFP_ORDER 5 /* 32 pages */ -#else -#define MAX_OBJ_ORDER 8 /* up to 1Mb */ -#define MAX_GFP_ORDER 8 /* up to 1Mb */ -#endif - -/* * Do not go above this order unless 0 objects fit into the slab. */ #define BREAK_GFP_ORDER_HI 1 @@ -590,8 +588,7 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) static inline struct kmem_cache *page_get_cache(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); + page = compound_head(page); BUG_ON(!PageSlab(page)); return (struct kmem_cache *)page->lru.next; } @@ -603,21 +600,19 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); BUG_ON(!PageSlab(page)); return (struct slab *)page->lru.prev; } static inline struct kmem_cache *virt_to_cache(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_cache(page); } static inline struct slab *virt_to_slab(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_slab(page); } @@ -627,10 +622,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, return slab->s_mem + cache->buffer_size * idx; } -static inline unsigned int obj_to_index(struct kmem_cache *cache, - struct slab *slab, void *obj) +/* + * We want to avoid an expensive divide : (offset / cache->buffer_size) + * Using the fact that buffer_size is a constant for a particular cache, + * we can replace (offset / cache->buffer_size) by + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct slab *slab, void *obj) { - return (unsigned)(obj - slab->s_mem) / cache->buffer_size; + u32 offset = (obj - slab->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); } /* @@ -669,9 +671,6 @@ static struct kmem_cache cache_cache = { .shared = 1, .buffer_size = sizeof(struct kmem_cache), .name = "kmem_cache", -#if DEBUG - .obj_size = sizeof(struct kmem_cache), -#endif }; #define BAD_ALIEN_MAGIC 0x01020304ul @@ -731,7 +730,10 @@ static inline void init_lock_keys(void) } #endif -/* Guard access to the cache-chain. */ +/* + * 1. Guard access to the cache-chain. + * 2. Protect sanity of cpu_online_map against cpu hotplug events + */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; @@ -754,7 +756,7 @@ int slab_is_available(void) return g_cpucache_up == FULL; } -static DEFINE_PER_CPU(struct work_struct, reap_work); +static DEFINE_PER_CPU(struct delayed_work, reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { @@ -773,6 +775,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, */ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); #endif + if (!size) + return ZERO_SIZE_PTR; + while (size > csizep->cs_size) csizep++; @@ -781,8 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, * has cs_{dma,}cachep==NULL. Thus no special case * for large kmalloc calls required. */ +#ifdef CONFIG_ZONE_DMA if (unlikely(gfpflags & GFP_DMA)) return csizep->cs_dmacachep; +#endif return csizep->cs_cachep; } @@ -867,6 +874,23 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, dump_stack(); } +/* + * By default on NUMA we use alien caches to stage the freeing of + * objects allocated from other nodes. This causes massive memory + * inefficiencies when using fake NUMA setup to split memory into a + * large number of small nodes, so it can be disabled on the command + * line + */ + +static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; +static int __init noaliencache_setup(char *s) +{ + use_alien_caches = 0; + return 1; +} +__setup("noaliencache", noaliencache_setup); + #ifdef CONFIG_NUMA /* * Special reaping functions for NUMA systems called from cache_reap(). @@ -884,19 +908,13 @@ static void init_reap_node(int cpu) if (node == MAX_NUMNODES) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + per_cpu(reap_node, cpu) = node; } static void next_reap_node(void) { int node = __get_cpu_var(reap_node); - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); - node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); @@ -915,19 +933,20 @@ static void next_reap_node(void) * the CPUs getting into lockstep and contending for the global cache chain * lock. */ -static void __devinit start_cpu_timer(int cpu) +static void __cpuinit start_cpu_timer(int cpu) { - struct work_struct *reap_work = &per_cpu(reap_work, cpu); + struct delayed_work *reap_work = &per_cpu(reap_work, cpu); /* * When this gets called from do_initcalls via cpucache_init(), * init_workqueues() has already run, so keventd will be setup * at that time. */ - if (keventd_up() && reap_work->func == NULL) { + if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_WORK(reap_work, cache_reap, NULL); - schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + INIT_DELAYED_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, + __round_jiffies_relative(HZ, cpu)); } } @@ -997,7 +1016,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep, return NULL; } -static inline void *__cache_alloc_node(struct kmem_cache *cachep, +static inline void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { return NULL; @@ -1005,13 +1024,13 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep, #else /* CONFIG_NUMA */ -static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { struct array_cache **ac_ptr; - int memsize = sizeof(void *) * MAX_NUMNODES; + int memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) @@ -1107,15 +1126,18 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) int nodeid = slabp->nodeid; struct kmem_list3 *l3; struct array_cache *alien = NULL; + int node; + + node = numa_node_id(); /* * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == numa_node_id())) + if (likely(slabp->nodeid == node)) return 0; - l3 = cachep->nodelists[numa_node_id()]; + l3 = cachep->nodelists[node]; STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; @@ -1142,11 +1164,14 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - int memsize = sizeof(struct kmem_list3); + const int memsize = sizeof(struct kmem_list3); switch (action) { - case CPU_UP_PREPARE: + case CPU_LOCK_ACQUIRE: mutex_lock(&cache_chain_mutex); + break; + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: /* * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. @@ -1189,22 +1214,25 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; - struct array_cache *shared; - struct array_cache **alien; + struct array_cache *shared = NULL; + struct array_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount); if (!nc) goto bad; - shared = alloc_arraycache(node, + if (cachep->shared) { + shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, 0xbaadf00d); - if (!shared) - goto bad; - - alien = alloc_alien_cache(node, cachep->limit); - if (!alien) - goto bad; + if (!shared) + goto bad; + } + if (use_alien_caches) { + alien = alloc_alien_cache(node, cachep->limit); + if (!alien) + goto bad; + } cachep->array[cpu] = nc; l3 = cachep->nodelists[node]; BUG_ON(!l3); @@ -1228,13 +1256,30 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, kfree(shared); free_alien_cache(alien); } - mutex_unlock(&cache_chain_mutex); break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: start_cpu_timer(cpu); break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Shutdown cache reaper. Note that the cache_chain_mutex is + * held so that if cache_reap() is invoked it cannot do + * anything expensive but will only modify reap_work + * and reschedule the timer. + */ + cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(reap_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* * Even if all the cpus of a node are down, we don't free the * kmem_list3 of any cache. This to avoid a race between @@ -1244,8 +1289,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * gets destroyed at kmem_cache_destroy(). */ /* fall thru */ +#endif case CPU_UP_CANCELED: - mutex_lock(&cache_chain_mutex); + case CPU_UP_CANCELED_FROZEN: list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1275,8 +1321,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, shared = l3->shared; if (shared) { - free_block(cachep, l3->shared->entry, - l3->shared->avail, node); + free_block(cachep, shared->entry, + shared->avail, node); l3->shared = NULL; } @@ -1304,13 +1350,13 @@ free_array_cache: continue; drain_freelist(cachep, l3, l3->free_objects); } + break; + case CPU_LOCK_RELEASE: mutex_unlock(&cache_chain_mutex); break; -#endif } return NOTIFY_OK; bad: - mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1326,7 +1372,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, { struct kmem_list3 *ptr; - BUG_ON(cachep->nodelists[nodeid] != list); ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); BUG_ON(!ptr); @@ -1353,6 +1398,12 @@ void __init kmem_cache_init(void) struct cache_names *names; int i; int order; + int node; + + if (num_possible_nodes() == 1) { + use_alien_caches = 0; + numa_platform = 0; + } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -1387,15 +1438,28 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ + node = numa_node_id(); + /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; + cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; + /* + * struct kmem_cache size depends on nr_node_ids, which + * can be less than MAX_NUMNODES. + */ + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + + nr_node_ids * sizeof(struct kmem_list3 *); +#if DEBUG + cache_cache.obj_size = cache_cache.buffer_size; +#endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); + cache_cache.reciprocal_buffer_size = + reciprocal_value(cache_cache.buffer_size); for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, @@ -1423,7 +1487,7 @@ void __init kmem_cache_init(void) sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = @@ -1431,7 +1495,7 @@ void __init kmem_cache_init(void) sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } slab_early_init = 0; @@ -1449,15 +1513,17 @@ void __init kmem_cache_init(void) sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL, NULL); + NULL); } - - sizes->cs_dmacachep = kmem_cache_create(names->name_dma, +#ifdef CONFIG_ZONE_DMA + sizes->cs_dmacachep = kmem_cache_create( + names->name_dma, sizes->cs_size, ARCH_KMALLOC_MINALIGN, ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC, - NULL, NULL); + NULL); +#endif sizes++; names++; } @@ -1497,19 +1563,18 @@ void __init kmem_cache_init(void) } /* 5) Replace the bootstrap kmem_list3's */ { - int node; + int nid; + /* Replace the static kmem_list3 structures for the boot cpu */ - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], - numa_node_id()); + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); - for_each_online_node(node) { + for_each_online_node(nid) { init_list(malloc_sizes[INDEX_AC].cs_cachep, - &initkmem_list3[SIZE_AC + node], node); + &initkmem_list3[SIZE_AC + nid], nid); if (INDEX_AC != INDEX_L3) { init_list(malloc_sizes[INDEX_L3].cs_cachep, - &initkmem_list3[SIZE_L3 + node], - node); + &initkmem_list3[SIZE_L3 + nid], nid); } } } @@ -1577,12 +1642,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) flags |= __GFP_COMP; #endif - /* - * Under NUMA we want memory on the indicated node. We will handle - * the needed fallback ourselves since we want to serve from our - * per node object lists first for other nodes. - */ - flags |= cachep->gfpflags | GFP_THISNODE; + flags |= cachep->gfpflags; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -1720,7 +1780,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", + printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -1762,8 +1822,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: %s start=%p, len=%d\n", + cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1840,20 +1900,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) slab_error(cachep, "end of a freed object " "was overwritten"); } - if (cachep->dtor && !(cachep->flags & SLAB_POISON)) - (cachep->dtor) (objp + obj_offset(cachep), cachep, 0); } } #else static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { - if (cachep->dtor) { - int i; - for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); - (cachep->dtor) (objp, cachep, 0); - } - } } #endif @@ -1889,7 +1940,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) * For setting up all the kmem_list3s for cache whose buffer_size is same as * size of kmem_list3. */ -static void set_up_list3s(struct kmem_cache *cachep, int index) +static void __init set_up_list3s(struct kmem_cache *cachep, int index) { int node; @@ -1942,7 +1993,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, size_t left_over = 0; int gfporder; - for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { + for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { unsigned int num; size_t remainder; @@ -1992,7 +2043,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } -static int setup_cpu_cache(struct kmem_cache *cachep) +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) { if (g_cpucache_up == FULL) return enable_cpucache(cachep); @@ -2053,12 +2104,10 @@ static int setup_cpu_cache(struct kmem_cache *cachep) * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. - * @dtor: A destructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache - * and the @dtor is run before the pages are handed back. + * The @ctor is run when new pages are allocated by the cache. * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. @@ -2078,8 +2127,7 @@ static int setup_cpu_cache(struct kmem_cache *cachep) struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void*, struct kmem_cache *, unsigned long), - void (*dtor)(void*, struct kmem_cache *, unsigned long)) + void (*ctor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2088,22 +2136,19 @@ kmem_cache_create (const char *name, size_t size, size_t align, * Sanity checks... these are all serious usage bugs. */ if (!name || in_interrupt() || (size < BYTES_PER_WORD) || - (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { + size > KMALLOC_MAX_SIZE) { printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, name); BUG(); } /* - * Prevent CPUs from coming and going. - * lock_cpu_hotplug() nests outside cache_chain_mutex + * We use cache_chain_mutex to ensure a consistent view of + * cpu_online_map as well. Please see cpuup_callback */ - lock_cpu_hotplug(); - mutex_lock(&cache_chain_mutex); list_for_each_entry(pc, &cache_chain, next) { - mm_segment_t old_fs = get_fs(); char tmp; int res; @@ -2112,17 +2157,17 @@ kmem_cache_create (const char *name, size_t size, size_t align, * destroy its slab cache and no-one else reuses the vmalloc * area of the module. Print a warning. */ - set_fs(KERNEL_DS); - res = __get_user(tmp, pc->name); - set_fs(old_fs); + res = probe_kernel_address(pc->name, tmp); if (res) { - printk("SLAB: cache with size %d has lost its name\n", + printk(KERN_ERR + "SLAB: cache with size %d has lost its name\n", pc->buffer_size); continue; } if (!strcmp(pc->name, name)) { - printk("kmem_cache_create: duplicate cache %s\n", name); + printk(KERN_ERR + "kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; } @@ -2130,12 +2175,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ - if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { - /* No constructor, but inital state check requested */ - printk(KERN_ERR "%s: No con, but init state check " - "requested - %s\n", __FUNCTION__, name); - flags &= ~SLAB_DEBUG_INITIAL; - } #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -2143,7 +2182,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; @@ -2151,9 +2191,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(dtor); - /* * Always checks flags, a caller might be expecting debug support which * isn't available. @@ -2187,32 +2224,39 @@ kmem_cache_create (const char *name, size_t size, size_t align, } /* - * Redzoning and user store require word alignment. Note this will be - * overridden by architecture or caller mandated alignment if either - * is greater than BYTES_PER_WORD. + * Redzoning and user store require word alignment or possibly larger. + * Note this will be overridden by architecture or caller mandated + * alignment if either is greater than BYTES_PER_WORD. */ - if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; - /* 2) arch mandated alignment: disables debug if necessary */ + if (flags & SLAB_RED_ZONE) { + ralign = REDZONE_ALIGN; + /* If redzoning, ensure that the second redzone is suitably + * aligned, by adjusting the object size accordingly. */ + size += REDZONE_ALIGN - 1; + size &= ~(REDZONE_ALIGN - 1); + } + + /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; - if (ralign > BYTES_PER_WORD) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } - /* 3) caller mandated alignment: disables debug if necessary */ + /* 3) caller mandated alignment */ if (ralign < align) { ralign = align; - if (ralign > BYTES_PER_WORD) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ align = ralign; /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); if (!cachep) goto oops; @@ -2225,14 +2269,18 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += BYTES_PER_WORD; - size += 2 * BYTES_PER_WORD; + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of - * the real object. + * the real object. But if the second red zone needs to be + * aligned to 64 bits, we must allow that much space. */ - size += BYTES_PER_WORD; + if (flags & SLAB_RED_ZONE) + size += REDZONE_ALIGN; + else + size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size @@ -2260,7 +2308,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { - printk("kmem_cache_create: couldn't create cache %s.\n", name); + printk(KERN_ERR + "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; @@ -2291,9 +2340,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; - if (flags & SLAB_CACHE_DMA) + if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; + cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); @@ -2304,10 +2354,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(!cachep->slabp_cache); + BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; - cachep->dtor = dtor; cachep->name = name; if (setup_cpu_cache(cachep)) { @@ -2323,7 +2372,6 @@ oops: panic("kmem_cache_create(): failed to create slab `%s'\n", name); mutex_unlock(&cache_chain_mutex); - unlock_cpu_hotplug(); return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2441,6 +2489,7 @@ out: return nr_freed; } +/* Called with cache_chain_mutex held to protect against cpu hotplug */ static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; @@ -2471,9 +2520,13 @@ static int __cache_shrink(struct kmem_cache *cachep) */ int kmem_cache_shrink(struct kmem_cache *cachep) { + int ret; BUG_ON(!cachep || in_interrupt()); - return __cache_shrink(cachep); + mutex_lock(&cache_chain_mutex); + ret = __cache_shrink(cachep); + mutex_unlock(&cache_chain_mutex); + return ret; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -2481,7 +2534,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * kmem_cache_destroy - delete a cache * @cachep: the cache to destroy * - * Remove a struct kmem_cache object from the slab cache. + * Remove a &struct kmem_cache object from the slab cache. * * It is expected this function will be called by a module when it is * unloaded. This will remove the cache completely, and avoid a duplicate @@ -2497,23 +2550,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep) { BUG_ON(!cachep || in_interrupt()); - /* Don't let CPUs to come and go */ - lock_cpu_hotplug(); - /* Find the cache in the chain of caches. */ mutex_lock(&cache_chain_mutex); /* * the chain is never empty, cache_cache is never destroyed */ list_del(&cachep->next); - mutex_unlock(&cache_chain_mutex); - if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); - mutex_lock(&cache_chain_mutex); list_add(&cachep->next, &cache_chain); mutex_unlock(&cache_chain_mutex); - unlock_cpu_hotplug(); return; } @@ -2521,7 +2567,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) synchronize_rcu(); __kmem_cache_destroy(cachep); - unlock_cpu_hotplug(); + mutex_unlock(&cache_chain_mutex); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2545,7 +2591,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags, nodeid); + local_flags & ~GFP_THISNODE, nodeid); if (!slabp) return NULL; } else { @@ -2565,7 +2611,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) } static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp, unsigned long ctor_flags) + struct slab *slabp) { int i; @@ -2589,7 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep, */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) cachep->ctor(objp + obj_offset(cachep), cachep, - ctor_flags); + 0); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2605,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep, cachep->buffer_size / PAGE_SIZE, 0); #else if (cachep->ctor) - cachep->ctor(objp, cachep, ctor_flags); + cachep->ctor(objp, cachep, 0); #endif slab_bufctl(slabp)[i] = i + 1; } @@ -2615,10 +2661,12 @@ static void cache_init_objs(struct kmem_cache *cachep, static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { - if (flags & SLAB_DMA) - BUG_ON(!(cachep->gfpflags & GFP_DMA)); - else - BUG_ON(cachep->gfpflags & GFP_DMA); + if (CONFIG_ZONE_DMA_FLAG) { + if (flags & GFP_DMA) + BUG_ON(!(cachep->gfpflags & GFP_DMA)); + else + BUG_ON(cachep->gfpflags & GFP_DMA); + } } static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, @@ -2686,32 +2734,21 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static int cache_grow(struct kmem_cache *cachep, + gfp_t flags, int nodeid, void *objp) { struct slab *slabp; - void *objp; size_t offset; gfp_t local_flags; - unsigned long ctor_flags; struct kmem_list3 *l3; /* * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); - if (flags & SLAB_NO_GROW) - return 0; - - ctor_flags = SLAB_CTOR_CONSTRUCTOR; - local_flags = (flags & SLAB_LEVEL_MASK); - if (!(local_flags & __GFP_WAIT)) - /* - * Not allowed to sleep. Need to tell a constructor about - * this - it might need to know... - */ - ctor_flags |= SLAB_CTOR_ATOMIC; + BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); + local_flags = (flags & GFP_LEVEL_MASK); /* Take the l3 list lock to change the colour_next on this node */ check_irq_off(); l3 = cachep->nodelists[nodeid]; @@ -2741,19 +2778,21 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - objp = kmem_getpages(cachep, flags, nodeid); + if (!objp) + objp = kmem_getpages(cachep, local_flags, nodeid); if (!objp) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); + slabp = alloc_slabmgmt(cachep, objp, offset, + local_flags & ~GFP_THISNODE, nodeid); if (!slabp) goto opps1; slabp->nodeid = nodeid; slab_map_pages(cachep, slabp, objp); - cache_init_objs(cachep, slabp, ctor_flags); + cache_init_objs(cachep, slabp); if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2780,28 +2819,19 @@ failed: * Perform extra freeing checks: * - detect bad pointers. * - POISON/RED_ZONE checking - * - destructor calls, for caches with POISON+dtor */ static void kfree_debugcheck(const void *objp) { - struct page *page; - if (!virt_addr_valid(objp)) { printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", (unsigned long)objp); BUG(); } - page = virt_to_page(objp); - if (!PageSlab(page)) { - printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", - (unsigned long)objp); - BUG(); - } } static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) { - unsigned long redzone1, redzone2; + unsigned long long redzone1, redzone2; redzone1 = *dbg_redzone1(cache, obj); redzone2 = *dbg_redzone2(cache, obj); @@ -2817,7 +2847,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", + printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", obj, redzone1, redzone2); } @@ -2830,7 +2860,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, objp -= obj_offset(cachep); kfree_debugcheck(objp); - page = virt_to_page(objp); + page = virt_to_head_page(objp); slabp = page_get_slab(page); @@ -2847,21 +2877,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); - if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* - * Need to call the slab's constructor so the caller can - * perform a verify of its state (debugging). Called without - * the cache-lock held. - */ - cachep->ctor(objp + obj_offset(cachep), - cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); - } - if (cachep->flags & SLAB_POISON && cachep->dtor) { - /* we want to cache poison the object, - * call the destruction callback - */ - cachep->dtor(objp + obj_offset(cachep), cachep, 0); - } #ifdef CONFIG_DEBUG_SLAB_LEAK slab_bufctl(slabp)[objnr] = BUFCTL_FREE; #endif @@ -2919,6 +2934,9 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) int batchcount; struct kmem_list3 *l3; struct array_cache *ac; + int node; + + node = numa_node_id(); check_irq_off(); ac = cpu_cache_get(cachep); @@ -2932,7 +2950,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[numa_node_id()]; + l3 = cachep->nodelists[node]; BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); @@ -2956,13 +2974,21 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); check_spinlock_acquired(cachep); + + /* + * The slab was either on partial or free list so + * there must be at least one object available for + * allocation. + */ + BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - numa_node_id()); + node); } check_slabp(cachep, slabp); @@ -2981,7 +3007,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags, numa_node_id()); + x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3031,7 +3057,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, slab_error(cachep, "double free, or memory outside" " object was overwritten"); printk(KERN_ERR - "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", + "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3043,32 +3069,106 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, struct slab *slabp; unsigned objnr; - slabp = page_get_slab(virt_to_page(objp)); + slabp = page_get_slab(virt_to_head_page(objp)); objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; } #endif objp += obj_offset(cachep); - if (cachep->ctor && cachep->flags & SLAB_POISON) { - unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; - - if (!(flags & __GFP_WAIT)) - ctor_flags |= SLAB_CTOR_ATOMIC; - - cachep->ctor(objp, cachep, ctor_flags); + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp, cachep, 0); +#if ARCH_SLAB_MINALIGN + if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + objp, ARCH_SLAB_MINALIGN); } +#endif return objp; } #else #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif +#ifdef CONFIG_FAILSLAB + +static struct failslab_attr { + + struct fault_attr attr; + + u32 ignore_gfp_wait; +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + struct dentry *ignore_gfp_wait_file; +#endif + +} failslab = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, +}; + +static int __init setup_failslab(char *str) +{ + return setup_fault_attr(&failslab.attr, str); +} +__setup("failslab=", setup_failslab); + +static int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + if (cachep == &cache_cache) + return 0; + if (flags & __GFP_NOFAIL) + return 0; + if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) + return 0; + + return should_fail(&failslab.attr, obj_size(cachep)); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init failslab_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&failslab.attr, "failslab"); + if (err) + return err; + dir = failslab.attr.dentries.dir; + + failslab.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait); + + if (!failslab.ignore_gfp_wait_file) { + err = -ENOMEM; + debugfs_remove(failslab.ignore_gfp_wait_file); + cleanup_fault_attr_dentries(&failslab.attr); + } + + return err; +} + +late_initcall(failslab_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAILSLAB */ + +static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + return 0; +} + +#endif /* CONFIG_FAILSLAB */ + static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; check_irq_off(); + ac = cpu_cache_get(cachep); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); @@ -3081,35 +3181,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) return objp; } -static __always_inline void *__cache_alloc(struct kmem_cache *cachep, - gfp_t flags, void *caller) -{ - unsigned long save_flags; - void *objp = NULL; - - cache_alloc_debugcheck_before(cachep, flags); - - local_irq_save(save_flags); - - if (unlikely(NUMA_BUILD && - current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) - objp = alternate_node_alloc(cachep, flags); - - if (!objp) - objp = ____cache_alloc(cachep, flags); - /* - * We may just have run out of memory on the local node. - * __cache_alloc_node() knows how to locate memory on other nodes - */ - if (NUMA_BUILD && !objp) - objp = __cache_alloc_node(cachep, flags, numa_node_id()); - local_irq_restore(save_flags); - objp = cache_alloc_debugcheck_after(cachep, flags, objp, - caller); - prefetchw(objp); - return objp; -} - #ifdef CONFIG_NUMA /* * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. @@ -3129,36 +3200,89 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return __cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; } /* * Fallback function if there was no memory available and no objects on a - * certain node and we are allowed to fall back. We mimick the behavior of - * the page allocator. We fall back according to a zonelist determined by - * the policy layer while obeying cpuset constraints. + * certain node and fall back is permitted. First we scan all the + * available nodelists for available objects. If that fails then we + * perform an allocation without specifying a node. This allows the page + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. */ -void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { - struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) - ->node_zonelists[gfp_zone(flags)]; + struct zonelist *zonelist; + gfp_t local_flags; struct zone **z; void *obj = NULL; + int nid; + + if (flags & __GFP_THISNODE) + return NULL; + + zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + local_flags = (flags & GFP_LEVEL_MASK); + +retry: + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for (z = zonelist->zones; *z && !obj; z++) { + nid = zone_to_nid(*z); + + if (cpuset_zone_allowed_hardwall(*z, flags) && + cache->nodelists[nid] && + cache->nodelists[nid]->free_objects) + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid); + } - for (z = zonelist->zones; *z && !obj; z++) - if (zone_idx(*z) <= ZONE_NORMAL && - cpuset_zone_allowed(*z, flags)) - obj = __cache_alloc_node(cache, - flags | __GFP_THISNODE, - zone_to_nid(*z)); + if (!obj) { + /* + * This allocation will be performed within the constraints + * of the current cpuset / memory policy requirements. + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ + if (local_flags & __GFP_WAIT) + local_irq_enable(); + kmem_flagcheck(cache, flags); + obj = kmem_getpages(cache, flags, -1); + if (local_flags & __GFP_WAIT) + local_irq_disable(); + if (obj) { + /* + * Insert into the appropriate per node queues + */ + nid = page_to_nid(virt_to_page(obj)); + if (cache_grow(cache, flags, nid, obj)) { + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid); + if (!obj) + /* + * Another processor may allocate the + * objects in the slab since we are + * not holding any locks. + */ + goto retry; + } else { + /* cache_grow already freed obj */ + obj = NULL; + } + } + } return obj; } /* * A interface to enable slab creation on nodeid */ -static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; @@ -3207,20 +3331,126 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags, nodeid); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); if (x) goto retry; - if (!(flags & __GFP_THISNODE)) - /* Unable to grow the cache. Fall back to other nodes. */ - return fallback_alloc(cachep, flags); - - return NULL; + return fallback_alloc(cachep, flags); done: return obj; } -#endif + +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * @caller: return address of caller, used for debug information + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + */ +static __always_inline void * +__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, + void *caller) +{ + unsigned long save_flags; + void *ptr; + + if (should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + + if (unlikely(nodeid == -1)) + nodeid = numa_node_id(); + + if (unlikely(!cachep->nodelists[nodeid])) { + /* Node not bootstrapped yet */ + ptr = fallback_alloc(cachep, flags); + goto out; + } + + if (nodeid == numa_node_id()) { + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + ptr = ____cache_alloc(cachep, flags); + if (ptr) + goto out; + } + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + out: + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + + if (unlikely((flags & __GFP_ZERO) && ptr)) + memset(ptr, 0, obj_size(cachep)); + + return ptr; +} + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +{ + void *objp; + + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + objp = alternate_node_alloc(cache, flags); + if (objp) + goto out; + } + objp = ____cache_alloc(cache, flags); + + /* + * We may just have run out of memory on the local node. + * ____cache_alloc_node() knows how to locate memory on other nodes + */ + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_node_id()); + + out: + return objp; +} +#else + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return ____cache_alloc(cachep, flags); +} + +#endif /* CONFIG_NUMA */ + +static __always_inline void * +__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +{ + unsigned long save_flags; + void *objp; + + if (should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + objp = __do_cache_alloc(cachep, flags); + local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + prefetchw(objp); + + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); + + return objp; +} /* * Caller needs to acquire correct kmem_list's list_lock @@ -3331,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3360,23 +3597,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_zalloc - Allocate an object. The memory is set to zero. - * @cache: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache and set the allocated memory to zero. - * The flags are only relevant if the cache has no available objects. - */ -void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) -{ - void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); - if (ret) - memset(ret, 0, obj_size(cache)); - return ret; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - -/** * kmem_ptr_validate - check if an untrusted pointer might * be a slab entry. * @cachep: the cache we're checking against @@ -3390,7 +3610,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc); * * Currently only used for dentry validation. */ -int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) +int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) { unsigned long addr = (unsigned long)ptr; unsigned long min_addr = PAGE_OFFSET; @@ -3419,51 +3639,46 @@ out: } #ifdef CONFIG_NUMA -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * - * Identical to kmem_cache_alloc, except that this function is slow - * and can sleep. And it will allocate memory on the given node, which - * can improve the performance for cpu bound structures. - * New and improved: it will now make sure that the object gets - * put on the correct node list so that there is no false sharing. - */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - unsigned long save_flags; - void *ptr; - - cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - - if (nodeid == -1 || nodeid == numa_node_id() || - !cachep->nodelists[nodeid]) - ptr = ____cache_alloc(cachep, flags); - else - ptr = __cache_alloc_node(cachep, flags, nodeid); - local_irq_restore(save_flags); - - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, - __builtin_return_address(0)); - - return ptr; + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); } EXPORT_SYMBOL(kmem_cache_alloc_node); -void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; cachep = kmem_find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return kmem_cache_alloc_node(cachep, flags, node); } + +#ifdef CONFIG_DEBUG_SLAB +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, + __builtin_return_address(0)); +} EXPORT_SYMBOL(__kmalloc_node); -#endif + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, void *caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); +#else +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, NULL); +} +EXPORT_SYMBOL(__kmalloc_node); +#endif /* CONFIG_DEBUG_SLAB */ +#endif /* CONFIG_NUMA */ /** * __do_kmalloc - allocate memory @@ -3482,8 +3697,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, * functions. */ cachep = __find_general_cachep(size, flags); - if (unlikely(cachep == NULL)) - return NULL; + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; return __cache_alloc(cachep, flags, caller); } @@ -3524,6 +3739,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) BUG_ON(virt_to_cache(objp) != cachep); local_irq_save(flags); + debug_check_no_locks_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); } @@ -3543,7 +3759,7 @@ void kfree(const void *objp) struct kmem_cache *c; unsigned long flags; - if (unlikely(!objp)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return; local_irq_save(flags); kfree_debugcheck(objp); @@ -3574,20 +3790,25 @@ static int alloc_kmemlist(struct kmem_cache *cachep) int node; struct kmem_list3 *l3; struct array_cache *new_shared; - struct array_cache **new_alien; + struct array_cache **new_alien = NULL; for_each_online_node(node) { - new_alien = alloc_alien_cache(node, cachep->limit); - if (!new_alien) - goto fail; + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit); + if (!new_alien) + goto fail; + } - new_shared = alloc_arraycache(node, + new_shared = NULL; + if (cachep->shared) { + new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; + if (!new_shared) { + free_alien_cache(new_alien); + goto fail; + } } l3 = cachep->nodelists[node]; @@ -3745,10 +3966,8 @@ static int enable_cpucache(struct kmem_cache *cachep) * to a larger limit. Thus disabled by default. */ shared = 0; -#ifdef CONFIG_SMP - if (cachep->buffer_size <= PAGE_SIZE) + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#endif #if DEBUG /* @@ -3796,7 +4015,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, /** * cache_reap - Reclaim memory from caches. - * @unused: unused parameter + * @w: work descriptor * * Called from workqueue/eventd every few seconds. * Purpose: @@ -3806,18 +4025,17 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, * If we cannot acquire the cache chain mutex then just give up - we'll try * again on the next iteration. */ -static void cache_reap(void *unused) +static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; struct kmem_list3 *l3; int node = numa_node_id(); + struct delayed_work *work = + container_of(w, struct delayed_work, work); - if (!mutex_trylock(&cache_chain_mutex)) { + if (!mutex_trylock(&cache_chain_mutex)) /* Give up. Setup the next iteration. */ - schedule_delayed_work(&__get_cpu_var(reap_work), - REAPTIMEOUT_CPUC); - return; - } + goto out; list_for_each_entry(searchp, &cache_chain, next) { check_irq_on(); @@ -3859,9 +4077,9 @@ next: check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); - refresh_cpu_vm_stats(smp_processor_id()); +out: /* Set up the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); } #ifdef CONFIG_PROC_FS @@ -3892,26 +4110,17 @@ static void print_slabinfo_header(struct seq_file *m) static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; - struct list_head *p; mutex_lock(&cache_chain_mutex); if (!n) print_slabinfo_header(m); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + + return seq_list_start(&cache_chain, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct kmem_cache *cachep = p; - ++*pos; - return cachep->next.next == &cache_chain ? - NULL : list_entry(cachep->next.next, struct kmem_cache, next); + return seq_list_next(p, &cache_chain, pos); } static void s_stop(struct seq_file *m, void *p) @@ -3921,7 +4130,7 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4029,7 +4238,7 @@ static int s_show(struct seq_file *m, void *p) * + further values on SMP and with statistics enabled */ -struct seq_operations slabinfo_op = { +const struct seq_operations slabinfo_op = { .start = s_start, .next = s_next, .stop = s_stop, @@ -4090,17 +4299,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, static void *leaks_start(struct seq_file *m, loff_t *pos) { - loff_t n = *pos; - struct list_head *p; - mutex_lock(&cache_chain_mutex); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + return seq_list_start(&cache_chain, *pos); } static inline int add_caller(unsigned long *n, unsigned long v) @@ -4150,16 +4350,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) static void show_symbol(struct seq_file *m, unsigned long address) { #ifdef CONFIG_KALLSYMS - char *modname; - const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN+1]; - - name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; - if (name) { + if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { seq_printf(m, "%s+%#lx/%#lx", name, offset, size); - if (modname) + if (modname[0]) seq_printf(m, " [%s]", modname); return; } @@ -4169,7 +4365,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; struct kmem_list3 *l3; const char *name; @@ -4227,7 +4423,7 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } -struct seq_operations slabstats_op = { +const struct seq_operations slabstats_op = { .start = leaks_start, .next = s_next, .stop = s_stop, @@ -4248,9 +4444,9 @@ struct seq_operations slabstats_op = { * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. */ -unsigned int ksize(const void *objp) +size_t ksize(const void *objp) { - if (unlikely(objp == NULL)) + if (unlikely(ZERO_OR_NULL_PTR(objp))) return 0; return obj_size(virt_to_cache(objp));