X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fslub.c;h=8d71aaf888d770b27ba26df5d4d9592832a87b2d;hb=e071041be037eca208b62b84469a06bdfc692bea;hp=0863fd38a5ce06a87392f5ba753606a516a5d262;hpb=62e5c4b4d6351707346695fd9e151b6cda85cbe1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/slub.c b/mm/slub.c index 0863fd3..8d71aaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5,22 +5,29 @@ * The allocator synchronizes using per slab locks and only * uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007 SGI, Christoph Lameter */ #include +#include /* struct reclaim_state */ #include #include #include #include #include +#include #include +#include +#include #include #include #include #include +#include #include #include +#include +#include /* * Lock order: @@ -100,44 +107,12 @@ * the fast path and disables lockless freelists. */ -#define FROZEN (1 << PG_active) - #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG (1 << PG_error) +#define SLABDEBUG 1 #else #define SLABDEBUG 0 #endif -static inline int SlabFrozen(struct page *page) -{ - return page->flags & FROZEN; -} - -static inline void SetSlabFrozen(struct page *page) -{ - page->flags |= FROZEN; -} - -static inline void ClearSlabFrozen(struct page *page) -{ - page->flags &= ~FROZEN; -} - -static inline int SlabDebug(struct page *page) -{ - return page->flags & SLABDEBUG; -} - -static inline void SetSlabDebug(struct page *page) -{ - page->flags |= SLABDEBUG; -} - -static inline void ClearSlabDebug(struct page *page) -{ - page->flags &= ~SLABDEBUG; -} - /* * Issues still to be resolved: * @@ -149,25 +124,6 @@ static inline void ClearSlabDebug(struct page *page) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -#if PAGE_SHIFT <= 12 - -/* - * Small page size. Make sure that we do not fragment memory - */ -#define DEFAULT_MAX_ORDER 1 -#define DEFAULT_MIN_OBJECTS 4 - -#else - -/* - * Large page machines are customarily able to handle larger - * page orders. - */ -#define DEFAULT_MAX_ORDER 2 -#define DEFAULT_MIN_OBJECTS 8 - -#endif - /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -185,13 +141,20 @@ static inline void ClearSlabDebug(struct page *page) SLAB_POISON | SLAB_STORE_USER) /* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +/* * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA) + SLAB_CACHE_DMA | SLAB_NOTRACK) #ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) @@ -201,16 +164,13 @@ static inline void ClearSlabDebug(struct page *page) #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ -#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */ -#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */ - -/* Not all arches define cache_line_size */ -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif static int kmem_size = sizeof(struct kmem_cache); @@ -233,7 +193,7 @@ static LIST_HEAD(slab_caches); * Tracking user of a slab. */ struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -241,7 +201,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); @@ -301,7 +261,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); - if (object < base || object >= base + s->objects * s->size || + if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { return 0; } @@ -327,8 +287,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) } /* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr) \ - for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) /* Scan freelist */ @@ -341,6 +301,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size) +{ + struct kmem_cache_order_objects x = { + (order << OO_SHIFT) + (PAGE_SIZE << order) / size + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> OO_SHIFT; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & OO_MASK; +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -352,6 +332,7 @@ static int slub_debug; #endif static char *slub_debug_slabs; +static int disable_higher_order_debug; /* * Object debugging @@ -402,20 +383,14 @@ static struct track *get_track(struct kmem_cache *s, void *object, } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { - struct track *p; + struct track *p = get_track(s, object, alloc); - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; - - p += alloc; if (addr) { p->addr = addr; p->cpu = smp_processor_id(); - p->pid = current ? current->pid : -1; + p->pid = current->pid; p->when = jiffies; } else memset(p, 0, sizeof(struct track)); @@ -426,8 +401,8 @@ static void init_tracking(struct kmem_cache *s, void *object) if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@ -435,9 +410,8 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in ", s); - __print_symbol("%s", (unsigned long)t->addr); - printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) @@ -451,8 +425,8 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", - page, page->inuse, page->freelist, page->flags); + printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } @@ -497,7 +471,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (p > addr + 16) print_section("Bytes b4", p - 16, 16); - print_section("Object", p, min(s->objsize, 128)); + print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, @@ -521,7 +495,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) static void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { - slab_bug(s, reason); + slab_bug(s, "%s", reason); print_trailer(s, page, object); } @@ -533,7 +507,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - slab_bug(s, fmt); + slab_bug(s, "%s", buf); print_page_info(page); dump_stack(); } @@ -652,6 +626,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) p + off, POISON_INUSE, s->size - off); } +/* Check the pad bytes at the end of a slab page */ static int slab_pad_check(struct kmem_cache *s, struct page *page) { u8 *start; @@ -664,22 +639,22 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - end = start + (PAGE_SIZE << s->order); - length = s->objects * s->size; - remainder = end - (start + length); + length = (PAGE_SIZE << compound_order(page)); + end = start + length; + remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(start + length, POISON_INUSE, remainder); + fault = check_bytes(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", start, length); + print_section("Padding", end - remainder, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, start, end); + restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; } @@ -727,7 +702,7 @@ static int check_object(struct kmem_cache *s, struct page *page, if (!check_valid_pointer(s, page, get_freepointer(s, p))) { object_err(s, page, p, "Freepointer corrupt"); /* - * No choice but to zap it and thus loose the remainder + * No choice but to zap it and thus lose the remainder * of the free objects in this slab. May cause * another error because the object count is now wrong. */ @@ -739,15 +714,24 @@ static int check_object(struct kmem_cache *s, struct page *page, static int check_slab(struct kmem_cache *s, struct page *page) { + int maxobj; + VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->inuse > s->objects) { + + maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, s->objects); + s->name, page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -764,8 +748,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) int nr = 0; void *fp = page->freelist; void *object = NULL; + unsigned long max_objects; - while (fp && nr <= s->objects) { + while (fp && nr <= page->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { @@ -777,7 +762,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; - page->inuse = s->objects; + page->inuse = page->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -788,16 +773,27 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - if (page->inuse != s->objects - nr) { + max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, s->objects - nr); - page->inuse = s->objects - nr; + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) +static void trace(struct kmem_cache *s, struct page *page, void *object, + int alloc) { if (s->flags & SLAB_TRACE) { printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", @@ -837,6 +833,43 @@ static void remove_full(struct kmem_cache *s, struct page *page) spin_unlock(&n->list_lock); } +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (!NUMA_BUILD || n) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { @@ -848,7 +881,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, } static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; @@ -881,14 +914,14 @@ bad: * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = s->objects; + page->inuse = page->objects; page->freelist = NULL; } return 0; } static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -922,7 +955,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && !page->freelist) + if (!PageSlubFrozen(page) && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -951,6 +984,15 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; + if (tolower(*str) == 'o') { + /* + * Avoid enabling debugging on caches if its minimum order + * would increase as a result. + */ + disable_higher_order_debug = 1; + goto out; + } + slub_debug = 0; if (*str == '-') /* @@ -995,14 +1037,14 @@ __setup("slub_debug", setup_slub_debug); static unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. */ if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) - flags |= slub_debug; + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + flags |= slub_debug; return flags; } @@ -1011,10 +1053,10 @@ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1023,34 +1065,89 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { return flags; } #define slub_debug 0 + +#define disable_higher_order_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} #endif + /* * Slab allocation and freeing */ +static inline struct page *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + int order = oo_order(oo); + + flags |= __GFP_NOTRACK; + + if (node == -1) + return alloc_pages(flags, order); + else + return alloc_pages_node(node, flags, order); +} + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int pages = 1 << s->order; + struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; flags |= s->allocflags; - if (node == -1) - page = alloc_pages(flags, s->order); - else - page = alloc_pages_node(node, flags, s->order); + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if (!page) - return NULL; + page = alloc_slab_page(alloc_gfp, node, oo); + if (unlikely(!page)) { + oo = s->min; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(flags, node, oo); + if (!page) + return NULL; + + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } + + if (kmemcheck_enabled + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); + 1 << oo_order(oo)); return page; } @@ -1060,13 +1157,12 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(s, object); + s->ctor(object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - struct kmem_cache_node *n; void *start; void *last; void *p; @@ -1078,22 +1174,20 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; - n = get_node(s, page_to_nid(page)); - if (n) - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); + __SetPageSlubDebug(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); last = start; - for_each_object(p, s, start) { + for_each_object(p, s, start, page->objects) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -1109,23 +1203,31 @@ out: static void __free_slab(struct kmem_cache *s, struct page *page) { - int pages = 1 << s->order; + int order = compound_order(page); + int pages = 1 << order; - if (unlikely(SlabDebug(page))) { + if (unlikely(SLABDEBUG && PageSlubDebug(page))) { void *p; slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) + for_each_object(p, s, page_address(page), + page->objects) check_object(s, page, p, 0); - ClearSlabDebug(page); + __ClearPageSlubDebug(page); } + kmemcheck_free_shadow(page, compound_order(page)); + mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); - __free_pages(page, s->order); + __ClearPageSlab(page); + reset_page_mapcount(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; + __free_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -1151,11 +1253,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) static void discard_slab(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - atomic_long_dec(&n->nr_slabs); - reset_page_mapcount(page); - __ClearPageSlab(page); + dec_slabs_node(s, page_to_nid(page), page->objects); free_slab(s, page); } @@ -1195,8 +1293,7 @@ static void add_partial(struct kmem_cache_node *n, spin_unlock(&n->list_lock); } -static void remove_partial(struct kmem_cache *s, - struct page *page) +static void remove_partial(struct kmem_cache *s, struct page *page) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1211,12 +1308,13 @@ static void remove_partial(struct kmem_cache *s, * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) +static inline int lock_and_freeze_slab(struct kmem_cache_node *n, + struct page *page) { if (slab_trylock(page)) { list_del(&page->lru); n->nr_partial--; - SetSlabFrozen(page); + __SetPageSlubFrozen(page); return 1; } return 0; @@ -1255,7 +1353,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) { #ifdef CONFIG_NUMA struct zonelist *zonelist; - struct zone **z; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); struct page *page; /* @@ -1280,15 +1380,14 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - zonelist = &NODE_DATA( - slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)]; - for (z = zonelist->zones; *z; z++) { + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; - n = get_node(s, zone_to_nid(*z)); + n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > MIN_PARTIAL) { + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1325,7 +1424,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) struct kmem_cache_node *n = get_node(s, page_to_nid(page)); struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); - ClearSlabFrozen(page); + __ClearPageSlubFrozen(page); if (page->inuse) { if (page->freelist) { @@ -1333,13 +1432,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { stat(c, DEACTIVATE_FULL); - if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) + if (SLABDEBUG && PageSlubDebug(page) && + (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < MIN_PARTIAL) { + if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1347,8 +1447,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * so that the others get filled first. That way the * size of the partial list stays small. * - * kmem_cache_shrink can reclaim any empty slabs from the - * partial list. + * kmem_cache_shrink can reclaim any empty slabs from + * the partial list. */ add_partial(n, page, 1); slab_unlock(page); @@ -1368,7 +1468,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) struct page *page = c->page; int tail = 1; - if (c->freelist) + if (page->freelist) stat(c, DEACTIVATE_REMOTE_FREES); /* * Merge cpu freelist into slab freelist. Typically we get here @@ -1422,15 +1522,7 @@ static void flush_cpu_slab(void *d) static void flush_all(struct kmem_cache *s) { -#ifdef CONFIG_SMP - on_each_cpu(flush_cpu_slab, s, 1, 1); -#else - unsigned long flags; - - local_irq_save(flags); - flush_cpu_slab(s); - local_irq_restore(flags); -#endif + on_each_cpu(flush_cpu_slab, s, 1); } /* @@ -1446,6 +1538,69 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) return 1; } +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ +#ifdef CONFIG_SLUB_DEBUG + return atomic_long_read(&n->total_objects); +#else + return 0; +#endif +} + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ + int node; + + printk(KERN_WARNING + "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " + "default order: %d, min order: %d\n", s->name, s->objsize, + s->size, oo_order(s->oo), oo_order(s->min)); + + if (oo_order(s->min) > get_order(s->objsize)) + printk(KERN_WARNING " %s debugging increased min order, use " + "slub_debug=O to disable.\n", s->name); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + if (!n) + continue; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + printk(KERN_WARNING + " node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -1464,12 +1619,15 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { void **object; struct page *new; + /* We handle __GFP_ZERO in the caller */ + gfpflags &= ~__GFP_ZERO; + if (!c->page) goto new_slab; @@ -1483,11 +1641,11 @@ load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(c->page))) + if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; c->freelist = object[c->offset]; - c->page->inuse = s->objects; + c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: @@ -1520,25 +1678,12 @@ new_slab: if (c->page) flush_slab(s, c); slab_lock(new); - SetSlabFrozen(new); + __SetPageSlubFrozen(new); c->page = new; goto load_freelist; } - - /* - * No memory available. - * - * If the slab uses higher order allocs but the object is - * smaller than a page size then we can fallback in emergencies - * to the page allocator via kmalloc_large. The page allocator may - * have failed to obtain a higher order page and we can try to - * allocate a single page if the object fits into a single page. - * That is only possible if certain conditions are met that are being - * checked when a slab is created. - */ - if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK)) - return kmalloc_large(s->objsize, gfpflags); - + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1561,14 +1706,24 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; unsigned long flags; + unsigned int objsize; + + gfpflags &= gfp_allowed_mask; + + lockdep_trace_alloc(gfpflags); + might_sleep_if(gfpflags & __GFP_WAIT); + + if (should_failslab(s->objsize, gfpflags)) + return NULL; local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + objsize = c->objsize; if (unlikely(!c->freelist || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); @@ -1580,26 +1735,56 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, } local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, c->objsize); + if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, objsize); + + kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); + kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1609,7 +1794,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1619,7 +1804,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(c, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SlabDebug(page))) + if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; checks_ok: @@ -1627,7 +1812,7 @@ checks_ok: page->freelist = object; page->inuse--; - if (unlikely(SlabFrozen(page))) { + if (unlikely(PageSlubFrozen(page))) { stat(c, FREE_FROZEN); goto out_unlock; } @@ -1679,15 +1864,19 @@ debug: * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; struct kmem_cache_cpu *c; unsigned long flags; + kmemleak_free_recursive(x, s->flags); local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + kmemcheck_slab_free(s, object, c->objsize); debug_check_no_locks_freed(object, c->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; @@ -1704,11 +1893,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, page, x, _RET_IP_); + + trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab object the object resides */ +/* Figure out on which slab page the object resides */ static struct page *get_object_page(const void *x) { struct page *page = virt_to_head_page(x); @@ -1739,8 +1930,8 @@ static struct page *get_object_page(const void *x) * take the list_lock. */ static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -1755,7 +1946,7 @@ static int slub_nomerge; * system components. Generally order 0 allocations should be preferred since * order 0 does not cause fragmentation in the page allocator. Larger objects * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab + * unused space left. We go to a higher order if more than 1/16th of the slab * would be wasted. * * In order to reach satisfactory performance we must ensure that a minimum @@ -1780,6 +1971,9 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; + if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1804,6 +1998,7 @@ static inline int calculate_order(int size) int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1814,8 +2009,13 @@ static inline int calculate_order(int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = (PAGE_SIZE << slub_max_order)/size; + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { - fraction = 8; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, slub_max_order, fraction); @@ -1823,7 +2023,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects /= 2; + min_objects--; } /* @@ -1838,7 +2038,7 @@ static inline int calculate_order(int size) * Doh this slab cannot be placed using slub_max_order. */ order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + if (order < MAX_ORDER) return order; return -ENOSYS; } @@ -1856,12 +2056,15 @@ static unsigned long calculate_alignment(unsigned long flags, * The hardware cache alignment cannot override the specified * alignment though. If that is greater then use it. */ - if ((flags & SLAB_HWCACHE_ALIGN) && - size > cache_line_size() / 2) - return max_t(unsigned long, align, cache_line_size()); + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } if (align < ARCH_SLAB_MINALIGN) - return ARCH_SLAB_MINALIGN; + align = ARCH_SLAB_MINALIGN; return ALIGN(align, sizeof(void *)); } @@ -1874,15 +2077,20 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, c->node = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize; +#ifdef CONFIG_SLUB_STATS + memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); +#endif } -static void init_kmem_cache_node(struct kmem_cache_node *n) +static void +init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } @@ -1905,11 +2113,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; +static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, int cpu, gfp_t flags) @@ -1935,7 +2143,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) { if (c < per_cpu(kmem_cache_cpu, cpu) || - c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { kfree(c); return; } @@ -1984,13 +2192,13 @@ static void init_alloc_cpu_cpu(int cpu) { int i; - if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) + if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) return; for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - cpu_set(cpu, kmem_cach_cpu_free_init_once); + cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); } static void __init init_alloc_cpu(void) @@ -2022,8 +2230,7 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) { struct page *page; struct kmem_cache_node *n; @@ -2050,8 +2257,8 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_object(kmalloc_caches, n, 1); init_tracking(kmalloc_caches, n); #endif - init_kmem_cache_node(n); - atomic_long_inc(&n->nr_slabs); + init_kmem_cache_node(n, kmalloc_caches); + inc_slabs_node(kmalloc_caches, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2061,7 +2268,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, local_irq_save(flags); add_partial(n, page, 0); local_irq_restore(flags); - return n; } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2093,8 +2299,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) n = &s->local_node; else { if (slab_state == DOWN) { - n = early_kmem_cache_node_alloc(gfpflags, - node); + early_kmem_cache_node_alloc(gfpflags, node); continue; } n = kmem_cache_alloc_node(kmalloc_caches, @@ -2107,7 +2312,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } s->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); } return 1; } @@ -2118,20 +2323,30 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { - init_kmem_cache_node(&s->local_node); + init_kmem_cache_node(&s->local_node, s); return 1; } #endif +static void set_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->objsize; unsigned long align = s->align; + int order; /* * Round up object size to the next word boundary. We can only @@ -2195,7 +2410,7 @@ static int calculate_sizes(struct kmem_cache *s) * Add some empty padding so that we can catch * overwrites from earlier objects rather than let * tracking information or the free pointer be - * corrupted if an user writes before the start + * corrupted if a user writes before the start * of the object. */ size += sizeof(void *); @@ -2207,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s) * on bootup. */ align = calculate_alignment(flags, align, s->objsize); + s->align = align; /* * SLUB stores one object immediately after another beginning from @@ -2215,26 +2431,16 @@ static int calculate_sizes(struct kmem_cache *s) */ size = ALIGN(size, align); s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size); - if ((flags & __KMALLOC_CACHE) && - PAGE_SIZE / size < slub_min_objects) { - /* - * Kmalloc cache that would not have enough objects in - * an order 0 page. Kmalloc slabs can fallback to - * page allocator order 0 allocs so take a reasonably large - * order that will allows us a good number of objects. - */ - s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); - s->flags |= __PAGE_ALLOC_FALLBACK; - s->allocflags |= __GFP_NOWARN; - } else - s->order = calculate_order(size); - - if (s->order < 0) + if (order < 0) return 0; s->allocflags = 0; - if (s->order) + if (order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) @@ -2246,16 +2452,19 @@ static int calculate_sizes(struct kmem_cache *s) /* * Determine the number of objects per slab */ - s->objects = (PAGE_SIZE << s->order) / size; + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; - return !!s->objects; + return !!oo_objects(s->oo); } static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { memset(s, 0, kmem_size); s->name = name; @@ -2264,12 +2473,29 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); - if (!calculate_sizes(s)) + if (!calculate_sizes(s, -1)) goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->objsize)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s, -1)) + goto error; + } + } + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + set_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 100; + s->remote_node_defrag_ratio = 1000; #endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2281,7 +2507,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, + s->name, (unsigned long)size, s->size, oo_order(s->oo), s->offset, flags); return 0; } @@ -2327,26 +2553,52 @@ const char *kmem_cache_name(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_name); +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + DECLARE_BITMAP(map, page->objects); + + bitmap_zero(map, page->objects); + slab_err(s, page, "%s", text); + slab_lock(page); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr, page->objects) { + + if (!test_bit(slab_index(p, s, addr), map)) { + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", + p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); +#endif +} + /* - * Attempt to free all slabs on a node. Return the number of slabs we - * were unable to free. + * Attempt to free all partial slabs on a node. */ -static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, - struct list_head *list) +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - int slabs_inuse = 0; unsigned long flags; struct page *page, *h; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) + list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); - } else - slabs_inuse++; + n->nr_partial--; + } else { + list_slab_objects(s, page, + "Objects remaining on kmem_cache_close()"); + } + } spin_unlock_irqrestore(&n->list_lock, flags); - return slabs_inuse; } /* @@ -2363,8 +2615,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - n->nr_partial -= free_list(s, n, &n->partial); - if (atomic_long_read(&n->nr_slabs)) + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) return 1; } free_kmem_cache_nodes(s); @@ -2382,8 +2634,13 @@ void kmem_cache_destroy(struct kmem_cache *s) if (!s->refcount) { list_del(&s->list); up_write(&slub_lock); - if (kmem_cache_close(s)) - WARN_ON(1); + if (kmem_cache_close(s)) { + printk(KERN_ERR "SLUB %s: %s called for cache that " + "still has objects.\n", s->name, __func__); + dump_stack(); + } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2394,13 +2651,9 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2413,6 +2666,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } @@ -2444,13 +2698,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; - down_write(&slub_lock); + /* + * This function is called with IRQs disabled during early-boot on + * single CPU so there's no need to take slub_lock here. + */ if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags | __KMALLOC_CACHE, NULL)) + flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - up_write(&slub_lock); + if (sysfs_slab_add(s)) goto panic; return s; @@ -2460,6 +2717,7 @@ panic: } #ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2482,6 +2740,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) struct kmem_cache *s; char *text; size_t realsize; + unsigned long slabflags; s = kmalloc_caches_dma[index]; if (s) @@ -2503,9 +2762,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) (unsigned int)realsize); s = kmalloc(kmem_size, flags & ~SLUB_DMA); + /* + * Must defer sysfs creation to a workqueue because we don't know + * what context we are called from. Before sysfs comes up, we don't + * need to do anything because our sysfs initcall will start by + * adding all existing slabs to sysfs. + */ + slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; + if (slab_state >= SYSFS) + slabflags |= __SYSFS_ADD_DEFERRED; + if (!s || !text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { kfree(s); kfree(text); goto unlock_out; @@ -2514,7 +2782,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) list_add(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; - schedule_work(&sysfs_add_work); + if (slab_state >= SYSFS) + schedule_work(&sysfs_add_work); unlock_out: up_write(&slub_lock); @@ -2556,6 +2825,11 @@ static s8 size_index[24] = { 2 /* 192 */ }; +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + static struct kmem_cache *get_slab(size_t size, gfp_t flags) { int index; @@ -2564,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) if (!size) return ZERO_SIZE_PTR; - index = size_index[(size - 1) / 8]; + index = size_index[size_index_elem(size)]; } else index = fls(size - 1); @@ -2579,8 +2853,9 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2588,35 +2863,54 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct page *page = alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); + struct page *page; + void *ptr = NULL; + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + trace_kmalloc_node(_RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + ret = slab_alloc(s, flags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2631,9 +2925,10 @@ size_t ksize(const void *object) page = virt_to_head_page(object); - if (unlikely(!PageSlab(page))) + if (unlikely(!PageSlab(page))) { + WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); - + } s = page->slab; #ifdef CONFIG_SLUB_DEBUG @@ -2664,31 +2959,22 @@ void kfree(const void *x) struct page *page; void *object = (void *)x; + trace_kfree(_RET_IP_, x); + if (unlikely(ZERO_OR_NULL_PTR(x))) return; page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } - slab_free(page->slab, page, object, __builtin_return_address(0)); + slab_free(page->slab, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); -static unsigned long count_partial(struct kmem_cache_node *n) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} - /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the @@ -2706,8 +2992,9 @@ int kmem_cache_shrink(struct kmem_cache *s) struct kmem_cache_node *n; struct page *page; struct page *t; + int objects = oo_objects(s->max); struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags; if (!slabs_by_inuse) @@ -2720,7 +3007,7 @@ int kmem_cache_shrink(struct kmem_cache *s) if (!n->nr_partial) continue; - for (i = 0; i < s->objects; i++) + for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i); spin_lock_irqsave(&n->list_lock, flags); @@ -2752,7 +3039,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = s->objects - 1; i >= 0; i--) + for (i = objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); @@ -2802,7 +3089,7 @@ static void slab_mem_offline_callback(void *arg) * and offline_pages() function shoudn't call this * callback. So, we must fail. */ - BUG_ON(atomic_long_read(&n->nr_slabs)); + BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; kmem_cache_free(kmalloc_caches, n); @@ -2827,7 +3114,7 @@ static int slab_mem_going_online_callback(void *arg) return 0; /* - * We are bringing a node online. No memory is availabe yet. We must + * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. */ @@ -2843,7 +3130,7 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); s->node[nid] = n; } out: @@ -2871,8 +3158,10 @@ static int slab_memory_callback(struct notifier_block *self, case MEM_CANCEL_OFFLINE: break; } - - ret = notifier_from_errno(ret); + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; return ret; } @@ -2896,31 +3185,31 @@ void __init kmem_cache_init(void) * kmem_cache_open for slab_state == DOWN. */ create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_KERNEL); + sizeof(struct kmem_cache_node), GFP_NOWAIT); kmalloc_caches[0].refcount = -1; caches++; - hotplug_memory_notifier(slab_memory_callback, 1); + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif /* Able to allocate the per node structures */ slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 64) { + if (KMALLOC_MIN_SIZE <= 32) { create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_KERNEL); + "kmalloc-96", 96, GFP_NOWAIT); caches++; } - if (KMALLOC_MIN_SIZE <= 128) { + if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_KERNEL); + "kmalloc-192", 192, GFP_NOWAIT); caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_KERNEL); + "kmalloc", 1 << i, GFP_NOWAIT); caches++; } @@ -2939,15 +3228,36 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) - size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } + + if (KMALLOC_MIN_SIZE == 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + } else if (KMALLOC_MIN_SIZE == 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[size_index_elem(i)] = 8; + } slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = - kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -2965,6 +3275,10 @@ void __init kmem_cache_init(void) nr_cpu_ids, nr_node_ids); } +void __init kmem_cache_init_late(void) +{ +} + /* * Find a mergeable slab cache */ @@ -2973,9 +3287,6 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if ((s->flags & __PAGE_ALLOC_FALLBACK)) - return 1; - if (s->ctor) return 1; @@ -2990,7 +3301,7 @@ static int slab_unmergeable(struct kmem_cache *s) static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { struct kmem_cache *s; @@ -3030,11 +3341,13 @@ static struct kmem_cache *find_mergeable(size_t size, } struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; + if (WARN_ON(!name)) + return NULL; + down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -3057,8 +3370,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); - if (sysfs_slab_alias(s, name)) + if (sysfs_slab_alias(s, name)) { + down_write(&slub_lock); + s->refcount--; + up_write(&slub_lock); goto err; + } return s; } @@ -3068,8 +3385,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); up_write(&slub_lock); - if (sysfs_slab_add(s)) + if (sysfs_slab_add(s)) { + down_write(&slub_lock); + list_del(&s->list); + up_write(&slub_lock); + kfree(s); goto err; + } return s; } kfree(s); @@ -3136,11 +3458,12 @@ static struct notifier_block __cpuinitdata slab_notifier = { #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3148,15 +3471,21 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + trace_kmalloc(caller, ret, size, s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); @@ -3164,10 +3493,25 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc(s, gfpflags, node, caller); + + /* Honor the call site pointer we recieved. */ + trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + + return ret; +} + +#ifdef CONFIG_SLUB_DEBUG +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; } -#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3179,7 +3523,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; /* Now we know that a valid freelist exists */ - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); @@ -3187,7 +3531,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; } - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) if (!check_object(s, page, p, 1)) return 0; @@ -3205,12 +3549,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, s->name, page); if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug not set " + if (!PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug not set " "on slab 0x%p\n", s->name, page); } else { - if (SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug set on " + if (PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug set on " "slab 0x%p\n", s->name, page); } } @@ -3253,7 +3597,7 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3334,13 +3678,13 @@ static void resiliency_test(void) {}; struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; long min_pid; long max_pid; - cpumask_t cpus; + DECLARE_BITMAP(cpus, NR_CPUS); nodemask_t nodes; }; @@ -3382,7 +3726,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; @@ -3415,7 +3759,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->pid > l->max_pid) l->max_pid = track->pid; - cpu_set(track->cpu, l->cpus); + cpumask_set_cpu(track->cpu, + to_cpumask(l->cpus)); } node_set(page_to_nid(virt_to_page(track)), l->nodes); return 1; @@ -3445,8 +3790,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; - cpus_clear(l->cpus); - cpu_set(track->cpu, l->cpus); + cpumask_clear(to_cpumask(l->cpus)); + cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); node_set(page_to_nid(virt_to_page(track)), l->nodes); return 1; @@ -3456,14 +3801,14 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc) { void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); + DECLARE_BITMAP(map, page->objects); void *p; - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) set_bit(slab_index(p, s, addr), map); - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); } @@ -3502,7 +3847,7 @@ static int list_locations(struct kmem_cache *s, char *buf, for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; - if (len > PAGE_SIZE - 100) + if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) break; len += sprintf(buf + len, "%7ld ", l->count); @@ -3512,12 +3857,10 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, ""); if (l->sum_time != l->min_time) { - unsigned long remainder; - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - div_long_long_rem(l->sum_time, l->count, &remainder), - l->max_time); + l->min_time, + (long)div_u64(l->sum_time, l->count), + l->max_time); } else len += sprintf(buf + len, " age=%ld", l->min_time); @@ -3529,14 +3872,15 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, " pid=%ld", l->min_pid); - if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && + if (num_online_cpus() > 1 && + !cpumask_empty(to_cpumask(l->cpus)) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->cpus); + to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, @@ -3553,22 +3897,23 @@ static int list_locations(struct kmem_cache *s, char *buf, } enum slab_stat_type { - SL_FULL, - SL_PARTIAL, - SL_CPU, - SL_OBJECTS + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ }; -#define SO_FULL (1 << SL_FULL) +#define SO_ALL (1 << SL_ALL) #define SO_PARTIAL (1 << SL_PARTIAL) #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { unsigned long total = 0; - int cpu; int node; int x; unsigned long *nodes; @@ -3579,56 +3924,60 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return -ENOMEM; per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (flags & SO_CPU) { + int cpu; - if (!c) - continue; + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - if (flags & SO_OBJECTS) - x = page->inuse; + if (!c || c->node < 0) + continue; + + if (c->page) { + if (flags & SO_TOTAL) + x = c->page->objects; + else if (flags & SO_OBJECTS) + x = c->page->inuse; else x = 1; + total += x; - nodes[node] += x; + nodes[c->node] += x; } - per_cpu[node]++; + per_cpu[c->node]++; } } - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); - if (flags & SO_PARTIAL) { - if (flags & SO_OBJECTS) - x = count_partial(n); else - x = n->nr_partial; + x = atomic_long_read(&n->nr_slabs); total += x; nodes[node] += x; } - if (flags & SO_FULL) { - int full_slabs = atomic_long_read(&n->nr_slabs) - - per_cpu[node] - - n->nr_partial; + } else if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); - if (flags & SO_OBJECTS) - x = full_slabs * s->objects; + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); else - x = full_slabs; + x = n->nr_partial; total += x; nodes[node] += x; } } - x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA for_each_node_state(node, N_NORMAL_MEMORY) @@ -3643,14 +3992,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c && c->page) - return 1; - } for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -3658,7 +3999,7 @@ static int any_slab_objects(struct kmem_cache *s) if (!n) continue; - if (n->nr_partial || atomic_long_read(&n->nr_slabs)) + if (atomic_long_read(&n->total_objects)) return 1; } return 0; @@ -3700,15 +4041,52 @@ SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objects); + return sprintf(buf, "%d\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long order; + int err; + + err = strict_strtoul(buf, 10, &order); + if (err) + return err; + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->order); + return sprintf(buf, "%d\n", oo_order(s->oo)); } -SLAB_ATTR_RO(order); +SLAB_ATTR(order); + +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = strict_strtoul(buf, 10, &min); + if (err) + return err; + + set_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { @@ -3729,7 +4107,7 @@ SLAB_ATTR_RO(aliases); static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); + return show_slab_objects(s, buf, SO_ALL); } SLAB_ATTR_RO(slabs); @@ -3747,10 +4125,22 @@ SLAB_ATTR_RO(cpu_slabs); static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } SLAB_ATTR_RO(objects); +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); @@ -3830,7 +4220,7 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') s->flags |= SLAB_RED_ZONE; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(red_zone); @@ -3849,7 +4239,7 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') s->flags |= SLAB_POISON; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(poison); @@ -3868,7 +4258,7 @@ static ssize_t store_user_store(struct kmem_cache *s, s->flags &= ~SLAB_STORE_USER; if (buf[0] == '1') s->flags |= SLAB_STORE_USER; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(store_user); @@ -3936,10 +4326,16 @@ static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - int n = simple_strtoul(buf, NULL, 10); + unsigned long ratio; + int err; + + err = strict_strtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio <= 100) + s->remote_node_defrag_ratio = ratio * 10; - if (n < 100) - s->remote_node_defrag_ratio = n * 10; return length; } SLAB_ATTR(remote_node_defrag_ratio); @@ -3965,20 +4361,38 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) len = sprintf(buf, "%lu", sum); +#ifdef CONFIG_SMP for_each_online_cpu(cpu) { if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]); + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); } +#endif kfree(data); return len + sprintf(buf + len, "\n"); } +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->stat[si] = 0; +} + #define STAT_ATTR(si, text) \ static ssize_t text##_show(struct kmem_cache *s, char *buf) \ { \ return show_stat(s, buf, si); \ } \ -SLAB_ATTR_RO(text); \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); @@ -3997,7 +4411,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); - +STAT_ATTR(ORDER_FALLBACK, order_fallback); #endif static struct attribute *slab_attrs[] = { @@ -4005,7 +4419,10 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, &objects_attr.attr, + &objects_partial_attr.attr, + &total_objects_attr.attr, &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, @@ -4048,6 +4465,7 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &order_fallback_attr.attr, #endif NULL }; @@ -4153,6 +4571,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); @@ -4195,8 +4615,11 @@ static int sysfs_slab_add(struct kmem_cache *s) } err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) + if (err) { + kobject_del(&s->kobj); + kobject_put(&s->kobj); return err; + } kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ @@ -4215,7 +4638,7 @@ static void sysfs_slab_remove(struct kmem_cache *s) /* * Need to buffer aliases during bootup until sysfs becomes - * available lest we loose that information. + * available lest we lose that information. */ struct saved_alias { struct kmem_cache *s; @@ -4290,14 +4713,6 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO - -ssize_t slabinfo_write(struct file *file, const char __user * buffer, - size_t count, loff_t *ppos) -{ - return -EINVAL; -} - - static void print_slabinfo_header(struct seq_file *m) { seq_puts(m, "slabinfo - version: 2.1\n"); @@ -4334,7 +4749,8 @@ static int s_show(struct seq_file *m, void *p) unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_inuse = 0; - unsigned long nr_objs; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; struct kmem_cache *s; int node; @@ -4348,14 +4764,15 @@ static int s_show(struct seq_file *m, void *p) nr_partials += n->nr_partial; nr_slabs += atomic_long_read(&n->nr_slabs); - nr_inuse += count_partial(n); + nr_objs += atomic_long_read(&n->total_objects); + nr_free += count_partial(n, count_free); } - nr_objs = nr_slabs * s->objects; - nr_inuse += (nr_slabs - nr_partials) * s->objects; + nr_inuse = nr_objs - nr_free; seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, s->objects, (1 << s->order)); + nr_objs, s->size, oo_objects(s->oo), + (1 << oo_order(s->oo))); seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 0UL); @@ -4363,11 +4780,29 @@ static int s_show(struct seq_file *m, void *p) return 0; } -const struct seq_operations slabinfo_op = { +static const struct seq_operations slabinfo_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */