X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fslub.c;h=8d71aaf888d770b27ba26df5d4d9592832a87b2d;hb=e071041be037eca208b62b84469a06bdfc692bea;hp=a660834416ac89b3f16279200d45f6d64ecdfe9e;hpb=064287807c9dd64688084d34c6748a326b5f3ec8;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/slub.c b/mm/slub.c index a660834..8d71aaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5,22 +5,29 @@ * The allocator synchronizes using per slab locks and only * uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007 SGI, Christoph Lameter */ #include +#include /* struct reclaim_state */ #include #include #include #include #include +#include #include +#include +#include #include #include #include #include +#include #include #include +#include +#include /* * Lock order: @@ -100,44 +107,12 @@ * the fast path and disables lockless freelists. */ -#define FROZEN (1 << PG_active) - #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG (1 << PG_error) +#define SLABDEBUG 1 #else #define SLABDEBUG 0 #endif -static inline int SlabFrozen(struct page *page) -{ - return page->flags & FROZEN; -} - -static inline void SetSlabFrozen(struct page *page) -{ - page->flags |= FROZEN; -} - -static inline void ClearSlabFrozen(struct page *page) -{ - page->flags &= ~FROZEN; -} - -static inline int SlabDebug(struct page *page) -{ - return page->flags & SLABDEBUG; -} - -static inline void SetSlabDebug(struct page *page) -{ - page->flags |= SLABDEBUG; -} - -static inline void ClearSlabDebug(struct page *page) -{ - page->flags &= ~SLABDEBUG; -} - /* * Issues still to be resolved: * @@ -149,25 +124,6 @@ static inline void ClearSlabDebug(struct page *page) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -#if PAGE_SHIFT <= 12 - -/* - * Small page size. Make sure that we do not fragment memory - */ -#define DEFAULT_MAX_ORDER 1 -#define DEFAULT_MIN_OBJECTS 4 - -#else - -/* - * Large page machines are customarily able to handle larger - * page orders. - */ -#define DEFAULT_MAX_ORDER 2 -#define DEFAULT_MIN_OBJECTS 8 - -#endif - /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -185,13 +141,20 @@ static inline void ClearSlabDebug(struct page *page) SLAB_POISON | SLAB_STORE_USER) /* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +/* * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA) + SLAB_CACHE_DMA | SLAB_NOTRACK) #ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) @@ -201,15 +164,14 @@ static inline void ClearSlabDebug(struct page *page) #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ -/* Not all arches define cache_line_size */ -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - static int kmem_size = sizeof(struct kmem_cache); #ifdef CONFIG_SMP @@ -231,7 +193,7 @@ static LIST_HEAD(slab_caches); * Tracking user of a slab. */ struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -239,10 +201,11 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); + #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) @@ -251,8 +214,16 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) { kfree(s); } + #endif +static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) +{ +#ifdef CONFIG_SLUB_STATS + c->stat[si]++; +#endif +} + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -280,6 +251,7 @@ static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) #endif } +/* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) { @@ -289,7 +261,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); - if (object < base || object >= base + s->objects * s->size || + if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { return 0; } @@ -315,8 +287,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) } /* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr) \ - for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) /* Scan freelist */ @@ -329,6 +301,26 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size) +{ + struct kmem_cache_order_objects x = { + (order << OO_SHIFT) + (PAGE_SIZE << order) / size + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> OO_SHIFT; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & OO_MASK; +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -340,6 +332,7 @@ static int slub_debug; #endif static char *slub_debug_slabs; +static int disable_higher_order_debug; /* * Object debugging @@ -390,20 +383,14 @@ static struct track *get_track(struct kmem_cache *s, void *object, } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { - struct track *p; - - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + struct track *p = get_track(s, object, alloc); - p += alloc; if (addr) { p->addr = addr; p->cpu = smp_processor_id(); - p->pid = current ? current->pid : -1; + p->pid = current->pid; p->when = jiffies; } else memset(p, 0, sizeof(struct track)); @@ -414,8 +401,8 @@ static void init_tracking(struct kmem_cache *s, void *object) if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@ -423,9 +410,8 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in ", s); - __print_symbol("%s", (unsigned long)t->addr); - printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) @@ -439,8 +425,8 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", - page, page->inuse, page->freelist, page->flags); + printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } @@ -485,7 +471,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (p > addr + 16) print_section("Bytes b4", p - 16, 16); - print_section("Object", p, min(s->objsize, 128)); + print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, @@ -509,7 +495,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) static void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { - slab_bug(s, reason); + slab_bug(s, "%s", reason); print_trailer(s, page, object); } @@ -521,7 +507,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - slab_bug(s, fmt); + slab_bug(s, "%s", buf); print_page_info(page); dump_stack(); } @@ -608,7 +594,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER * C. Padding to reach required alignment boundary or at mininum - * one word if debuggin is on to be able to detect writes + * one word if debugging is on to be able to detect writes * before the word boundary. * * Padding is done using 0x5a (POISON_INUSE) @@ -640,6 +626,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) p + off, POISON_INUSE, s->size - off); } +/* Check the pad bytes at the end of a slab page */ static int slab_pad_check(struct kmem_cache *s, struct page *page) { u8 *start; @@ -652,22 +639,22 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - end = start + (PAGE_SIZE << s->order); - length = s->objects * s->size; - remainder = end - (start + length); + length = (PAGE_SIZE << compound_order(page)); + end = start + length; + remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(start + length, POISON_INUSE, remainder); + fault = check_bytes(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", start, length); + print_section("Padding", end - remainder, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, start, end); + restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; } @@ -685,9 +672,10 @@ static int check_object(struct kmem_cache *s, struct page *page, endobject, red, s->inuse - s->objsize)) return 0; } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) - check_bytes_and_report(s, page, p, "Alignment padding", endobject, - POISON_INUSE, s->inuse - s->objsize); + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { + check_bytes_and_report(s, page, p, "Alignment padding", + endobject, POISON_INUSE, s->inuse - s->objsize); + } } if (s->flags & SLAB_POISON) { @@ -714,7 +702,7 @@ static int check_object(struct kmem_cache *s, struct page *page, if (!check_valid_pointer(s, page, get_freepointer(s, p))) { object_err(s, page, p, "Freepointer corrupt"); /* - * No choice but to zap it and thus loose the remainder + * No choice but to zap it and thus lose the remainder * of the free objects in this slab. May cause * another error because the object count is now wrong. */ @@ -726,15 +714,24 @@ static int check_object(struct kmem_cache *s, struct page *page, static int check_slab(struct kmem_cache *s, struct page *page) { + int maxobj; + VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->inuse > s->objects) { + + maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, s->objects); + s->name, page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -751,8 +748,9 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) int nr = 0; void *fp = page->freelist; void *object = NULL; + unsigned long max_objects; - while (fp && nr <= s->objects) { + while (fp && nr <= page->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { @@ -764,7 +762,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; - page->inuse = s->objects; + page->inuse = page->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -775,16 +773,27 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - if (page->inuse != s->objects - nr) { + max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, s->objects - nr); - page->inuse = s->objects - nr; + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) +static void trace(struct kmem_cache *s, struct page *page, void *object, + int alloc) { if (s->flags & SLAB_TRACE) { printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", @@ -824,6 +833,43 @@ static void remove_full(struct kmem_cache *s, struct page *page) spin_unlock(&n->list_lock); } +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (!NUMA_BUILD || n) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { @@ -835,12 +881,12 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, } static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; - if (object && !on_freelist(s, page, object)) { + if (!on_freelist(s, page, object)) { object_err(s, page, object, "Object already allocated"); goto bad; } @@ -850,7 +896,7 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; } - if (object && !check_object(s, page, object, 0)) + if (!check_object(s, page, object, 0)) goto bad; /* Success perform special debug activities for allocs */ @@ -868,14 +914,14 @@ bad: * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = s->objects; + page->inuse = page->objects; page->freelist = NULL; } return 0; } static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -894,11 +940,10 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, return 0; if (unlikely(s != page->slab)) { - if (!PageSlab(page)) + if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - else - if (!page->slab) { + } else if (!page->slab) { printk(KERN_ERR "SLUB : no slab for object 0x%p.\n", object); @@ -910,7 +955,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && !page->freelist) + if (!PageSlubFrozen(page) && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -939,6 +984,15 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; + if (tolower(*str) == 'o') { + /* + * Avoid enabling debugging on caches if its minimum order + * would increase as a result. + */ + disable_higher_order_debug = 1; + goto out; + } + slub_debug = 0; if (*str == '-') /* @@ -983,33 +1037,14 @@ __setup("slub_debug", setup_slub_debug); static unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { /* - * The page->offset field is only 16 bit wide. This is an offset - * in units of words from the beginning of an object. If the slab - * size is bigger then we cannot move the free pointer behind the - * object anymore. - * - * On 32 bit platforms the limit is 256k. On 64bit platforms - * the limit is 512k. - * - * Debugging or ctor may create a need to move the free - * pointer. Fail if this happens. + * Enable debugging if selected on the kernel commandline. */ - if (objsize >= 65535 * sizeof(void *)) { - BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); - BUG_ON(ctor); - } else { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, - strlen(slub_debug_slabs)) == 0)) - flags |= slub_debug; - } + if (slub_debug && (!slub_debug_slabs || + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + flags |= slub_debug; return flags; } @@ -1018,10 +1053,10 @@ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1030,41 +1065,89 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { return flags; } #define slub_debug 0 + +#define disable_higher_order_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} #endif + /* * Slab allocation and freeing */ +static inline struct page *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + int order = oo_order(oo); + + flags |= __GFP_NOTRACK; + + if (node == -1) + return alloc_pages(flags, order); + else + return alloc_pages_node(node, flags, order); +} + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int pages = 1 << s->order; + struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; - if (s->order) - flags |= __GFP_COMP; + flags |= s->allocflags; - if (s->flags & SLAB_CACHE_DMA) - flags |= SLUB_DMA; + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if (s->flags & SLAB_RECLAIM_ACCOUNT) - flags |= __GFP_RECLAIMABLE; + page = alloc_slab_page(alloc_gfp, node, oo); + if (unlikely(!page)) { + oo = s->min; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(flags, node, oo); + if (!page) + return NULL; - if (node == -1) - page = alloc_pages(flags, s->order); - else - page = alloc_pages_node(node, flags, s->order); + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } - if (!page) - return NULL; + if (kmemcheck_enabled + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); + 1 << oo_order(oo)); return page; } @@ -1074,13 +1157,12 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(s, object); + s->ctor(object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - struct kmem_cache_node *n; void *start; void *last; void *p; @@ -1092,22 +1174,20 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; - n = get_node(s, page_to_nid(page)); - if (n) - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); + __SetPageSlubDebug(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); last = start; - for_each_object(p, s, start) { + for_each_object(p, s, start, page->objects) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -1123,23 +1203,31 @@ out: static void __free_slab(struct kmem_cache *s, struct page *page) { - int pages = 1 << s->order; + int order = compound_order(page); + int pages = 1 << order; - if (unlikely(SlabDebug(page))) { + if (unlikely(SLABDEBUG && PageSlubDebug(page))) { void *p; slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) + for_each_object(p, s, page_address(page), + page->objects) check_object(s, page, p, 0); - ClearSlabDebug(page); + __ClearPageSlubDebug(page); } + kmemcheck_free_shadow(page, compound_order(page)); + mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); - __free_pages(page, s->order); + __ClearPageSlab(page); + reset_page_mapcount(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; + __free_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -1165,11 +1253,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) static void discard_slab(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - atomic_long_dec(&n->nr_slabs); - reset_page_mapcount(page); - __ClearPageSlab(page); + dec_slabs_node(s, page_to_nid(page), page->objects); free_slab(s, page); } @@ -1183,7 +1267,7 @@ static __always_inline void slab_lock(struct page *page) static __always_inline void slab_unlock(struct page *page) { - bit_spin_unlock(PG_locked, &page->flags); + __bit_spin_unlock(PG_locked, &page->flags); } static __always_inline int slab_trylock(struct page *page) @@ -1209,8 +1293,7 @@ static void add_partial(struct kmem_cache_node *n, spin_unlock(&n->list_lock); } -static void remove_partial(struct kmem_cache *s, - struct page *page) +static void remove_partial(struct kmem_cache *s, struct page *page) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1225,12 +1308,13 @@ static void remove_partial(struct kmem_cache *s, * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) +static inline int lock_and_freeze_slab(struct kmem_cache_node *n, + struct page *page) { if (slab_trylock(page)) { list_del(&page->lru); n->nr_partial--; - SetSlabFrozen(page); + __SetPageSlubFrozen(page); return 1; } return 0; @@ -1269,7 +1353,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) { #ifdef CONFIG_NUMA struct zonelist *zonelist; - struct zone **z; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); struct page *page; /* @@ -1283,7 +1369,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/slab/xx/defrag_ratio is set to 100 (which makes + * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes * defrag_ratio = 1000) then every (well almost) allocation will * first attempt to defrag slab caches on other nodes. This means * scanning over all nodes to look for partial slabs which may be @@ -1294,15 +1380,14 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - zonelist = &NODE_DATA(slab_node(current->mempolicy)) - ->node_zonelists[gfp_zone(flags)]; - for (z = zonelist->zones; *z; z++) { + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; - n = get_node(s, zone_to_nid(*z)); + n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > MIN_PARTIAL) { + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1337,30 +1422,39 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); - ClearSlabFrozen(page); + __ClearPageSlubFrozen(page); if (page->inuse) { - if (page->freelist) + if (page->freelist) { add_partial(n, page, tail); - else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); + stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + } else { + stat(c, DEACTIVATE_FULL); + if (SLABDEBUG && PageSlubDebug(page) && + (s->flags & SLAB_STORE_USER)) + add_full(n, page); + } slab_unlock(page); - } else { - if (n->nr_partial < MIN_PARTIAL) { + stat(c, DEACTIVATE_EMPTY); + if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs * to come after the other slabs with objects in - * order to fill them up. That way the size of the - * partial list stays small. kmem_cache_shrink can - * reclaim empty slabs from the partial list. + * so that the others get filled first. That way the + * size of the partial list stays small. + * + * kmem_cache_shrink can reclaim any empty slabs from + * the partial list. */ add_partial(n, page, 1); slab_unlock(page); } else { slab_unlock(page); + stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); discard_slab(s, page); } } @@ -1373,8 +1467,11 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { struct page *page = c->page; int tail = 1; + + if (page->freelist) + stat(c, DEACTIVATE_REMOTE_FREES); /* - * Merge cpu freelist into freelist. Typically we get here + * Merge cpu freelist into slab freelist. Typically we get here * because both freelists are empty. So this is unlikely * to occur. */ @@ -1398,12 +1495,14 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { + stat(c, CPUSLAB_FLUSH); slab_lock(c->page); deactivate_slab(s, c); } /* * Flush cpu slab. + * * Called from IPI handler with interrupts disabled. */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) @@ -1423,15 +1522,7 @@ static void flush_cpu_slab(void *d) static void flush_all(struct kmem_cache *s) { -#ifdef CONFIG_SMP - on_each_cpu(flush_cpu_slab, s, 1, 1); -#else - unsigned long flags; - - local_irq_save(flags); - flush_cpu_slab(s); - local_irq_restore(flags); -#endif + on_each_cpu(flush_cpu_slab, s, 1); } /* @@ -1447,6 +1538,69 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) return 1; } +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ +#ifdef CONFIG_SLUB_DEBUG + return atomic_long_read(&n->total_objects); +#else + return 0; +#endif +} + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ + int node; + + printk(KERN_WARNING + "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " + "default order: %d, min order: %d\n", s->name, s->objsize, + s->size, oo_order(s->oo), oo_order(s->min)); + + if (oo_order(s->min) > get_order(s->objsize)) + printk(KERN_WARNING " %s debugging increased min order, use " + "slub_debug=O to disable.\n", s->name); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + if (!n) + continue; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + printk(KERN_WARNING + " node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -1462,33 +1616,41 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * rest of the freelist to the lockless freelist. * * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is slowest path since we may sleep. + * we need to allocate a new slab. This is the slowest path since it involves + * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { void **object; struct page *new; + /* We handle __GFP_ZERO in the caller */ + gfpflags &= ~__GFP_ZERO; + if (!c->page) goto new_slab; slab_lock(c->page); if (unlikely(!node_match(c, node))) goto another_slab; + + stat(c, ALLOC_REFILL); + load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(c->page))) + if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; - object = c->page->freelist; c->freelist = object[c->offset]; - c->page->inuse = s->objects; + c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); +unlock_out: slab_unlock(c->page); + stat(c, ALLOC_SLOWPATH); return object; another_slab: @@ -1498,6 +1660,7 @@ new_slab: new = get_partial(s, gfpflags, node); if (new) { c->page = new; + stat(c, ALLOC_FROM_PARTIAL); goto load_freelist; } @@ -1511,24 +1674,25 @@ new_slab: if (new) { c = get_cpu_slab(s, smp_processor_id()); + stat(c, ALLOC_SLAB); if (c->page) flush_slab(s, c); slab_lock(new); - SetSlabFrozen(new); + __SetPageSlubFrozen(new); c->page = new; goto load_freelist; } + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); return NULL; debug: - object = c->page->freelist; if (!alloc_debug_processing(s, c->page, object, addr)) goto another_slab; c->page->inuse++; c->page->freelist = object[c->offset]; c->node = -1; - slab_unlock(c->page); - return object; + goto unlock_out; } /* @@ -1542,14 +1706,24 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, unsigned long addr) { void **object; - unsigned long flags; struct kmem_cache_cpu *c; + unsigned long flags; + unsigned int objsize; + + gfpflags &= gfp_allowed_mask; + + lockdep_trace_alloc(gfpflags); + might_sleep_if(gfpflags & __GFP_WAIT); + + if (should_failslab(s->objsize, gfpflags)) + return NULL; local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); + objsize = c->objsize; if (unlikely(!c->freelist || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); @@ -1557,29 +1731,60 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, else { object = c->freelist; c->freelist = object[c->offset]; + stat(c, ALLOC_FASTPATH); } local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, c->objsize); + if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, objsize); + + kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); + kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1589,46 +1794,55 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr, unsigned int offset) { void *prior; void **object = (void *)x; + struct kmem_cache_cpu *c; + c = get_cpu_slab(s, raw_smp_processor_id()); + stat(c, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SlabDebug(page))) + if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; + checks_ok: prior = object[offset] = page->freelist; page->freelist = object; page->inuse--; - if (unlikely(SlabFrozen(page))) + if (unlikely(PageSlubFrozen(page))) { + stat(c, FREE_FROZEN); goto out_unlock; + } if (unlikely(!page->inuse)) goto slab_empty; /* - * Objects left in the slab. If it - * was not on the partial list before + * Objects left in the slab. If it was not on the partial list before * then add it. */ - if (unlikely(!prior)) + if (unlikely(!prior)) { add_partial(get_node(s, page_to_nid(page)), page, 1); + stat(c, FREE_ADD_PARTIAL); + } out_unlock: slab_unlock(page); return; slab_empty: - if (prior) + if (prior) { /* * Slab still on the partial list. */ remove_partial(s, page); - + stat(c, FREE_REMOVE_PARTIAL); + } slab_unlock(page); + stat(c, FREE_SLAB); discard_slab(s, page); return; @@ -1650,18 +1864,23 @@ debug: * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; - unsigned long flags; struct kmem_cache_cpu *c; + unsigned long flags; + kmemleak_free_recursive(x, s->flags); local_irq_save(flags); - debug_check_no_locks_freed(object, s->objsize); c = get_cpu_slab(s, smp_processor_id()); + kmemcheck_slab_free(s, object, c->objsize); + debug_check_no_locks_freed(object, c->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, c->objsize); if (likely(page == c->page && c->node >= 0)) { object[c->offset] = c->freelist; c->freelist = object; + stat(c, FREE_FASTPATH); } else __slab_free(s, page, x, addr, c->offset); @@ -1674,11 +1893,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, page, x, _RET_IP_); + + trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab object the object resides */ +/* Figure out on which slab page the object resides */ static struct page *get_object_page(const void *x) { struct page *page = virt_to_head_page(x); @@ -1709,8 +1930,8 @@ static struct page *get_object_page(const void *x) * take the list_lock. */ static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -1725,7 +1946,7 @@ static int slub_nomerge; * system components. Generally order 0 allocations should be preferred since * order 0 does not cause fragmentation in the page allocator. Larger objects * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab + * unused space left. We go to a higher order if more than 1/16th of the slab * would be wasted. * * In order to reach satisfactory performance we must ensure that a minimum @@ -1750,6 +1971,9 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; + if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1774,6 +1998,7 @@ static inline int calculate_order(int size) int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1784,8 +2009,13 @@ static inline int calculate_order(int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = (PAGE_SIZE << slub_max_order)/size; + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { - fraction = 8; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, slub_max_order, fraction); @@ -1793,7 +2023,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects /= 2; + min_objects--; } /* @@ -1808,7 +2038,7 @@ static inline int calculate_order(int size) * Doh this slab cannot be placed using slub_max_order. */ order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + if (order < MAX_ORDER) return order; return -ENOSYS; } @@ -1820,20 +2050,21 @@ static unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size) { /* - * If the user wants hardware cache aligned objects then - * follow that suggestion if the object is sufficiently - * large. + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. * - * The hardware cache alignment cannot override the - * specified alignment though. If that is greater - * then use it. + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. */ - if ((flags & SLAB_HWCACHE_ALIGN) && - size > cache_line_size() / 2) - return max_t(unsigned long, align, cache_line_size()); + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } if (align < ARCH_SLAB_MINALIGN) - return ARCH_SLAB_MINALIGN; + align = ARCH_SLAB_MINALIGN; return ALIGN(align, sizeof(void *)); } @@ -1846,15 +2077,20 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, c->node = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize; +#ifdef CONFIG_SLUB_STATS + memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); +#endif } -static void init_kmem_cache_node(struct kmem_cache_node *n) +static void +init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; - atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } @@ -1877,11 +2113,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; +static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, int cpu, gfp_t flags) @@ -1907,7 +2143,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) { if (c < per_cpu(kmem_cache_cpu, cpu) || - c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { kfree(c); return; } @@ -1956,13 +2192,13 @@ static void init_alloc_cpu_cpu(int cpu) { int i; - if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) + if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) return; for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - cpu_set(cpu, kmem_cach_cpu_free_init_once); + cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); } static void __init init_alloc_cpu(void) @@ -1994,11 +2230,11 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) { struct page *page; struct kmem_cache_node *n; + unsigned long flags; BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); @@ -2021,10 +2257,17 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_object(kmalloc_caches, n, 1); init_tracking(kmalloc_caches, n); #endif - init_kmem_cache_node(n); - atomic_long_inc(&n->nr_slabs); + init_kmem_cache_node(n, kmalloc_caches); + inc_slabs_node(kmalloc_caches, node, page->objects); + + /* + * lockdep requires consistent irq usage for each lock + * so even though there cannot be a race this early in + * the boot sequence, we still disable irqs. + */ + local_irq_save(flags); add_partial(n, page, 0); - return n; + local_irq_restore(flags); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2056,8 +2299,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) n = &s->local_node; else { if (slab_state == DOWN) { - n = early_kmem_cache_node_alloc(gfpflags, - node); + early_kmem_cache_node_alloc(gfpflags, node); continue; } n = kmem_cache_alloc_node(kmalloc_caches, @@ -2070,7 +2312,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } s->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); } return 1; } @@ -2081,22 +2323,40 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { - init_kmem_cache_node(&s->local_node); + init_kmem_cache_node(&s->local_node, s); return 1; } #endif +static void set_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->objsize; unsigned long align = s->align; + int order; /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + +#ifdef CONFIG_SLUB_DEBUG + /* * Determine if we can poison the object itself. If the user of * the slab may touch the object after free or before allocation * then we should never poison the object itself. @@ -2107,14 +2367,7 @@ static int calculate_sizes(struct kmem_cache *s) else s->flags &= ~__OBJECT_POISON; - /* - * Round up object size to the next word boundary. We can only - * place the free pointer at word boundaries and this determines - * the possible location of the free pointer. - */ - size = ALIGN(size, sizeof(void *)); -#ifdef CONFIG_SLUB_DEBUG /* * If we are Redzoning then check if there is some space between the * end of the object and the free pointer. If not then add an @@ -2157,7 +2410,7 @@ static int calculate_sizes(struct kmem_cache *s) * Add some empty padding so that we can catch * overwrites from earlier objects rather than let * tracking information or the free pointer be - * corrupted if an user writes before the start + * corrupted if a user writes before the start * of the object. */ size += sizeof(void *); @@ -2169,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s) * on bootup. */ align = calculate_alignment(flags, align, s->objsize); + s->align = align; /* * SLUB stores one object immediately after another beginning from @@ -2177,24 +2431,40 @@ static int calculate_sizes(struct kmem_cache *s) */ size = ALIGN(size, align); s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size); - s->order = calculate_order(size); - if (s->order < 0) + if (order < 0) return 0; + s->allocflags = 0; + if (order) + s->allocflags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + s->allocflags |= SLUB_DMA; + + if (s->flags & SLAB_RECLAIM_ACCOUNT) + s->allocflags |= __GFP_RECLAIMABLE; + /* * Determine the number of objects per slab */ - s->objects = (PAGE_SIZE << s->order) / size; + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; - return !!s->objects; + return !!oo_objects(s->oo); } static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { memset(s, 0, kmem_size); s->name = name; @@ -2203,12 +2473,29 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); - if (!calculate_sizes(s)) + if (!calculate_sizes(s, -1)) goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->objsize)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s, -1)) + goto error; + } + } + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + set_min_partial(s, ilog2(s->size)); s->refcount = 1; #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 100; + s->remote_node_defrag_ratio = 1000; #endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2220,7 +2507,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, + s->name, (unsigned long)size, s->size, oo_order(s->oo), s->offset, flags); return 0; } @@ -2244,7 +2531,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) /* * We could also check if the object is on the slabs freelist. * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid is to check if the object belongs + * purpose of kmem_ptr_valid() is to check if the object belongs * to a certain slab. */ return 1; @@ -2266,26 +2553,52 @@ const char *kmem_cache_name(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_name); +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + DECLARE_BITMAP(map, page->objects); + + bitmap_zero(map, page->objects); + slab_err(s, page, "%s", text); + slab_lock(page); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr, page->objects) { + + if (!test_bit(slab_index(p, s, addr), map)) { + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", + p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); +#endif +} + /* - * Attempt to free all slabs on a node. Return the number of slabs we - * were unable to free. + * Attempt to free all partial slabs on a node. */ -static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, - struct list_head *list) +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - int slabs_inuse = 0; unsigned long flags; struct page *page, *h; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) + list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); - } else - slabs_inuse++; + n->nr_partial--; + } else { + list_slab_objects(s, page, + "Objects remaining on kmem_cache_close()"); + } + } spin_unlock_irqrestore(&n->list_lock, flags); - return slabs_inuse; } /* @@ -2302,8 +2615,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - n->nr_partial -= free_list(s, n, &n->partial); - if (atomic_long_read(&n->nr_slabs)) + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) return 1; } free_kmem_cache_nodes(s); @@ -2321,8 +2634,13 @@ void kmem_cache_destroy(struct kmem_cache *s) if (!s->refcount) { list_del(&s->list); up_write(&slub_lock); - if (kmem_cache_close(s)) - WARN_ON(1); + if (kmem_cache_close(s)) { + printk(KERN_ERR "SLUB %s: %s called for cache that " + "still has objects.\n", s->name, __func__); + dump_stack(); + } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2333,13 +2651,9 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2352,6 +2666,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } @@ -2383,13 +2698,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, if (gfp_flags & SLUB_DMA) flags = SLAB_CACHE_DMA; - down_write(&slub_lock); + /* + * This function is called with IRQs disabled during early-boot on + * single CPU so there's no need to take slub_lock here. + */ if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) + flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - up_write(&slub_lock); + if (sysfs_slab_add(s)) goto panic; return s; @@ -2399,6 +2717,7 @@ panic: } #ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2421,6 +2740,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) struct kmem_cache *s; char *text; size_t realsize; + unsigned long slabflags; s = kmalloc_caches_dma[index]; if (s) @@ -2438,12 +2758,22 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) goto unlock_out; realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); s = kmalloc(kmem_size, flags & ~SLUB_DMA); + /* + * Must defer sysfs creation to a workqueue because we don't know + * what context we are called from. Before sysfs comes up, we don't + * need to do anything because our sysfs initcall will start by + * adding all existing slabs to sysfs. + */ + slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; + if (slab_state >= SYSFS) + slabflags |= __SYSFS_ADD_DEFERRED; + if (!s || !text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { kfree(s); kfree(text); goto unlock_out; @@ -2452,7 +2782,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) list_add(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; - schedule_work(&sysfs_add_work); + if (slab_state >= SYSFS) + schedule_work(&sysfs_add_work); unlock_out: up_write(&slub_lock); @@ -2494,6 +2825,11 @@ static s8 size_index[24] = { 2 /* 192 */ }; +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + static struct kmem_cache *get_slab(size_t size, gfp_t flags) { int index; @@ -2502,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) if (!size) return ZERO_SIZE_PTR; - index = size_index[(size - 1) / 8]; + index = size_index[size_index_elem(size)]; } else index = fls(size - 1); @@ -2517,35 +2853,64 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + if (unlikely(size > SLUB_MAX_SIZE)) + return kmalloc_large(size, flags); s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); +static void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_pages_node(node, flags, get_order(size)); + if (page) + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; +} + #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; + + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, flags, node); - if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); + trace_kmalloc_node(_RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + ret = slab_alloc(s, flags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2555,19 +2920,18 @@ size_t ksize(const void *object) struct page *page; struct kmem_cache *s; - BUG_ON(!object); if (unlikely(object == ZERO_SIZE_PTR)) return 0; page = virt_to_head_page(object); - BUG_ON(!page); - if (unlikely(!PageSlab(page))) + if (unlikely(!PageSlab(page))) { + WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); - + } s = page->slab; - BUG_ON(!s); +#ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. @@ -2575,6 +2939,7 @@ size_t ksize(const void *object) if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->objsize; +#endif /* * If we have the need to store the freelist pointer * back there or track user information then we can @@ -2582,7 +2947,6 @@ size_t ksize(const void *object) */ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) return s->inuse; - /* * Else we can use all the padding etc for the allocation */ @@ -2593,32 +2957,24 @@ EXPORT_SYMBOL(ksize); void kfree(const void *x) { struct page *page; + void *object = (void *)x; + + trace_kfree(_RET_IP_, x); if (unlikely(ZERO_OR_NULL_PTR(x))) return; page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } - slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); + slab_free(page->slab, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); -static unsigned long count_partial(struct kmem_cache_node *n) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} - /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the @@ -2636,8 +2992,9 @@ int kmem_cache_shrink(struct kmem_cache *s) struct kmem_cache_node *n; struct page *page; struct page *t; + int objects = oo_objects(s->max); struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags; if (!slabs_by_inuse) @@ -2650,7 +3007,7 @@ int kmem_cache_shrink(struct kmem_cache *s) if (!n->nr_partial) continue; - for (i = 0; i < s->objects; i++) + for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i); spin_lock_irqsave(&n->list_lock, flags); @@ -2682,7 +3039,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = s->objects - 1; i >= 0; i--) + for (i = objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); @@ -2732,7 +3089,7 @@ static void slab_mem_offline_callback(void *arg) * and offline_pages() function shoudn't call this * callback. So, we must fail. */ - BUG_ON(atomic_long_read(&n->nr_slabs)); + BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; kmem_cache_free(kmalloc_caches, n); @@ -2757,7 +3114,7 @@ static int slab_mem_going_online_callback(void *arg) return 0; /* - * We are bringing a node online. No memory is availabe yet. We must + * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. */ @@ -2773,7 +3130,7 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); s->node[nid] = n; } out: @@ -2801,8 +3158,10 @@ static int slab_memory_callback(struct notifier_block *self, case MEM_CANCEL_OFFLINE: break; } - - ret = notifier_from_errno(ret); + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; return ret; } @@ -2826,31 +3185,31 @@ void __init kmem_cache_init(void) * kmem_cache_open for slab_state == DOWN. */ create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_KERNEL); + sizeof(struct kmem_cache_node), GFP_NOWAIT); kmalloc_caches[0].refcount = -1; caches++; - hotplug_memory_notifier(slab_memory_callback, 1); + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif /* Able to allocate the per node structures */ slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 64) { + if (KMALLOC_MIN_SIZE <= 32) { create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_KERNEL); + "kmalloc-96", 96, GFP_NOWAIT); caches++; } - if (KMALLOC_MIN_SIZE <= 128) { + if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_KERNEL); + "kmalloc-192", 192, GFP_NOWAIT); caches++; } - for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_KERNEL); + "kmalloc", 1 << i, GFP_NOWAIT); caches++; } @@ -2858,7 +3217,7 @@ void __init kmem_cache_init(void) /* * Patch up the size_index table if we have strange large alignment * requirements for the kmalloc array. This is only the case for - * mips it seems. The standard arches will not generate any code here. + * MIPS it seems. The standard arches will not generate any code here. * * Largest permitted alignment is 256 bytes due to the way we * handle the index determination for the smaller caches. @@ -2869,15 +3228,36 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) - size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } + + if (KMALLOC_MIN_SIZE == 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + } else if (KMALLOC_MIN_SIZE == 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[size_index_elem(i)] = 8; + } slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = - kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); @@ -2887,14 +3267,18 @@ void __init kmem_cache_init(void) kmem_size = sizeof(struct kmem_cache); #endif - - printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + printk(KERN_INFO + "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", caches, cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } +void __init kmem_cache_init_late(void) +{ +} + /* * Find a mergeable slab cache */ @@ -2917,7 +3301,7 @@ static int slab_unmergeable(struct kmem_cache *s) static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { struct kmem_cache *s; @@ -2957,11 +3341,13 @@ static struct kmem_cache *find_mergeable(size_t size, } struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; + if (WARN_ON(!name)) + return NULL; + down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -2980,20 +3366,32 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, */ for_each_online_cpu(cpu) get_cpu_slab(s, cpu)->objsize = s->objsize; + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); - if (sysfs_slab_alias(s, name)) + + if (sysfs_slab_alias(s, name)) { + down_write(&slub_lock); + s->refcount--; + up_write(&slub_lock); goto err; + } return s; } + s = kmalloc(kmem_size, GFP_KERNEL); if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); up_write(&slub_lock); - if (sysfs_slab_add(s)) + if (sysfs_slab_add(s)) { + down_write(&slub_lock); + list_del(&s->list); + up_write(&slub_lock); + kfree(s); goto err; + } return s; } kfree(s); @@ -3055,43 +3453,65 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, } static struct notifier_block __cpuinitdata slab_notifier = { - &slab_cpuup_callback, NULL, 0 + .notifier_call = slab_cpuup_callback }; #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; + + if (unlikely(size > SLUB_MAX_SIZE)) + return kmalloc_large(size, gfpflags); - if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(gfpflags | __GFP_COMP, - get_order(size)); s = get_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + trace_kmalloc(caller, ret, size, s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; + void *ret; + + if (unlikely(size > SLUB_MAX_SIZE)) + return kmalloc_large_node(size, gfpflags, node); - if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(gfpflags | __GFP_COMP, - get_order(size)); s = get_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc(s, gfpflags, node, caller); + + /* Honor the call site pointer we recieved. */ + trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + + return ret; +} + +#ifdef CONFIG_SLUB_DEBUG +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; } -#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3103,7 +3523,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; /* Now we know that a valid freelist exists */ - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); @@ -3111,7 +3531,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; } - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) if (!check_object(s, page, p, 1)) return 0; @@ -3129,12 +3549,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, s->name, page); if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug not set " + if (!PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug not set " "on slab 0x%p\n", s->name, page); } else { - if (SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug set on " + if (PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug set on " "slab 0x%p\n", s->name, page); } } @@ -3177,7 +3597,7 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3213,8 +3633,9 @@ static void resiliency_test(void) p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches + 5); p = kzalloc(64, GFP_KERNEL); @@ -3222,7 +3643,8 @@ static void resiliency_test(void) *p = 0x56; printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); - printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches + 6); printk(KERN_ERR "\nB. Corruption after free\n"); @@ -3235,7 +3657,8 @@ static void resiliency_test(void) p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", + p); validate_slab_cache(kmalloc_caches + 8); p = kzalloc(512, GFP_KERNEL); @@ -3255,13 +3678,13 @@ static void resiliency_test(void) {}; struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; long min_pid; long max_pid; - cpumask_t cpus; + DECLARE_BITMAP(cpus, NR_CPUS); nodemask_t nodes; }; @@ -3303,7 +3726,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; @@ -3336,7 +3759,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->pid > l->max_pid) l->max_pid = track->pid; - cpu_set(track->cpu, l->cpus); + cpumask_set_cpu(track->cpu, + to_cpumask(l->cpus)); } node_set(page_to_nid(virt_to_page(track)), l->nodes); return 1; @@ -3366,8 +3790,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; - cpus_clear(l->cpus); - cpu_set(track->cpu, l->cpus); + cpumask_clear(to_cpumask(l->cpus)); + cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); node_set(page_to_nid(virt_to_page(track)), l->nodes); return 1; @@ -3377,14 +3801,14 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc) { void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); + DECLARE_BITMAP(map, page->objects); void *p; - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) set_bit(slab_index(p, s, addr), map); - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); } @@ -3423,7 +3847,7 @@ static int list_locations(struct kmem_cache *s, char *buf, for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; - if (len > PAGE_SIZE - 100) + if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) break; len += sprintf(buf + len, "%7ld ", l->count); @@ -3433,12 +3857,10 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, ""); if (l->sum_time != l->min_time) { - unsigned long remainder; - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - div_long_long_rem(l->sum_time, l->count, &remainder), - l->max_time); + l->min_time, + (long)div_u64(l->sum_time, l->count), + l->max_time); } else len += sprintf(buf + len, " age=%ld", l->min_time); @@ -3450,14 +3872,15 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, " pid=%ld", l->min_pid); - if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && + if (num_online_cpus() > 1 && + !cpumask_empty(to_cpumask(l->cpus)) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->cpus); + to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, @@ -3474,80 +3897,87 @@ static int list_locations(struct kmem_cache *s, char *buf, } enum slab_stat_type { - SL_FULL, - SL_PARTIAL, - SL_CPU, - SL_OBJECTS + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ }; -#define SO_FULL (1 << SL_FULL) +#define SO_ALL (1 << SL_ALL) #define SO_PARTIAL (1 << SL_PARTIAL) #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) -static unsigned long slab_objects(struct kmem_cache *s, - char *buf, unsigned long flags) +static ssize_t show_slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) { unsigned long total = 0; - int cpu; int node; int x; unsigned long *nodes; unsigned long *per_cpu; nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + if (!nodes) + return -ENOMEM; per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (flags & SO_CPU) { + int cpu; - if (!c) - continue; + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - if (flags & SO_OBJECTS) - x = page->inuse; + if (!c || c->node < 0) + continue; + + if (c->page) { + if (flags & SO_TOTAL) + x = c->page->objects; + else if (flags & SO_OBJECTS) + x = c->page->inuse; else x = 1; + total += x; - nodes[node] += x; + nodes[c->node] += x; } - per_cpu[node]++; + per_cpu[c->node]++; } } - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); - if (flags & SO_PARTIAL) { - if (flags & SO_OBJECTS) - x = count_partial(n); else - x = n->nr_partial; + x = atomic_long_read(&n->nr_slabs); total += x; nodes[node] += x; } - if (flags & SO_FULL) { - int full_slabs = atomic_long_read(&n->nr_slabs) - - per_cpu[node] - - n->nr_partial; + } else if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); - if (flags & SO_OBJECTS) - x = full_slabs * s->objects; + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); else - x = full_slabs; + x = n->nr_partial; total += x; nodes[node] += x; } } - x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA for_each_node_state(node, N_NORMAL_MEMORY) @@ -3562,14 +3992,6 @@ static unsigned long slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c && c->page) - return 1; - } for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -3577,7 +3999,7 @@ static int any_slab_objects(struct kmem_cache *s) if (!n) continue; - if (n->nr_partial || atomic_long_read(&n->nr_slabs)) + if (atomic_long_read(&n->total_objects)) return 1; } return 0; @@ -3619,15 +4041,52 @@ SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objects); + return sprintf(buf, "%d\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long order; + int err; + + err = strict_strtoul(buf, 10, &order); + if (err) + return err; + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->order); + return sprintf(buf, "%d\n", oo_order(s->oo)); +} +SLAB_ATTR(order); + +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = strict_strtoul(buf, 10, &min); + if (err) + return err; + + set_min_partial(s, min); + return length; } -SLAB_ATTR_RO(order); +SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { @@ -3648,28 +4107,40 @@ SLAB_ATTR_RO(aliases); static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); + return show_slab_objects(s, buf, SO_ALL); } SLAB_ATTR_RO(slabs); static ssize_t partial_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_PARTIAL); + return show_slab_objects(s, buf, SO_PARTIAL); } SLAB_ATTR_RO(partial); static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_CPU); + return show_slab_objects(s, buf, SO_CPU); } SLAB_ATTR_RO(cpu_slabs); static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } SLAB_ATTR_RO(objects); +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); @@ -3749,7 +4220,7 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') s->flags |= SLAB_RED_ZONE; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(red_zone); @@ -3768,7 +4239,7 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') s->flags |= SLAB_POISON; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(poison); @@ -3787,7 +4258,7 @@ static ssize_t store_user_store(struct kmem_cache *s, s->flags &= ~SLAB_STORE_USER; if (buf[0] == '1') s->flags |= SLAB_STORE_USER; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(store_user); @@ -3855,21 +4326,103 @@ static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - int n = simple_strtoul(buf, NULL, 10); + unsigned long ratio; + int err; + + err = strict_strtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio <= 100) + s->remote_node_defrag_ratio = ratio * 10; - if (n < 100) - s->remote_node_defrag_ratio = n * 10; return length; } SLAB_ATTR(remote_node_defrag_ratio); #endif +#ifdef CONFIG_SLUB_STATS +static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) +{ + unsigned long sum = 0; + int cpu; + int len; + int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + for_each_online_cpu(cpu) { + unsigned x = get_cpu_slab(s, cpu)->stat[si]; + + data[cpu] = x; + sum += x; + } + + len = sprintf(buf, "%lu", sum); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + if (data[cpu] && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); + } +#endif + kfree(data); + return len + sprintf(buf + len, "\n"); +} + +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->stat[si] = 0; +} + +#define STAT_ATTR(si, text) \ +static ssize_t text##_show(struct kmem_cache *s, char *buf) \ +{ \ + return show_stat(s, buf, si); \ +} \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ + +STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); +STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_FASTPATH, free_fastpath); +STAT_ATTR(FREE_SLOWPATH, free_slowpath); +STAT_ATTR(FREE_FROZEN, free_frozen); +STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); +STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); +STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); +STAT_ATTR(ALLOC_SLAB, alloc_slab); +STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(FREE_SLAB, free_slab); +STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); +STAT_ATTR(DEACTIVATE_FULL, deactivate_full); +STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); +STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); +STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); +STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(ORDER_FALLBACK, order_fallback); +#endif + static struct attribute *slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, &objects_attr.attr, + &objects_partial_attr.attr, + &total_objects_attr.attr, &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, @@ -3894,6 +4447,26 @@ static struct attribute *slab_attrs[] = { #ifdef CONFIG_NUMA &remote_node_defrag_ratio_attr.attr, #endif +#ifdef CONFIG_SLUB_STATS + &alloc_fastpath_attr.attr, + &alloc_slowpath_attr.attr, + &free_fastpath_attr.attr, + &free_slowpath_attr.attr, + &free_frozen_attr.attr, + &free_add_partial_attr.attr, + &free_remove_partial_attr.attr, + &alloc_from_partial_attr.attr, + &alloc_slab_attr.attr, + &alloc_refill_attr.attr, + &free_slab_attr.attr, + &cpuslab_flush_attr.attr, + &deactivate_full_attr.attr, + &deactivate_empty_attr.attr, + &deactivate_to_head_attr.attr, + &deactivate_to_tail_attr.attr, + &deactivate_remote_frees_attr.attr, + &order_fallback_attr.attr, +#endif NULL }; @@ -3974,8 +4547,8 @@ static struct kset *slab_kset; #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: - * format - * :[flags-]size:[memory address of kmemcache] + * + * Format :[flags-]size */ static char *create_unique_id(struct kmem_cache *s) { @@ -3998,6 +4571,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); @@ -4040,8 +4615,11 @@ static int sysfs_slab_add(struct kmem_cache *s) } err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) + if (err) { + kobject_del(&s->kobj); + kobject_put(&s->kobj); return err; + } kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ @@ -4060,7 +4638,7 @@ static void sysfs_slab_remove(struct kmem_cache *s) /* * Need to buffer aliases during bootup until sysfs becomes - * available lest we loose that information. + * available lest we lose that information. */ struct saved_alias { struct kmem_cache *s; @@ -4135,14 +4713,6 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO - -ssize_t slabinfo_write(struct file *file, const char __user * buffer, - size_t count, loff_t *ppos) -{ - return -EINVAL; -} - - static void print_slabinfo_header(struct seq_file *m) { seq_puts(m, "slabinfo - version: 2.1\n"); @@ -4179,7 +4749,8 @@ static int s_show(struct seq_file *m, void *p) unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_inuse = 0; - unsigned long nr_objs; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; struct kmem_cache *s; int node; @@ -4193,14 +4764,15 @@ static int s_show(struct seq_file *m, void *p) nr_partials += n->nr_partial; nr_slabs += atomic_long_read(&n->nr_slabs); - nr_inuse += count_partial(n); + nr_objs += atomic_long_read(&n->total_objects); + nr_free += count_partial(n, count_free); } - nr_objs = nr_slabs * s->objects; - nr_inuse += (nr_slabs - nr_partials) * s->objects; + nr_inuse = nr_objs - nr_free; seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, s->objects, (1 << s->order)); + nr_objs, s->size, oo_objects(s->oo), + (1 << oo_order(s->oo))); seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 0UL); @@ -4208,11 +4780,29 @@ static int s_show(struct seq_file *m, void *p) return 0; } -const struct seq_operations slabinfo_op = { +static const struct seq_operations slabinfo_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */