X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fslub.c;h=e29a42988c78ca17d9594c2cb9fdb3cb87c257d0;hb=49d4914fd7c10ae846d0f30d5f6f4732cc68499c;hp=c4f40d373d1e5ec7050671faa8a109031c76f2ee;hpb=643b113849d8faa68c9f01c3c9d929bfbffd50bd;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/slub.c b/mm/slub.c index c4f40d3..e29a429 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -66,11 +66,11 @@ * SLUB assigns one slab for allocation to each processor. * Allocations only occur from these slabs called cpu slabs. * - * Slabs with free elements are kept on a partial list. - * There is no list for full slabs. If an object in a full slab is + * Slabs with free elements are kept on a partial list and during regular + * operations no list for full slabs is used. If an object in a full slab is * freed then the slab will show up again on the partial lists. - * Otherwise there is no need to track full slabs unless we have to - * track full slabs for debugging purposes. + * We track full slabs for debugging purposes though because otherwise we + * cannot scan all objects. * * Slabs are freed when they become empty. Teardown and setup is * minimal so we rely on the page allocators per cpu caches for @@ -78,31 +78,70 @@ * * Overloading of page flags that are otherwise used for LRU management. * - * PageActive The slab is used as a cpu cache. Allocations - * may be performed from the slab. The slab is not - * on any slab list and cannot be moved onto one. + * PageActive The slab is frozen and exempt from list processing. + * This means that the slab is dedicated to a purpose + * such as satisfying allocations for a specific + * processor. Objects may be freed in the slab while + * it is frozen but slab_free will then skip the usual + * list operations. It is up to the processor holding + * the slab to integrate the slab into the slab lists + * when the slab is no longer needed. + * + * One use of this flag is to mark slabs that are + * used for allocations. Then such a slab becomes a cpu + * slab. The cpu slab may be equipped with an additional + * freelist that allows lockless access to + * free objects in addition to the regular freelist + * that requires the slab lock. * * PageError Slab requires special handling due to debug * options set. This moves slab handling out of - * the fast path. + * the fast path and disables lockless freelists. */ +#define FROZEN (1 << PG_active) + +#ifdef CONFIG_SLUB_DEBUG +#define SLABDEBUG (1 << PG_error) +#else +#define SLABDEBUG 0 +#endif + +static inline int SlabFrozen(struct page *page) +{ + return page->flags & FROZEN; +} + +static inline void SetSlabFrozen(struct page *page) +{ + page->flags |= FROZEN; +} + +static inline void ClearSlabFrozen(struct page *page) +{ + page->flags &= ~FROZEN; +} + +static inline int SlabDebug(struct page *page) +{ + return page->flags & SLABDEBUG; +} + +static inline void SetSlabDebug(struct page *page) +{ + page->flags |= SLABDEBUG; +} + +static inline void ClearSlabDebug(struct page *page) +{ + page->flags &= ~SLABDEBUG; +} + /* * Issues still to be resolved: * - * - The per cpu array is updated for each new slab and and is a remote - * cacheline for most nodes. This could become a bouncing cacheline given - * enough frequent updates. There are 16 pointers in a cacheline.so at - * max 16 cpus could compete. Likely okay. - * * - Support PAGE_ALLOC_DEBUG. Should be easy to do. * - * - Support DEBUG_SLAB_LEAK. Trouble is we do not know where the full - * slabs are in SLUB. - * - * - SLAB_DEBUG_INITIAL is not supported but I have never seen a use of - * it. - * * - Variable sizing of the per node arrays */ @@ -129,12 +168,21 @@ #endif /* - * Flags from the regular SLAB that SLUB does not support: + * Mininum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ +#define MIN_PARTIAL 2 + +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in the. */ -#define SLUB_UNIMPLEMENTED (SLAB_DEBUG_INITIAL) +#define MAX_PARTIAL 10 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) + /* * Set of flags that will prevent slab merging */ @@ -153,7 +201,13 @@ #endif /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ +#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ + +/* Not all arches define cache_line_size */ +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif static int kmem_size = sizeof(struct kmem_cache); @@ -164,22 +218,35 @@ static struct notifier_block slab_notifier; static enum { DOWN, /* No slab functionality available */ PARTIAL, /* kmem_cache_open() works but kmalloc does not */ - UP, /* Everything works */ + UP, /* Everything works but does not show up in sysfs */ SYSFS /* Sysfs up */ } slab_state = DOWN; /* A list of all slab caches on the system */ static DECLARE_RWSEM(slub_lock); -LIST_HEAD(slab_caches); +static LIST_HEAD(slab_caches); + +/* + * Tracking user of a slab. + */ +struct track { + void *addr; /* Called from address */ + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SYSFS +#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); #else -static int sysfs_slab_add(struct kmem_cache *s) { return 0; } -static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static void sysfs_slab_remove(struct kmem_cache *s) {} +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) + { return 0; } +static inline void sysfs_slab_remove(struct kmem_cache *s) {} #endif /******************************************************************** @@ -200,6 +267,76 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } +static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) +{ +#ifdef CONFIG_SMP + return s->cpu_slab[cpu]; +#else + return &s->cpu_slab; +#endif +} + +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, const void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + if (object < base || object >= base + s->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +/* + * Slow version of get and set free pointer. + * + * This version requires touching the cache lines of kmem_cache which + * we avoid to do in the fast alloc free paths. There we obtain the offset + * from the page struct. + */ +static inline void *get_freepointer(struct kmem_cache *s, void *object) +{ + return *(void **)(object + s->offset); +} + +static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + *(void **)(object + s->offset) = fp; +} + +/* Loop over all objects in a slab */ +#define for_each_object(__p, __s, __addr) \ + for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ + __p += (__s)->size) + +/* Scan freelist */ +#define for_each_free_object(__p, __s, __free) \ + for (__p = (__free); __p; __p = get_freepointer((__s), __p)) + +/* Determine object index from a given position */ +static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +{ + return (p - addr) / s->size; +} + +#ifdef CONFIG_SLUB_DEBUG +/* + * Debug settings: + */ +#ifdef CONFIG_SLUB_DEBUG_ON +static int slub_debug = DEBUG_DEFAULT_FLAGS; +#else +static int slub_debug; +#endif + +static char *slub_debug_slabs; + /* * Object debugging */ @@ -213,7 +350,7 @@ static void print_section(char *text, u8 *addr, unsigned int length) for (i = 0; i < length; i++) { if (newline) { - printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + printk(KERN_ERR "%8s 0x%p: ", text, addr + i); newline = 0; } printk(" %02x", addr[i]); @@ -235,35 +372,6 @@ static void print_section(char *text, u8 *addr, unsigned int length) } } -/* - * Slow version of get and set free pointer. - * - * This requires touching the cache lines of kmem_cache. - * The offset can also be obtained from the page. In that - * case it is in the cacheline that we already need to touch. - */ -static void *get_freepointer(struct kmem_cache *s, void *object) -{ - return *(void **)(object + s->offset); -} - -static void set_freepointer(struct kmem_cache *s, void *object, void *fp) -{ - *(void **)(object + s->offset) = fp; -} - -/* - * Tracking user of a slab. - */ -struct track { - void *addr; /* Called from address */ - int cpu; /* Was running on cpu */ - int pid; /* Pid context */ - unsigned long when; /* When did the operation occur */ -}; - -enum track_item { TRACK_ALLOC, TRACK_FREE }; - static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { @@ -299,10 +407,11 @@ static void set_track(struct kmem_cache *s, void *object, static void init_tracking(struct kmem_cache *s, void *object) { - if (s->flags & SLAB_STORE_USER) { - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); - } + if (!(s->flags & SLAB_STORE_USER)) + return; + + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); } static void print_track(const char *s, struct track *t) @@ -310,65 +419,106 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "%s: ", s); + printk(KERN_ERR "INFO: %s in ", s); __print_symbol("%s", (unsigned long)t->addr); - printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); +} + +static void print_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + print_track("Allocated", get_track(s, object, TRACK_ALLOC)); + print_track("Freed", get_track(s, object, TRACK_FREE)); +} + +static void print_page_info(struct page *page) +{ + printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", + page, page->inuse, page->freelist, page->flags); + +} + +static void slab_bug(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "========================================" + "=====================================\n"); + printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "----------------------------------------" + "-------------------------------------\n\n"); +} + +static void slab_fix(struct kmem_cache *s, char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } -static void print_trailer(struct kmem_cache *s, u8 *p) +static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ + u8 *addr = page_address(page); + + print_tracking(s, p); + + print_page_info(page); + + printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); + + if (p > addr + 16) + print_section("Bytes b4", p - 16, 16); + + print_section("Object", p, min(s->objsize, 128)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, s->inuse - s->objsize); - printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", - p + s->offset, - get_freepointer(s, p)); - if (s->offset) off = s->offset + sizeof(void *); else off = s->inuse; - if (s->flags & SLAB_STORE_USER) { - print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); - print_track("Last free ", get_track(s, p, TRACK_FREE)); + if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - } if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Filler", p + off, s->size - off); + print_section("Padding", p + off, s->size - off); + + dump_stack(); } static void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { - u8 *addr = page_address(page); - - printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", - s->name, reason, object, page); - printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", - object - addr, page->flags, page->inuse, page->freelist); - if (object > addr + 16) - print_section("Bytes b4", object - 16, 16); - print_section("Object", object, min(s->objsize, 128)); - print_trailer(s, object); - dump_stack(); + slab_bug(s, reason); + print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) { va_list args; char buf[100]; - va_start(args, reason); - vsnprintf(buf, sizeof(buf), reason, args); + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, - page); + slab_bug(s, fmt); + print_page_info(page); dump_stack(); } @@ -387,33 +537,46 @@ static void init_object(struct kmem_cache *s, void *object, int active) s->inuse - s->objsize); } -static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) { while (bytes) { if (*start != (u8)value) - return 0; + return start; start++; bytes--; } - return 1; + return NULL; } +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + memset(from, data, to - from); +} -static int check_valid_pointer(struct kmem_cache *s, struct page *page, - void *object) +static int check_bytes_and_report(struct kmem_cache *s, struct page *page, + u8 *object, char *what, + u8* start, unsigned int value, unsigned int bytes) { - void *base; + u8 *fault; + u8 *end; - if (!object) + fault = check_bytes(start, value, bytes); + if (!fault) return 1; - base = page_address(page); - if (object < base || object >= base + s->objects * s->size || - (object - base) % s->size) { - return 0; - } + end = start + bytes; + while (end > fault && end[-1] == value) + end--; - return 1; + slab_bug(s, "%s overwritten", what); + printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault[0], value); + print_trailer(s, page, object); + + restore_bytes(s, what, value, fault, end); + return 0; } /* @@ -423,37 +586,37 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page, * Bytes of the object to be managed. * If the freepointer may overlay the object then the free * pointer is the first word of the object. + * * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) * * object + s->objsize * Padding to reach word boundary. This is also used for Redzoning. - * Padding is extended to word size if Redzoning is enabled - * and objsize == inuse. + * Padding is extended by another word if Redzoning is enabled and + * objsize == inuse. + * * We fill with 0xbb (RED_INACTIVE) for inactive objects and with * 0xcc (RED_ACTIVE) for objects in use. * * object + s->inuse + * Meta data starts here. + * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary - * Padding is done using 0x5a (POISON_INUSE) + * C. Padding to reach required alignment boundary or at mininum + * one word if debuggin is on to be able to detect writes + * before the word boundary. + * + * Padding is done using 0x5a (POISON_INUSE) * * object + s->size + * Nothing is used beyond s->size. * - * If slabcaches are merged then the objsize and inuse boundaries are to - * be ignored. And therefore no slab options that rely on these boundaries + * If slabcaches are merged then the objsize and inuse boundaries are mostly + * ignored. And therefore no slab options that rely on these boundaries * may be used with merged slabcaches. */ -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, - void *from, void *to) -{ - printk(KERN_ERR "@@@ SLUB: %s Restoring %s (0x%x) from 0x%p-0x%p\n", - s->name, message, data, from, to - 1); - memset(from, data, to - from); -} - static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { unsigned long off = s->inuse; /* The end of info */ @@ -469,41 +632,39 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) if (s->size == off) return 1; - if (check_bytes(p + off, POISON_INUSE, s->size - off)) - return 1; - - object_err(s, page, p, "Object padding check fails"); - - /* - * Restore padding - */ - restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); - return 0; + return check_bytes_and_report(s, page, p, "Object padding", + p + off, POISON_INUSE, s->size - off); } static int slab_pad_check(struct kmem_cache *s, struct page *page) { - u8 *p; - int length, remainder; + u8 *start; + u8 *fault; + u8 *end; + int length; + int remainder; if (!(s->flags & SLAB_POISON)) return 1; - p = page_address(page); + start = page_address(page); + end = start + (PAGE_SIZE << s->order); length = s->objects * s->size; - remainder = (PAGE_SIZE << s->order) - length; + remainder = end - (start + length); if (!remainder) return 1; - if (!check_bytes(p + length, POISON_INUSE, remainder)) { - printk(KERN_ERR "SLUB: %s slab 0x%p: Padding fails check\n", - s->name, p); - dump_stack(); - restore_bytes(s, "slab padding", POISON_INUSE, p + length, - p + length + remainder); - return 0; - } - return 1; + fault = check_bytes(start + length, POISON_INUSE, remainder); + if (!fault) + return 1; + while (end > fault && end[-1] == POISON_INUSE) + end--; + + slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + print_section("Padding", start, length); + + restore_bytes(s, "slab padding", POISON_INUSE, start, end); + return 0; } static int check_object(struct kmem_cache *s, struct page *page, @@ -516,41 +677,22 @@ static int check_object(struct kmem_cache *s, struct page *page, unsigned int red = active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes(endobject, red, s->inuse - s->objsize)) { - object_err(s, page, object, - active ? "Redzone Active" : "Redzone Inactive"); - restore_bytes(s, "redzone", red, - endobject, object + s->inuse); + if (!check_bytes_and_report(s, page, object, "Redzone", + endobject, red, s->inuse - s->objsize)) return 0; - } } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && - !check_bytes(endobject, POISON_INUSE, - s->inuse - s->objsize)) { - object_err(s, page, p, "Alignment padding check fails"); - /* - * Fix it so that there will not be another report. - * - * Hmmm... We may be corrupting an object that now expects - * to be longer than allowed. - */ - restore_bytes(s, "alignment padding", POISON_INUSE, - endobject, object + s->inuse); - } + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) + check_bytes_and_report(s, page, p, "Alignment padding", endobject, + POISON_INUSE, s->inuse - s->objsize); } if (s->flags & SLAB_POISON) { if (!active && (s->flags & __OBJECT_POISON) && - (!check_bytes(p, POISON_FREE, s->objsize - 1) || - p[s->objsize - 1] != POISON_END)) { - - object_err(s, page, p, "Poison check failed"); - restore_bytes(s, "Poison", POISON_FREE, - p, p + s->objsize -1); - restore_bytes(s, "Poison", POISON_END, - p + s->objsize - 1, p + s->objsize); + (!check_bytes_and_report(s, page, p, "Poison", p, + POISON_FREE, s->objsize - 1) || + !check_bytes_and_report(s, page, p, "Poison", + p + s->objsize -1, POISON_END, 1))) return 0; - } /* * check_pad_bytes cleans up on its own. */ @@ -570,8 +712,7 @@ static int check_object(struct kmem_cache *s, struct page *page, /* * No choice but to zap it and thus loose the remainder * of the free objects in this slab. May cause - * another error because the object count maybe - * wrong now. + * another error because the object count is now wrong. */ set_freepointer(s, p, NULL); return 0; @@ -584,30 +725,12 @@ static int check_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { - printk(KERN_ERR "SLUB: %s Not a valid slab page @0x%p " - "flags=%lx mapping=0x%p count=%d \n", - s->name, page, page->flags, page->mapping, - page_count(page)); - return 0; - } - if (page->offset * sizeof(void *) != s->offset) { - printk(KERN_ERR "SLUB: %s Corrupted offset %lu in slab @0x%p" - " flags=0x%lx mapping=0x%p count=%d\n", - s->name, - (unsigned long)(page->offset * sizeof(void *)), - page, - page->flags, - page->mapping, - page_count(page)); - dump_stack(); + slab_err(s, page, "Not a valid slab page"); return 0; } if (page->inuse > s->objects) { - printk(KERN_ERR "SLUB: %s Inuse %u > max %u in slab " - "page @0x%p flags=%lx mapping=0x%p count=%d\n", - s->name, page->inuse, s->objects, page, page->flags, - page->mapping, page_count(page)); - dump_stack(); + slab_err(s, page, "inuse %u > max %u", + s->name, page->inuse, s->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -616,9 +739,8 @@ static int check_slab(struct kmem_cache *s, struct page *page) } /* - * Determine if a certain object on a page is on the freelist and - * therefore free. Must hold the slab lock for cpu slabs to - * guarantee that the chains are consistent. + * Determine if a certain object on a page is on the freelist. Must hold the + * slab lock to guarantee that the chains are in a consistent state. */ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { @@ -636,12 +758,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) set_freepointer(s, object, NULL); break; } else { - printk(KERN_ERR "SLUB: %s slab 0x%p " - "freepointer 0x%p corrupted.\n", - s->name, page, fp); - dump_stack(); + slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; page->inuse = s->objects; + slab_fix(s, "Freelist cleared"); return 0; } break; @@ -652,30 +772,35 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } if (page->inuse != s->objects - nr) { - printk(KERN_ERR "slab %s: page 0x%p wrong object count." - " counter is %d but counted were %d\n", - s->name, page, page->inuse, - s->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", page->inuse, s->objects - nr); page->inuse = s->objects - nr; + slab_fix(s, "Object count adjusted."); } return search == NULL; } -/* - * Tracking of fully allocated slabs for debugging - */ -static void add_full(struct kmem_cache *s, struct page *page) +static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { - struct kmem_cache_node *n; - - VM_BUG_ON(!irqs_disabled()); + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + s->name, + alloc ? "alloc" : "free", + object, page->inuse, + page->freelist); - VM_BUG_ON(!irqs_disabled()); + if (!alloc) + print_section("Object", (void *)object, s->objsize); - if (!(s->flags & SLAB_STORE_USER)) - return; + dump_stack(); + } +} - n = get_node(s, page_to_nid(page)); +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache_node *n, struct page *page) +{ spin_lock(&n->list_lock); list_add(&page->lru, &n->full); spin_unlock(&n->list_lock); @@ -695,73 +820,69 @@ static void remove_full(struct kmem_cache *s, struct page *page) spin_unlock(&n->list_lock); } -static int alloc_object_checks(struct kmem_cache *s, struct page *page, - void *object) +static void setup_object_debug(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + return; + + init_object(s, object, 0); + init_tracking(s, object); +} + +static int alloc_debug_processing(struct kmem_cache *s, struct page *page, + void *object, void *addr) { if (!check_slab(s, page)) goto bad; if (object && !on_freelist(s, page, object)) { - printk(KERN_ERR "SLUB: %s Object 0x%p@0x%p " - "already allocated.\n", - s->name, object, page); - goto dump; + object_err(s, page, object, "Object already allocated"); + goto bad; } if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); - goto dump; + goto bad; } - if (!object) - return 1; - - if (!check_object(s, page, object, 0)) + if (object && !check_object(s, page, object, 0)) goto bad; - init_object(s, object, 1); - if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", - s->name, object, page->inuse, - page->freelist); - dump_stack(); - } + /* Success perform special debug activities for allocs */ + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_ALLOC, addr); + trace(s, page, object, 1); + init_object(s, object, 1); return 1; -dump: - dump_stack(); + bad: if (PageSlab(page)) { /* * If this is a slab page then lets do the best we can * to avoid issues in the future. Marking all objects - * as used avoids touching the remainder. + * as used avoids touching the remaining objects. */ - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", - s->name, page); + slab_fix(s, "Marking all objects used"); page->inuse = s->objects; page->freelist = NULL; - /* Fix up fields that may be corrupted */ - page->offset = s->offset / sizeof(void *); } return 0; } -static int free_object_checks(struct kmem_cache *s, struct page *page, - void *object) +static int free_debug_processing(struct kmem_cache *s, struct page *page, + void *object, void *addr) { if (!check_slab(s, page)) goto fail; if (!check_valid_pointer(s, page, object)) { - printk(KERN_ERR "SLUB: %s slab 0x%p invalid " - "object pointer 0x%p\n", - s->name, page, object); + slab_err(s, page, "Invalid object pointer 0x%p", object); goto fail; } if (on_freelist(s, page, object)) { - printk(KERN_ERR "SLUB: %s slab 0x%p object " - "0x%p already free.\n", s->name, page, object); + object_err(s, page, object, "Object already free"); goto fail; } @@ -770,37 +891,148 @@ static int free_object_checks(struct kmem_cache *s, struct page *page, if (unlikely(s != page->slab)) { if (!PageSlab(page)) - printk(KERN_ERR "slab_free %s size %d: attempt to" - "free object(0x%p) outside of slab.\n", - s->name, s->size, object); + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); else - if (!page->slab) + if (!page->slab) { printk(KERN_ERR - "slab_free : no slab(NULL) for object 0x%p.\n", + "SLUB : no slab for object 0x%p.\n", object); + dump_stack(); + } else - printk(KERN_ERR "slab_free %s(%d): object at 0x%p" - " belongs to slab %s(%d)\n", - s->name, s->size, object, - page->slab->name, page->slab->size); + object_err(s, page, object, + "page slab pointer corrupt."); goto fail; } - if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", - s->name, object, page->inuse, - page->freelist); - print_section("Object", object, s->objsize); - dump_stack(); - } + + /* Special debug activities for freeing objects */ + if (!SlabFrozen(page) && !page->freelist) + remove_full(s, page); + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_FREE, addr); + trace(s, page, object, 0); init_object(s, object, 0); return 1; + fail: - dump_stack(); - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", - s->name, page, object); + slab_fix(s, "Object at 0x%p not freed", object); return 0; } +static int __init setup_slub_debug(char *str) +{ + slub_debug = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + if (*str == ',') + /* + * No options but restriction on slabs. This means full + * debugging for slabs matching a pattern. + */ + goto check_slabs; + + slub_debug = 0; + if (*str == '-') + /* + * Switch off all debugging measures. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for ( ;*str && *str != ','; str++) { + switch (tolower(*str)) { + case 'f': + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z': + slub_debug |= SLAB_RED_ZONE; + break; + case 'p': + slub_debug |= SLAB_POISON; + break; + case 'u': + slub_debug |= SLAB_STORE_USER; + break; + case 't': + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n",*str); + } + } + +check_slabs: + if (*str == ',') + slub_debug_slabs = str + 1; +out: + return 1; +} + +__setup("slub_debug", setup_slub_debug); + +static unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(struct kmem_cache *, void *)) +{ + /* + * The page->offset field is only 16 bit wide. This is an offset + * in units of words from the beginning of an object. If the slab + * size is bigger then we cannot move the free pointer behind the + * object anymore. + * + * On 32 bit platforms the limit is 256k. On 64bit platforms + * the limit is 512k. + * + * Debugging or ctor may create a need to move the free + * pointer. Fail if this happens. + */ + if (objsize >= 65535 * sizeof(void *)) { + BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); + BUG_ON(ctor); + } else { + /* + * Enable debugging if selected on the kernel commandline. + */ + if (slub_debug && (!slub_debug_slabs || + strncmp(slub_debug_slabs, name, + strlen(slub_debug_slabs)) == 0)) + flags |= slub_debug; + } + + return flags; +} +#else +static inline void setup_object_debug(struct kmem_cache *s, + struct page *page, void *object) {} + +static inline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr) { return 0; } + +static inline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr) { return 0; } + +static inline int slab_pad_check(struct kmem_cache *s, struct page *page) + { return 1; } +static inline int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) { return 1; } +static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline unsigned long kmem_cache_flags(unsigned long objsize, + unsigned long flags, const char *name, + void (*ctor)(struct kmem_cache *, void *)) +{ + return flags; +} +#define slub_debug 0 +#endif /* * Slab allocation and freeing */ @@ -815,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (s->flags & SLAB_CACHE_DMA) flags |= SLUB_DMA; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; + if (node == -1) page = alloc_pages(flags, s->order); else @@ -834,19 +1069,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) static void setup_object(struct kmem_cache *s, struct page *page, void *object) { - if (PageError(page)) { - init_object(s, object, 0); - init_tracking(s, object); - } - - if (unlikely(s->ctor)) { - int mode = SLAB_CTOR_CONSTRUCTOR; - - if (!(s->flags & __GFP_WAIT)) - mode |= SLAB_CTOR_ATOMIC; - - s->ctor(object, s, mode); - } + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) + s->ctor(s, object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -858,27 +1083,21 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *last; void *p; - if (flags & __GFP_NO_GROW) - return NULL; - - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); - - if (flags & __GFP_WAIT) - local_irq_enable(); + BUG_ON(flags & GFP_SLAB_BUG_MASK); - page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + page = allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); if (!page) goto out; n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); - page->offset = s->offset / sizeof(void *); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) - page->flags |= 1 << PG_error; + SetSlabDebug(page); start = page_address(page); end = start + s->objects * s->size; @@ -887,7 +1106,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) memset(start, POISON_INUSE, PAGE_SIZE << s->order); last = start; - for (p = start + s->size; p < end; p += s->size) { + for_each_object(p, s, start) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -898,8 +1117,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = 0; out: - if (flags & __GFP_WAIT) - local_irq_disable(); return page; } @@ -907,17 +1124,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) { int pages = 1 << s->order; - if (unlikely(PageError(page) || s->dtor)) { - void *start = page_address(page); - void *end = start + (pages << PAGE_SHIFT); + if (unlikely(SlabDebug(page))) { void *p; slab_pad_check(s, page); - for (p = start; p <= end - s->size; p += s->size) { - if (s->dtor) - s->dtor(p, s, 0); + for_each_object(p, s, page_address(page)) check_object(s, page, p, 0); - } + ClearSlabDebug(page); } mod_zone_page_state(page_zone(page), @@ -925,7 +1138,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); - page->mapping = NULL; __free_pages(page, s->order); } @@ -956,7 +1168,7 @@ static void discard_slab(struct kmem_cache *s, struct page *page) atomic_long_dec(&n->nr_slabs); reset_page_mapcount(page); - page->flags &= ~(1 << PG_slab | 1 << PG_error); + __ClearPageSlab(page); free_slab(s, page); } @@ -984,10 +1196,16 @@ static __always_inline int slab_trylock(struct page *page) /* * Management of partially allocated slabs */ -static void add_partial(struct kmem_cache *s, struct page *page) +static void add_partial_tail(struct kmem_cache_node *n, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} +static void add_partial(struct kmem_cache_node *n, struct page *page) +{ spin_lock(&n->list_lock); n->nr_partial++; list_add(&page->lru, &n->partial); @@ -1006,22 +1224,23 @@ static void remove_partial(struct kmem_cache *s, } /* - * Lock page and remove it from the partial list + * Lock slab and remove from the partial list. * - * Must hold list_lock + * Must hold list_lock. */ -static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) +static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { list_del(&page->lru); n->nr_partial--; + SetSlabFrozen(page); return 1; } return 0; } /* - * Try to get a partial slab from a specific node + * Try to allocate a partial slab from a specific node. */ static struct page *get_partial_node(struct kmem_cache_node *n) { @@ -1030,14 +1249,15 @@ static struct page *get_partial_node(struct kmem_cache_node *n) /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab then get_partials() will return NULL. + * partial slab and there is none available then get_partials() + * will return NULL. */ if (!n || !n->nr_partial) return NULL; spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_del_slab(n, page)) + if (lock_and_freeze_slab(n, page)) goto out; page = NULL; out: @@ -1046,8 +1266,7 @@ out: } /* - * Get a page from somewhere. Search in increasing NUMA - * distances. + * Get a page from somewhere. Search in increasing NUMA distances. */ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) { @@ -1057,24 +1276,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) struct page *page; /* - * The defrag ratio allows to configure the tradeoffs between - * inter node defragmentation and node local allocations. - * A lower defrag_ratio increases the tendency to do local - * allocations instead of scanning throught the partial - * lists on other nodes. + * The defrag ratio allows a configuration of the tradeoffs between + * inter node defragmentation and node local allocations. A lower + * defrag_ratio increases the tendency to do local allocations + * instead of attempting to obtain partial slabs from other nodes. * - * If defrag_ratio is set to 0 then kmalloc() always - * returns node local objects. If its higher then kmalloc() - * may return off node objects in order to avoid fragmentation. - * - * A higher ratio means slabs may be taken from other nodes - * thus reducing the number of partial slabs on those nodes. + * If the defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If the ratio is higher then kmalloc() + * may return off node objects because partial slabs are obtained + * from other nodes and filled up. * * If /sys/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation - * will first attempt to defrag slab caches on other nodes. This - * means scanning over all nodes to look for partial slabs which - * may be a bit expensive to do on every slab allocation. + * defrag_ratio = 1000) then every (well almost) allocation will + * first attempt to defrag slab caches on other nodes. This means + * scanning over all nodes to look for partial slabs which may be + * expensive if we do it every time we are trying to find a slab + * with available objects. */ if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) return NULL; @@ -1087,7 +1304,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(*z)); if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > 2) { + n->nr_partial > MIN_PARTIAL) { page = get_partial_node(n); if (page) return page; @@ -1119,55 +1336,88 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) * * On exit the slab lock will have been dropped. */ -static void putback_slab(struct kmem_cache *s, struct page *page) +static void unfreeze_slab(struct kmem_cache *s, struct page *page) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + ClearSlabFrozen(page); if (page->inuse) { + if (page->freelist) - add_partial(s, page); - else if (PageError(page)) - add_full(s, page); + add_partial(n, page); + else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) + add_full(n, page); slab_unlock(page); + } else { - slab_unlock(page); - discard_slab(s, page); + if (n->nr_partial < MIN_PARTIAL) { + /* + * Adding an empty slab to the partial slabs in order + * to avoid page allocator overhead. This slab needs + * to come after the other slabs with objects in + * order to fill them up. That way the size of the + * partial list stays small. kmem_cache_shrink can + * reclaim empty slabs from the partial list. + */ + add_partial_tail(n, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } } } /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - s->cpu_slab[cpu] = NULL; - ClearPageActive(page); + struct page *page = c->page; + /* + * Merge cpu freelist into freelist. Typically we get here + * because both freelists are empty. So this is unlikely + * to occur. + */ + while (unlikely(c->freelist)) { + void **object; - putback_slab(s, page); + /* Retrieve object from cpu_freelist */ + object = c->freelist; + c->freelist = c->freelist[c->offset]; + + /* And put onto the regular freelist */ + object[c->offset] = page->freelist; + page->freelist = object; + page->inuse--; + } + c->page = NULL; + unfreeze_slab(s, page); } -static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - slab_lock(page); - deactivate_slab(s, page, cpu); + slab_lock(c->page); + deactivate_slab(s, c); } /* * Flush cpu slab. * Called from IPI handler with interrupts disabled. */ -static void __flush_cpu_slab(struct kmem_cache *s, int cpu) +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct page *page = s->cpu_slab[cpu]; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - if (likely(page)) - flush_slab(s, page, cpu); + if (likely(c && c->page)) + flush_slab(s, c); } static void flush_cpu_slab(void *d) { struct kmem_cache *s = d; - int cpu = smp_processor_id(); - __flush_cpu_slab(s, cpu); + __flush_cpu_slab(s, smp_processor_id()); } static void flush_all(struct kmem_cache *s) @@ -1184,95 +1434,153 @@ static void flush_all(struct kmem_cache *s) } /* - * slab_alloc is optimized to only modify two cachelines on the fast path - * (aside from the stack): + * Check if the objects in a per cpu structure fit numa + * locality expectations. + */ +static inline int node_match(struct kmem_cache_cpu *c, int node) +{ +#ifdef CONFIG_NUMA + if (node != -1 && c->node != node) + return 0; +#endif + return 1; +} + +/* + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. * - * 1. The page struct - * 2. The first cacheline of the object to be allocated. + * Interrupts are disabled. * - * The only cache lines that are read (apart from code) is the - * per cpu array in the kmem_cache struct. + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. * - * Fastpath is not possible if we need to get a new slab or have - * debugging enabled (which means all slabs are marked with PageError) + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. + * + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is slowest path since we may sleep. */ -static void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) +static void *__slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) { - struct page *page; void **object; - unsigned long flags; - int cpu; + struct page *new; - local_irq_save(flags); - cpu = smp_processor_id(); - page = s->cpu_slab[cpu]; - if (!page) + if (!c->page) goto new_slab; - slab_lock(page); - if (unlikely(node != -1 && page_to_nid(page) != node)) + slab_lock(c->page); + if (unlikely(!node_match(c, node))) goto another_slab; -redo: - object = page->freelist; +load_freelist: + object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(PageError(page))) + if (unlikely(SlabDebug(c->page))) goto debug; -have_object: - page->inuse++; - page->freelist = object[page->offset]; - slab_unlock(page); - local_irq_restore(flags); + object = c->page->freelist; + c->freelist = object[c->offset]; + c->page->inuse = s->objects; + c->page->freelist = NULL; + c->node = page_to_nid(c->page); + slab_unlock(c->page); return object; another_slab: - deactivate_slab(s, page, cpu); + deactivate_slab(s, c); new_slab: - page = get_partial(s, gfpflags, node); - if (likely(page)) { -have_slab: - s->cpu_slab[cpu] = page; - SetPageActive(page); - goto redo; - } - - page = new_slab(s, gfpflags, node); - if (page) { - cpu = smp_processor_id(); - if (s->cpu_slab[cpu]) { + new = get_partial(s, gfpflags, node); + if (new) { + c->page = new; + goto load_freelist; + } + + if (gfpflags & __GFP_WAIT) + local_irq_enable(); + + new = new_slab(s, gfpflags, node); + + if (gfpflags & __GFP_WAIT) + local_irq_disable(); + + if (new) { + c = get_cpu_slab(s, smp_processor_id()); + if (c->page) { /* - * Someone else populated the cpu_slab while we enabled - * interrupts, or we have got scheduled on another cpu. - * The page may not be on the requested node. + * Someone else populated the cpu_slab while we + * enabled interrupts, or we have gotten scheduled + * on another cpu. The page may not be on the + * requested node even if __GFP_THISNODE was + * specified. So we need to recheck. */ - if (node == -1 || - page_to_nid(s->cpu_slab[cpu]) == node) { + if (node_match(c, node)) { /* * Current cpuslab is acceptable and we * want the current one since its cache hot */ - discard_slab(s, page); - page = s->cpu_slab[cpu]; - slab_lock(page); - goto redo; + discard_slab(s, new); + slab_lock(c->page); + goto load_freelist; } - /* Dump the current slab */ - flush_slab(s, s->cpu_slab[cpu], cpu); + /* New slab does not fit our expectations */ + flush_slab(s, c); } - slab_lock(page); - goto have_slab; + slab_lock(new); + SetSlabFrozen(new); + c->page = new; + goto load_freelist; } - local_irq_restore(flags); return NULL; debug: - if (!alloc_object_checks(s, page, object)) + object = c->page->freelist; + if (!alloc_debug_processing(s, c->page, object, addr)) goto another_slab; - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); - goto have_object; + + c->page->inuse++; + c->page->freelist = object[c->offset]; + c->node = -1; + slab_unlock(c->page); + return object; +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static void __always_inline *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr) +{ + void **object; + unsigned long flags; + struct kmem_cache_cpu *c; + + local_irq_save(flags); + c = get_cpu_slab(s, smp_processor_id()); + if (unlikely(!c->freelist || !node_match(c, node))) + + object = __slab_alloc(s, gfpflags, node, addr, c); + + else { + object = c->freelist; + c->freelist = object[c->offset]; + } + local_irq_restore(flags); + + if (unlikely((gfpflags & __GFP_ZERO) && object)) + memset(object, 0, c->objsize); + + return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) @@ -1290,33 +1598,29 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); #endif /* - * The fastpath only writes the cacheline of the page struct and the first - * cacheline of the object. + * Slow patch handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. * - * No special cachelines need to be read + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial page + * handling required then we can return immediately. */ -static void slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr) +static void __slab_free(struct kmem_cache *s, struct page *page, + void *x, void *addr, unsigned int offset) { void *prior; void **object = (void *)x; - unsigned long flags; - local_irq_save(flags); slab_lock(page); - if (unlikely(PageError(page))) + if (unlikely(SlabDebug(page))) goto debug; checks_ok: - prior = object[page->offset] = page->freelist; + prior = object[offset] = page->freelist; page->freelist = object; page->inuse--; - if (unlikely(PageActive(page))) - /* - * Cpu slabs are never on partial lists and are - * never freed. - */ + if (unlikely(SlabFrozen(page))) goto out_unlock; if (unlikely(!page->inuse)) @@ -1328,35 +1632,59 @@ checks_ok: * then add it. */ if (unlikely(!prior)) - add_partial(s, page); + add_partial(get_node(s, page_to_nid(page)), page); out_unlock: slab_unlock(page); - local_irq_restore(flags); return; slab_empty: if (prior) /* - * Slab on the partial list. + * Slab still on the partial list. */ remove_partial(s, page); slab_unlock(page); discard_slab(s, page); - local_irq_restore(flags); return; debug: - if (!free_object_checks(s, page, x)) + if (!free_debug_processing(s, page, x, addr)) goto out_unlock; - if (!PageActive(page) && !page->freelist) - remove_full(s, page); - if (s->flags & SLAB_STORE_USER) - set_track(s, x, TRACK_FREE, addr); goto checks_ok; } +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + */ +static void __always_inline slab_free(struct kmem_cache *s, + struct page *page, void *x, void *addr) +{ + void **object = (void *)x; + unsigned long flags; + struct kmem_cache_cpu *c; + + local_irq_save(flags); + debug_check_no_locks_freed(object, s->objsize); + c = get_cpu_slab(s, smp_processor_id()); + if (likely(page == c->page && c->node >= 0)) { + object[c->offset] = c->freelist; + c->freelist = object; + } else + __slab_free(s, page, x, addr, c->offset); + + local_irq_restore(flags); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { struct page *page; @@ -1379,22 +1707,16 @@ static struct page *get_object_page(const void *x) } /* - * kmem_cache_open produces objects aligned at "size" and the first object - * is placed at offset 0 in the slab (We have no metainformation on the - * slab, all slabs are in essence "off slab"). - * - * In order to get the desired alignment one just needs to align the - * size. + * Object placement in a slab is made very easy because we always start at + * offset 0. If we tune the size of the object to the alignment then we can + * get the required alignment by putting one properly sized object after + * another. * * Notice that the allocation order determines the sizes of the per cpu * caches. Each processor has always one slab available for allocations. * Increasing the allocation order reduces the number of times that slabs - * must be moved on and off the partial lists and therefore may influence + * must be moved on and off the partial lists and is therefore a factor in * locking overhead. - * - * The offset is used to relocate the free list link in each object. It is - * therefore possible to move the free list link behind the object. This - * is necessary for RCU to work properly and also useful for debugging. */ /* @@ -1405,76 +1727,111 @@ static struct page *get_object_page(const void *x) */ static int slub_min_order; static int slub_max_order = DEFAULT_MAX_ORDER; - -/* - * Minimum number of objects per slab. This is necessary in order to - * reduce locking overhead. Similar to the queue size in SLAB. - */ static int slub_min_objects = DEFAULT_MIN_OBJECTS; /* * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) */ static int slub_nomerge; /* - * Debug settings: - */ -static int slub_debug; - -static char *slub_debug_slabs; - -/* * Calculate the order of allocation given an slab object size. * - * The order of allocation has significant impact on other elements - * of the system. Generally order 0 allocations should be preferred - * since they do not cause fragmentation in the page allocator. Larger - * objects may have problems with order 0 because there may be too much - * space left unused in a slab. We go to a higher order if more than 1/8th - * of the slab would be wasted. + * The order of allocation has significant impact on performance and other + * system components. Generally order 0 allocations should be preferred since + * order 0 does not cause fragmentation in the page allocator. Larger objects + * be problematic to put into order 0 slabs because there may be too much + * unused space left. We go to a higher order if more than 1/8th of the slab + * would be wasted. + * + * In order to reach satisfactory performance we must ensure that a minimum + * number of objects is in one slab. Otherwise we may generate too much + * activity on the partial lists which requires taking the list_lock. This is + * less a concern for large slabs though which are rarely used. * - * In order to reach satisfactory performance we must ensure that - * a minimum number of objects is in one slab. Otherwise we may - * generate too much activity on the partial lists. This is less a - * concern for large slabs though. slub_max_order specifies the order - * where we begin to stop considering the number of objects in a slab. + * slub_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slub_max_order then + * we try to keep the page order as low as possible. So we accept more waste + * of space in favor of a small page order. * - * Higher order allocations also allow the placement of more objects - * in a slab and thereby reduce object handling overhead. If the user - * has requested a higher mininum order then we start with that one - * instead of zero. + * Higher order allocations also allow the placement of more objects in a + * slab and thereby reduce object handling overhead. If the user has + * requested a higher mininum order then we start with that one instead of + * the smallest order which will fit the object. */ -static int calculate_order(int size) +static inline int slab_order(int size, int min_objects, + int max_order, int fract_leftover) { int order; int rem; + int min_order = slub_min_order; - for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); - order < MAX_ORDER; order++) { - unsigned long slab_size = PAGE_SIZE << order; + for (order = max(min_order, + fls(min_objects * size - 1) - PAGE_SHIFT); + order <= max_order; order++) { - if (slub_max_order > order && - slab_size < slub_min_objects * size) - continue; + unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < size) + if (slab_size < min_objects * size) continue; rem = slab_size % size; - if (rem <= (PAGE_SIZE << order) / 8) + if (rem <= slab_size / fract_leftover) break; } - if (order >= MAX_ORDER) - return -E2BIG; + return order; } +static inline int calculate_order(int size) +{ + int order; + int min_objects; + int fraction; + + /* + * Attempt to find best configuration for a slab. This + * works by first attempting to generate a layout with + * the best configuration and backing off gradually. + * + * First we reduce the acceptable waste in a slab. Then + * we reduce the minimum objects required in a slab. + */ + min_objects = slub_min_objects; + while (min_objects > 1) { + fraction = 8; + while (fraction >= 4) { + order = slab_order(size, min_objects, + slub_max_order, fraction); + if (order <= slub_max_order) + return order; + fraction /= 2; + } + min_objects /= 2; + } + + /* + * We were unable to place multiple objects in a slab. Now + * lets see if we can place a single object there. + */ + order = slab_order(size, 1, slub_max_order, 1); + if (order <= slub_max_order) + return order; + + /* + * Doh this slab cannot be placed using slub_max_order. + */ + order = slab_order(size, 1, MAX_ORDER, 1); + if (order <= MAX_ORDER) + return order; + return -ENOSYS; +} + /* - * Function to figure out which alignment to use from the - * various ways of specifying it. + * Figure out what the alignment of the objects will be. */ static unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size) @@ -1488,9 +1845,9 @@ static unsigned long calculate_alignment(unsigned long flags, * specified alignment though. If that is greater * then use it. */ - if ((flags & (SLAB_MUST_HWCACHE_ALIGN | SLAB_HWCACHE_ALIGN)) && - size > L1_CACHE_BYTES / 2) - return max_t(unsigned long, align, L1_CACHE_BYTES); + if ((flags & SLAB_HWCACHE_ALIGN) && + size > cache_line_size() / 2) + return max_t(unsigned long, align, cache_line_size()); if (align < ARCH_SLAB_MINALIGN) return ARCH_SLAB_MINALIGN; @@ -1498,14 +1855,151 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static void init_kmem_cache_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + c->page = NULL; + c->freelist = NULL; + c->node = 0; + c->offset = s->offset / sizeof(void *); + c->objsize = s->objsize; +} + static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG INIT_LIST_HEAD(&n->full); +#endif +} + +#ifdef CONFIG_SMP +/* + * Per cpu array for per cpu structures. + * + * The per cpu array places all kmem_cache_cpu structures from one processor + * close together meaning that it becomes possible that multiple per cpu + * structures are contained in one cacheline. This may be particularly + * beneficial for the kmalloc caches. + * + * A desktop system typically has around 60-80 slabs. With 100 here we are + * likely able to get per cpu structures for all caches from the array defined + * here. We must be able to cover all kmalloc caches during bootstrap. + * + * If the per cpu array is exhausted then fall back to kmalloc + * of individual cachelines. No sharing is possible then. + */ +#define NR_KMEM_CACHE_CPU 100 + +static DEFINE_PER_CPU(struct kmem_cache_cpu, + kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; + +static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); +static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; + +static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, + int cpu, gfp_t flags) +{ + struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); + + if (c) + per_cpu(kmem_cache_cpu_free, cpu) = + (void *)c->freelist; + else { + /* Table overflow: So allocate ourselves */ + c = kmalloc_node( + ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), + flags, cpu_to_node(cpu)); + if (!c) + return NULL; + } + + init_kmem_cache_cpu(s, c); + return c; +} + +static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) +{ + if (c < per_cpu(kmem_cache_cpu, cpu) || + c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + kfree(c); + return; + } + c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); + per_cpu(kmem_cache_cpu_free, cpu) = c; +} + +static void free_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) { + s->cpu_slab[cpu] = NULL; + free_kmem_cache_cpu(c, cpu); + } + } +} + +static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) + continue; + + c = alloc_kmem_cache_cpu(s, cpu, flags); + if (!c) { + free_kmem_cache_cpus(s); + return 0; + } + s->cpu_slab[cpu] = c; + } + return 1; +} + +/* + * Initialize the per cpu array. + */ +static void init_alloc_cpu_cpu(int cpu) +{ + int i; + + if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) + return; + + for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) + free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); + + cpu_set(cpu, kmem_cach_cpu_free_init_once); +} + +static void __init init_alloc_cpu(void) +{ + int cpu; + + for_each_online_cpu(cpu) + init_alloc_cpu_cpu(cpu); + } + +#else +static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} +static inline void init_alloc_cpu(void) {} + +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + init_kmem_cache_cpu(s, &s->cpu_slab); + return 1; } +#endif #ifdef CONFIG_NUMA /* @@ -1514,30 +2008,39 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) * possible. * * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. + * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, + int node) { struct page *page; struct kmem_cache_node *n; BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); - /* new_slab() disables interupts */ - local_irq_enable(); + page = new_slab(kmalloc_caches, gfpflags, node); BUG_ON(!page); + if (page_to_nid(page) != node) { + printk(KERN_ERR "SLUB: Unable to allocate memory from " + "node %d\n", node); + printk(KERN_ERR "SLUB: Allocating a useless per node structure " + "in order to be able to continue\n"); + } + n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmalloc_caches, n); page->inuse++; kmalloc_caches->node[node] = n; +#ifdef CONFIG_SLUB_DEBUG init_object(kmalloc_caches, n, 1); + init_tracking(kmalloc_caches, n); +#endif init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); - add_partial(kmalloc_caches, page); + add_partial(n, page); return n; } @@ -1545,7 +2048,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; if (n && n != &s->local_node) kmem_cache_free(kmalloc_caches, n); @@ -1563,7 +2066,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) else local_node = 0; - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; if (local_node == node) @@ -1616,7 +2119,7 @@ static int calculate_sizes(struct kmem_cache *s) * then we should never poison the object itself. */ if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && - !s->ctor && !s->dtor) + !s->ctor) s->flags |= __OBJECT_POISON; else s->flags &= ~__OBJECT_POISON; @@ -1628,24 +2131,24 @@ static int calculate_sizes(struct kmem_cache *s) */ size = ALIGN(size, sizeof(void *)); +#ifdef CONFIG_SLUB_DEBUG /* - * If we are redzoning then check if there is some space between the + * If we are Redzoning then check if there is some space between the * end of the object and the free pointer. If not then add an - * additional word, so that we can establish a redzone between - * the object and the freepointer to be able to check for overwrites. + * additional word to have some bytes to store Redzone information. */ if ((flags & SLAB_RED_ZONE) && size == s->objsize) size += sizeof(void *); +#endif /* - * With that we have determined how much of the slab is in actual - * use by the object. This is the potential offset to the free - * pointer. + * With that we have determined the number of bytes in actual use + * by the object. This is the potential offset to the free pointer. */ s->inuse = size; if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || - s->ctor || s->dtor)) { + s->ctor)) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on @@ -1658,6 +2161,7 @@ static int calculate_sizes(struct kmem_cache *s) size += sizeof(void *); } +#ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_STORE_USER) /* * Need to store information about allocs and frees after @@ -1665,7 +2169,7 @@ static int calculate_sizes(struct kmem_cache *s) */ size += 2 * sizeof(struct track); - if (flags & DEBUG_DEFAULT_FLAGS) + if (flags & SLAB_RED_ZONE) /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -1674,10 +2178,12 @@ static int calculate_sizes(struct kmem_cache *s) * of the object. */ size += sizeof(void *); +#endif + /* * Determine the alignment based on various parameters that the - * user specified (this is unecessarily complex due to the attempt - * to be compatible with SLAB. Should be cleaned up some day). + * user specified and the dynamic determination of cache line size + * on bootup. */ align = calculate_alignment(flags, align, s->objsize); @@ -1698,75 +2204,21 @@ static int calculate_sizes(struct kmem_cache *s) */ s->objects = (PAGE_SIZE << s->order) / size; - /* - * Verify that the number of objects is within permitted limits. - * The page->inuse field is only 16 bit wide! So we cannot have - * more than 64k objects per slab. - */ - if (!s->objects || s->objects > 65535) - return 0; - return 1; - -} - -static int __init finish_bootstrap(void) -{ - struct list_head *h; - int err; - - slab_state = SYSFS; - - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); + return !!s->objects; - err = sysfs_slab_add(s); - BUG_ON(err); - } - return 0; } static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - void (*dtor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { memset(s, 0, kmem_size); s->name = name; s->ctor = ctor; - s->dtor = dtor; s->objsize = size; - s->flags = flags; s->align = align; - - BUG_ON(flags & SLUB_UNIMPLEMENTED); - - /* - * The page->offset field is only 16 bit wide. This is an offset - * in units of words from the beginning of an object. If the slab - * size is bigger then we cannot move the free pointer behind the - * object anymore. - * - * On 32 bit platforms the limit is 256k. On 64bit platforms - * the limit is 512k. - * - * Debugging or ctor/dtors may create a need to move the free - * pointer. Fail if this happens. - */ - if (s->size >= 65535 * sizeof(void *)) { - BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); - BUG_ON(ctor || dtor); - } - else - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, - strlen(slub_debug_slabs)) == 0)) - s->flags |= slub_debug; + s->flags = kmem_cache_flags(size, flags, name, ctor); if (!calculate_sizes(s)) goto error; @@ -1775,9 +2227,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->defrag_ratio = 100; #endif + if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + goto error; - if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " @@ -1786,7 +2241,6 @@ error: s->offset, flags); return 0; } -EXPORT_SYMBOL(kmem_cache_open); /* * Check if a given pointer is valid @@ -1794,7 +2248,6 @@ EXPORT_SYMBOL(kmem_cache_open); int kmem_ptr_validate(struct kmem_cache *s, const void *object) { struct page * page; - void *addr; page = get_object_page(object); @@ -1802,13 +2255,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) /* No slab or wrong slab */ return 0; - addr = page_address(page); - if (object < addr || object >= addr + s->objects * s->size) - /* Out of bounds */ - return 0; - - if ((object - addr) % s->size) - /* Improperly aligned */ + if (!check_valid_pointer(s, page, object)) return 0; /* @@ -1837,7 +2284,8 @@ const char *kmem_cache_name(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_name); /* - * Attempt to free all slabs on a node + * Attempt to free all slabs on a node. Return the number of slabs we + * were unable to free. */ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, struct list_head *list) @@ -1858,19 +2306,20 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, } /* - * Release all resources used by slab cache + * Release all resources used by a slab cache. */ -static int kmem_cache_close(struct kmem_cache *s) +static inline int kmem_cache_close(struct kmem_cache *s) { int node; flush_all(s); /* Attempt to free all objects */ - for_each_online_node(node) { + free_kmem_cache_cpus(s); + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - free_list(s, n, &n->partial); + n->nr_partial -= free_list(s, n, &n->partial); if (atomic_long_read(&n->nr_slabs)) return 1; } @@ -1888,12 +2337,13 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) WARN_ON(1); sysfs_slab_remove(s); kfree(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -1901,11 +2351,11 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; #endif static int __init setup_slub_min_order(char *str) @@ -1943,45 +2393,6 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static int __init setup_slub_debug(char *str) -{ - if (!str || *str != '=') - slub_debug = DEBUG_DEFAULT_FLAGS; - else { - str++; - if (*str == 0 || *str == ',') - slub_debug = DEBUG_DEFAULT_FLAGS; - else - for( ;*str && *str != ','; str++) - switch (*str) { - case 'f' : case 'F' : - slub_debug |= SLAB_DEBUG_FREE; - break; - case 'z' : case 'Z' : - slub_debug |= SLAB_RED_ZONE; - break; - case 'p' : case 'P' : - slub_debug |= SLAB_POISON; - break; - case 'u' : case 'U' : - slub_debug |= SLAB_STORE_USER; - break; - case 't' : case 'T' : - slub_debug |= SLAB_TRACE; - break; - default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n",*str); - } - } - - if (*str == ',') - slub_debug_slabs = str + 1; - return 1; -} - -__setup("slub_debug", setup_slub_debug); - static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, const char *name, int size, gfp_t gfp_flags) { @@ -1992,7 +2403,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, down_write(&slub_lock); if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL, NULL)) + flags, NULL)) goto panic; list_add(&s->list, &slab_caches); @@ -2005,78 +2416,168 @@ panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); } -static struct kmem_cache *get_slab(size_t size, gfp_t flags) +#ifdef CONFIG_ZONE_DMA + +static void sysfs_add_func(struct work_struct *w) { - int index = kmalloc_index(size); + struct kmem_cache *s; - if (!index) - return NULL; + down_write(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + if (s->flags & __SYSFS_ADD_DEFERRED) { + s->flags &= ~__SYSFS_ADD_DEFERRED; + sysfs_slab_add(s); + } + } + up_write(&slub_lock); +} - /* Allocation too large? */ - BUG_ON(index < 0); +static DECLARE_WORK(sysfs_add_work, sysfs_add_func); -#ifdef CONFIG_ZONE_DMA - if ((flags & SLUB_DMA)) { - struct kmem_cache *s; - struct kmem_cache *x; - char *text; - size_t realsize; - - s = kmalloc_caches_dma[index]; - if (s) - return s; +static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) +{ + struct kmem_cache *s; + char *text; + size_t realsize; + + s = kmalloc_caches_dma[index]; + if (s) + return s; - /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); - if (!x) - panic("Unable to allocate memory for dma cache\n"); + /* Dynamically create dma cache */ + if (flags & __GFP_WAIT) + down_write(&slub_lock); + else { + if (!down_write_trylock(&slub_lock)) + goto out; + } - if (index <= KMALLOC_SHIFT_HIGH) - realsize = 1 << index; - else { - if (index == 1) - realsize = 96; - else - realsize = 192; - } + if (kmalloc_caches_dma[index]) + goto unlock_out; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = create_kmalloc_cache(x, text, realsize, flags); - kmalloc_caches_dma[index] = s; - return s; + realsize = kmalloc_caches[index].objsize; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize), + s = kmalloc(kmem_size, flags & ~SLUB_DMA); + + if (!s || !text || !kmem_cache_open(s, flags, text, + realsize, ARCH_KMALLOC_MINALIGN, + SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { + kfree(s); + kfree(text); + goto unlock_out; } + + list_add(&s->list, &slab_caches); + kmalloc_caches_dma[index] = s; + + schedule_work(&sysfs_add_work); + +unlock_out: + up_write(&slub_lock); +out: + return kmalloc_caches_dma[index]; +} +#endif + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index; + + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[(size - 1) / 8]; + } else + index = fls(size - 1); + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & SLUB_DMA))) + return dma_kmalloc_cache(index, flags); + #endif return &kmalloc_caches[index]; } void *__kmalloc(size_t size, gfp_t flags) { - struct kmem_cache *s = get_slab(size, flags); + struct kmem_cache *s; - if (s) - return slab_alloc(s, flags, -1, __builtin_return_address(0)); - return NULL; + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(flags | __GFP_COMP, + get_order(size)); + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + return slab_alloc(s, flags, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc); #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { - struct kmem_cache *s = get_slab(size, flags); + struct kmem_cache *s; - if (s) - return slab_alloc(s, flags, node, __builtin_return_address(0)); - return NULL; + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(flags | __GFP_COMP, + get_order(size)); + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + return slab_alloc(s, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc_node); #endif size_t ksize(const void *object) { - struct page *page = get_object_page(object); + struct page *page; struct kmem_cache *s; + BUG_ON(!object); + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + page = get_object_page(object); BUG_ON(!page); s = page->slab; BUG_ON(!s); @@ -2105,63 +2606,93 @@ EXPORT_SYMBOL(ksize); void kfree(const void *x) { - struct kmem_cache *s; struct page *page; - if (!x) + if (unlikely(ZERO_OR_NULL_PTR(x))) return; page = virt_to_head_page(x); - s = page->slab; - - slab_free(s, page, (void *)x, __builtin_return_address(0)); + if (unlikely(!PageSlab(page))) { + put_page(page); + return; + } + slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); } EXPORT_SYMBOL(kfree); -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. +/* + * kmem_cache_shrink removes empty slabs from the partial lists and sorts + * the remaining slabs by the number of items in use. The slabs with the + * most items in use come first. New allocations will then fill those up + * and thus they can be removed from the partial lists. * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * The slabs with the least items are placed last. This results in them + * being allocated from last increasing the chance that the last objects + * are freed in them. */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +int kmem_cache_shrink(struct kmem_cache *s) { - struct kmem_cache *new_cache; - void *ret; + int node; + int i; + struct kmem_cache_node *n; struct page *page; + struct page *t; + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + unsigned long flags; + + if (!slabs_by_inuse) + return -ENOMEM; - if (unlikely(!p)) - return kmalloc(new_size, flags); + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) { + n = get_node(s, node); - if (unlikely(!new_size)) { - kfree(p); - return NULL; - } + if (!n->nr_partial) + continue; - page = virt_to_head_page(p); + for (i = 0; i < s->objects; i++) + INIT_LIST_HEAD(slabs_by_inuse + i); - new_cache = get_slab(new_size, flags); + spin_lock_irqsave(&n->list_lock, flags); - /* - * If new size fits in the current cache, bail out. - */ - if (likely(page->slab == new_cache)) - return (void *)p; + /* + * Build lists indexed by the items in use in each slab. + * + * Note that concurrent frees may occur while we hold the + * list_lock. page->inuse here is the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + if (!page->inuse && slab_trylock(page)) { + /* + * Must hold slab lock here because slab_free + * may have freed the last object and be + * waiting to release the slab. + */ + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + } else { + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + } + + /* + * Rebuild the partial list with the slabs filled up most + * first and the least used slabs at the end. + */ + for (i = s->objects - 1; i >= 0; i--) + list_splice(slabs_by_inuse + i, n->partial.prev); - ret = kmalloc(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ksize(p))); - kfree(p); + spin_unlock_irqrestore(&n->list_lock, flags); } - return ret; + + kfree(slabs_by_inuse); + return 0; } -EXPORT_SYMBOL(krealloc); +EXPORT_SYMBOL(kmem_cache_shrink); /******************************************************************** * Basic setup of slabs @@ -2170,48 +2701,80 @@ EXPORT_SYMBOL(krealloc); void __init kmem_cache_init(void) { int i; + int caches = 0; + + init_alloc_cpu(); #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the - * struct kmalloc_cache_node's. There is special bootstrap code in + * struct kmem_cache_node's. There is special bootstrap code in * kmem_cache_open for slab_state == DOWN. */ create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", sizeof(struct kmem_cache_node), GFP_KERNEL); + kmalloc_caches[0].refcount = -1; + caches++; #endif /* Able to allocate the per node structures */ slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - create_kmalloc_cache(&kmalloc_caches[1], + if (KMALLOC_MIN_SIZE <= 64) { + create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_KERNEL); - create_kmalloc_cache(&kmalloc_caches[2], + caches++; + } + if (KMALLOC_MIN_SIZE <= 128) { + create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_KERNEL); + caches++; + } - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); + caches++; + } + + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * mips it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) + size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#else + kmem_size = sizeof(struct kmem_cache); #endif - if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ - kmem_size = offsetof(struct kmem_cache, cpu_slab) - + nr_cpu_ids * sizeof(struct page *); printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," - " Processors=%d, Nodes=%d\n", - KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, + " CPUs=%d, Nodes=%d\n", + caches, cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } @@ -2224,41 +2787,43 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if (s->ctor || s->dtor) + if (s->ctor) + return 1; + + /* + * We may have set a slab to be unmergeable during bootstrap. + */ + if (s->refcount < 0) return 1; return 0; } static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - void (*dtor)(void *, struct kmem_cache *, unsigned long)) + size_t align, unsigned long flags, const char *name, + void (*ctor)(struct kmem_cache *, void *)) { - struct list_head *h; + struct kmem_cache *s; if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) return NULL; - if (ctor || dtor) + if (ctor) return NULL; size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - + list_for_each_entry(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; if (size > s->size) continue; - if (((flags | slub_debug) & SLUB_MERGE_SAME) != - (s->flags & SLUB_MERGE_SAME)) + if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) continue; /* * Check if alignment is compatible. @@ -2277,40 +2842,49 @@ static struct kmem_cache *find_mergeable(size_t size, struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - void (*dtor)(void *, struct kmem_cache *, unsigned long)) + void (*ctor)(struct kmem_cache *, void *)) { struct kmem_cache *s; down_write(&slub_lock); - s = find_mergeable(size, align, flags, dtor, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { + int cpu; + s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); + + /* + * And then we need to update the object size in the + * per cpu structures + */ + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + up_write(&slub_lock); if (sysfs_slab_alias(s, name)) goto err; - } else { - s = kmalloc(kmem_size, GFP_KERNEL); - if (s && kmem_cache_open(s, GFP_KERNEL, name, - size, align, flags, ctor, dtor)) { - if (sysfs_slab_add(s)) { - kfree(s); - goto err; - } + return s; + } + s = kmalloc(kmem_size, GFP_KERNEL); + if (s) { + if (kmem_cache_open(s, GFP_KERNEL, name, + size, align, flags, ctor)) { list_add(&s->list, &slab_caches); - } else - kfree(s); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto err; + return s; + } + kfree(s); } up_write(&slub_lock); - return s; err: - up_write(&slub_lock); if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -2319,45 +2893,44 @@ err: } EXPORT_SYMBOL(kmem_cache_create); -void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) -{ - void *x; - - x = slab_alloc(s, flags, -1, __builtin_return_address(0)); - if (x) - memset(x, 0, s->objsize); - return x; -} -EXPORT_SYMBOL(kmem_cache_zalloc); - #ifdef CONFIG_SMP -static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) -{ - struct list_head *h; - - down_read(&slub_lock); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); - - func(s, cpu); - } - up_read(&slub_lock); -} - /* - * Use the cpu notifier to insure that the slab are flushed - * when necessary. + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. */ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct kmem_cache *s; + unsigned long flags; switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_alloc_cpu_cpu(cpu); + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) + s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, + GFP_KERNEL); + up_read(&slub_lock); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: - for_all_slabs(__flush_cpu_slab, cpu); + case CPU_DEAD_FROZEN: + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + free_kmem_cache_cpu(c, cpu); + s->cpu_slab[cpu] = NULL; + } + up_read(&slub_lock); break; default: break; @@ -2370,105 +2943,140 @@ static struct notifier_block __cpuinitdata slab_notifier = #endif -/*************************************************************** - * Compatiblility definitions - **************************************************************/ - -int kmem_cache_shrink(struct kmem_cache *s) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) { - flush_all(s); - return 0; -} -EXPORT_SYMBOL(kmem_cache_shrink); + struct kmem_cache *s; -#ifdef CONFIG_NUMA + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(gfpflags | __GFP_COMP, + get_order(size)); + s = get_slab(size, gfpflags); -/***************************************************************** - * Generic reaper used to support the page allocator - * (the cpu slabs are reaped by a per slab workqueue). - * - * Maybe move this to the page allocator? - ****************************************************************/ + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; -static DEFINE_PER_CPU(unsigned long, reap_node); + return slab_alloc(s, gfpflags, -1, caller); +} -static void init_reap_node(int cpu) +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, void *caller) { - int node; + struct kmem_cache *s; + + if (unlikely(size > PAGE_SIZE / 2)) + return (void *)__get_free_pages(gfpflags | __GFP_COMP, + get_order(size)); + s = get_slab(size, gfpflags); - node = next_node(cpu_to_node(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; - __get_cpu_var(reap_node) = node; + return slab_alloc(s, gfpflags, node, caller); } -static void next_reap_node(void) +#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +static int validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) { - int node = __get_cpu_var(reap_node); + void *p; + void *addr = page_address(page); - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); + if (!check_slab(s, page) || + !on_freelist(s, page, NULL)) + return 0; - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; -} -#else -#define init_reap_node(cpu) do { } while (0) -#define next_reap_node(void) do { } while (0) -#endif + /* Now we know that a valid freelist exists */ + bitmap_zero(map, s->objects); -#define REAPTIMEOUT_CPUC (2*HZ) + for_each_free_object(p, s, page->freelist) { + set_bit(slab_index(p, s, addr), map); + if (!check_object(s, page, p, 0)) + return 0; + } -#ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct delayed_work, reap_work); + for_each_object(p, s, addr) + if (!test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, 1)) + return 0; + return 1; +} -static void cache_reap(struct work_struct *unused) +static void validate_slab_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) { - next_reap_node(); - refresh_cpu_vm_stats(smp_processor_id()); - schedule_delayed_work(&__get_cpu_var(reap_work), - REAPTIMEOUT_CPUC); + if (slab_trylock(page)) { + validate_slab(s, page, map); + slab_unlock(page); + } else + printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", + s->name, page); + + if (s->flags & DEBUG_DEFAULT_FLAGS) { + if (!SlabDebug(page)) + printk(KERN_ERR "SLUB %s: SlabDebug not set " + "on slab 0x%p\n", s->name, page); + } else { + if (SlabDebug(page)) + printk(KERN_ERR "SLUB %s: SlabDebug set on " + "slab 0x%p\n", s->name, page); + } } -static void __devinit start_cpu_timer(int cpu) +static int validate_slab_node(struct kmem_cache *s, + struct kmem_cache_node *n, unsigned long *map) { - struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + unsigned long count = 0; + struct page *page; + unsigned long flags; - /* - * When this gets called from do_initcalls via cpucache_init(), - * init_workqueues() has already run, so keventd will be setup - * at that time. - */ - if (keventd_up() && reap_work->work.func == NULL) { - init_reap_node(cpu); - INIT_DELAYED_WORK(reap_work, cache_reap); - schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, lru) { + validate_slab_slab(s, page, map); + count++; } + if (count != n->nr_partial) + printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " + "counter=%ld\n", s->name, count, n->nr_partial); + + if (!(s->flags & SLAB_STORE_USER)) + goto out; + + list_for_each_entry(page, &n->full, lru) { + validate_slab_slab(s, page, map); + count++; + } + if (count != atomic_long_read(&n->nr_slabs)) + printk(KERN_ERR "SLUB: %s %ld slabs counted but " + "counter=%ld\n", s->name, count, + atomic_long_read(&n->nr_slabs)); + +out: + spin_unlock_irqrestore(&n->list_lock, flags); + return count; } -static int __init cpucache_init(void) +static long validate_slab_cache(struct kmem_cache *s) { - int cpu; + int node; + unsigned long count = 0; + unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + sizeof(unsigned long), GFP_KERNEL); - /* - * Register the timers that drain pcp pages and update vm statistics - */ - for_each_online_cpu(cpu) - start_cpu_timer(cpu); - return 0; + if (!map) + return -ENOMEM; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + count += validate_slab_node(s, n, map); + } + kfree(map); + return count; } -__initcall(cpucache_init); -#endif #ifdef SLUB_RESILIENCY_TEST -static unsigned long validate_slab_cache(struct kmem_cache *s); - static void resiliency_test(void) { u8 *p; @@ -2524,32 +3132,229 @@ static void resiliency_test(void) {}; #endif /* - * These are not as efficient as kmalloc for the non debug case. - * We do not have the page struct available so we have to touch one - * cacheline in struct kmem_cache to check slab flags. + * Generate lists of code addresses where slabcache objects are allocated + * and freed. */ -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) + +struct location { + unsigned long count; + void *addr; + long long sum_time; + long min_time; + long max_time; + long min_pid; + long max_pid; + cpumask_t cpus; + nodemask_t nodes; +}; + +struct loc_track { + unsigned long max; + unsigned long count; + struct location *loc; +}; + +static void free_loc_track(struct loc_track *t) { - struct kmem_cache *s = get_slab(size, gfpflags); + if (t->max) + free_pages((unsigned long)t->loc, + get_order(sizeof(struct location) * t->max)); +} - if (!s) - return NULL; +static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) +{ + struct location *l; + int order; - return slab_alloc(s, gfpflags, -1, caller); + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); + if (!l) + return 0; + + if (t->count) { + memcpy(l, t->loc, sizeof(struct location) * t->count); + free_loc_track(t); + } + t->max = max; + t->loc = l; + return 1; } -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) +static int add_location(struct loc_track *t, struct kmem_cache *s, + const struct track *track) { - struct kmem_cache *s = get_slab(size, gfpflags); + long start, end, pos; + struct location *l; + void *caddr; + unsigned long age = jiffies - track->when; - if (!s) - return NULL; + start = -1; + end = t->count; - return slab_alloc(s, gfpflags, node, caller); + for ( ; ; ) { + pos = start + (end - start + 1) / 2; + + /* + * There is nothing at "end". If we end up there + * we need to add something to before end. + */ + if (pos == end) + break; + + caddr = t->loc[pos].addr; + if (track->addr == caddr) { + + l = &t->loc[pos]; + l->count++; + if (track->when) { + l->sum_time += age; + if (age < l->min_time) + l->min_time = age; + if (age > l->max_time) + l->max_time = age; + + if (track->pid < l->min_pid) + l->min_pid = track->pid; + if (track->pid > l->max_pid) + l->max_pid = track->pid; + + cpu_set(track->cpu, l->cpus); + } + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; + } + + if (track->addr < caddr) + end = pos; + else + start = pos; + } + + /* + * Not found. Insert new tracking element. + */ + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) + return 0; + + l = t->loc + pos; + if (pos < t->count) + memmove(l + 1, l, + (t->count - pos) * sizeof(struct location)); + t->count++; + l->count = 1; + l->addr = track->addr; + l->sum_time = age; + l->min_time = age; + l->max_time = age; + l->min_pid = track->pid; + l->max_pid = track->pid; + cpus_clear(l->cpus); + cpu_set(track->cpu, l->cpus); + nodes_clear(l->nodes); + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; +} + +static void process_slab(struct loc_track *t, struct kmem_cache *s, + struct page *page, enum track_item alloc) +{ + void *addr = page_address(page); + DECLARE_BITMAP(map, s->objects); + void *p; + + bitmap_zero(map, s->objects); + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + + for_each_object(p, s, addr) + if (!test_bit(slab_index(p, s, addr), map)) + add_location(t, s, get_track(s, p, alloc)); } -#ifdef CONFIG_SYSFS +static int list_locations(struct kmem_cache *s, char *buf, + enum track_item alloc) +{ + int n = 0; + unsigned long i; + struct loc_track t = { 0, 0, NULL }; + int node; + + if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) + return sprintf(buf, "Out of memory\n"); + + /* Push back cpu slabs */ + flush_all(s); + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long flags; + struct page *page; + + if (!atomic_long_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + process_slab(&t, s, page, alloc); + list_for_each_entry(page, &n->full, lru) + process_slab(&t, s, page, alloc); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { + struct location *l = &t.loc[i]; + + if (n > PAGE_SIZE - 100) + break; + n += sprintf(buf + n, "%7ld ", l->count); + + if (l->addr) + n += sprint_symbol(buf + n, (unsigned long)l->addr); + else + n += sprintf(buf + n, ""); + + if (l->sum_time != l->min_time) { + unsigned long remainder; + + n += sprintf(buf + n, " age=%ld/%ld/%ld", + l->min_time, + div_long_long_rem(l->sum_time, l->count, &remainder), + l->max_time); + } else + n += sprintf(buf + n, " age=%ld", + l->min_time); + + if (l->min_pid != l->max_pid) + n += sprintf(buf + n, " pid=%ld-%ld", + l->min_pid, l->max_pid); + else + n += sprintf(buf + n, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && + n < PAGE_SIZE - 60) { + n += sprintf(buf + n, " cpus="); + n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, + l->cpus); + } + + if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + n < PAGE_SIZE - 60) { + n += sprintf(buf + n, " nodes="); + n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, + l->nodes); + } + + n += sprintf(buf + n, "\n"); + } + + free_loc_track(&t); + if (!t.count) + n += sprintf(buf, "No data\n"); + return n; +} static unsigned long count_partial(struct kmem_cache_node *n) { @@ -2590,11 +3395,18 @@ static unsigned long slab_objects(struct kmem_cache *s, per_cpu = nodes + nr_node_ids; for_each_possible_cpu(cpu) { - struct page *page = s->cpu_slab[cpu]; + struct page *page; int node; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (!c) + continue; + page = c->page; + node = c->node; + if (node < 0) + continue; if (page) { - node = page_to_nid(page); if (flags & SO_CPU) { int x = 0; @@ -2609,7 +3421,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } } - for_each_online_node(node) { + for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); if (flags & SO_PARTIAL) { @@ -2622,7 +3434,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } if (flags & SO_FULL) { - int full_slabs = atomic_read(&n->nr_slabs) + int full_slabs = atomic_long_read(&n->nr_slabs) - per_cpu[node] - n->nr_partial; @@ -2637,7 +3449,7 @@ static unsigned long slab_objects(struct kmem_cache *s, x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_online_node(node) + for_each_node_state(node, N_NORMAL_MEMORY) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -2651,14 +3463,20 @@ static int any_slab_objects(struct kmem_cache *s) int node; int cpu; - for_each_possible_cpu(cpu) - if (s->cpu_slab[cpu]) + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c && c->page) return 1; + } - for_each_node(node) { + for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); - if (n->nr_partial || atomic_read(&n->nr_slabs)) + if (!n) + continue; + + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; } return 0; @@ -2680,7 +3498,6 @@ struct slab_attribute { static struct slab_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) - static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", s->size); @@ -2722,17 +3539,6 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(ctor); -static ssize_t dtor_show(struct kmem_cache *s, char *buf) -{ - if (s->dtor) { - int n = sprint_symbol(buf, (unsigned long)s->dtor); - - return n + sprintf(buf + n, "\n"); - } - return 0; -} -SLAB_ATTR_RO(dtor); - static ssize_t aliases_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", s->refcount - 1); @@ -2810,8 +3616,7 @@ SLAB_ATTR(reclaim_account); static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & - (SLAB_HWCACHE_ALIGN|SLAB_MUST_HWCACHE_ALIGN))); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } SLAB_ATTR_RO(hwcache_align); @@ -2886,6 +3691,60 @@ static ssize_t store_user_store(struct kmem_cache *s, } SLAB_ATTR(store_user); +static ssize_t validate_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t validate_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = -EINVAL; + + if (buf[0] == '1') { + ret = validate_slab_cache(s); + if (ret >= 0) + ret = length; + } + return ret; +} +SLAB_ATTR(validate); + +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') { + int rc = kmem_cache_shrink(s); + + if (rc) + return rc; + } else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); + #ifdef CONFIG_NUMA static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -2914,7 +3773,6 @@ static struct attribute * slab_attrs[] = { &partial_attr.attr, &cpu_slabs_attr.attr, &ctor_attr.attr, - &dtor_attr.attr, &aliases_attr.attr, &align_attr.attr, &sanity_checks_attr.attr, @@ -2925,6 +3783,10 @@ static struct attribute * slab_attrs[] = { &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, + &validate_attr.attr, + &shrink_attr.attr, + &alloc_calls_attr.attr, + &free_calls_attr.attr, #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -2998,7 +3860,7 @@ static struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; -decl_subsys(slab, &slab_ktype, &slab_uevent_ops); +static decl_subsys(slab, &slab_ktype, &slab_uevent_ops); #define ID_STR_LENGTH 64 @@ -3051,7 +3913,7 @@ static int sysfs_slab_add(struct kmem_cache *s) * This is typically the case for debug situations. In that * case we can catch duplicate names easily. */ - sysfs_remove_link(&slab_subsys.kset.kobj, s->name); + sysfs_remove_link(&slab_subsys.kobj, s->name); name = s->name; } else { /* @@ -3096,7 +3958,7 @@ struct saved_alias { struct saved_alias *next; }; -struct saved_alias *alias_list; +static struct saved_alias *alias_list; static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { @@ -3106,8 +3968,8 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) /* * If we have a leftover link then remove it. */ - sysfs_remove_link(&slab_subsys.kset.kobj, name); - return sysfs_create_link(&slab_subsys.kset.kobj, + sysfs_remove_link(&slab_subsys.kobj, name); + return sysfs_create_link(&slab_subsys.kobj, &s->kobj, name); } @@ -3124,6 +3986,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) static int __init slab_sysfs_init(void) { + struct kmem_cache *s; int err; err = subsystem_register(&slab_subsys); @@ -3132,14 +3995,23 @@ static int __init slab_sysfs_init(void) return -ENOSYS; } - finish_bootstrap(); + slab_state = SYSFS; + + list_for_each_entry(s, &slab_caches, list) { + err = sysfs_slab_add(s); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab %s" + " to sysfs\n", s->name); + } while (alias_list) { struct saved_alias *al = alias_list; alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); - BUG_ON(err); + if (err) + printk(KERN_ERR "SLUB: Unable to add boot slab alias" + " %s to sysfs\n", s->name); kfree(al); } @@ -3148,6 +4020,4 @@ static int __init slab_sysfs_init(void) } __initcall(slab_sysfs_init); -#else -__initcall(finish_bootstrap); #endif