From: Ravikiran G Thirumalai Date: Sun, 5 Feb 2006 07:27:59 +0000 (-0800) Subject: [PATCH] NUMA slab locking fixes: fix cpu down and up locking X-Git-Tag: v2.6.16-rc3~154 X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=commitdiff_plain;h=4484ebf12bdb0ebcdc6e8951243cbab3d7f6f4c1 [PATCH] NUMA slab locking fixes: fix cpu down and up locking This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab allocator. Sonny Rao reported problems sometime back on POWER5 boxes, when the last cpu on the nodes were being offlined. We could not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not being updated on cpu down. Since that issue is now fixed, we can reproduce Sonny's problems on x86_64 NUMA, and here is the fix. The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go down, the array_caches (shared, alien) and the kmem_list3 of the node were being freed (kfree) with the kmem_list3 lock held. If the l3 or the array_caches were to come from the same cache being cleared, we hit on badness. This patch cleans up the locking in cpu_up and cpu_down path. We cannot really free l3 on cpu down because, there is no node offlining yet and even though a cpu is not yet up, node local memory can be allocated for it. So l3s are usually allocated at keme_cache_create and destroyed at kmem_cache_destroy. Hence, we don't need cachep->spinlock protection to get to the cachep->nodelist[nodeid] either. Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4 dbench process running all the time. Signed-off-by: Alok N Kataria Signed-off-by: Ravikiran Thirumalai Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- diff --git a/mm/slab.c b/mm/slab.c index d3f6854..9cc049a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep, } } -static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3) +static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { int i = 0; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = l3->alien[i]; + ac = alien[i]; if (ac) { spin_lock_irqsave(&ac->lock, flags); __drain_alien_cache(cachep, ac, i); @@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3) } #else #define alloc_alien_cache(node, limit) do { } while (0) -#define free_alien_cache(ac_ptr) do { } while (0) -#define drain_alien_cache(cachep, l3) do { } while (0) +#define drain_alien_cache(cachep, alien) do { } while (0) + +static inline void free_alien_cache(struct array_cache **ac_ptr) +{ +} #endif static int __devinit cpuup_callback(struct notifier_block *nfb, @@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ cachep->nodelists[node] = l3; } @@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, & array cache's */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; + struct array_cache *shared; + struct array_cache **alien; nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount); + cachep->batchcount); if (!nc) goto bad; + shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, + 0xbaadf00d); + if (!shared) + goto bad; +#ifdef CONFIG_NUMA + alien = alloc_alien_cache(node, cachep->limit); + if (!alien) + goto bad; +#endif cachep->array[cpu] = nc; l3 = cachep->nodelists[node]; BUG_ON(!l3); - if (!l3->shared) { - if (!(nc = alloc_arraycache(node, - cachep->shared * - cachep->batchcount, - 0xbaadf00d))) - goto bad; - /* we are serialised from CPU_DEAD or - CPU_UP_CANCELLED by the cpucontrol lock */ - l3->shared = nc; + spin_lock_irq(&l3->list_lock); + if (!l3->shared) { + /* + * We are serialised from CPU_DEAD or + * CPU_UP_CANCELLED by the cpucontrol lock + */ + l3->shared = shared; + shared = NULL; } +#ifdef CONFIG_NUMA + if (!l3->alien) { + l3->alien = alien; + alien = NULL; + } +#endif + spin_unlock_irq(&l3->list_lock); + + kfree(shared); + free_alien_cache(alien); } mutex_unlock(&cache_chain_mutex); break; @@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + /* + * Even if all the cpus of a node are down, we don't free the + * kmem_list3 of any cache. This to avoid a race between + * cpu_down, and a kmalloc allocation from another cpu for + * memory from the node of the cpu going down. The list3 + * structure is usually allocated from kmem_cache_create() and + * gets destroyed at kmem_cache_destroy(). + */ /* fall thru */ case CPU_UP_CANCELED: mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; + struct array_cache *shared; + struct array_cache **alien; cpumask_t mask; mask = node_to_cpumask(node); - spin_lock(&cachep->spinlock); /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; l3 = cachep->nodelists[node]; if (!l3) - goto unlock_cache; + goto free_array_cache; spin_lock_irq(&l3->list_lock); @@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, if (!cpus_empty(mask)) { spin_unlock_irq(&l3->list_lock); - goto unlock_cache; + goto free_array_cache; } - if (l3->shared) { + shared = l3->shared; + if (shared) { free_block(cachep, l3->shared->entry, l3->shared->avail, node); - kfree(l3->shared); l3->shared = NULL; } - if (l3->alien) { - drain_alien_cache(cachep, l3); - free_alien_cache(l3->alien); - l3->alien = NULL; - } - /* free slabs belonging to this node */ - if (__node_shrink(cachep, node)) { - cachep->nodelists[node] = NULL; - spin_unlock_irq(&l3->list_lock); - kfree(l3); - } else { - spin_unlock_irq(&l3->list_lock); + alien = l3->alien; + l3->alien = NULL; + + spin_unlock_irq(&l3->list_lock); + + kfree(shared); + if (alien) { + drain_alien_cache(cachep, alien); + free_alien_cache(alien); } - unlock_cache: - spin_unlock(&cachep->spinlock); +free_array_cache: kfree(nc); } + /* + * In the previous loop, all the objects were freed to + * the respective cache's slabs, now we can go ahead and + * shrink each nodelist to its limit. + */ + list_for_each_entry(cachep, &cache_chain, next) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + spin_lock_irq(&l3->list_lock); + /* free slabs belonging to this node */ + __node_shrink(cachep, node); + spin_unlock_irq(&l3->list_lock); + } mutex_unlock(&cache_chain_mutex); break; #endif @@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep) smp_call_function_all_cpus(do_drain, cachep); check_irq_on(); - spin_lock(&cachep->spinlock); for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3) { @@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) drain_array_locked(cachep, l3->shared, 1, node); spin_unlock_irq(&l3->list_lock); if (l3->alien) - drain_alien_cache(cachep, l3); + drain_alien_cache(cachep, l3->alien); } } - spin_unlock(&cachep->spinlock); } static int __node_shrink(struct kmem_cache *cachep, int node) @@ -3440,7 +3486,7 @@ static void cache_reap(void *unused) l3 = searchp->nodelists[numa_node_id()]; if (l3->alien) - drain_alien_cache(searchp, l3); + drain_alien_cache(searchp, l3->alien); spin_lock_irq(&l3->list_lock); drain_array_locked(searchp, cpu_cache_get(searchp), 0, @@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p) num_slabs++; } free_objects += l3->free_objects; - shared_avail += l3->shared->avail; + if (l3->shared) + shared_avail += l3->shared->avail; spin_unlock_irq(&l3->list_lock); }