X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=lib%2Fradix-tree.c;h=23abbd93cae1fd50b87b2ed5f8ffd95d940230b1;hb=a9d9058abab4ac17b79d500506e6c74bd16cecdc;hp=7cbdada51049738f4bd105f88f1ca0956daa390f;hpb=cfd9b7df4abd3257c9e381b0e445817b26a51c0c;p=safe%2Fjmp%2Flinux-2.6 diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7cbdada..23abbd9 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1,7 +1,8 @@ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig - * Copyright (C) 2005 SGI, Christoph Lameter + * Copyright (C) 2005 SGI, Christoph Lameter + * Copyright (C) 2006 Nick Piggin * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -30,6 +31,7 @@ #include #include #include +#include #ifdef __KERNEL__ @@ -45,7 +47,9 @@ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) struct radix_tree_node { + unsigned int height; /* Height from the bottom */ unsigned int count; + struct rcu_head rcu_head; void *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; @@ -56,14 +60,19 @@ struct radix_tree_path { }; #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly; +/* + * The height_to_maxindex array needs to be one deeper than the maximum + * path as height 0 holds only 1 entry. + */ +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; /* * Radix tree node cache. */ -static kmem_cache_t *radix_tree_node_cachep; +static struct kmem_cache *radix_tree_node_cachep; /* * Per-cpu pool of preloaded nodes @@ -72,13 +81,64 @@ struct radix_tree_preload { int nr; struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; }; -DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; static inline gfp_t root_gfp_mask(struct radix_tree_root *root) { return root->gfp_mask & __GFP_BITS_MASK; } +static inline void tag_set(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __set_bit(offset, node->tags[tag]); +} + +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __clear_bit(offset, node->tags[tag]); +} + +static inline int tag_get(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + return test_bit(offset, node->tags[tag]); +} + +static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear_all(struct radix_tree_root *root) +{ + root->gfp_mask &= __GFP_BITS_MASK; +} + +static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) +{ + return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); +} + +/* + * Returns 1 if any slot in the node has this tag set. + * Otherwise returns 0. + */ +static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) +{ + int idx; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (node->tags[tag][idx]) + return 1; + } + return 0; +} /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -86,13 +146,17 @@ static inline gfp_t root_gfp_mask(struct radix_tree_root *root) static struct radix_tree_node * radix_tree_node_alloc(struct radix_tree_root *root) { - struct radix_tree_node *ret; + struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); - if (ret == NULL && !(gfp_mask & __GFP_WAIT)) { + if (!(gfp_mask & __GFP_WAIT)) { struct radix_tree_preload *rtp; + /* + * Provided the caller has preloaded here, we will always + * succeed in getting a node here (and never reach + * kmem_cache_alloc) + */ rtp = &__get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; @@ -100,13 +164,35 @@ radix_tree_node_alloc(struct radix_tree_root *root) rtp->nr--; } } + if (ret == NULL) + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); + + BUG_ON(radix_tree_is_indirect_ptr(ret)); return ret; } +static void radix_tree_node_rcu_free(struct rcu_head *head) +{ + struct radix_tree_node *node = + container_of(head, struct radix_tree_node, rcu_head); + + /* + * must only free zeroed nodes into the slab. radix_tree_shrink + * can leave us with a non-NULL entry in the first slot, so clear + * that here to make sure. + */ + tag_clear(node, 0, 0); + tag_clear(node, 1, 0); + node->slots[0] = NULL; + node->count = 0; + + kmem_cache_free(radix_tree_node_cachep, node); +} + static inline void radix_tree_node_free(struct radix_tree_node *node) { - kmem_cache_free(radix_tree_node_cachep, node); + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* @@ -139,59 +225,7 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } - -static inline void tag_set(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - __set_bit(offset, node->tags[tag]); -} - -static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - __clear_bit(offset, node->tags[tag]); -} - -static inline int tag_get(struct radix_tree_node *node, unsigned int tag, - int offset) -{ - return test_bit(offset, node->tags[tag]); -} - -static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) -{ - root->gfp_mask |= (1 << (tag + __GFP_BITS_SHIFT)); -} - - -static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) -{ - root->gfp_mask &= ~(1 << (tag + __GFP_BITS_SHIFT)); -} - -static inline void root_tag_clear_all(struct radix_tree_root *root) -{ - root->gfp_mask &= __GFP_BITS_MASK; -} - -static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) -{ - return root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); -} - -/* - * Returns 1 if any slot in the node has this tag set. - * Otherwise returns 0. - */ -static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) -{ - int idx; - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (node->tags[tag][idx]) - return 1; - } - return 0; -} +EXPORT_SYMBOL(radix_tree_preload); /* * Return the maximum key which can be store into a @@ -222,11 +256,12 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) } do { + unsigned int newheight; if (!(node = radix_tree_node_alloc(root))) return -ENOMEM; /* Increase the height. */ - node->slots[0] = root->rnode; + node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { @@ -234,9 +269,12 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) tag_set(node, tag, 0); } + newheight = root->height+1; + node->height = newheight; node->count = 1; - root->rnode = node; - root->height++; + node = radix_tree_ptr_to_indirect(node); + rcu_assign_pointer(root->rnode, node); + root->height = newheight; } while (height > root->height); out: return 0; @@ -258,6 +296,8 @@ int radix_tree_insert(struct radix_tree_root *root, int offset; int error; + BUG_ON(radix_tree_is_indirect_ptr(item)); + /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); @@ -265,7 +305,8 @@ int radix_tree_insert(struct radix_tree_root *root, return error; } - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); + height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -275,11 +316,13 @@ int radix_tree_insert(struct radix_tree_root *root, /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; + slot->height = height; if (node) { - node->slots[offset] = slot; + rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - root->rnode = slot; + rcu_assign_pointer(root->rnode, + radix_tree_ptr_to_indirect(slot)); } /* Go a level down */ @@ -295,11 +338,11 @@ int radix_tree_insert(struct radix_tree_root *root, if (node) { node->count++; - node->slots[offset] = item; + rcu_assign_pointer(node->slots[offset], item); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); } else { - root->rnode = item; + rcu_assign_pointer(root->rnode, item); BUG_ON(root_tag_get(root, 0)); BUG_ON(root_tag_get(root, 1)); } @@ -308,35 +351,45 @@ int radix_tree_insert(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_insert); -static inline void **__lookup_slot(struct radix_tree_root *root, - unsigned long index) +/* + * is_slot == 1 : search for the slot. + * is_slot == 0 : search for the node. + */ +static void *radix_tree_lookup_element(struct radix_tree_root *root, + unsigned long index, int is_slot) { unsigned int height, shift; - struct radix_tree_node **slot; + struct radix_tree_node *node, **slot; - height = root->height; + node = rcu_dereference(root->rnode); + if (node == NULL) + return NULL; + if (!radix_tree_is_indirect_ptr(node)) { + if (index > 0) + return NULL; + return is_slot ? (void *)&root->rnode : node; + } + node = radix_tree_indirect_to_ptr(node); + + height = node->height; if (index > radix_tree_maxindex(height)) return NULL; - if (height == 0 && root->rnode) - return (void **)&root->rnode; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = &root->rnode; - while (height > 0) { - if (*slot == NULL) + do { + slot = (struct radix_tree_node **) + (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); + node = rcu_dereference(*slot); + if (node == NULL) return NULL; - slot = (struct radix_tree_node **) - ((*slot)->slots + - ((index >> shift) & RADIX_TREE_MAP_MASK)); shift -= RADIX_TREE_MAP_SHIFT; height--; - } + } while (height > 0); - return (void **)slot; + return is_slot ? (void *)slot:node; } /** @@ -344,12 +397,17 @@ static inline void **__lookup_slot(struct radix_tree_root *root, * @root: radix tree root * @index: index key * - * Lookup the slot corresponding to the position @index in the radix tree - * @root. This is useful for update-if-exists operations. + * Returns: the slot corresponding to the position @index in the + * radix tree @root. This is useful for update-if-exists operations. + * + * This function can be called under rcu_read_lock iff the slot is not + * modified by radix_tree_replace_slot, otherwise it must be called + * exclusive from other writers. Any dereference of the slot must be done + * using radix_tree_deref_slot. */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { - return __lookup_slot(root, index); + return (void **)radix_tree_lookup_element(root, index, 1); } EXPORT_SYMBOL(radix_tree_lookup_slot); @@ -359,13 +417,15 @@ EXPORT_SYMBOL(radix_tree_lookup_slot); * @index: index key * * Lookup the item at the position @index in the radix tree @root. + * + * This function can be called under rcu_read_lock, however the caller + * must manage lifetimes of leaf nodes (eg. RCU may also be used to free + * them safely). No RCU barriers are required to access or modify the + * returned item, however. */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { - void **slot; - - slot = __lookup_slot(root, index); - return slot != NULL ? *slot : NULL; + return radix_tree_lookup_element(root, index, 0); } EXPORT_SYMBOL(radix_tree_lookup); @@ -389,10 +449,9 @@ void *radix_tree_tag_set(struct radix_tree_root *root, struct radix_tree_node *slot; height = root->height; - if (index > radix_tree_maxindex(height)) - return NULL; + BUG_ON(index > radix_tree_maxindex(height)); - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { @@ -432,7 +491,11 @@ EXPORT_SYMBOL(radix_tree_tag_set); void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { - struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + /* + * The radix tree path needs to be one longer than the maximum path + * since the "list" is null terminated. + */ + struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; struct radix_tree_node *slot = NULL; unsigned int height, shift; @@ -442,7 +505,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -496,27 +559,31 @@ int radix_tree_tag_get(struct radix_tree_root *root, unsigned long index, unsigned int tag) { unsigned int height, shift; - struct radix_tree_node *slot; + struct radix_tree_node *node; int saw_unset_tag = 0; - height = root->height; - if (index > radix_tree_maxindex(height)) - return 0; - /* check the root's tag bit */ if (!root_tag_get(root, tag)) return 0; - if (height == 0) - return 1; + node = rcu_dereference(root->rnode); + if (node == NULL) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) + return (index == 0); + node = radix_tree_indirect_to_ptr(node); + + height = node->height; + if (index > radix_tree_maxindex(height)) + return 0; shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; for ( ; ; ) { int offset; - if (slot == NULL) + if (node == NULL) return 0; offset = (index >> shift) & RADIX_TREE_MAP_MASK; @@ -525,15 +592,15 @@ int radix_tree_tag_get(struct radix_tree_root *root, * This is just a debug check. Later, we can bale as soon as * we see an unset tag. */ - if (!tag_get(slot, tag, offset)) + if (!tag_get(node, tag, offset)) saw_unset_tag = 1; if (height == 1) { - int ret = tag_get(slot, tag, offset); + int ret = tag_get(node, tag, offset); BUG_ON(ret && saw_unset_tag); - return ret; + return !!ret; } - slot = slot->slots[offset]; + node = rcu_dereference(node->slots[offset]); shift -= RADIX_TREE_MAP_SHIFT; height--; } @@ -541,48 +608,118 @@ int radix_tree_tag_get(struct radix_tree_root *root, EXPORT_SYMBOL(radix_tree_tag_get); #endif +/** + * radix_tree_next_hole - find the next hole (not-present entry) + * @root: tree root + * @index: index key + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest + * indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= max_scan' + * will be true). In rare cases of index wrap-around, 0 will be returned. + * + * radix_tree_next_hole may be called under rcu_read_lock. However, like + * radix_tree_gang_lookup, this will not atomically search a snapshot of + * the tree at a single point in time. For example, if a hole is created + * at index 5, then subsequently a hole is created at index 10, + * radix_tree_next_hole covering both indexes may return 10 if called + * under rcu_read_lock. + */ +unsigned long radix_tree_next_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(root, index)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(radix_tree_next_hole); + +/** + * radix_tree_prev_hole - find the prev hole (not-present entry) + * @root: tree root + * @index: index key + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] + * for the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= max_scan' + * will be true). In rare cases of wrap-around, LONG_MAX will be returned. + * + * radix_tree_next_hole may be called under rcu_read_lock. However, like + * radix_tree_gang_lookup, this will not atomically search a snapshot of + * the tree at a single point in time. For example, if a hole is created + * at index 10, then subsequently a hole is created at index 5, + * radix_tree_prev_hole covering both indexes may return 5 if called under + * rcu_read_lock. + */ +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(root, index)) + break; + index--; + if (index == LONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(radix_tree_prev_hole); + static unsigned int -__lookup(struct radix_tree_root *root, void **results, unsigned long index, +__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; unsigned int shift, height; - struct radix_tree_node *slot; unsigned long i; - height = root->height; - if (height == 0) { - if (root->rnode && index == 0) - results[nr_found++] = root->rnode; + height = slot->height; + if (height == 0) goto out; - } - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; for ( ; height > 1; height--) { - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; - i < RADIX_TREE_MAP_SIZE; i++) { + i = (index >> shift) & RADIX_TREE_MAP_MASK; + for (;;) { if (slot->slots[i] != NULL) break; index &= ~((1UL << shift) - 1); index += 1UL << shift; if (index == 0) goto out; /* 32-bit wraparound */ + i++; + if (i == RADIX_TREE_MAP_SIZE) + goto out; } - if (i == RADIX_TREE_MAP_SIZE) - goto out; shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; + slot = rcu_dereference(slot->slots[i]); + if (slot == NULL) + goto out; } /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { index++; if (slot->slots[i]) { - results[nr_found++] = slot->slots[i]; + results[nr_found++] = &(slot->slots[i]); if (nr_found == max_items) goto out; } @@ -604,86 +741,183 @@ out: * *@results. * * The implementation is naive. + * + * Like radix_tree_lookup, radix_tree_gang_lookup may be called under + * rcu_read_lock. In this case, rather than the returned results being + * an atomic snapshot of the tree at a single point in time, the semantics + * of an RCU protected gang lookup are as though multiple radix_tree_lookups + * have been issued in individual locks, and results stored in 'results'. */ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { - const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long max_index; + struct radix_tree_node *node; unsigned long cur_index = first_index; - unsigned int ret = 0; + unsigned int ret; + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = node; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup(root, results + ret, cur_index, + slots_found = __lookup(node, (void ***)results + ret, cur_index, max_items - ret, &next_index); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; cur_index = next_index; } + return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); +/** + * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * their slots at *@results and returns the number of items which were + * placed at *@results. + * + * The implementation is naive. + * + * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must + * be dereferenced with radix_tree_deref_slot, and if using only RCU + * protection, radix_tree_deref_slot may fail requiring a retry. + */ +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items) +{ + unsigned long max_index; + struct radix_tree_node *node; + unsigned long cur_index = first_index; + unsigned int ret; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup(node, results + ret, cur_index, + max_items - ret, &next_index); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_slot); + /* * FIXME: the two tag_get()s here should use find_next_bit() instead of * open-coding the search. */ static unsigned int -__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, +__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index, unsigned int tag) { unsigned int nr_found = 0; - unsigned int shift; - unsigned int height = root->height; - struct radix_tree_node *slot; + unsigned int shift, height; - if (height == 0) { - if (root->rnode && index == 0) - results[nr_found++] = root->rnode; + height = slot->height; + if (height == 0) goto out; - } - - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; - do { - unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + while (height > 0) { + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ; - for ( ; i < RADIX_TREE_MAP_SIZE; i++) { - if (tag_get(slot, tag, i)) { - BUG_ON(slot->slots[i] == NULL); + for (;;) { + if (tag_get(slot, tag, i)) break; - } index &= ~((1UL << shift) - 1); index += 1UL << shift; if (index == 0) goto out; /* 32-bit wraparound */ + i++; + if (i == RADIX_TREE_MAP_SIZE) + goto out; } - if (i == RADIX_TREE_MAP_SIZE) - goto out; height--; if (height == 0) { /* Bottom level: grab some items */ unsigned long j = index & RADIX_TREE_MAP_MASK; for ( ; j < RADIX_TREE_MAP_SIZE; j++) { index++; - if (tag_get(slot, tag, j)) { - BUG_ON(slot->slots[j] == NULL); - results[nr_found++] = slot->slots[j]; + if (!tag_get(slot, tag, j)) + continue; + /* + * Even though the tag was found set, we need to + * recheck that we have a non-NULL node, because + * if this lookup is lockless, it may have been + * subsequently deleted. + * + * Similar care must be taken in any place that + * lookup ->slots[x] without a lock (ie. can't + * rely on its value remaining the same). + */ + if (slot->slots[j]) { + results[nr_found++] = &(slot->slots[j]); if (nr_found == max_items) goto out; } } } shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; - } while (height > 0); + slot = rcu_dereference(slot->slots[i]); + if (slot == NULL) + break; + } out: *next_index = index; return nr_found; @@ -707,50 +941,153 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { - const unsigned long max_index = radix_tree_maxindex(root->height); + struct radix_tree_node *node; + unsigned long max_index; unsigned long cur_index = first_index; - unsigned int ret = 0; + unsigned int ret; /* check the root's tag bit */ if (!root_tag_get(root, tag)) return 0; + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = node; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup_tag(root, results + ret, cur_index, - max_items - ret, &next_index, tag); + slots_found = __lookup_tag(node, (void ***)results + ret, + cur_index, max_items - ret, &next_index, tag); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; cur_index = next_index; } + return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** + * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a + * radix tree based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the slots at *@results and + * returns the number of slots which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items, + unsigned int tag) +{ + struct radix_tree_node *node; + unsigned long max_index; + unsigned long cur_index = first_index; + unsigned int ret; + + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + node = rcu_dereference(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup_tag(node, results + ret, + cur_index, max_items - ret, &next_index, tag); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); + + +/** * radix_tree_shrink - shrink height of a radix tree to minimal * @root radix tree root */ static inline void radix_tree_shrink(struct radix_tree_root *root) { /* try to shrink tree height */ - while (root->height > 0 && - root->rnode->count == 1 && - root->rnode->slots[0]) { + while (root->height > 0) { struct radix_tree_node *to_free = root->rnode; + void *newptr; - root->rnode = to_free->slots[0]; + BUG_ON(!radix_tree_is_indirect_ptr(to_free)); + to_free = radix_tree_indirect_to_ptr(to_free); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, we cannot shrink. + */ + if (to_free->count != 1) + break; + if (!to_free->slots[0]) + break; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another. If + * it was safe to dereference the old pointer to it + * (to_free->slots[0]), it will be safe to dereference the new + * one (root->rnode). + */ + newptr = to_free->slots[0]; + if (root->height > 1) + newptr = radix_tree_ptr_to_indirect(newptr); + root->rnode = newptr; root->height--; - /* must only free zeroed nodes into the slab */ - tag_clear(to_free, 0, 0); - tag_clear(to_free, 1, 0); - to_free->slots[0] = NULL; - to_free->count = 0; radix_tree_node_free(to_free); } } @@ -766,8 +1103,13 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { - struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + /* + * The radix tree path needs to be one longer than the maximum path + * since the "list" is null terminated. + */ + struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; struct radix_tree_node *slot = NULL; + struct radix_tree_node *to_free; unsigned int height, shift; int tag; int offset; @@ -777,11 +1119,12 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) goto out; slot = root->rnode; - if (height == 0 && root->rnode) { + if (height == 0) { root_tag_clear_all(root); root->rnode = NULL; goto out; } + slot = radix_tree_indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; @@ -810,25 +1153,35 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_tag_clear(root, index, tag); } + to_free = NULL; /* Now free the nodes we do not need anymore */ while (pathp->node) { pathp->node->slots[pathp->offset] = NULL; pathp->node->count--; + /* + * Queue the node for deferred freeing after the + * last reference to it disappears (set NULL, above). + */ + if (to_free) + radix_tree_node_free(to_free); if (pathp->node->count) { - if (pathp->node == root->rnode) + if (pathp->node == + radix_tree_indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } /* Node with zero slots in use so free it */ - radix_tree_node_free(pathp->node); - + to_free = pathp->node; pathp--; + } root_tag_clear_all(root); root->height = 0; root->rnode = NULL; + if (to_free) + radix_tree_node_free(to_free); out: return slot; @@ -847,19 +1200,21 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) EXPORT_SYMBOL(radix_tree_tagged); static void -radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) +radix_tree_node_ctor(void *node) { memset(node, 0, sizeof(struct radix_tree_node)); } static __init unsigned long __maxindex(unsigned int height) { - unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; - unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; - - if (tmp >= RADIX_TREE_INDEX_BITS) - index = ~0UL; - return index; + unsigned int width = height * RADIX_TREE_MAP_SHIFT; + int shift = RADIX_TREE_INDEX_BITS - width; + + if (shift < 0) + return ~0UL; + if (shift >= BITS_PER_LONG) + return 0UL; + return ~0UL >> shift; } static __init void radix_tree_init_maxindex(void) @@ -870,7 +1225,6 @@ static __init void radix_tree_init_maxindex(void) height_to_maxindex[i] = __maxindex(i); } -#ifdef CONFIG_HOTPLUG_CPU static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -879,7 +1233,7 @@ static int radix_tree_callback(struct notifier_block *nfb, struct radix_tree_preload *rtp; /* Free per-cpu pool of perloaded nodes */ - if (action == CPU_DEAD) { + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { kmem_cache_free(radix_tree_node_cachep, @@ -890,13 +1244,13 @@ static int radix_tree_callback(struct notifier_block *nfb, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init radix_tree_init(void) { radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, - SLAB_PANIC, radix_tree_node_ctor, NULL); + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, + radix_tree_node_ctor); radix_tree_init_maxindex(); hotcpu_notifier(radix_tree_callback, 0); }