X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=lib%2Fradix-tree.c;h=2a087e0f98633cf00724ef0ef7b411d9f60b609e;hb=0e950fa686d53a57ee6c47f477ecfc681670c6a9;hp=6a8bc6e06431eec49c1af2f34ec42d7d100e4e0b;hpb=00b61f51922e432fd92a482ba1e0b5f8f326ef46;p=safe%2Fjmp%2Flinux-2.6 diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 6a8bc6e..2a087e0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1,7 +1,8 @@ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig - * Copyright (C) 2005 SGI, Christoph Lameter + * Copyright (C) 2005 SGI, Christoph Lameter + * Copyright (C) 2006 Nick Piggin * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -27,17 +28,16 @@ #include #include #include -#include #include #include +#include #ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT 6 +#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) #else #define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ #endif -#define RADIX_TREE_TAGS 2 #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) @@ -46,9 +46,11 @@ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) struct radix_tree_node { + unsigned int height; /* Height from the bottom */ unsigned int count; + struct rcu_head rcu_head; void *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; struct radix_tree_path { @@ -57,14 +59,19 @@ struct radix_tree_path { }; #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly; +/* + * The height_to_maxindex array needs to be one deeper than the maximum + * path as height 0 holds only 1 entry. + */ +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; /* * Radix tree node cache. */ -static kmem_cache_t *radix_tree_node_cachep; +static struct kmem_cache *radix_tree_node_cachep; /* * Per-cpu pool of preloaded nodes @@ -73,8 +80,64 @@ struct radix_tree_preload { int nr; struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; }; -DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; + +static inline gfp_t root_gfp_mask(struct radix_tree_root *root) +{ + return root->gfp_mask & __GFP_BITS_MASK; +} + +static inline void tag_set(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __set_bit(offset, node->tags[tag]); +} + +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __clear_bit(offset, node->tags[tag]); +} + +static inline int tag_get(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + return test_bit(offset, node->tags[tag]); +} + +static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag) +{ + root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT)); +} + +static inline void root_tag_clear_all(struct radix_tree_root *root) +{ + root->gfp_mask &= __GFP_BITS_MASK; +} +static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag) +{ + return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT)); +} + +/* + * Returns 1 if any slot in the node has this tag set. + * Otherwise returns 0. + */ +static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) +{ + int idx; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (node->tags[tag][idx]) + return 1; + } + return 0; +} /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -82,12 +145,17 @@ DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; static struct radix_tree_node * radix_tree_node_alloc(struct radix_tree_root *root) { - struct radix_tree_node *ret; + struct radix_tree_node *ret = NULL; + gfp_t gfp_mask = root_gfp_mask(root); - ret = kmem_cache_alloc(radix_tree_node_cachep, root->gfp_mask); - if (ret == NULL && !(root->gfp_mask & __GFP_WAIT)) { + if (!(gfp_mask & __GFP_WAIT)) { struct radix_tree_preload *rtp; + /* + * Provided the caller has preloaded here, we will always + * succeed in getting a node here (and never reach + * kmem_cache_alloc) + */ rtp = &__get_cpu_var(radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; @@ -95,13 +163,35 @@ radix_tree_node_alloc(struct radix_tree_root *root) rtp->nr--; } } + if (ret == NULL) + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); + + BUG_ON(radix_tree_is_indirect_ptr(ret)); return ret; } +static void radix_tree_node_rcu_free(struct rcu_head *head) +{ + struct radix_tree_node *node = + container_of(head, struct radix_tree_node, rcu_head); + + /* + * must only free zeroed nodes into the slab. radix_tree_shrink + * can leave us with a non-NULL entry in the first slot, so clear + * that here to make sure. + */ + tag_clear(node, 0, 0); + tag_clear(node, 1, 0); + node->slots[0] = NULL; + node->count = 0; + + kmem_cache_free(radix_tree_node_cachep, node); +} + static inline void radix_tree_node_free(struct radix_tree_node *node) { - kmem_cache_free(radix_tree_node_cachep, node); + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* @@ -109,8 +199,11 @@ radix_tree_node_free(struct radix_tree_node *node) * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(unsigned int __nocast gfp_mask) +int radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -134,22 +227,7 @@ int radix_tree_preload(unsigned int __nocast gfp_mask) out: return ret; } - -static inline void tag_set(struct radix_tree_node *node, int tag, int offset) -{ - if (!test_bit(offset, &node->tags[tag][0])) - __set_bit(offset, &node->tags[tag][0]); -} - -static inline void tag_clear(struct radix_tree_node *node, int tag, int offset) -{ - __clear_bit(offset, &node->tags[tag][0]); -} - -static inline int tag_get(struct radix_tree_node *node, int tag, int offset) -{ - return test_bit(offset, &node->tags[tag][0]); -} +EXPORT_SYMBOL(radix_tree_preload); /* * Return the maximum key which can be store into a @@ -167,7 +245,6 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) { struct radix_tree_node *node; unsigned int height; - char tags[RADIX_TREE_TAGS]; int tag; /* Figure out what the height should be. */ @@ -180,38 +257,26 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) goto out; } - /* - * Prepare the tag status of the top-level node for propagation - * into the newly-pushed top-level node(s) - */ - for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { - int idx; - - tags[tag] = 0; - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (root->rnode->tags[tag][idx]) { - tags[tag] = 1; - break; - } - } - } - do { + unsigned int newheight; if (!(node = radix_tree_node_alloc(root))) return -ENOMEM; /* Increase the height. */ - node->slots[0] = root->rnode; + node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ - for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { - if (tags[tag]) + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (root_tag_get(root, tag)) tag_set(node, tag, 0); } + newheight = root->height+1; + node->height = newheight; node->count = 1; - root->rnode = node; - root->height++; + node = radix_tree_ptr_to_indirect(node); + rcu_assign_pointer(root->rnode, node); + root->height = newheight; } while (height > root->height); out: return 0; @@ -233,15 +298,17 @@ int radix_tree_insert(struct radix_tree_root *root, int offset; int error; + BUG_ON(radix_tree_is_indirect_ptr(item)); + /* Make sure the tree is high enough. */ - if ((!index && !root->rnode) || - index > radix_tree_maxindex(root->height)) { + if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); if (error) return error; } - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); + height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -251,11 +318,13 @@ int radix_tree_insert(struct radix_tree_root *root, /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; + slot->height = height; if (node) { - node->slots[offset] = slot; + rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - root->rnode = slot; + rcu_assign_pointer(root->rnode, + radix_tree_ptr_to_indirect(slot)); } /* Go a level down */ @@ -271,45 +340,94 @@ int radix_tree_insert(struct radix_tree_root *root, if (node) { node->count++; - node->slots[offset] = item; + rcu_assign_pointer(node->slots[offset], item); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); - } else - root->rnode = item; + } else { + rcu_assign_pointer(root->rnode, item); + BUG_ON(root_tag_get(root, 0)); + BUG_ON(root_tag_get(root, 1)); + } return 0; } EXPORT_SYMBOL(radix_tree_insert); -/** - * radix_tree_lookup - perform lookup operation on a radix tree - * @root: radix tree root - * @index: index key - * - * Lookup the item at the position @index in the radix tree @root. +/* + * is_slot == 1 : search for the slot. + * is_slot == 0 : search for the node. */ -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +static void *radix_tree_lookup_element(struct radix_tree_root *root, + unsigned long index, int is_slot) { unsigned int height, shift; - struct radix_tree_node *slot; + struct radix_tree_node *node, **slot; - height = root->height; + node = rcu_dereference_raw(root->rnode); + if (node == NULL) + return NULL; + + if (!radix_tree_is_indirect_ptr(node)) { + if (index > 0) + return NULL; + return is_slot ? (void *)&root->rnode : node; + } + node = radix_tree_indirect_to_ptr(node); + + height = node->height; if (index > radix_tree_maxindex(height)) return NULL; shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; - while (height > 0) { - if (slot == NULL) + do { + slot = (struct radix_tree_node **) + (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); + node = rcu_dereference_raw(*slot); + if (node == NULL) return NULL; - slot = slot->slots[(index >> shift) & RADIX_TREE_MAP_MASK]; shift -= RADIX_TREE_MAP_SHIFT; height--; - } + } while (height > 0); - return slot; + return is_slot ? (void *)slot:node; +} + +/** + * radix_tree_lookup_slot - lookup a slot in a radix tree + * @root: radix tree root + * @index: index key + * + * Returns: the slot corresponding to the position @index in the + * radix tree @root. This is useful for update-if-exists operations. + * + * This function can be called under rcu_read_lock iff the slot is not + * modified by radix_tree_replace_slot, otherwise it must be called + * exclusive from other writers. Any dereference of the slot must be done + * using radix_tree_deref_slot. + */ +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) +{ + return (void **)radix_tree_lookup_element(root, index, 1); +} +EXPORT_SYMBOL(radix_tree_lookup_slot); + +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + * + * This function can be called under rcu_read_lock, however the caller + * must manage lifetimes of leaf nodes (eg. RCU may also be used to free + * them safely). No RCU barriers are required to access or modify the + * returned item, however. + */ +void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +{ + return radix_tree_lookup_element(root, index, 0); } EXPORT_SYMBOL(radix_tree_lookup); @@ -319,36 +437,41 @@ EXPORT_SYMBOL(radix_tree_lookup); * @index: index key * @tag: tag index * - * Set the search tag corresponging to @index in the radix tree. From + * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) + * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, - unsigned long index, int tag) + unsigned long index, unsigned int tag) { unsigned int height, shift; struct radix_tree_node *slot; height = root->height; - if (index > radix_tree_maxindex(height)) - return NULL; + BUG_ON(index > radix_tree_maxindex(height)); + slot = radix_tree_indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; while (height > 0) { int offset; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - tag_set(slot, tag, offset); + if (!tag_get(slot, tag, offset)) + tag_set(slot, tag, offset); slot = slot->slots[offset]; BUG_ON(slot == NULL); shift -= RADIX_TREE_MAP_SHIFT; height--; } + /* set the root's tag bit */ + if (slot && !root_tag_get(root, tag)) + root_tag_set(root, tag); + return slot; } EXPORT_SYMBOL(radix_tree_tag_set); @@ -359,7 +482,8 @@ EXPORT_SYMBOL(radix_tree_tag_set); * @index: index key * @tag: tag index * - * Clear the search tag corresponging to @index in the radix tree. If + * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) + * corresponding to @index in the radix tree. If * this causes the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * @@ -367,12 +491,15 @@ EXPORT_SYMBOL(radix_tree_tag_set); * has the same return value and semantics as radix_tree_lookup(). */ void *radix_tree_tag_clear(struct radix_tree_root *root, - unsigned long index, int tag) + unsigned long index, unsigned int tag) { - struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; - struct radix_tree_node *slot; + /* + * The radix tree path needs to be one longer than the maximum path + * since the "list" is null terminated. + */ + struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *slot = NULL; unsigned int height, shift; - void *ret = NULL; height = root->height; if (index > radix_tree_maxindex(height)) @@ -380,7 +507,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = root->rnode; + slot = radix_tree_indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -397,56 +524,71 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, height--; } - ret = slot; - if (ret == NULL) + if (slot == NULL) goto out; - do { - int idx; - + while (pathp->node) { + if (!tag_get(pathp->node, tag, pathp->offset)) + goto out; tag_clear(pathp->node, tag, pathp->offset); - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (pathp->node->tags[tag][idx]) - goto out; - } + if (any_tag_set(pathp->node, tag)) + goto out; pathp--; - } while (pathp->node); + } + + /* clear the root's tag bit */ + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); + out: - return ret; + return slot; } EXPORT_SYMBOL(radix_tree_tag_clear); -#ifndef __KERNEL__ /* Only the test harness uses this at present */ /** * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key - * @tag: tag index + * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * - * 0: tag not present - * 1: tag present, set - * -1: tag present, unset + * 0: tag not present or not set + * 1: tag set + * + * Note that the return value of this function may not be relied on, even if + * the RCU lock is held, unless tag modification and node deletion are excluded + * from concurrency. */ int radix_tree_tag_get(struct radix_tree_root *root, - unsigned long index, int tag) + unsigned long index, unsigned int tag) { unsigned int height, shift; - struct radix_tree_node *slot; + struct radix_tree_node *node; int saw_unset_tag = 0; - height = root->height; + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + node = rcu_dereference_raw(root->rnode); + if (node == NULL) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) + return (index == 0); + node = radix_tree_indirect_to_ptr(node); + + height = node->height; if (index > radix_tree_maxindex(height)) return 0; shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; for ( ; ; ) { int offset; - if (slot == NULL) + if (node == NULL) return 0; offset = (index >> shift) & RADIX_TREE_MAP_MASK; @@ -455,61 +597,129 @@ int radix_tree_tag_get(struct radix_tree_root *root, * This is just a debug check. Later, we can bale as soon as * we see an unset tag. */ - if (!tag_get(slot, tag, offset)) + if (!tag_get(node, tag, offset)) saw_unset_tag = 1; - if (height == 1) { - int ret = tag_get(slot, tag, offset); - - BUG_ON(ret && saw_unset_tag); - return ret ? 1 : -1; - } - slot = slot->slots[offset]; + if (height == 1) + return !!tag_get(node, tag, offset); + node = rcu_dereference_raw(node->slots[offset]); shift -= RADIX_TREE_MAP_SHIFT; height--; } } EXPORT_SYMBOL(radix_tree_tag_get); -#endif + +/** + * radix_tree_next_hole - find the next hole (not-present entry) + * @root: tree root + * @index: index key + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest + * indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= max_scan' + * will be true). In rare cases of index wrap-around, 0 will be returned. + * + * radix_tree_next_hole may be called under rcu_read_lock. However, like + * radix_tree_gang_lookup, this will not atomically search a snapshot of + * the tree at a single point in time. For example, if a hole is created + * at index 5, then subsequently a hole is created at index 10, + * radix_tree_next_hole covering both indexes may return 10 if called + * under rcu_read_lock. + */ +unsigned long radix_tree_next_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(root, index)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(radix_tree_next_hole); + +/** + * radix_tree_prev_hole - find the prev hole (not-present entry) + * @root: tree root + * @index: index key + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] + * for the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= max_scan' + * will be true). In rare cases of wrap-around, LONG_MAX will be returned. + * + * radix_tree_next_hole may be called under rcu_read_lock. However, like + * radix_tree_gang_lookup, this will not atomically search a snapshot of + * the tree at a single point in time. For example, if a hole is created + * at index 10, then subsequently a hole is created at index 5, + * radix_tree_prev_hole covering both indexes may return 5 if called under + * rcu_read_lock. + */ +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(root, index)) + break; + index--; + if (index == LONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(radix_tree_prev_hole); static unsigned int -__lookup(struct radix_tree_root *root, void **results, unsigned long index, +__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; unsigned int shift, height; - struct radix_tree_node *slot; unsigned long i; - height = root->height; + height = slot->height; if (height == 0) goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; for ( ; height > 1; height--) { - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; - i < RADIX_TREE_MAP_SIZE; i++) { + i = (index >> shift) & RADIX_TREE_MAP_MASK; + for (;;) { if (slot->slots[i] != NULL) break; index &= ~((1UL << shift) - 1); index += 1UL << shift; if (index == 0) goto out; /* 32-bit wraparound */ + i++; + if (i == RADIX_TREE_MAP_SIZE) + goto out; } - if (i == RADIX_TREE_MAP_SIZE) - goto out; shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; + slot = rcu_dereference_raw(slot->slots[i]); + if (slot == NULL) + goto out; } /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { index++; if (slot->slots[i]) { - results[nr_found++] = slot->slots[i]; + results[nr_found++] = &(slot->slots[i]); if (nr_found == max_items) goto out; } @@ -531,79 +741,182 @@ out: * *@results. * * The implementation is naive. + * + * Like radix_tree_lookup, radix_tree_gang_lookup may be called under + * rcu_read_lock. In this case, rather than the returned results being + * an atomic snapshot of the tree at a single point in time, the semantics + * of an RCU protected gang lookup are as though multiple radix_tree_lookups + * have been issued in individual locks, and results stored in 'results'. */ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { - const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long max_index; + struct radix_tree_node *node; unsigned long cur_index = first_index; - unsigned int ret = 0; + unsigned int ret; + + node = rcu_dereference_raw(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = node; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup(root, results + ret, cur_index, + slots_found = __lookup(node, (void ***)results + ret, cur_index, max_items - ret, &next_index); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference_raw(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; cur_index = next_index; } + return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); +/** + * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * their slots at *@results and returns the number of items which were + * placed at *@results. + * + * The implementation is naive. + * + * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must + * be dereferenced with radix_tree_deref_slot, and if using only RCU + * protection, radix_tree_deref_slot may fail requiring a retry. + */ +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items) +{ + unsigned long max_index; + struct radix_tree_node *node; + unsigned long cur_index = first_index; + unsigned int ret; + + node = rcu_dereference_raw(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup(node, results + ret, cur_index, + max_items - ret, &next_index); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_slot); + /* * FIXME: the two tag_get()s here should use find_next_bit() instead of * open-coding the search. */ static unsigned int -__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, - unsigned int max_items, unsigned long *next_index, int tag) +__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, + unsigned int max_items, unsigned long *next_index, unsigned int tag) { unsigned int nr_found = 0; - unsigned int shift; - unsigned int height = root->height; - struct radix_tree_node *slot; + unsigned int shift, height; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; + height = slot->height; + if (height == 0) + goto out; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { - unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ; - for ( ; i < RADIX_TREE_MAP_SIZE; i++) { - if (tag_get(slot, tag, i)) { - BUG_ON(slot->slots[i] == NULL); + for (;;) { + if (tag_get(slot, tag, i)) break; - } index &= ~((1UL << shift) - 1); index += 1UL << shift; if (index == 0) goto out; /* 32-bit wraparound */ + i++; + if (i == RADIX_TREE_MAP_SIZE) + goto out; } - if (i == RADIX_TREE_MAP_SIZE) - goto out; height--; if (height == 0) { /* Bottom level: grab some items */ unsigned long j = index & RADIX_TREE_MAP_MASK; for ( ; j < RADIX_TREE_MAP_SIZE; j++) { index++; - if (tag_get(slot, tag, j)) { - BUG_ON(slot->slots[j] == NULL); - results[nr_found++] = slot->slots[j]; + if (!tag_get(slot, tag, j)) + continue; + /* + * Even though the tag was found set, we need to + * recheck that we have a non-NULL node, because + * if this lookup is lockless, it may have been + * subsequently deleted. + * + * Similar care must be taken in any place that + * lookup ->slots[x] without a lock (ie. can't + * rely on its value remaining the same). + */ + if (slot->slots[j]) { + results[nr_found++] = &(slot->slots[j]); if (nr_found == max_items) goto out; } } } shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; + slot = rcu_dereference_raw(slot->slots[i]); + if (slot == NULL) + break; } out: *next_index = index; @@ -617,7 +930,7 @@ out: * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results - * @tag: the tag index + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the items at *@results and @@ -625,30 +938,161 @@ out: */ unsigned int radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items, int tag) + unsigned long first_index, unsigned int max_items, + unsigned int tag) { - const unsigned long max_index = radix_tree_maxindex(root->height); + struct radix_tree_node *node; + unsigned long max_index; unsigned long cur_index = first_index; - unsigned int ret = 0; + unsigned int ret; + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + node = rcu_dereference_raw(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = node; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup_tag(root, results + ret, cur_index, - max_items - ret, &next_index, tag); + slots_found = __lookup_tag(node, (void ***)results + ret, + cur_index, max_items - ret, &next_index, tag); + nr_found = 0; + for (i = 0; i < slots_found; i++) { + struct radix_tree_node *slot; + slot = *(((void ***)results)[ret + i]); + if (!slot) + continue; + results[ret + nr_found] = rcu_dereference_raw(slot); + nr_found++; + } ret += nr_found; if (next_index == 0) break; cur_index = next_index; } + return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** + * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a + * radix tree based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the slots at *@results and + * returns the number of slots which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items, + unsigned int tag) +{ + struct radix_tree_node *node; + unsigned long max_index; + unsigned long cur_index = first_index; + unsigned int ret; + + /* check the root's tag bit */ + if (!root_tag_get(root, tag)) + return 0; + + node = rcu_dereference_raw(root->rnode); + if (!node) + return 0; + + if (!radix_tree_is_indirect_ptr(node)) { + if (first_index > 0) + return 0; + results[0] = (void **)&root->rnode; + return 1; + } + node = radix_tree_indirect_to_ptr(node); + + max_index = radix_tree_maxindex(node->height); + + ret = 0; + while (ret < max_items) { + unsigned int slots_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + slots_found = __lookup_tag(node, results + ret, + cur_index, max_items - ret, &next_index, tag); + ret += slots_found; + if (next_index == 0) + break; + cur_index = next_index; + } + + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); + + +/** + * radix_tree_shrink - shrink height of a radix tree to minimal + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root) +{ + /* try to shrink tree height */ + while (root->height > 0) { + struct radix_tree_node *to_free = root->rnode; + void *newptr; + + BUG_ON(!radix_tree_is_indirect_ptr(to_free)); + to_free = radix_tree_indirect_to_ptr(to_free); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, we cannot shrink. + */ + if (to_free->count != 1) + break; + if (!to_free->slots[0]) + break; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another. If + * it was safe to dereference the old pointer to it + * (to_free->slots[0]), it will be safe to dereference the new + * one (root->rnode). + */ + newptr = to_free->slots[0]; + if (root->height > 1) + newptr = radix_tree_ptr_to_indirect(newptr); + root->rnode = newptr; + root->height--; + radix_tree_node_free(to_free); + } +} + +/** * radix_tree_delete - delete an item from a radix tree * @root: radix tree root * @index: index key @@ -659,82 +1103,88 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag); */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { - struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; - struct radix_tree_path *orig_pathp; - struct radix_tree_node *slot; + /* + * The radix tree path needs to be one longer than the maximum path + * since the "list" is null terminated. + */ + struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *slot = NULL; + struct radix_tree_node *to_free; unsigned int height, shift; - void *ret = NULL; - char tags[RADIX_TREE_TAGS]; - int nr_cleared_tags; + int tag; + int offset; height = root->height; if (index > radix_tree_maxindex(height)) goto out; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - pathp->node = NULL; slot = root->rnode; + if (height == 0) { + root_tag_clear_all(root); + root->rnode = NULL; + goto out; + } + slot = radix_tree_indirect_to_ptr(slot); - for ( ; height > 0; height--) { - int offset; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + do { if (slot == NULL) goto out; + pathp++; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - pathp[1].offset = offset; - pathp[1].node = slot; + pathp->offset = offset; + pathp->node = slot; slot = slot->slots[offset]; - pathp++; shift -= RADIX_TREE_MAP_SHIFT; - } + height--; + } while (height > 0); - ret = slot; - if (ret == NULL) + if (slot == NULL) goto out; - orig_pathp = pathp; - /* * Clear all tags associated with the just-deleted item */ - memset(tags, 0, sizeof(tags)); - do { - int tag; - - nr_cleared_tags = RADIX_TREE_TAGS; - for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { - int idx; - - if (tags[tag]) - continue; - - tag_clear(pathp->node, tag, pathp->offset); - - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (pathp->node->tags[tag][idx]) { - tags[tag] = 1; - nr_cleared_tags--; - break; - } - } - } - pathp--; - } while (pathp->node && nr_cleared_tags); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (tag_get(pathp->node, tag, pathp->offset)) + radix_tree_tag_clear(root, index, tag); + } + to_free = NULL; /* Now free the nodes we do not need anymore */ - for (pathp = orig_pathp; pathp->node; pathp--) { + while (pathp->node) { pathp->node->slots[pathp->offset] = NULL; - if (--pathp->node->count) + pathp->node->count--; + /* + * Queue the node for deferred freeing after the + * last reference to it disappears (set NULL, above). + */ + if (to_free) + radix_tree_node_free(to_free); + + if (pathp->node->count) { + if (pathp->node == + radix_tree_indirect_to_ptr(root->rnode)) + radix_tree_shrink(root); goto out; + } /* Node with zero slots in use so free it */ - radix_tree_node_free(pathp->node); + to_free = pathp->node; + pathp--; + } - root->rnode = NULL; + root_tag_clear_all(root); root->height = 0; + root->rnode = NULL; + if (to_free) + radix_tree_node_free(to_free); + out: - return ret; + return slot; } EXPORT_SYMBOL(radix_tree_delete); @@ -743,34 +1193,28 @@ EXPORT_SYMBOL(radix_tree_delete); * @root: radix tree root * @tag: tag to test */ -int radix_tree_tagged(struct radix_tree_root *root, int tag) +int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) { - int idx; - - if (!root->rnode) - return 0; - for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { - if (root->rnode->tags[tag][idx]) - return 1; - } - return 0; + return root_tag_get(root, tag); } EXPORT_SYMBOL(radix_tree_tagged); static void -radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) +radix_tree_node_ctor(void *node) { memset(node, 0, sizeof(struct radix_tree_node)); } static __init unsigned long __maxindex(unsigned int height) { - unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; - unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; - - if (tmp >= RADIX_TREE_INDEX_BITS) - index = ~0UL; - return index; + unsigned int width = height * RADIX_TREE_MAP_SHIFT; + int shift = RADIX_TREE_INDEX_BITS - width; + + if (shift < 0) + return ~0UL; + if (shift >= BITS_PER_LONG) + return 0UL; + return ~0UL >> shift; } static __init void radix_tree_init_maxindex(void) @@ -781,7 +1225,6 @@ static __init void radix_tree_init_maxindex(void) height_to_maxindex[i] = __maxindex(i); } -#ifdef CONFIG_HOTPLUG_CPU static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -790,7 +1233,7 @@ static int radix_tree_callback(struct notifier_block *nfb, struct radix_tree_preload *rtp; /* Free per-cpu pool of perloaded nodes */ - if (action == CPU_DEAD) { + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { kmem_cache_free(radix_tree_node_cachep, @@ -801,13 +1244,13 @@ static int radix_tree_callback(struct notifier_block *nfb, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init radix_tree_init(void) { radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, - SLAB_PANIC, radix_tree_node_ctor, NULL); + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, + radix_tree_node_ctor); radix_tree_init_maxindex(); hotcpu_notifier(radix_tree_callback, 0); }