2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
45 * Substantial contributions to this work comes from:
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
53 #define VERSION "0.408"
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
84 #undef CONFIG_IP_FIB_TRIE_STATS
85 #define MAX_STAT_DEPTH 32
87 #define KEYLENGTH (8*sizeof(t_key))
88 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
91 typedef unsigned int t_key;
95 #define NODE_TYPE_MASK 0x1UL
96 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
98 #define IS_TNODE(n) (!(n->parent & T_LEAF))
99 #define IS_LEAF(n) (n->parent & T_LEAF)
103 unsigned long parent;
108 unsigned long parent;
109 struct hlist_head list;
114 struct hlist_node hlist;
117 struct list_head falh;
122 unsigned long parent;
123 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
124 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short full_children; /* KEYLENGTH bits needed */
126 unsigned short empty_children; /* KEYLENGTH bits needed */
128 struct node *child[0];
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
143 unsigned int totdepth;
144 unsigned int maxdepth;
147 unsigned int nullpointers;
148 unsigned int nodesizes[MAX_STAT_DEPTH];
153 #ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
157 unsigned int revision;
160 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 static void tnode_free(struct tnode *tn);
167 static struct kmem_cache *fn_alias_kmem __read_mostly;
168 static struct trie *trie_local = NULL, *trie_main = NULL;
170 static inline struct tnode *node_parent(struct node *node)
174 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
175 return rcu_dereference(ret);
178 static inline void node_set_parent(struct node *node, struct tnode *ptr)
180 rcu_assign_pointer(node->parent,
181 (unsigned long)ptr | NODE_TYPE(node));
184 /* rcu_read_lock needs to be hold by caller from readside */
186 static inline struct node *tnode_get_child(struct tnode *tn, int i)
188 BUG_ON(i >= 1 << tn->bits);
190 return rcu_dereference(tn->child[i]);
193 static inline int tnode_child_length(const struct tnode *tn)
195 return 1 << tn->bits;
198 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
200 if (offset < KEYLENGTH)
201 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
206 static inline int tkey_equals(t_key a, t_key b)
211 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
213 if (bits == 0 || offset >= KEYLENGTH)
215 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
216 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
219 static inline int tkey_mismatch(t_key a, int offset, t_key b)
226 while ((diff << i) >> (KEYLENGTH-1) == 0)
232 To understand this stuff, an understanding of keys and all their bits is
233 necessary. Every node in the trie has a key associated with it, but not
234 all of the bits in that key are significant.
236 Consider a node 'n' and its parent 'tp'.
238 If n is a leaf, every bit in its key is significant. Its presence is
239 necessitated by path compression, since during a tree traversal (when
240 searching for a leaf - unless we are doing an insertion) we will completely
241 ignore all skipped bits we encounter. Thus we need to verify, at the end of
242 a potentially successful search, that we have indeed been walking the
245 Note that we can never "miss" the correct key in the tree if present by
246 following the wrong path. Path compression ensures that segments of the key
247 that are the same for all keys with a given prefix are skipped, but the
248 skipped part *is* identical for each node in the subtrie below the skipped
249 bit! trie_insert() in this implementation takes care of that - note the
250 call to tkey_sub_equals() in trie_insert().
252 if n is an internal node - a 'tnode' here, the various parts of its key
253 have many different meanings.
256 _________________________________________________________________
257 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
258 -----------------------------------------------------------------
259 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
261 _________________________________________________________________
262 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
263 -----------------------------------------------------------------
264 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
271 First, let's just ignore the bits that come before the parent tp, that is
272 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
273 not use them for anything.
275 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
276 index into the parent's child array. That is, they will be used to find
277 'n' among tp's children.
279 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
282 All the bits we have seen so far are significant to the node n. The rest
283 of the bits are really not needed or indeed known in n->key.
285 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
286 n's child array, and will of course be different for each child.
289 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
294 static inline void check_tnode(const struct tnode *tn)
296 WARN_ON(tn && tn->pos+tn->bits > 32);
299 static int halve_threshold = 25;
300 static int inflate_threshold = 50;
301 static int halve_threshold_root = 8;
302 static int inflate_threshold_root = 15;
305 static void __alias_free_mem(struct rcu_head *head)
307 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
308 kmem_cache_free(fn_alias_kmem, fa);
311 static inline void alias_free_mem_rcu(struct fib_alias *fa)
313 call_rcu(&fa->rcu, __alias_free_mem);
316 static void __leaf_free_rcu(struct rcu_head *head)
318 kfree(container_of(head, struct leaf, rcu));
321 static void __leaf_info_free_rcu(struct rcu_head *head)
323 kfree(container_of(head, struct leaf_info, rcu));
326 static inline void free_leaf_info(struct leaf_info *leaf)
328 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
331 static struct tnode *tnode_alloc(unsigned int size)
335 if (size <= PAGE_SIZE)
336 return kcalloc(size, 1, GFP_KERNEL);
338 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
342 return page_address(pages);
345 static void __tnode_free_rcu(struct rcu_head *head)
347 struct tnode *tn = container_of(head, struct tnode, rcu);
348 unsigned int size = sizeof(struct tnode) +
349 (1 << tn->bits) * sizeof(struct node *);
351 if (size <= PAGE_SIZE)
354 free_pages((unsigned long)tn, get_order(size));
357 static inline void tnode_free(struct tnode *tn)
360 struct leaf *l = (struct leaf *) tn;
361 call_rcu_bh(&l->rcu, __leaf_free_rcu);
363 call_rcu(&tn->rcu, __tnode_free_rcu);
366 static struct leaf *leaf_new(void)
368 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
371 INIT_HLIST_HEAD(&l->list);
376 static struct leaf_info *leaf_info_new(int plen)
378 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
381 INIT_LIST_HEAD(&li->falh);
386 static struct tnode* tnode_new(t_key key, int pos, int bits)
388 int nchildren = 1<<bits;
389 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
390 struct tnode *tn = tnode_alloc(sz);
394 tn->parent = T_TNODE;
398 tn->full_children = 0;
399 tn->empty_children = 1<<bits;
402 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
403 (unsigned int) (sizeof(struct node) * 1<<bits));
408 * Check whether a tnode 'n' is "full", i.e. it is an internal node
409 * and no bits are skipped. See discussion in dyntree paper p. 6
412 static inline int tnode_full(const struct tnode *tn, const struct node *n)
414 if (n == NULL || IS_LEAF(n))
417 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
420 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
422 tnode_put_child_reorg(tn, i, n, -1);
426 * Add a child at position i overwriting the old value.
427 * Update the value of full_children and empty_children.
430 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
432 struct node *chi = tn->child[i];
435 BUG_ON(i >= 1<<tn->bits);
438 /* update emptyChildren */
439 if (n == NULL && chi != NULL)
440 tn->empty_children++;
441 else if (n != NULL && chi == NULL)
442 tn->empty_children--;
444 /* update fullChildren */
446 wasfull = tnode_full(tn, chi);
448 isfull = tnode_full(tn, n);
449 if (wasfull && !isfull)
451 else if (!wasfull && isfull)
455 node_set_parent(n, tn);
457 rcu_assign_pointer(tn->child[i], n);
460 static struct node *resize(struct trie *t, struct tnode *tn)
464 struct tnode *old_tn;
465 int inflate_threshold_use;
466 int halve_threshold_use;
472 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
473 tn, inflate_threshold, halve_threshold);
476 if (tn->empty_children == tnode_child_length(tn)) {
481 if (tn->empty_children == tnode_child_length(tn) - 1)
482 for (i = 0; i < tnode_child_length(tn); i++) {
489 /* compress one level */
490 node_set_parent(n, NULL);
495 * Double as long as the resulting node has a number of
496 * nonempty nodes that are above the threshold.
500 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
501 * the Helsinki University of Technology and Matti Tikkanen of Nokia
502 * Telecommunications, page 6:
503 * "A node is doubled if the ratio of non-empty children to all
504 * children in the *doubled* node is at least 'high'."
506 * 'high' in this instance is the variable 'inflate_threshold'. It
507 * is expressed as a percentage, so we multiply it with
508 * tnode_child_length() and instead of multiplying by 2 (since the
509 * child array will be doubled by inflate()) and multiplying
510 * the left-hand side by 100 (to handle the percentage thing) we
511 * multiply the left-hand side by 50.
513 * The left-hand side may look a bit weird: tnode_child_length(tn)
514 * - tn->empty_children is of course the number of non-null children
515 * in the current node. tn->full_children is the number of "full"
516 * children, that is non-null tnodes with a skip value of 0.
517 * All of those will be doubled in the resulting inflated tnode, so
518 * we just count them one extra time here.
520 * A clearer way to write this would be:
522 * to_be_doubled = tn->full_children;
523 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
526 * new_child_length = tnode_child_length(tn) * 2;
528 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
530 * if (new_fill_factor >= inflate_threshold)
532 * ...and so on, tho it would mess up the while () loop.
535 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
539 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
540 * inflate_threshold * new_child_length
542 * expand not_to_be_doubled and to_be_doubled, and shorten:
543 * 100 * (tnode_child_length(tn) - tn->empty_children +
544 * tn->full_children) >= inflate_threshold * new_child_length
546 * expand new_child_length:
547 * 100 * (tnode_child_length(tn) - tn->empty_children +
548 * tn->full_children) >=
549 * inflate_threshold * tnode_child_length(tn) * 2
552 * 50 * (tn->full_children + tnode_child_length(tn) -
553 * tn->empty_children) >= inflate_threshold *
554 * tnode_child_length(tn)
560 /* Keep root node larger */
563 inflate_threshold_use = inflate_threshold_root;
565 inflate_threshold_use = inflate_threshold;
569 while ((tn->full_children > 0 && max_resize-- &&
570 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
571 inflate_threshold_use * tnode_child_length(tn))) {
577 #ifdef CONFIG_IP_FIB_TRIE_STATS
578 t->stats.resize_node_skipped++;
584 if (max_resize < 0) {
586 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
587 inflate_threshold_root, tn->bits);
589 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
590 inflate_threshold, tn->bits);
596 * Halve as long as the number of empty children in this
597 * node is above threshold.
601 /* Keep root node larger */
604 halve_threshold_use = halve_threshold_root;
606 halve_threshold_use = halve_threshold;
610 while (tn->bits > 1 && max_resize-- &&
611 100 * (tnode_child_length(tn) - tn->empty_children) <
612 halve_threshold_use * tnode_child_length(tn)) {
618 #ifdef CONFIG_IP_FIB_TRIE_STATS
619 t->stats.resize_node_skipped++;
625 if (max_resize < 0) {
627 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
628 halve_threshold_root, tn->bits);
630 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
631 halve_threshold, tn->bits);
634 /* Only one child remains */
635 if (tn->empty_children == tnode_child_length(tn) - 1)
636 for (i = 0; i < tnode_child_length(tn); i++) {
643 /* compress one level */
645 node_set_parent(n, NULL);
650 return (struct node *) tn;
653 static struct tnode *inflate(struct trie *t, struct tnode *tn)
656 struct tnode *oldtnode = tn;
657 int olen = tnode_child_length(tn);
660 pr_debug("In inflate\n");
662 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
665 return ERR_PTR(-ENOMEM);
668 * Preallocate and store tnodes before the actual work so we
669 * don't get into an inconsistent state if memory allocation
670 * fails. In case of failure we return the oldnode and inflate
671 * of tnode is ignored.
674 for (i = 0; i < olen; i++) {
675 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
679 inode->pos == oldtnode->pos + oldtnode->bits &&
681 struct tnode *left, *right;
682 t_key m = TKEY_GET_MASK(inode->pos, 1);
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
689 right = tnode_new(inode->key|m, inode->pos + 1,
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
702 for (i = 0; i < olen; i++) {
703 struct node *node = tnode_get_child(oldtnode, i);
704 struct tnode *left, *right;
711 /* A leaf or an internal node with skipped bits */
713 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
714 tn->pos + tn->bits - 1) {
715 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
717 put_child(t, tn, 2*i, node);
719 put_child(t, tn, 2*i+1, node);
723 /* An internal node with two children */
724 inode = (struct tnode *) node;
726 if (inode->bits == 1) {
727 put_child(t, tn, 2*i, inode->child[0]);
728 put_child(t, tn, 2*i+1, inode->child[1]);
734 /* An internal node with more than two children */
736 /* We will replace this node 'inode' with two new
737 * ones, 'left' and 'right', each with half of the
738 * original children. The two new nodes will have
739 * a position one bit further down the key and this
740 * means that the "significant" part of their keys
741 * (see the discussion near the top of this file)
742 * will differ by one bit, which will be "0" in
743 * left's key and "1" in right's key. Since we are
744 * moving the key position by one step, the bit that
745 * we are moving away from - the bit at position
746 * (inode->pos) - is the one that will differ between
747 * left and right. So... we synthesize that bit in the
749 * The mask 'm' below will be a single "one" bit at
750 * the position (inode->pos)
753 /* Use the old key, but set the new significant
757 left = (struct tnode *) tnode_get_child(tn, 2*i);
758 put_child(t, tn, 2*i, NULL);
762 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
763 put_child(t, tn, 2*i+1, NULL);
767 size = tnode_child_length(left);
768 for (j = 0; j < size; j++) {
769 put_child(t, left, j, inode->child[j]);
770 put_child(t, right, j, inode->child[j + size]);
772 put_child(t, tn, 2*i, resize(t, left));
773 put_child(t, tn, 2*i+1, resize(t, right));
777 tnode_free(oldtnode);
781 int size = tnode_child_length(tn);
784 for (j = 0; j < size; j++)
786 tnode_free((struct tnode *)tn->child[j]);
790 return ERR_PTR(-ENOMEM);
794 static struct tnode *halve(struct trie *t, struct tnode *tn)
796 struct tnode *oldtnode = tn;
797 struct node *left, *right;
799 int olen = tnode_child_length(tn);
801 pr_debug("In halve\n");
803 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
806 return ERR_PTR(-ENOMEM);
809 * Preallocate and store tnodes before the actual work so we
810 * don't get into an inconsistent state if memory allocation
811 * fails. In case of failure we return the oldnode and halve
812 * of tnode is ignored.
815 for (i = 0; i < olen; i += 2) {
816 left = tnode_get_child(oldtnode, i);
817 right = tnode_get_child(oldtnode, i+1);
819 /* Two nonempty children */
823 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
828 put_child(t, tn, i/2, (struct node *)newn);
833 for (i = 0; i < olen; i += 2) {
834 struct tnode *newBinNode;
836 left = tnode_get_child(oldtnode, i);
837 right = tnode_get_child(oldtnode, i+1);
839 /* At least one of the children is empty */
841 if (right == NULL) /* Both are empty */
843 put_child(t, tn, i/2, right);
848 put_child(t, tn, i/2, left);
852 /* Two nonempty children */
853 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
854 put_child(t, tn, i/2, NULL);
855 put_child(t, newBinNode, 0, left);
856 put_child(t, newBinNode, 1, right);
857 put_child(t, tn, i/2, resize(t, newBinNode));
859 tnode_free(oldtnode);
863 int size = tnode_child_length(tn);
866 for (j = 0; j < size; j++)
868 tnode_free((struct tnode *)tn->child[j]);
872 return ERR_PTR(-ENOMEM);
876 static void trie_init(struct trie *t)
882 rcu_assign_pointer(t->trie, NULL);
884 #ifdef CONFIG_IP_FIB_TRIE_STATS
885 memset(&t->stats, 0, sizeof(struct trie_use_stats));
889 /* readside must use rcu_read_lock currently dump routines
890 via get_fa_head and dump */
892 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
894 struct hlist_head *head = &l->list;
895 struct hlist_node *node;
896 struct leaf_info *li;
898 hlist_for_each_entry_rcu(li, node, head, hlist)
899 if (li->plen == plen)
905 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
907 struct leaf_info *li = find_leaf_info(l, plen);
915 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
917 struct leaf_info *li = NULL, *last = NULL;
918 struct hlist_node *node;
920 if (hlist_empty(head)) {
921 hlist_add_head_rcu(&new->hlist, head);
923 hlist_for_each_entry(li, node, head, hlist) {
924 if (new->plen > li->plen)
930 hlist_add_after_rcu(&last->hlist, &new->hlist);
932 hlist_add_before_rcu(&new->hlist, &li->hlist);
936 /* rcu_read_lock needs to be hold by caller from readside */
939 fib_find_node(struct trie *t, u32 key)
946 n = rcu_dereference(t->trie);
948 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
949 tn = (struct tnode *) n;
953 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
954 pos = tn->pos + tn->bits;
955 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
959 /* Case we have found a leaf. Compare prefixes */
961 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
962 return (struct leaf *)n;
967 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
970 t_key cindex, key = tn->key;
973 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
974 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
975 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
976 tn = (struct tnode *) resize (t, (struct tnode *)tn);
977 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
979 tp = node_parent((struct node *) tn);
985 /* Handle last (top) tnode */
987 tn = (struct tnode*) resize(t, (struct tnode *)tn);
989 return (struct node*) tn;
992 /* only used from updater-side */
994 static struct list_head *
995 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
998 struct tnode *tp = NULL, *tn = NULL;
1002 struct list_head *fa_head = NULL;
1003 struct leaf_info *li;
1009 /* If we point to NULL, stop. Either the tree is empty and we should
1010 * just put a new leaf in if, or we have reached an empty child slot,
1011 * and we should just put our new leaf in that.
1012 * If we point to a T_TNODE, check if it matches our key. Note that
1013 * a T_TNODE might be skipping any number of bits - its 'pos' need
1014 * not be the parent's 'pos'+'bits'!
1016 * If it does match the current key, get pos/bits from it, extract
1017 * the index from our key, push the T_TNODE and walk the tree.
1019 * If it doesn't, we have to replace it with a new T_TNODE.
1021 * If we point to a T_LEAF, it might or might not have the same key
1022 * as we do. If it does, just change the value, update the T_LEAF's
1023 * value, and return it.
1024 * If it doesn't, we need to replace it with a T_TNODE.
1027 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1028 tn = (struct tnode *) n;
1032 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1034 pos = tn->pos + tn->bits;
1035 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1037 BUG_ON(n && node_parent(n) != tn);
1043 * n ----> NULL, LEAF or TNODE
1045 * tp is n's (parent) ----> NULL or TNODE
1048 BUG_ON(tp && IS_LEAF(tp));
1050 /* Case 1: n is a leaf. Compare prefixes */
1052 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1053 struct leaf *l = (struct leaf *) n;
1055 li = leaf_info_new(plen);
1062 fa_head = &li->falh;
1063 insert_leaf_info(&l->list, li);
1075 li = leaf_info_new(plen);
1078 tnode_free((struct tnode *) l);
1083 fa_head = &li->falh;
1084 insert_leaf_info(&l->list, li);
1086 if (t->trie && n == NULL) {
1087 /* Case 2: n is NULL, and will just insert a new leaf */
1089 node_set_parent((struct node *)l, tp);
1091 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1092 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1094 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1096 * Add a new tnode here
1097 * first tnode need some special handling
1101 pos = tp->pos+tp->bits;
1106 newpos = tkey_mismatch(key, pos, n->key);
1107 tn = tnode_new(n->key, newpos, 1);
1110 tn = tnode_new(key, newpos, 1); /* First tnode */
1115 tnode_free((struct tnode *) l);
1120 node_set_parent((struct node *)tn, tp);
1122 missbit = tkey_extract_bits(key, newpos, 1);
1123 put_child(t, tn, missbit, (struct node *)l);
1124 put_child(t, tn, 1-missbit, n);
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1130 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1135 if (tp && tp->pos + tp->bits > 32)
1136 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1137 tp, tp->pos, tp->bits, key, plen);
1139 /* Rebalance the trie */
1141 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1149 * Caller must hold RTNL.
1151 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1153 struct trie *t = (struct trie *) tb->tb_data;
1154 struct fib_alias *fa, *new_fa;
1155 struct list_head *fa_head = NULL;
1156 struct fib_info *fi;
1157 int plen = cfg->fc_dst_len;
1158 u8 tos = cfg->fc_tos;
1166 key = ntohl(cfg->fc_dst);
1168 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1170 mask = ntohl(inet_make_mask(plen));
1177 fi = fib_create_info(cfg);
1183 l = fib_find_node(t, key);
1187 fa_head = get_fa_head(l, plen);
1188 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1191 /* Now fa, if non-NULL, points to the first fib alias
1192 * with the same keys [prefix,tos,priority], if such key already
1193 * exists or to the node before which we will insert new one.
1195 * If fa is NULL, we will need to allocate a new one and
1196 * insert to the head of f.
1198 * If f is NULL, no fib node matched the destination key
1199 * and we need to allocate a new one of those as well.
1202 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1203 struct fib_alias *fa_orig;
1206 if (cfg->fc_nlflags & NLM_F_EXCL)
1209 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1210 struct fib_info *fi_drop;
1214 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1218 fi_drop = fa->fa_info;
1219 new_fa->fa_tos = fa->fa_tos;
1220 new_fa->fa_info = fi;
1221 new_fa->fa_type = cfg->fc_type;
1222 new_fa->fa_scope = cfg->fc_scope;
1223 state = fa->fa_state;
1224 new_fa->fa_state &= ~FA_S_ACCESSED;
1226 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1227 alias_free_mem_rcu(fa);
1229 fib_release_info(fi_drop);
1230 if (state & FA_S_ACCESSED)
1232 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1233 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1237 /* Error if we find a perfect match which
1238 * uses the same scope, type, and nexthop
1242 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1243 if (fa->fa_tos != tos)
1245 if (fa->fa_info->fib_priority != fi->fib_priority)
1247 if (fa->fa_type == cfg->fc_type &&
1248 fa->fa_scope == cfg->fc_scope &&
1249 fa->fa_info == fi) {
1253 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1257 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1261 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1265 new_fa->fa_info = fi;
1266 new_fa->fa_tos = tos;
1267 new_fa->fa_type = cfg->fc_type;
1268 new_fa->fa_scope = cfg->fc_scope;
1269 new_fa->fa_state = 0;
1271 * Insert new entry to the list.
1276 fa_head = fib_insert_node(t, &err, key, plen);
1278 goto out_free_new_fa;
1281 list_add_tail_rcu(&new_fa->fa_list,
1282 (fa ? &fa->fa_list : fa_head));
1285 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1286 &cfg->fc_nlinfo, 0);
1291 kmem_cache_free(fn_alias_kmem, new_fa);
1293 fib_release_info(fi);
1299 /* should be called with rcu_read_lock */
1300 static inline int check_leaf(struct trie *t, struct leaf *l,
1301 t_key key, int *plen, const struct flowi *flp,
1302 struct fib_result *res)
1306 struct leaf_info *li;
1307 struct hlist_head *hhead = &l->list;
1308 struct hlist_node *node;
1310 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1312 mask = inet_make_mask(i);
1313 if (l->key != (key & ntohl(mask)))
1316 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1318 #ifdef CONFIG_IP_FIB_TRIE_STATS
1319 t->stats.semantic_match_passed++;
1323 #ifdef CONFIG_IP_FIB_TRIE_STATS
1324 t->stats.semantic_match_miss++;
1331 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1333 struct trie *t = (struct trie *) tb->tb_data;
1338 t_key key = ntohl(flp->fl4_dst);
1341 int current_prefix_length = KEYLENGTH;
1343 t_key node_prefix, key_prefix, pref_mismatch;
1348 n = rcu_dereference(t->trie);
1352 #ifdef CONFIG_IP_FIB_TRIE_STATS
1358 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1362 pn = (struct tnode *) n;
1370 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1372 n = tnode_get_child(pn, cindex);
1375 #ifdef CONFIG_IP_FIB_TRIE_STATS
1376 t->stats.null_node_hit++;
1382 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1390 cn = (struct tnode *)n;
1393 * It's a tnode, and we can do some extra checks here if we
1394 * like, to avoid descending into a dead-end branch.
1395 * This tnode is in the parent's child array at index
1396 * key[p_pos..p_pos+p_bits] but potentially with some bits
1397 * chopped off, so in reality the index may be just a
1398 * subprefix, padded with zero at the end.
1399 * We can also take a look at any skipped bits in this
1400 * tnode - everything up to p_pos is supposed to be ok,
1401 * and the non-chopped bits of the index (se previous
1402 * paragraph) are also guaranteed ok, but the rest is
1403 * considered unknown.
1405 * The skipped bits are key[pos+bits..cn->pos].
1408 /* If current_prefix_length < pos+bits, we are already doing
1409 * actual prefix matching, which means everything from
1410 * pos+(bits-chopped_off) onward must be zero along some
1411 * branch of this subtree - otherwise there is *no* valid
1412 * prefix present. Here we can only check the skipped
1413 * bits. Remember, since we have already indexed into the
1414 * parent's child array, we know that the bits we chopped of
1418 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1420 if (current_prefix_length < pos+bits) {
1421 if (tkey_extract_bits(cn->key, current_prefix_length,
1422 cn->pos - current_prefix_length) != 0 ||
1428 * If chopped_off=0, the index is fully validated and we
1429 * only need to look at the skipped bits for this, the new,
1430 * tnode. What we actually want to do is to find out if
1431 * these skipped bits match our key perfectly, or if we will
1432 * have to count on finding a matching prefix further down,
1433 * because if we do, we would like to have some way of
1434 * verifying the existence of such a prefix at this point.
1437 /* The only thing we can do at this point is to verify that
1438 * any such matching prefix can indeed be a prefix to our
1439 * key, and if the bits in the node we are inspecting that
1440 * do not match our key are not ZERO, this cannot be true.
1441 * Thus, find out where there is a mismatch (before cn->pos)
1442 * and verify that all the mismatching bits are zero in the
1446 /* Note: We aren't very concerned about the piece of the key
1447 * that precede pn->pos+pn->bits, since these have already been
1448 * checked. The bits after cn->pos aren't checked since these are
1449 * by definition "unknown" at this point. Thus, what we want to
1450 * see is if we are about to enter the "prefix matching" state,
1451 * and in that case verify that the skipped bits that will prevail
1452 * throughout this subtree are zero, as they have to be if we are
1453 * to find a matching prefix.
1456 node_prefix = MASK_PFX(cn->key, cn->pos);
1457 key_prefix = MASK_PFX(key, cn->pos);
1458 pref_mismatch = key_prefix^node_prefix;
1461 /* In short: If skipped bits in this node do not match the search
1462 * key, enter the "prefix matching" state.directly.
1464 if (pref_mismatch) {
1465 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1467 pref_mismatch = pref_mismatch <<1;
1469 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1471 if (key_prefix != 0)
1474 if (current_prefix_length >= cn->pos)
1475 current_prefix_length = mp;
1478 pn = (struct tnode *)n; /* Descend */
1485 /* As zero don't change the child key (cindex) */
1486 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1489 /* Decrease current_... with bits chopped off */
1490 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1491 current_prefix_length = pn->pos + pn->bits - chopped_off;
1494 * Either we do the actual chop off according or if we have
1495 * chopped off all bits in this tnode walk up to our parent.
1498 if (chopped_off <= pn->bits) {
1499 cindex &= ~(1 << (chopped_off-1));
1501 struct tnode *parent = node_parent((struct node *) pn);
1505 /* Get Child's index */
1506 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1510 #ifdef CONFIG_IP_FIB_TRIE_STATS
1511 t->stats.backtrack++;
1523 /* only called from updater side */
1524 static int trie_leaf_remove(struct trie *t, t_key key)
1527 struct tnode *tp = NULL;
1528 struct node *n = t->trie;
1531 pr_debug("entering trie_leaf_remove(%p)\n", n);
1533 /* Note that in the case skipped bits, those bits are *not* checked!
1534 * When we finish this, we will have NULL or a T_LEAF, and the
1535 * T_LEAF may or may not match our key.
1538 while (n != NULL && IS_TNODE(n)) {
1539 struct tnode *tn = (struct tnode *) n;
1541 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1543 BUG_ON(n && node_parent(n) != tn);
1545 l = (struct leaf *) n;
1547 if (!n || !tkey_equals(l->key, key))
1552 * Remove the leaf and rebalance the tree
1558 tp = node_parent(n);
1559 tnode_free((struct tnode *) n);
1562 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1563 put_child(t, (struct tnode *)tp, cindex, NULL);
1564 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1566 rcu_assign_pointer(t->trie, NULL);
1572 * Caller must hold RTNL.
1574 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1576 struct trie *t = (struct trie *) tb->tb_data;
1578 int plen = cfg->fc_dst_len;
1579 u8 tos = cfg->fc_tos;
1580 struct fib_alias *fa, *fa_to_delete;
1581 struct list_head *fa_head;
1583 struct leaf_info *li;
1588 key = ntohl(cfg->fc_dst);
1589 mask = ntohl(inet_make_mask(plen));
1595 l = fib_find_node(t, key);
1600 fa_head = get_fa_head(l, plen);
1601 fa = fib_find_alias(fa_head, tos, 0);
1606 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1608 fa_to_delete = NULL;
1609 fa_head = fa->fa_list.prev;
1611 list_for_each_entry(fa, fa_head, fa_list) {
1612 struct fib_info *fi = fa->fa_info;
1614 if (fa->fa_tos != tos)
1617 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1618 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1619 fa->fa_scope == cfg->fc_scope) &&
1620 (!cfg->fc_protocol ||
1621 fi->fib_protocol == cfg->fc_protocol) &&
1622 fib_nh_match(cfg, fi) == 0) {
1632 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1633 &cfg->fc_nlinfo, 0);
1635 l = fib_find_node(t, key);
1636 li = find_leaf_info(l, plen);
1638 list_del_rcu(&fa->fa_list);
1640 if (list_empty(fa_head)) {
1641 hlist_del_rcu(&li->hlist);
1645 if (hlist_empty(&l->list))
1646 trie_leaf_remove(t, key);
1648 if (fa->fa_state & FA_S_ACCESSED)
1651 fib_release_info(fa->fa_info);
1652 alias_free_mem_rcu(fa);
1656 static int trie_flush_list(struct trie *t, struct list_head *head)
1658 struct fib_alias *fa, *fa_node;
1661 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1662 struct fib_info *fi = fa->fa_info;
1664 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1665 list_del_rcu(&fa->fa_list);
1666 fib_release_info(fa->fa_info);
1667 alias_free_mem_rcu(fa);
1674 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1677 struct hlist_head *lih = &l->list;
1678 struct hlist_node *node, *tmp;
1679 struct leaf_info *li = NULL;
1681 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1682 found += trie_flush_list(t, &li->falh);
1684 if (list_empty(&li->falh)) {
1685 hlist_del_rcu(&li->hlist);
1692 /* rcu_read_lock needs to be hold by caller from readside */
1694 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1696 struct node *c = (struct node *) thisleaf;
1699 struct node *trie = rcu_dereference(t->trie);
1705 if (IS_LEAF(trie)) /* trie w. just a leaf */
1706 return (struct leaf *) trie;
1708 p = (struct tnode*) trie; /* Start */
1715 /* Find the next child of the parent */
1717 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1721 last = 1 << p->bits;
1722 for (idx = pos; idx < last ; idx++) {
1723 c = rcu_dereference(p->child[idx]);
1728 /* Decend if tnode */
1729 while (IS_TNODE(c)) {
1730 p = (struct tnode *) c;
1733 /* Rightmost non-NULL branch */
1734 if (p && IS_TNODE(p))
1735 while (!(c = rcu_dereference(p->child[idx]))
1736 && idx < (1<<p->bits)) idx++;
1738 /* Done with this tnode? */
1739 if (idx >= (1 << p->bits) || !c)
1742 return (struct leaf *) c;
1745 /* No more children go up one step */
1746 c = (struct node *) p;
1749 return NULL; /* Ready. Root of trie */
1753 * Caller must hold RTNL.
1755 static int fn_trie_flush(struct fib_table *tb)
1757 struct trie *t = (struct trie *) tb->tb_data;
1758 struct leaf *ll = NULL, *l = NULL;
1763 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1764 found += trie_flush_leaf(t, l);
1766 if (ll && hlist_empty(&ll->list))
1767 trie_leaf_remove(t, ll->key);
1771 if (ll && hlist_empty(&ll->list))
1772 trie_leaf_remove(t, ll->key);
1774 pr_debug("trie_flush found=%d\n", found);
1778 static int trie_last_dflt = -1;
1781 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1783 struct trie *t = (struct trie *) tb->tb_data;
1784 int order, last_idx;
1785 struct fib_info *fi = NULL;
1786 struct fib_info *last_resort;
1787 struct fib_alias *fa = NULL;
1788 struct list_head *fa_head;
1797 l = fib_find_node(t, 0);
1801 fa_head = get_fa_head(l, 0);
1805 if (list_empty(fa_head))
1808 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1809 struct fib_info *next_fi = fa->fa_info;
1811 if (fa->fa_scope != res->scope ||
1812 fa->fa_type != RTN_UNICAST)
1815 if (next_fi->fib_priority > res->fi->fib_priority)
1817 if (!next_fi->fib_nh[0].nh_gw ||
1818 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1820 fa->fa_state |= FA_S_ACCESSED;
1823 if (next_fi != res->fi)
1825 } else if (!fib_detect_death(fi, order, &last_resort,
1826 &last_idx, &trie_last_dflt)) {
1828 fib_info_put(res->fi);
1830 atomic_inc(&fi->fib_clntref);
1831 trie_last_dflt = order;
1837 if (order <= 0 || fi == NULL) {
1838 trie_last_dflt = -1;
1842 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1844 fib_info_put(res->fi);
1846 atomic_inc(&fi->fib_clntref);
1847 trie_last_dflt = order;
1850 if (last_idx >= 0) {
1852 fib_info_put(res->fi);
1853 res->fi = last_resort;
1855 atomic_inc(&last_resort->fib_clntref);
1857 trie_last_dflt = last_idx;
1862 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1863 struct sk_buff *skb, struct netlink_callback *cb)
1866 struct fib_alias *fa;
1868 __be32 xkey = htonl(key);
1873 /* rcu_read_lock is hold by caller */
1875 list_for_each_entry_rcu(fa, fah, fa_list) {
1880 BUG_ON(!fa->fa_info);
1882 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1891 fa->fa_info, 0) < 0) {
1901 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1902 struct netlink_callback *cb)
1905 struct list_head *fa_head;
1906 struct leaf *l = NULL;
1910 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1914 memset(&cb->args[4], 0,
1915 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1917 fa_head = get_fa_head(l, plen);
1922 if (list_empty(fa_head))
1925 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1934 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1937 struct trie *t = (struct trie *) tb->tb_data;
1942 for (m = 0; m <= 32; m++) {
1946 memset(&cb->args[3], 0,
1947 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1949 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1962 /* Fix more generic FIB names for init later */
1964 #ifdef CONFIG_IP_MULTIPLE_TABLES
1965 struct fib_table * fib_hash_init(u32 id)
1967 struct fib_table * __init fib_hash_init(u32 id)
1970 struct fib_table *tb;
1973 if (fn_alias_kmem == NULL)
1974 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1975 sizeof(struct fib_alias),
1976 0, SLAB_HWCACHE_ALIGN,
1979 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985 tb->tb_lookup = fn_trie_lookup;
1986 tb->tb_insert = fn_trie_insert;
1987 tb->tb_delete = fn_trie_delete;
1988 tb->tb_flush = fn_trie_flush;
1989 tb->tb_select_default = fn_trie_select_default;
1990 tb->tb_dump = fn_trie_dump;
1991 memset(tb->tb_data, 0, sizeof(struct trie));
1993 t = (struct trie *) tb->tb_data;
1997 if (id == RT_TABLE_LOCAL)
1999 else if (id == RT_TABLE_MAIN)
2002 if (id == RT_TABLE_LOCAL)
2003 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
2008 #ifdef CONFIG_PROC_FS
2009 /* Depth first Trie walk iterator */
2010 struct fib_trie_iter {
2011 struct tnode *tnode;
2017 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2019 struct tnode *tn = iter->tnode;
2020 unsigned cindex = iter->index;
2023 /* A single entry routing table */
2027 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2028 iter->tnode, iter->index, iter->depth);
2030 while (cindex < (1<<tn->bits)) {
2031 struct node *n = tnode_get_child(tn, cindex);
2036 iter->index = cindex + 1;
2038 /* push down one level */
2039 iter->tnode = (struct tnode *) n;
2049 /* Current node exhausted, pop back up */
2050 p = node_parent((struct node *)tn);
2052 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2062 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2070 n = rcu_dereference(t->trie);
2077 iter->tnode = (struct tnode *) n;
2092 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2095 struct fib_trie_iter iter;
2097 memset(s, 0, sizeof(*s));
2100 for (n = fib_trie_get_first(&iter, t); n;
2101 n = fib_trie_get_next(&iter)) {
2104 s->totdepth += iter.depth;
2105 if (iter.depth > s->maxdepth)
2106 s->maxdepth = iter.depth;
2108 const struct tnode *tn = (const struct tnode *) n;
2112 if (tn->bits < MAX_STAT_DEPTH)
2113 s->nodesizes[tn->bits]++;
2115 for (i = 0; i < (1<<tn->bits); i++)
2124 * This outputs /proc/net/fib_triestats
2126 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2128 unsigned i, max, pointers, bytes, avdepth;
2131 avdepth = stat->totdepth*100 / stat->leaves;
2135 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2136 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2138 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2140 bytes = sizeof(struct leaf) * stat->leaves;
2141 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2142 bytes += sizeof(struct tnode) * stat->tnodes;
2144 max = MAX_STAT_DEPTH;
2145 while (max > 0 && stat->nodesizes[max-1] == 0)
2149 for (i = 1; i <= max; i++)
2150 if (stat->nodesizes[i] != 0) {
2151 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2152 pointers += (1<<i) * stat->nodesizes[i];
2154 seq_putc(seq, '\n');
2155 seq_printf(seq, "\tPointers: %d\n", pointers);
2157 bytes += sizeof(struct node *) * pointers;
2158 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2159 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2161 #ifdef CONFIG_IP_FIB_TRIE_STATS
2162 seq_printf(seq, "Counters:\n---------\n");
2163 seq_printf(seq,"gets = %d\n", t->stats.gets);
2164 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2165 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2166 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2167 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2168 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2170 memset(&(t->stats), 0, sizeof(t->stats));
2172 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2175 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2177 struct trie_stat *stat;
2179 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2183 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2184 sizeof(struct leaf), sizeof(struct tnode));
2187 seq_printf(seq, "Local:\n");
2188 trie_collect_stats(trie_local, stat);
2189 trie_show_stats(seq, stat);
2193 seq_printf(seq, "Main:\n");
2194 trie_collect_stats(trie_main, stat);
2195 trie_show_stats(seq, stat);
2202 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2204 return single_open(file, fib_triestat_seq_show, NULL);
2207 static const struct file_operations fib_triestat_fops = {
2208 .owner = THIS_MODULE,
2209 .open = fib_triestat_seq_open,
2211 .llseek = seq_lseek,
2212 .release = single_release,
2215 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2221 for (n = fib_trie_get_first(iter, trie_local);
2222 n; ++idx, n = fib_trie_get_next(iter)) {
2227 for (n = fib_trie_get_first(iter, trie_main);
2228 n; ++idx, n = fib_trie_get_next(iter)) {
2235 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2239 return SEQ_START_TOKEN;
2240 return fib_trie_get_idx(seq->private, *pos - 1);
2243 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2245 struct fib_trie_iter *iter = seq->private;
2249 if (v == SEQ_START_TOKEN)
2250 return fib_trie_get_idx(iter, 0);
2252 v = fib_trie_get_next(iter);
2257 /* continue scan in next trie */
2258 if (iter->trie == trie_local)
2259 return fib_trie_get_first(iter, trie_main);
2264 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2269 static void seq_indent(struct seq_file *seq, int n)
2271 while (n-- > 0) seq_puts(seq, " ");
2274 static inline const char *rtn_scope(enum rt_scope_t s)
2276 static char buf[32];
2279 case RT_SCOPE_UNIVERSE: return "universe";
2280 case RT_SCOPE_SITE: return "site";
2281 case RT_SCOPE_LINK: return "link";
2282 case RT_SCOPE_HOST: return "host";
2283 case RT_SCOPE_NOWHERE: return "nowhere";
2285 snprintf(buf, sizeof(buf), "scope=%d", s);
2290 static const char *rtn_type_names[__RTN_MAX] = {
2291 [RTN_UNSPEC] = "UNSPEC",
2292 [RTN_UNICAST] = "UNICAST",
2293 [RTN_LOCAL] = "LOCAL",
2294 [RTN_BROADCAST] = "BROADCAST",
2295 [RTN_ANYCAST] = "ANYCAST",
2296 [RTN_MULTICAST] = "MULTICAST",
2297 [RTN_BLACKHOLE] = "BLACKHOLE",
2298 [RTN_UNREACHABLE] = "UNREACHABLE",
2299 [RTN_PROHIBIT] = "PROHIBIT",
2300 [RTN_THROW] = "THROW",
2302 [RTN_XRESOLVE] = "XRESOLVE",
2305 static inline const char *rtn_type(unsigned t)
2307 static char buf[32];
2309 if (t < __RTN_MAX && rtn_type_names[t])
2310 return rtn_type_names[t];
2311 snprintf(buf, sizeof(buf), "type %d", t);
2315 /* Pretty print the trie */
2316 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2318 const struct fib_trie_iter *iter = seq->private;
2321 if (v == SEQ_START_TOKEN)
2324 if (!node_parent(n)) {
2325 if (iter->trie == trie_local)
2326 seq_puts(seq, "<local>:\n");
2328 seq_puts(seq, "<main>:\n");
2332 struct tnode *tn = (struct tnode *) n;
2333 __be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2335 seq_indent(seq, iter->depth-1);
2336 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2337 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2338 tn->empty_children);
2341 struct leaf *l = (struct leaf *) n;
2343 __be32 val = htonl(l->key);
2345 seq_indent(seq, iter->depth);
2346 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2347 for (i = 32; i >= 0; i--) {
2348 struct leaf_info *li = find_leaf_info(l, i);
2350 struct fib_alias *fa;
2351 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2352 seq_indent(seq, iter->depth+1);
2353 seq_printf(seq, " /%d %s %s", i,
2354 rtn_scope(fa->fa_scope),
2355 rtn_type(fa->fa_type));
2357 seq_printf(seq, "tos =%d\n",
2359 seq_putc(seq, '\n');
2368 static const struct seq_operations fib_trie_seq_ops = {
2369 .start = fib_trie_seq_start,
2370 .next = fib_trie_seq_next,
2371 .stop = fib_trie_seq_stop,
2372 .show = fib_trie_seq_show,
2375 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2377 struct seq_file *seq;
2379 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2384 rc = seq_open(file, &fib_trie_seq_ops);
2388 seq = file->private_data;
2390 memset(s, 0, sizeof(*s));
2398 static const struct file_operations fib_trie_fops = {
2399 .owner = THIS_MODULE,
2400 .open = fib_trie_seq_open,
2402 .llseek = seq_lseek,
2403 .release = seq_release_private,
2406 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2408 static unsigned type2flags[RTN_MAX + 1] = {
2409 [7] = RTF_REJECT, [8] = RTF_REJECT,
2411 unsigned flags = type2flags[type];
2413 if (fi && fi->fib_nh->nh_gw)
2414 flags |= RTF_GATEWAY;
2415 if (mask == htonl(0xFFFFFFFF))
2422 * This outputs /proc/net/route.
2423 * The format of the file is not supposed to be changed
2424 * and needs to be same as fib_hash output to avoid breaking
2427 static int fib_route_seq_show(struct seq_file *seq, void *v)
2429 const struct fib_trie_iter *iter = seq->private;
2434 if (v == SEQ_START_TOKEN) {
2435 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2436 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2441 if (iter->trie == trie_local)
2446 for (i=32; i>=0; i--) {
2447 struct leaf_info *li = find_leaf_info(l, i);
2448 struct fib_alias *fa;
2449 __be32 mask, prefix;
2454 mask = inet_make_mask(li->plen);
2455 prefix = htonl(l->key);
2457 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2458 const struct fib_info *fi = fa->fa_info;
2459 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2461 if (fa->fa_type == RTN_BROADCAST
2462 || fa->fa_type == RTN_MULTICAST)
2466 snprintf(bf, sizeof(bf),
2467 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2468 fi->fib_dev ? fi->fib_dev->name : "*",
2470 fi->fib_nh->nh_gw, flags, 0, 0,
2473 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2477 snprintf(bf, sizeof(bf),
2478 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2479 prefix, 0, flags, 0, 0, 0,
2482 seq_printf(seq, "%-127s\n", bf);
2489 static const struct seq_operations fib_route_seq_ops = {
2490 .start = fib_trie_seq_start,
2491 .next = fib_trie_seq_next,
2492 .stop = fib_trie_seq_stop,
2493 .show = fib_route_seq_show,
2496 static int fib_route_seq_open(struct inode *inode, struct file *file)
2498 struct seq_file *seq;
2500 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2505 rc = seq_open(file, &fib_route_seq_ops);
2509 seq = file->private_data;
2511 memset(s, 0, sizeof(*s));
2519 static const struct file_operations fib_route_fops = {
2520 .owner = THIS_MODULE,
2521 .open = fib_route_seq_open,
2523 .llseek = seq_lseek,
2524 .release = seq_release_private,
2527 int __init fib_proc_init(void)
2529 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2532 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2535 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2541 proc_net_remove("fib_triestat");
2543 proc_net_remove("fib_trie");
2548 void __init fib_proc_exit(void)
2550 proc_net_remove("fib_trie");
2551 proc_net_remove("fib_triestat");
2552 proc_net_remove("route");
2555 #endif /* CONFIG_PROC_FS */