2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
26 * Code from fib_hash has been reused which includes the following header:
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
33 * IPv4 FIB: lookup engine and maintenance routines.
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
43 * Substantial contributions to this work comes from:
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
51 #define VERSION "0.408"
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
83 #define MAX_STAT_DEPTH 32
85 #define KEYLENGTH (8*sizeof(t_key))
87 typedef unsigned int t_key;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
103 unsigned long parent;
105 struct hlist_head list;
110 struct hlist_node hlist;
113 struct list_head falh;
117 unsigned long parent;
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
125 struct work_struct work;
126 struct tnode *tnode_free;
128 struct node *child[0];
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
138 unsigned int resize_node_skipped;
143 unsigned int totdepth;
144 unsigned int maxdepth;
147 unsigned int nullpointers;
148 unsigned int prefixes;
149 unsigned int nodesizes[MAX_STAT_DEPTH];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
167 static size_t tnode_free_size;
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
174 static const int sync_pages = 128;
176 static struct kmem_cache *fn_alias_kmem __read_mostly;
177 static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 static inline struct tnode *node_parent(struct node *node)
181 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
184 static inline struct tnode *node_parent_rcu(struct node *node)
186 struct tnode *ret = node_parent(node);
188 return rcu_dereference(ret);
191 /* Same as rcu_assign_pointer
192 * but that macro() assumes that value is a pointer.
194 static inline void node_set_parent(struct node *node, struct tnode *ptr)
197 node->parent = (unsigned long)ptr | NODE_TYPE(node);
200 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
202 BUG_ON(i >= 1U << tn->bits);
207 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209 struct node *ret = tnode_get_child(tn, i);
211 return rcu_dereference(ret);
214 static inline int tnode_child_length(const struct tnode *tn)
216 return 1 << tn->bits;
219 static inline t_key mask_pfx(t_key k, unsigned short l)
221 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
224 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
226 if (offset < KEYLENGTH)
227 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
232 static inline int tkey_equals(t_key a, t_key b)
237 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
239 if (bits == 0 || offset >= KEYLENGTH)
241 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
242 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
245 static inline int tkey_mismatch(t_key a, int offset, t_key b)
252 while ((diff << i) >> (KEYLENGTH-1) == 0)
258 To understand this stuff, an understanding of keys and all their bits is
259 necessary. Every node in the trie has a key associated with it, but not
260 all of the bits in that key are significant.
262 Consider a node 'n' and its parent 'tp'.
264 If n is a leaf, every bit in its key is significant. Its presence is
265 necessitated by path compression, since during a tree traversal (when
266 searching for a leaf - unless we are doing an insertion) we will completely
267 ignore all skipped bits we encounter. Thus we need to verify, at the end of
268 a potentially successful search, that we have indeed been walking the
271 Note that we can never "miss" the correct key in the tree if present by
272 following the wrong path. Path compression ensures that segments of the key
273 that are the same for all keys with a given prefix are skipped, but the
274 skipped part *is* identical for each node in the subtrie below the skipped
275 bit! trie_insert() in this implementation takes care of that - note the
276 call to tkey_sub_equals() in trie_insert().
278 if n is an internal node - a 'tnode' here, the various parts of its key
279 have many different meanings.
282 _________________________________________________________________
283 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
284 -----------------------------------------------------------------
285 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
287 _________________________________________________________________
288 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
289 -----------------------------------------------------------------
290 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
297 First, let's just ignore the bits that come before the parent tp, that is
298 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
299 not use them for anything.
301 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
302 index into the parent's child array. That is, they will be used to find
303 'n' among tp's children.
305 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
308 All the bits we have seen so far are significant to the node n. The rest
309 of the bits are really not needed or indeed known in n->key.
311 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
312 n's child array, and will of course be different for each child.
315 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
320 static inline void check_tnode(const struct tnode *tn)
322 WARN_ON(tn && tn->pos+tn->bits > 32);
325 static const int halve_threshold = 25;
326 static const int inflate_threshold = 50;
327 static const int halve_threshold_root = 15;
328 static const int inflate_threshold_root = 25;
331 static void __alias_free_mem(struct rcu_head *head)
333 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
334 kmem_cache_free(fn_alias_kmem, fa);
337 static inline void alias_free_mem_rcu(struct fib_alias *fa)
339 call_rcu(&fa->rcu, __alias_free_mem);
342 static void __leaf_free_rcu(struct rcu_head *head)
344 struct leaf *l = container_of(head, struct leaf, rcu);
345 kmem_cache_free(trie_leaf_kmem, l);
348 static inline void free_leaf(struct leaf *l)
350 call_rcu_bh(&l->rcu, __leaf_free_rcu);
353 static void __leaf_info_free_rcu(struct rcu_head *head)
355 kfree(container_of(head, struct leaf_info, rcu));
358 static inline void free_leaf_info(struct leaf_info *leaf)
360 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
363 static struct tnode *tnode_alloc(size_t size)
365 if (size <= PAGE_SIZE)
366 return kzalloc(size, GFP_KERNEL);
368 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
371 static void __tnode_vfree(struct work_struct *arg)
373 struct tnode *tn = container_of(arg, struct tnode, work);
377 static void __tnode_free_rcu(struct rcu_head *head)
379 struct tnode *tn = container_of(head, struct tnode, rcu);
380 size_t size = sizeof(struct tnode) +
381 (sizeof(struct node *) << tn->bits);
383 if (size <= PAGE_SIZE)
386 INIT_WORK(&tn->work, __tnode_vfree);
387 schedule_work(&tn->work);
391 static inline void tnode_free(struct tnode *tn)
394 free_leaf((struct leaf *) tn);
396 call_rcu(&tn->rcu, __tnode_free_rcu);
399 static void tnode_free_safe(struct tnode *tn)
402 tn->tnode_free = tnode_free_head;
403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits);
408 static void tnode_free_flush(void)
412 while ((tn = tnode_free_head)) {
413 tnode_free_head = tn->tnode_free;
414 tn->tnode_free = NULL;
418 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
424 static struct leaf *leaf_new(void)
426 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
429 INIT_HLIST_HEAD(&l->list);
434 static struct leaf_info *leaf_info_new(int plen)
436 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
439 INIT_LIST_HEAD(&li->falh);
444 static struct tnode *tnode_new(t_key key, int pos, int bits)
446 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
447 struct tnode *tn = tnode_alloc(sz);
450 tn->parent = T_TNODE;
454 tn->full_children = 0;
455 tn->empty_children = 1<<bits;
458 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
459 (unsigned long) (sizeof(struct node) << bits));
464 * Check whether a tnode 'n' is "full", i.e. it is an internal node
465 * and no bits are skipped. See discussion in dyntree paper p. 6
468 static inline int tnode_full(const struct tnode *tn, const struct node *n)
470 if (n == NULL || IS_LEAF(n))
473 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
476 static inline void put_child(struct trie *t, struct tnode *tn, int i,
479 tnode_put_child_reorg(tn, i, n, -1);
483 * Add a child at position i overwriting the old value.
484 * Update the value of full_children and empty_children.
487 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
490 struct node *chi = tn->child[i];
493 BUG_ON(i >= 1<<tn->bits);
495 /* update emptyChildren */
496 if (n == NULL && chi != NULL)
497 tn->empty_children++;
498 else if (n != NULL && chi == NULL)
499 tn->empty_children--;
501 /* update fullChildren */
503 wasfull = tnode_full(tn, chi);
505 isfull = tnode_full(tn, n);
506 if (wasfull && !isfull)
508 else if (!wasfull && isfull)
512 node_set_parent(n, tn);
514 rcu_assign_pointer(tn->child[i], n);
517 static struct node *resize(struct trie *t, struct tnode *tn)
521 struct tnode *old_tn;
522 int inflate_threshold_use;
523 int halve_threshold_use;
529 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
530 tn, inflate_threshold, halve_threshold);
533 if (tn->empty_children == tnode_child_length(tn)) {
538 if (tn->empty_children == tnode_child_length(tn) - 1)
539 for (i = 0; i < tnode_child_length(tn); i++) {
546 /* compress one level */
547 node_set_parent(n, NULL);
552 * Double as long as the resulting node has a number of
553 * nonempty nodes that are above the threshold.
557 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
558 * the Helsinki University of Technology and Matti Tikkanen of Nokia
559 * Telecommunications, page 6:
560 * "A node is doubled if the ratio of non-empty children to all
561 * children in the *doubled* node is at least 'high'."
563 * 'high' in this instance is the variable 'inflate_threshold'. It
564 * is expressed as a percentage, so we multiply it with
565 * tnode_child_length() and instead of multiplying by 2 (since the
566 * child array will be doubled by inflate()) and multiplying
567 * the left-hand side by 100 (to handle the percentage thing) we
568 * multiply the left-hand side by 50.
570 * The left-hand side may look a bit weird: tnode_child_length(tn)
571 * - tn->empty_children is of course the number of non-null children
572 * in the current node. tn->full_children is the number of "full"
573 * children, that is non-null tnodes with a skip value of 0.
574 * All of those will be doubled in the resulting inflated tnode, so
575 * we just count them one extra time here.
577 * A clearer way to write this would be:
579 * to_be_doubled = tn->full_children;
580 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
583 * new_child_length = tnode_child_length(tn) * 2;
585 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
587 * if (new_fill_factor >= inflate_threshold)
589 * ...and so on, tho it would mess up the while () loop.
592 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
596 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
597 * inflate_threshold * new_child_length
599 * expand not_to_be_doubled and to_be_doubled, and shorten:
600 * 100 * (tnode_child_length(tn) - tn->empty_children +
601 * tn->full_children) >= inflate_threshold * new_child_length
603 * expand new_child_length:
604 * 100 * (tnode_child_length(tn) - tn->empty_children +
605 * tn->full_children) >=
606 * inflate_threshold * tnode_child_length(tn) * 2
609 * 50 * (tn->full_children + tnode_child_length(tn) -
610 * tn->empty_children) >= inflate_threshold *
611 * tnode_child_length(tn)
617 /* Keep root node larger */
620 inflate_threshold_use = inflate_threshold_root;
622 inflate_threshold_use = inflate_threshold;
626 while ((tn->full_children > 0 && max_resize-- &&
627 50 * (tn->full_children + tnode_child_length(tn)
628 - tn->empty_children)
629 >= inflate_threshold_use * tnode_child_length(tn))) {
636 #ifdef CONFIG_IP_FIB_TRIE_STATS
637 t->stats.resize_node_skipped++;
643 if (max_resize < 0) {
645 pr_warning("Fix inflate_threshold_root."
646 " Now=%d size=%d bits\n",
647 inflate_threshold_root, tn->bits);
649 pr_warning("Fix inflate_threshold."
650 " Now=%d size=%d bits\n",
651 inflate_threshold, tn->bits);
657 * Halve as long as the number of empty children in this
658 * node is above threshold.
662 /* Keep root node larger */
665 halve_threshold_use = halve_threshold_root;
667 halve_threshold_use = halve_threshold;
671 while (tn->bits > 1 && max_resize-- &&
672 100 * (tnode_child_length(tn) - tn->empty_children) <
673 halve_threshold_use * tnode_child_length(tn)) {
679 #ifdef CONFIG_IP_FIB_TRIE_STATS
680 t->stats.resize_node_skipped++;
686 if (max_resize < 0) {
688 pr_warning("Fix halve_threshold_root."
689 " Now=%d size=%d bits\n",
690 halve_threshold_root, tn->bits);
692 pr_warning("Fix halve_threshold."
693 " Now=%d size=%d bits\n",
694 halve_threshold, tn->bits);
697 /* Only one child remains */
698 if (tn->empty_children == tnode_child_length(tn) - 1)
699 for (i = 0; i < tnode_child_length(tn); i++) {
706 /* compress one level */
708 node_set_parent(n, NULL);
713 return (struct node *) tn;
716 static struct tnode *inflate(struct trie *t, struct tnode *tn)
718 struct tnode *oldtnode = tn;
719 int olen = tnode_child_length(tn);
722 pr_debug("In inflate\n");
724 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
727 return ERR_PTR(-ENOMEM);
730 * Preallocate and store tnodes before the actual work so we
731 * don't get into an inconsistent state if memory allocation
732 * fails. In case of failure we return the oldnode and inflate
733 * of tnode is ignored.
736 for (i = 0; i < olen; i++) {
739 inode = (struct tnode *) tnode_get_child(oldtnode, i);
742 inode->pos == oldtnode->pos + oldtnode->bits &&
744 struct tnode *left, *right;
745 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
747 left = tnode_new(inode->key&(~m), inode->pos + 1,
752 right = tnode_new(inode->key|m, inode->pos + 1,
760 put_child(t, tn, 2*i, (struct node *) left);
761 put_child(t, tn, 2*i+1, (struct node *) right);
765 for (i = 0; i < olen; i++) {
767 struct node *node = tnode_get_child(oldtnode, i);
768 struct tnode *left, *right;
775 /* A leaf or an internal node with skipped bits */
777 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
778 tn->pos + tn->bits - 1) {
779 if (tkey_extract_bits(node->key,
780 oldtnode->pos + oldtnode->bits,
782 put_child(t, tn, 2*i, node);
784 put_child(t, tn, 2*i+1, node);
788 /* An internal node with two children */
789 inode = (struct tnode *) node;
791 if (inode->bits == 1) {
792 put_child(t, tn, 2*i, inode->child[0]);
793 put_child(t, tn, 2*i+1, inode->child[1]);
795 tnode_free_safe(inode);
799 /* An internal node with more than two children */
801 /* We will replace this node 'inode' with two new
802 * ones, 'left' and 'right', each with half of the
803 * original children. The two new nodes will have
804 * a position one bit further down the key and this
805 * means that the "significant" part of their keys
806 * (see the discussion near the top of this file)
807 * will differ by one bit, which will be "0" in
808 * left's key and "1" in right's key. Since we are
809 * moving the key position by one step, the bit that
810 * we are moving away from - the bit at position
811 * (inode->pos) - is the one that will differ between
812 * left and right. So... we synthesize that bit in the
814 * The mask 'm' below will be a single "one" bit at
815 * the position (inode->pos)
818 /* Use the old key, but set the new significant
822 left = (struct tnode *) tnode_get_child(tn, 2*i);
823 put_child(t, tn, 2*i, NULL);
827 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
828 put_child(t, tn, 2*i+1, NULL);
832 size = tnode_child_length(left);
833 for (j = 0; j < size; j++) {
834 put_child(t, left, j, inode->child[j]);
835 put_child(t, right, j, inode->child[j + size]);
837 put_child(t, tn, 2*i, resize(t, left));
838 put_child(t, tn, 2*i+1, resize(t, right));
840 tnode_free_safe(inode);
842 tnode_free_safe(oldtnode);
846 int size = tnode_child_length(tn);
849 for (j = 0; j < size; j++)
851 tnode_free((struct tnode *)tn->child[j]);
855 return ERR_PTR(-ENOMEM);
859 static struct tnode *halve(struct trie *t, struct tnode *tn)
861 struct tnode *oldtnode = tn;
862 struct node *left, *right;
864 int olen = tnode_child_length(tn);
866 pr_debug("In halve\n");
868 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
871 return ERR_PTR(-ENOMEM);
874 * Preallocate and store tnodes before the actual work so we
875 * don't get into an inconsistent state if memory allocation
876 * fails. In case of failure we return the oldnode and halve
877 * of tnode is ignored.
880 for (i = 0; i < olen; i += 2) {
881 left = tnode_get_child(oldtnode, i);
882 right = tnode_get_child(oldtnode, i+1);
884 /* Two nonempty children */
888 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
893 put_child(t, tn, i/2, (struct node *)newn);
898 for (i = 0; i < olen; i += 2) {
899 struct tnode *newBinNode;
901 left = tnode_get_child(oldtnode, i);
902 right = tnode_get_child(oldtnode, i+1);
904 /* At least one of the children is empty */
906 if (right == NULL) /* Both are empty */
908 put_child(t, tn, i/2, right);
913 put_child(t, tn, i/2, left);
917 /* Two nonempty children */
918 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
919 put_child(t, tn, i/2, NULL);
920 put_child(t, newBinNode, 0, left);
921 put_child(t, newBinNode, 1, right);
922 put_child(t, tn, i/2, resize(t, newBinNode));
924 tnode_free_safe(oldtnode);
928 int size = tnode_child_length(tn);
931 for (j = 0; j < size; j++)
933 tnode_free((struct tnode *)tn->child[j]);
937 return ERR_PTR(-ENOMEM);
941 /* readside must use rcu_read_lock currently dump routines
942 via get_fa_head and dump */
944 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
946 struct hlist_head *head = &l->list;
947 struct hlist_node *node;
948 struct leaf_info *li;
950 hlist_for_each_entry_rcu(li, node, head, hlist)
951 if (li->plen == plen)
957 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
959 struct leaf_info *li = find_leaf_info(l, plen);
967 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
969 struct leaf_info *li = NULL, *last = NULL;
970 struct hlist_node *node;
972 if (hlist_empty(head)) {
973 hlist_add_head_rcu(&new->hlist, head);
975 hlist_for_each_entry(li, node, head, hlist) {
976 if (new->plen > li->plen)
982 hlist_add_after_rcu(&last->hlist, &new->hlist);
984 hlist_add_before_rcu(&new->hlist, &li->hlist);
988 /* rcu_read_lock needs to be hold by caller from readside */
991 fib_find_node(struct trie *t, u32 key)
998 n = rcu_dereference(t->trie);
1000 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1001 tn = (struct tnode *) n;
1005 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1006 pos = tn->pos + tn->bits;
1007 n = tnode_get_child_rcu(tn,
1008 tkey_extract_bits(key,
1014 /* Case we have found a leaf. Compare prefixes */
1016 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1017 return (struct leaf *)n;
1022 static void trie_rebalance(struct trie *t, struct tnode *tn)
1030 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1031 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1032 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1033 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1035 tnode_put_child_reorg((struct tnode *)tp, cindex,
1036 (struct node *)tn, wasfull);
1038 tp = node_parent((struct node *) tn);
1040 rcu_assign_pointer(t->trie, (struct node *)tn);
1048 /* Handle last (top) tnode */
1050 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1052 rcu_assign_pointer(t->trie, (struct node *)tn);
1058 /* only used from updater-side */
1060 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1063 struct tnode *tp = NULL, *tn = NULL;
1067 struct list_head *fa_head = NULL;
1068 struct leaf_info *li;
1074 /* If we point to NULL, stop. Either the tree is empty and we should
1075 * just put a new leaf in if, or we have reached an empty child slot,
1076 * and we should just put our new leaf in that.
1077 * If we point to a T_TNODE, check if it matches our key. Note that
1078 * a T_TNODE might be skipping any number of bits - its 'pos' need
1079 * not be the parent's 'pos'+'bits'!
1081 * If it does match the current key, get pos/bits from it, extract
1082 * the index from our key, push the T_TNODE and walk the tree.
1084 * If it doesn't, we have to replace it with a new T_TNODE.
1086 * If we point to a T_LEAF, it might or might not have the same key
1087 * as we do. If it does, just change the value, update the T_LEAF's
1088 * value, and return it.
1089 * If it doesn't, we need to replace it with a T_TNODE.
1092 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1093 tn = (struct tnode *) n;
1097 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1099 pos = tn->pos + tn->bits;
1100 n = tnode_get_child(tn,
1101 tkey_extract_bits(key,
1105 BUG_ON(n && node_parent(n) != tn);
1111 * n ----> NULL, LEAF or TNODE
1113 * tp is n's (parent) ----> NULL or TNODE
1116 BUG_ON(tp && IS_LEAF(tp));
1118 /* Case 1: n is a leaf. Compare prefixes */
1120 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1121 l = (struct leaf *) n;
1122 li = leaf_info_new(plen);
1127 fa_head = &li->falh;
1128 insert_leaf_info(&l->list, li);
1137 li = leaf_info_new(plen);
1144 fa_head = &li->falh;
1145 insert_leaf_info(&l->list, li);
1147 if (t->trie && n == NULL) {
1148 /* Case 2: n is NULL, and will just insert a new leaf */
1150 node_set_parent((struct node *)l, tp);
1152 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1153 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1155 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1157 * Add a new tnode here
1158 * first tnode need some special handling
1162 pos = tp->pos+tp->bits;
1167 newpos = tkey_mismatch(key, pos, n->key);
1168 tn = tnode_new(n->key, newpos, 1);
1171 tn = tnode_new(key, newpos, 1); /* First tnode */
1180 node_set_parent((struct node *)tn, tp);
1182 missbit = tkey_extract_bits(key, newpos, 1);
1183 put_child(t, tn, missbit, (struct node *)l);
1184 put_child(t, tn, 1-missbit, n);
1187 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1188 put_child(t, (struct tnode *)tp, cindex,
1191 rcu_assign_pointer(t->trie, (struct node *)tn);
1196 if (tp && tp->pos + tp->bits > 32)
1197 pr_warning("fib_trie"
1198 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1199 tp, tp->pos, tp->bits, key, plen);
1201 /* Rebalance the trie */
1203 trie_rebalance(t, tp);
1209 * Caller must hold RTNL.
1211 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1213 struct trie *t = (struct trie *) tb->tb_data;
1214 struct fib_alias *fa, *new_fa;
1215 struct list_head *fa_head = NULL;
1216 struct fib_info *fi;
1217 int plen = cfg->fc_dst_len;
1218 u8 tos = cfg->fc_tos;
1226 key = ntohl(cfg->fc_dst);
1228 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1230 mask = ntohl(inet_make_mask(plen));
1237 fi = fib_create_info(cfg);
1243 l = fib_find_node(t, key);
1247 fa_head = get_fa_head(l, plen);
1248 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1251 /* Now fa, if non-NULL, points to the first fib alias
1252 * with the same keys [prefix,tos,priority], if such key already
1253 * exists or to the node before which we will insert new one.
1255 * If fa is NULL, we will need to allocate a new one and
1256 * insert to the head of f.
1258 * If f is NULL, no fib node matched the destination key
1259 * and we need to allocate a new one of those as well.
1262 if (fa && fa->fa_tos == tos &&
1263 fa->fa_info->fib_priority == fi->fib_priority) {
1264 struct fib_alias *fa_first, *fa_match;
1267 if (cfg->fc_nlflags & NLM_F_EXCL)
1271 * 1. Find exact match for type, scope, fib_info to avoid
1273 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1277 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1278 list_for_each_entry_continue(fa, fa_head, fa_list) {
1279 if (fa->fa_tos != tos)
1281 if (fa->fa_info->fib_priority != fi->fib_priority)
1283 if (fa->fa_type == cfg->fc_type &&
1284 fa->fa_scope == cfg->fc_scope &&
1285 fa->fa_info == fi) {
1291 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1292 struct fib_info *fi_drop;
1302 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1306 fi_drop = fa->fa_info;
1307 new_fa->fa_tos = fa->fa_tos;
1308 new_fa->fa_info = fi;
1309 new_fa->fa_type = cfg->fc_type;
1310 new_fa->fa_scope = cfg->fc_scope;
1311 state = fa->fa_state;
1312 new_fa->fa_state = state & ~FA_S_ACCESSED;
1314 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1315 alias_free_mem_rcu(fa);
1317 fib_release_info(fi_drop);
1318 if (state & FA_S_ACCESSED)
1319 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1320 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1321 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1325 /* Error if we find a perfect match which
1326 * uses the same scope, type, and nexthop
1332 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1336 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1340 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1344 new_fa->fa_info = fi;
1345 new_fa->fa_tos = tos;
1346 new_fa->fa_type = cfg->fc_type;
1347 new_fa->fa_scope = cfg->fc_scope;
1348 new_fa->fa_state = 0;
1350 * Insert new entry to the list.
1354 fa_head = fib_insert_node(t, key, plen);
1355 if (unlikely(!fa_head)) {
1357 goto out_free_new_fa;
1361 list_add_tail_rcu(&new_fa->fa_list,
1362 (fa ? &fa->fa_list : fa_head));
1364 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1365 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1366 &cfg->fc_nlinfo, 0);
1371 kmem_cache_free(fn_alias_kmem, new_fa);
1373 fib_release_info(fi);
1378 /* should be called with rcu_read_lock */
1379 static int check_leaf(struct trie *t, struct leaf *l,
1380 t_key key, const struct flowi *flp,
1381 struct fib_result *res)
1383 struct leaf_info *li;
1384 struct hlist_head *hhead = &l->list;
1385 struct hlist_node *node;
1387 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1389 int plen = li->plen;
1390 __be32 mask = inet_make_mask(plen);
1392 if (l->key != (key & ntohl(mask)))
1395 err = fib_semantic_match(&li->falh, flp, res, plen);
1397 #ifdef CONFIG_IP_FIB_TRIE_STATS
1399 t->stats.semantic_match_passed++;
1401 t->stats.semantic_match_miss++;
1410 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1411 struct fib_result *res)
1413 struct trie *t = (struct trie *) tb->tb_data;
1418 t_key key = ntohl(flp->fl4_dst);
1421 int current_prefix_length = KEYLENGTH;
1423 t_key node_prefix, key_prefix, pref_mismatch;
1428 n = rcu_dereference(t->trie);
1432 #ifdef CONFIG_IP_FIB_TRIE_STATS
1438 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1442 pn = (struct tnode *) n;
1450 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1453 n = tnode_get_child(pn, cindex);
1456 #ifdef CONFIG_IP_FIB_TRIE_STATS
1457 t->stats.null_node_hit++;
1463 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1469 cn = (struct tnode *)n;
1472 * It's a tnode, and we can do some extra checks here if we
1473 * like, to avoid descending into a dead-end branch.
1474 * This tnode is in the parent's child array at index
1475 * key[p_pos..p_pos+p_bits] but potentially with some bits
1476 * chopped off, so in reality the index may be just a
1477 * subprefix, padded with zero at the end.
1478 * We can also take a look at any skipped bits in this
1479 * tnode - everything up to p_pos is supposed to be ok,
1480 * and the non-chopped bits of the index (se previous
1481 * paragraph) are also guaranteed ok, but the rest is
1482 * considered unknown.
1484 * The skipped bits are key[pos+bits..cn->pos].
1487 /* If current_prefix_length < pos+bits, we are already doing
1488 * actual prefix matching, which means everything from
1489 * pos+(bits-chopped_off) onward must be zero along some
1490 * branch of this subtree - otherwise there is *no* valid
1491 * prefix present. Here we can only check the skipped
1492 * bits. Remember, since we have already indexed into the
1493 * parent's child array, we know that the bits we chopped of
1497 /* NOTA BENE: Checking only skipped bits
1498 for the new node here */
1500 if (current_prefix_length < pos+bits) {
1501 if (tkey_extract_bits(cn->key, current_prefix_length,
1502 cn->pos - current_prefix_length)
1508 * If chopped_off=0, the index is fully validated and we
1509 * only need to look at the skipped bits for this, the new,
1510 * tnode. What we actually want to do is to find out if
1511 * these skipped bits match our key perfectly, or if we will
1512 * have to count on finding a matching prefix further down,
1513 * because if we do, we would like to have some way of
1514 * verifying the existence of such a prefix at this point.
1517 /* The only thing we can do at this point is to verify that
1518 * any such matching prefix can indeed be a prefix to our
1519 * key, and if the bits in the node we are inspecting that
1520 * do not match our key are not ZERO, this cannot be true.
1521 * Thus, find out where there is a mismatch (before cn->pos)
1522 * and verify that all the mismatching bits are zero in the
1527 * Note: We aren't very concerned about the piece of
1528 * the key that precede pn->pos+pn->bits, since these
1529 * have already been checked. The bits after cn->pos
1530 * aren't checked since these are by definition
1531 * "unknown" at this point. Thus, what we want to see
1532 * is if we are about to enter the "prefix matching"
1533 * state, and in that case verify that the skipped
1534 * bits that will prevail throughout this subtree are
1535 * zero, as they have to be if we are to find a
1539 node_prefix = mask_pfx(cn->key, cn->pos);
1540 key_prefix = mask_pfx(key, cn->pos);
1541 pref_mismatch = key_prefix^node_prefix;
1545 * In short: If skipped bits in this node do not match
1546 * the search key, enter the "prefix matching"
1549 if (pref_mismatch) {
1550 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1552 pref_mismatch = pref_mismatch << 1;
1554 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1556 if (key_prefix != 0)
1559 if (current_prefix_length >= cn->pos)
1560 current_prefix_length = mp;
1563 pn = (struct tnode *)n; /* Descend */
1570 /* As zero don't change the child key (cindex) */
1571 while ((chopped_off <= pn->bits)
1572 && !(cindex & (1<<(chopped_off-1))))
1575 /* Decrease current_... with bits chopped off */
1576 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1577 current_prefix_length = pn->pos + pn->bits
1581 * Either we do the actual chop off according or if we have
1582 * chopped off all bits in this tnode walk up to our parent.
1585 if (chopped_off <= pn->bits) {
1586 cindex &= ~(1 << (chopped_off-1));
1588 struct tnode *parent = node_parent((struct node *) pn);
1592 /* Get Child's index */
1593 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1597 #ifdef CONFIG_IP_FIB_TRIE_STATS
1598 t->stats.backtrack++;
1611 * Remove the leaf and return parent.
1613 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1615 struct tnode *tp = node_parent((struct node *) l);
1617 pr_debug("entering trie_leaf_remove(%p)\n", l);
1620 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1621 put_child(t, (struct tnode *)tp, cindex, NULL);
1622 trie_rebalance(t, tp);
1624 rcu_assign_pointer(t->trie, NULL);
1630 * Caller must hold RTNL.
1632 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1634 struct trie *t = (struct trie *) tb->tb_data;
1636 int plen = cfg->fc_dst_len;
1637 u8 tos = cfg->fc_tos;
1638 struct fib_alias *fa, *fa_to_delete;
1639 struct list_head *fa_head;
1641 struct leaf_info *li;
1646 key = ntohl(cfg->fc_dst);
1647 mask = ntohl(inet_make_mask(plen));
1653 l = fib_find_node(t, key);
1658 fa_head = get_fa_head(l, plen);
1659 fa = fib_find_alias(fa_head, tos, 0);
1664 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1666 fa_to_delete = NULL;
1667 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1668 list_for_each_entry_continue(fa, fa_head, fa_list) {
1669 struct fib_info *fi = fa->fa_info;
1671 if (fa->fa_tos != tos)
1674 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1675 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1676 fa->fa_scope == cfg->fc_scope) &&
1677 (!cfg->fc_protocol ||
1678 fi->fib_protocol == cfg->fc_protocol) &&
1679 fib_nh_match(cfg, fi) == 0) {
1689 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1690 &cfg->fc_nlinfo, 0);
1692 l = fib_find_node(t, key);
1693 li = find_leaf_info(l, plen);
1695 list_del_rcu(&fa->fa_list);
1697 if (list_empty(fa_head)) {
1698 hlist_del_rcu(&li->hlist);
1702 if (hlist_empty(&l->list))
1703 trie_leaf_remove(t, l);
1705 if (fa->fa_state & FA_S_ACCESSED)
1706 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1708 fib_release_info(fa->fa_info);
1709 alias_free_mem_rcu(fa);
1713 static int trie_flush_list(struct list_head *head)
1715 struct fib_alias *fa, *fa_node;
1718 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1719 struct fib_info *fi = fa->fa_info;
1721 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1722 list_del_rcu(&fa->fa_list);
1723 fib_release_info(fa->fa_info);
1724 alias_free_mem_rcu(fa);
1731 static int trie_flush_leaf(struct leaf *l)
1734 struct hlist_head *lih = &l->list;
1735 struct hlist_node *node, *tmp;
1736 struct leaf_info *li = NULL;
1738 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1739 found += trie_flush_list(&li->falh);
1741 if (list_empty(&li->falh)) {
1742 hlist_del_rcu(&li->hlist);
1750 * Scan for the next right leaf starting at node p->child[idx]
1751 * Since we have back pointer, no recursion necessary.
1753 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1759 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1763 while (idx < 1u << p->bits) {
1764 c = tnode_get_child_rcu(p, idx++);
1769 prefetch(p->child[idx]);
1770 return (struct leaf *) c;
1773 /* Rescan start scanning in new node */
1774 p = (struct tnode *) c;
1778 /* Node empty, walk back up to parent */
1779 c = (struct node *) p;
1780 } while ( (p = node_parent_rcu(c)) != NULL);
1782 return NULL; /* Root of trie */
1785 static struct leaf *trie_firstleaf(struct trie *t)
1787 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1792 if (IS_LEAF(n)) /* trie is just a leaf */
1793 return (struct leaf *) n;
1795 return leaf_walk_rcu(n, NULL);
1798 static struct leaf *trie_nextleaf(struct leaf *l)
1800 struct node *c = (struct node *) l;
1801 struct tnode *p = node_parent(c);
1804 return NULL; /* trie with just one leaf */
1806 return leaf_walk_rcu(p, c);
1809 static struct leaf *trie_leafindex(struct trie *t, int index)
1811 struct leaf *l = trie_firstleaf(t);
1813 while (l && index-- > 0)
1814 l = trie_nextleaf(l);
1821 * Caller must hold RTNL.
1823 static int fn_trie_flush(struct fib_table *tb)
1825 struct trie *t = (struct trie *) tb->tb_data;
1826 struct leaf *l, *ll = NULL;
1829 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1830 found += trie_flush_leaf(l);
1832 if (ll && hlist_empty(&ll->list))
1833 trie_leaf_remove(t, ll);
1837 if (ll && hlist_empty(&ll->list))
1838 trie_leaf_remove(t, ll);
1840 pr_debug("trie_flush found=%d\n", found);
1844 static void fn_trie_select_default(struct fib_table *tb,
1845 const struct flowi *flp,
1846 struct fib_result *res)
1848 struct trie *t = (struct trie *) tb->tb_data;
1849 int order, last_idx;
1850 struct fib_info *fi = NULL;
1851 struct fib_info *last_resort;
1852 struct fib_alias *fa = NULL;
1853 struct list_head *fa_head;
1862 l = fib_find_node(t, 0);
1866 fa_head = get_fa_head(l, 0);
1870 if (list_empty(fa_head))
1873 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1874 struct fib_info *next_fi = fa->fa_info;
1876 if (fa->fa_scope != res->scope ||
1877 fa->fa_type != RTN_UNICAST)
1880 if (next_fi->fib_priority > res->fi->fib_priority)
1882 if (!next_fi->fib_nh[0].nh_gw ||
1883 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1885 fa->fa_state |= FA_S_ACCESSED;
1888 if (next_fi != res->fi)
1890 } else if (!fib_detect_death(fi, order, &last_resort,
1891 &last_idx, tb->tb_default)) {
1892 fib_result_assign(res, fi);
1893 tb->tb_default = order;
1899 if (order <= 0 || fi == NULL) {
1900 tb->tb_default = -1;
1904 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1906 fib_result_assign(res, fi);
1907 tb->tb_default = order;
1911 fib_result_assign(res, last_resort);
1912 tb->tb_default = last_idx;
1917 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1918 struct fib_table *tb,
1919 struct sk_buff *skb, struct netlink_callback *cb)
1922 struct fib_alias *fa;
1923 __be32 xkey = htonl(key);
1928 /* rcu_read_lock is hold by caller */
1930 list_for_each_entry_rcu(fa, fah, fa_list) {
1936 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1945 fa->fa_info, NLM_F_MULTI) < 0) {
1955 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1956 struct sk_buff *skb, struct netlink_callback *cb)
1958 struct leaf_info *li;
1959 struct hlist_node *node;
1965 /* rcu_read_lock is hold by caller */
1966 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1975 if (list_empty(&li->falh))
1978 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1989 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1990 struct netlink_callback *cb)
1993 struct trie *t = (struct trie *) tb->tb_data;
1994 t_key key = cb->args[2];
1995 int count = cb->args[3];
1998 /* Dump starting at last key.
1999 * Note: 0.0.0.0/0 (ie default) is first key.
2002 l = trie_firstleaf(t);
2004 /* Normally, continue from last key, but if that is missing
2005 * fallback to using slow rescan
2007 l = fib_find_node(t, key);
2009 l = trie_leafindex(t, count);
2013 cb->args[2] = l->key;
2014 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
2015 cb->args[3] = count;
2021 l = trie_nextleaf(l);
2022 memset(&cb->args[4], 0,
2023 sizeof(cb->args) - 4*sizeof(cb->args[0]));
2025 cb->args[3] = count;
2031 void __init fib_hash_init(void)
2033 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2034 sizeof(struct fib_alias),
2035 0, SLAB_PANIC, NULL);
2037 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2038 max(sizeof(struct leaf),
2039 sizeof(struct leaf_info)),
2040 0, SLAB_PANIC, NULL);
2044 /* Fix more generic FIB names for init later */
2045 struct fib_table *fib_hash_table(u32 id)
2047 struct fib_table *tb;
2050 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2056 tb->tb_default = -1;
2057 tb->tb_lookup = fn_trie_lookup;
2058 tb->tb_insert = fn_trie_insert;
2059 tb->tb_delete = fn_trie_delete;
2060 tb->tb_flush = fn_trie_flush;
2061 tb->tb_select_default = fn_trie_select_default;
2062 tb->tb_dump = fn_trie_dump;
2064 t = (struct trie *) tb->tb_data;
2065 memset(t, 0, sizeof(*t));
2067 if (id == RT_TABLE_LOCAL)
2068 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2073 #ifdef CONFIG_PROC_FS
2074 /* Depth first Trie walk iterator */
2075 struct fib_trie_iter {
2076 struct seq_net_private p;
2077 struct fib_table *tb;
2078 struct tnode *tnode;
2083 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2085 struct tnode *tn = iter->tnode;
2086 unsigned cindex = iter->index;
2089 /* A single entry routing table */
2093 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2094 iter->tnode, iter->index, iter->depth);
2096 while (cindex < (1<<tn->bits)) {
2097 struct node *n = tnode_get_child_rcu(tn, cindex);
2102 iter->index = cindex + 1;
2104 /* push down one level */
2105 iter->tnode = (struct tnode *) n;
2115 /* Current node exhausted, pop back up */
2116 p = node_parent_rcu((struct node *)tn);
2118 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2128 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2136 n = rcu_dereference(t->trie);
2141 iter->tnode = (struct tnode *) n;
2153 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2156 struct fib_trie_iter iter;
2158 memset(s, 0, sizeof(*s));
2161 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2163 struct leaf *l = (struct leaf *)n;
2164 struct leaf_info *li;
2165 struct hlist_node *tmp;
2168 s->totdepth += iter.depth;
2169 if (iter.depth > s->maxdepth)
2170 s->maxdepth = iter.depth;
2172 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2175 const struct tnode *tn = (const struct tnode *) n;
2179 if (tn->bits < MAX_STAT_DEPTH)
2180 s->nodesizes[tn->bits]++;
2182 for (i = 0; i < (1<<tn->bits); i++)
2191 * This outputs /proc/net/fib_triestats
2193 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2195 unsigned i, max, pointers, bytes, avdepth;
2198 avdepth = stat->totdepth*100 / stat->leaves;
2202 seq_printf(seq, "\tAver depth: %u.%02d\n",
2203 avdepth / 100, avdepth % 100);
2204 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2206 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2207 bytes = sizeof(struct leaf) * stat->leaves;
2209 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2210 bytes += sizeof(struct leaf_info) * stat->prefixes;
2212 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2213 bytes += sizeof(struct tnode) * stat->tnodes;
2215 max = MAX_STAT_DEPTH;
2216 while (max > 0 && stat->nodesizes[max-1] == 0)
2220 for (i = 1; i <= max; i++)
2221 if (stat->nodesizes[i] != 0) {
2222 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2223 pointers += (1<<i) * stat->nodesizes[i];
2225 seq_putc(seq, '\n');
2226 seq_printf(seq, "\tPointers: %u\n", pointers);
2228 bytes += sizeof(struct node *) * pointers;
2229 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2230 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2233 #ifdef CONFIG_IP_FIB_TRIE_STATS
2234 static void trie_show_usage(struct seq_file *seq,
2235 const struct trie_use_stats *stats)
2237 seq_printf(seq, "\nCounters:\n---------\n");
2238 seq_printf(seq, "gets = %u\n", stats->gets);
2239 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2240 seq_printf(seq, "semantic match passed = %u\n",
2241 stats->semantic_match_passed);
2242 seq_printf(seq, "semantic match miss = %u\n",
2243 stats->semantic_match_miss);
2244 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2245 seq_printf(seq, "skipped node resize = %u\n\n",
2246 stats->resize_node_skipped);
2248 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2250 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2252 if (tb->tb_id == RT_TABLE_LOCAL)
2253 seq_puts(seq, "Local:\n");
2254 else if (tb->tb_id == RT_TABLE_MAIN)
2255 seq_puts(seq, "Main:\n");
2257 seq_printf(seq, "Id %d:\n", tb->tb_id);
2261 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2263 struct net *net = (struct net *)seq->private;
2267 "Basic info: size of leaf:"
2268 " %Zd bytes, size of tnode: %Zd bytes.\n",
2269 sizeof(struct leaf), sizeof(struct tnode));
2271 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2272 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2273 struct hlist_node *node;
2274 struct fib_table *tb;
2276 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2277 struct trie *t = (struct trie *) tb->tb_data;
2278 struct trie_stat stat;
2283 fib_table_print(seq, tb);
2285 trie_collect_stats(t, &stat);
2286 trie_show_stats(seq, &stat);
2287 #ifdef CONFIG_IP_FIB_TRIE_STATS
2288 trie_show_usage(seq, &t->stats);
2296 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2298 return single_open_net(inode, file, fib_triestat_seq_show);
2301 static const struct file_operations fib_triestat_fops = {
2302 .owner = THIS_MODULE,
2303 .open = fib_triestat_seq_open,
2305 .llseek = seq_lseek,
2306 .release = single_release_net,
2309 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2311 struct fib_trie_iter *iter = seq->private;
2312 struct net *net = seq_file_net(seq);
2316 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2317 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2318 struct hlist_node *node;
2319 struct fib_table *tb;
2321 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2324 for (n = fib_trie_get_first(iter,
2325 (struct trie *) tb->tb_data);
2326 n; n = fib_trie_get_next(iter))
2337 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2341 return fib_trie_get_idx(seq, *pos);
2344 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2346 struct fib_trie_iter *iter = seq->private;
2347 struct net *net = seq_file_net(seq);
2348 struct fib_table *tb = iter->tb;
2349 struct hlist_node *tb_node;
2354 /* next node in same table */
2355 n = fib_trie_get_next(iter);
2359 /* walk rest of this hash chain */
2360 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2361 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2362 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2363 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2368 /* new hash chain */
2369 while (++h < FIB_TABLE_HASHSZ) {
2370 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2371 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2372 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2384 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2390 static void seq_indent(struct seq_file *seq, int n)
2392 while (n-- > 0) seq_puts(seq, " ");
2395 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2398 case RT_SCOPE_UNIVERSE: return "universe";
2399 case RT_SCOPE_SITE: return "site";
2400 case RT_SCOPE_LINK: return "link";
2401 case RT_SCOPE_HOST: return "host";
2402 case RT_SCOPE_NOWHERE: return "nowhere";
2404 snprintf(buf, len, "scope=%d", s);
2409 static const char *rtn_type_names[__RTN_MAX] = {
2410 [RTN_UNSPEC] = "UNSPEC",
2411 [RTN_UNICAST] = "UNICAST",
2412 [RTN_LOCAL] = "LOCAL",
2413 [RTN_BROADCAST] = "BROADCAST",
2414 [RTN_ANYCAST] = "ANYCAST",
2415 [RTN_MULTICAST] = "MULTICAST",
2416 [RTN_BLACKHOLE] = "BLACKHOLE",
2417 [RTN_UNREACHABLE] = "UNREACHABLE",
2418 [RTN_PROHIBIT] = "PROHIBIT",
2419 [RTN_THROW] = "THROW",
2421 [RTN_XRESOLVE] = "XRESOLVE",
2424 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2426 if (t < __RTN_MAX && rtn_type_names[t])
2427 return rtn_type_names[t];
2428 snprintf(buf, len, "type %u", t);
2432 /* Pretty print the trie */
2433 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2435 const struct fib_trie_iter *iter = seq->private;
2438 if (!node_parent_rcu(n))
2439 fib_table_print(seq, iter->tb);
2442 struct tnode *tn = (struct tnode *) n;
2443 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2445 seq_indent(seq, iter->depth-1);
2446 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2447 &prf, tn->pos, tn->bits, tn->full_children,
2448 tn->empty_children);
2451 struct leaf *l = (struct leaf *) n;
2452 struct leaf_info *li;
2453 struct hlist_node *node;
2454 __be32 val = htonl(l->key);
2456 seq_indent(seq, iter->depth);
2457 seq_printf(seq, " |-- %pI4\n", &val);
2459 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2460 struct fib_alias *fa;
2462 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2463 char buf1[32], buf2[32];
2465 seq_indent(seq, iter->depth+1);
2466 seq_printf(seq, " /%d %s %s", li->plen,
2467 rtn_scope(buf1, sizeof(buf1),
2469 rtn_type(buf2, sizeof(buf2),
2472 seq_printf(seq, " tos=%d", fa->fa_tos);
2473 seq_putc(seq, '\n');
2481 static const struct seq_operations fib_trie_seq_ops = {
2482 .start = fib_trie_seq_start,
2483 .next = fib_trie_seq_next,
2484 .stop = fib_trie_seq_stop,
2485 .show = fib_trie_seq_show,
2488 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2490 return seq_open_net(inode, file, &fib_trie_seq_ops,
2491 sizeof(struct fib_trie_iter));
2494 static const struct file_operations fib_trie_fops = {
2495 .owner = THIS_MODULE,
2496 .open = fib_trie_seq_open,
2498 .llseek = seq_lseek,
2499 .release = seq_release_net,
2502 struct fib_route_iter {
2503 struct seq_net_private p;
2504 struct trie *main_trie;
2509 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2511 struct leaf *l = NULL;
2512 struct trie *t = iter->main_trie;
2514 /* use cache location of last found key */
2515 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2519 l = trie_firstleaf(t);
2522 while (l && pos-- > 0) {
2524 l = trie_nextleaf(l);
2528 iter->key = pos; /* remember it */
2530 iter->pos = 0; /* forget it */
2535 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2538 struct fib_route_iter *iter = seq->private;
2539 struct fib_table *tb;
2542 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2546 iter->main_trie = (struct trie *) tb->tb_data;
2548 return SEQ_START_TOKEN;
2550 return fib_route_get_idx(iter, *pos - 1);
2553 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2555 struct fib_route_iter *iter = seq->private;
2559 if (v == SEQ_START_TOKEN) {
2561 l = trie_firstleaf(iter->main_trie);
2564 l = trie_nextleaf(l);
2574 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2580 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2582 static unsigned type2flags[RTN_MAX + 1] = {
2583 [7] = RTF_REJECT, [8] = RTF_REJECT,
2585 unsigned flags = type2flags[type];
2587 if (fi && fi->fib_nh->nh_gw)
2588 flags |= RTF_GATEWAY;
2589 if (mask == htonl(0xFFFFFFFF))
2596 * This outputs /proc/net/route.
2597 * The format of the file is not supposed to be changed
2598 * and needs to be same as fib_hash output to avoid breaking
2601 static int fib_route_seq_show(struct seq_file *seq, void *v)
2604 struct leaf_info *li;
2605 struct hlist_node *node;
2607 if (v == SEQ_START_TOKEN) {
2608 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2609 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2614 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2615 struct fib_alias *fa;
2616 __be32 mask, prefix;
2618 mask = inet_make_mask(li->plen);
2619 prefix = htonl(l->key);
2621 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2622 const struct fib_info *fi = fa->fa_info;
2623 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2626 if (fa->fa_type == RTN_BROADCAST
2627 || fa->fa_type == RTN_MULTICAST)
2632 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2633 "%d\t%08X\t%d\t%u\t%u%n",
2634 fi->fib_dev ? fi->fib_dev->name : "*",
2636 fi->fib_nh->nh_gw, flags, 0, 0,
2640 fi->fib_advmss + 40 : 0),
2642 fi->fib_rtt >> 3, &len);
2645 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2646 "%d\t%08X\t%d\t%u\t%u%n",
2647 prefix, 0, flags, 0, 0, 0,
2648 mask, 0, 0, 0, &len);
2650 seq_printf(seq, "%*s\n", 127 - len, "");
2657 static const struct seq_operations fib_route_seq_ops = {
2658 .start = fib_route_seq_start,
2659 .next = fib_route_seq_next,
2660 .stop = fib_route_seq_stop,
2661 .show = fib_route_seq_show,
2664 static int fib_route_seq_open(struct inode *inode, struct file *file)
2666 return seq_open_net(inode, file, &fib_route_seq_ops,
2667 sizeof(struct fib_route_iter));
2670 static const struct file_operations fib_route_fops = {
2671 .owner = THIS_MODULE,
2672 .open = fib_route_seq_open,
2674 .llseek = seq_lseek,
2675 .release = seq_release_net,
2678 int __net_init fib_proc_init(struct net *net)
2680 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2683 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2684 &fib_triestat_fops))
2687 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2693 proc_net_remove(net, "fib_triestat");
2695 proc_net_remove(net, "fib_trie");
2700 void __net_exit fib_proc_exit(struct net *net)
2702 proc_net_remove(net, "fib_trie");
2703 proc_net_remove(net, "fib_triestat");
2704 proc_net_remove(net, "route");
2707 #endif /* CONFIG_PROC_FS */