2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
46 #define VERSION "0.325"
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
85 static DEFINE_RWLOCK(fib_lock
);
87 typedef unsigned int t_key
;
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(node) \
93 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(node, ptr) \
95 ((node)->parent = (((unsigned long)(ptr)) | \
96 ((node)->parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(node, type) \
98 ((node)->parent = (type))
99 #define NODE_TYPE(node) \
100 ((node)->parent & NODE_TYPE_MASK)
102 #define IS_TNODE(n) (!(n->parent & T_LEAF))
103 #define IS_LEAF(n) (n->parent & T_LEAF)
107 unsigned long parent
;
112 unsigned long parent
;
113 struct hlist_head list
;
117 struct hlist_node hlist
;
119 struct list_head falh
;
124 unsigned long parent
;
125 unsigned short pos
:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits
:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children
; /* KEYLENGTH bits needed */
128 unsigned short empty_children
; /* KEYLENGTH bits needed */
129 struct node
*child
[0];
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats
{
135 unsigned int backtrack
;
136 unsigned int semantic_match_passed
;
137 unsigned int semantic_match_miss
;
138 unsigned int null_node_hit
;
139 unsigned int resize_node_skipped
;
144 unsigned int totdepth
;
145 unsigned int maxdepth
;
148 unsigned int nullpointers
;
149 unsigned int nodesizes
[MAX_CHILDS
];
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats
;
158 unsigned int revision
;
161 static int trie_debug
= 0;
163 #define DBG(x...) do { if (trie_debug) printk(x); } while (0)
165 static int tnode_full(struct tnode
*tn
, struct node
*n
);
166 static void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
);
167 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
);
168 static int tnode_child_length(struct tnode
*tn
);
169 static struct node
*resize(struct trie
*t
, struct tnode
*tn
);
170 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
);
171 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
);
172 static void tnode_free(struct tnode
*tn
);
173 static void trie_dump_seq(struct seq_file
*seq
, struct trie
*t
);
174 extern struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
);
175 extern int fib_detect_death(struct fib_info
*fi
, int order
,
176 struct fib_info
**last_resort
, int *last_idx
, int *dflt
);
178 extern void rtmsg_fib(int event
, u32 key
, struct fib_alias
*fa
, int z
, int tb_id
,
179 struct nlmsghdr
*n
, struct netlink_skb_parms
*req
);
181 static kmem_cache_t
*fn_alias_kmem
;
182 static struct trie
*trie_local
= NULL
, *trie_main
= NULL
;
184 static inline struct node
*tnode_get_child(struct tnode
*tn
, int i
)
186 BUG_ON(i
>= 1 << tn
->bits
);
191 static inline int tnode_child_length(struct tnode
*tn
)
193 return 1 << tn
->bits
;
196 static inline t_key
tkey_extract_bits(t_key a
, int offset
, int bits
)
198 if (offset
< KEYLENGTH
)
199 return ((t_key
)(a
<< offset
)) >> (KEYLENGTH
- bits
);
204 static inline int tkey_equals(t_key a
, t_key b
)
209 static inline int tkey_sub_equals(t_key a
, int offset
, int bits
, t_key b
)
211 if (bits
== 0 || offset
>= KEYLENGTH
)
213 bits
= bits
> KEYLENGTH
? KEYLENGTH
: bits
;
214 return ((a
^ b
) << offset
) >> (KEYLENGTH
- bits
) == 0;
217 static inline int tkey_mismatch(t_key a
, int offset
, t_key b
)
224 while ((diff
<< i
) >> (KEYLENGTH
-1) == 0)
229 /* Candidate for fib_semantics */
231 static void fn_free_alias(struct fib_alias
*fa
)
233 fib_release_info(fa
->fa_info
);
234 kmem_cache_free(fn_alias_kmem
, fa
);
238 To understand this stuff, an understanding of keys and all their bits is
239 necessary. Every node in the trie has a key associated with it, but not
240 all of the bits in that key are significant.
242 Consider a node 'n' and its parent 'tp'.
244 If n is a leaf, every bit in its key is significant. Its presence is
245 necessitaded by path compression, since during a tree traversal (when
246 searching for a leaf - unless we are doing an insertion) we will completely
247 ignore all skipped bits we encounter. Thus we need to verify, at the end of
248 a potentially successful search, that we have indeed been walking the
251 Note that we can never "miss" the correct key in the tree if present by
252 following the wrong path. Path compression ensures that segments of the key
253 that are the same for all keys with a given prefix are skipped, but the
254 skipped part *is* identical for each node in the subtrie below the skipped
255 bit! trie_insert() in this implementation takes care of that - note the
256 call to tkey_sub_equals() in trie_insert().
258 if n is an internal node - a 'tnode' here, the various parts of its key
259 have many different meanings.
262 _________________________________________________________________
263 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
264 -----------------------------------------------------------------
265 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
267 _________________________________________________________________
268 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
269 -----------------------------------------------------------------
270 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
277 First, let's just ignore the bits that come before the parent tp, that is
278 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
279 not use them for anything.
281 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
282 index into the parent's child array. That is, they will be used to find
283 'n' among tp's children.
285 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
288 All the bits we have seen so far are significant to the node n. The rest
289 of the bits are really not needed or indeed known in n->key.
291 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
292 n's child array, and will of course be different for each child.
295 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
300 static void check_tnode(struct tnode
*tn
)
302 if (tn
&& tn
->pos
+tn
->bits
> 32) {
303 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn
, tn
->pos
, tn
->bits
);
307 static int halve_threshold
= 25;
308 static int inflate_threshold
= 50;
310 static struct leaf
*leaf_new(void)
312 struct leaf
*l
= kmalloc(sizeof(struct leaf
), GFP_KERNEL
);
314 NODE_INIT_PARENT(l
, T_LEAF
);
315 INIT_HLIST_HEAD(&l
->list
);
320 static struct leaf_info
*leaf_info_new(int plen
)
322 struct leaf_info
*li
= kmalloc(sizeof(struct leaf_info
), GFP_KERNEL
);
328 INIT_LIST_HEAD(&li
->falh
);
333 static inline void free_leaf(struct leaf
*l
)
338 static inline void free_leaf_info(struct leaf_info
*li
)
343 static struct tnode
*tnode_alloc(unsigned int size
)
345 if (size
<= PAGE_SIZE
) {
346 return kmalloc(size
, GFP_KERNEL
);
348 return (struct tnode
*)
349 __get_free_pages(GFP_KERNEL
, get_order(size
));
353 static void __tnode_free(struct tnode
*tn
)
355 unsigned int size
= sizeof(struct tnode
) +
356 (1 << tn
->bits
) * sizeof(struct node
*);
358 if (size
<= PAGE_SIZE
)
361 free_pages((unsigned long)tn
, get_order(size
));
364 static struct tnode
* tnode_new(t_key key
, int pos
, int bits
)
366 int nchildren
= 1<<bits
;
367 int sz
= sizeof(struct tnode
) + nchildren
* sizeof(struct node
*);
368 struct tnode
*tn
= tnode_alloc(sz
);
372 NODE_INIT_PARENT(tn
, T_TNODE
);
376 tn
->full_children
= 0;
377 tn
->empty_children
= 1<<bits
;
380 DBG("AT %p s=%u %u\n", tn
, (unsigned int) sizeof(struct tnode
),
381 (unsigned int) (sizeof(struct node
) * 1<<bits
));
385 static void tnode_free(struct tnode
*tn
)
390 free_leaf((struct leaf
*)tn
);
399 * Check whether a tnode 'n' is "full", i.e. it is an internal node
400 * and no bits are skipped. See discussion in dyntree paper p. 6
403 static inline int tnode_full(struct tnode
*tn
, struct node
*n
)
405 if (n
== NULL
|| IS_LEAF(n
))
408 return ((struct tnode
*) n
)->pos
== tn
->pos
+ tn
->bits
;
411 static inline void put_child(struct trie
*t
, struct tnode
*tn
, int i
, struct node
*n
)
413 tnode_put_child_reorg(tn
, i
, n
, -1);
417 * Add a child at position i overwriting the old value.
418 * Update the value of full_children and empty_children.
421 static void tnode_put_child_reorg(struct tnode
*tn
, int i
, struct node
*n
, int wasfull
)
426 if (i
>= 1<<tn
->bits
) {
427 printk("bits=%d, i=%d\n", tn
->bits
, i
);
430 write_lock_bh(&fib_lock
);
433 /* update emptyChildren */
434 if (n
== NULL
&& chi
!= NULL
)
435 tn
->empty_children
++;
436 else if (n
!= NULL
&& chi
== NULL
)
437 tn
->empty_children
--;
439 /* update fullChildren */
441 wasfull
= tnode_full(tn
, chi
);
443 isfull
= tnode_full(tn
, n
);
444 if (wasfull
&& !isfull
)
446 else if (!wasfull
&& isfull
)
450 NODE_SET_PARENT(n
, tn
);
453 write_unlock_bh(&fib_lock
);
456 static struct node
*resize(struct trie
*t
, struct tnode
*tn
)
460 struct tnode
*old_tn
;
465 DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
466 tn
, inflate_threshold
, halve_threshold
);
469 if (tn
->empty_children
== tnode_child_length(tn
)) {
474 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
475 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
478 write_lock_bh(&fib_lock
);
481 write_unlock_bh(&fib_lock
);
485 /* compress one level */
486 NODE_INIT_PARENT(n
, NODE_TYPE(n
));
488 write_unlock_bh(&fib_lock
);
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
518 * A clearer way to write this would be:
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
524 * new_child_length = tnode_child_length(tn) * 2;
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
528 * if (new_fill_factor >= inflate_threshold)
530 * ...and so on, tho it would mess up the while () loop.
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children) >= inflate_threshold * new_child_length
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children) >= inflate_threshold *
552 * tnode_child_length(tn)
559 while ((tn
->full_children
> 0 &&
560 50 * (tn
->full_children
+ tnode_child_length(tn
) - tn
->empty_children
) >=
561 inflate_threshold
* tnode_child_length(tn
))) {
567 #ifdef CONFIG_IP_FIB_TRIE_STATS
568 t
->stats
.resize_node_skipped
++;
577 * Halve as long as the number of empty children in this
578 * node is above threshold.
582 while (tn
->bits
> 1 &&
583 100 * (tnode_child_length(tn
) - tn
->empty_children
) <
584 halve_threshold
* tnode_child_length(tn
)) {
590 #ifdef CONFIG_IP_FIB_TRIE_STATS
591 t
->stats
.resize_node_skipped
++;
598 /* Only one child remains */
600 if (tn
->empty_children
== tnode_child_length(tn
) - 1)
601 for (i
= 0; i
< tnode_child_length(tn
); i
++) {
604 write_lock_bh(&fib_lock
);
608 write_unlock_bh(&fib_lock
);
612 /* compress one level */
614 NODE_INIT_PARENT(n
, NODE_TYPE(n
));
616 write_unlock_bh(&fib_lock
);
621 return (struct node
*) tn
;
624 static struct tnode
*inflate(struct trie
*t
, struct tnode
*tn
)
627 struct tnode
*oldtnode
= tn
;
628 int olen
= tnode_child_length(tn
);
633 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
+ 1);
636 return ERR_PTR(-ENOMEM
);
639 * Preallocate and store tnodes before the actual work so we
640 * don't get into an inconsistent state if memory allocation
641 * fails. In case of failure we return the oldnode and inflate
642 * of tnode is ignored.
645 for (i
= 0; i
< olen
; i
++) {
646 struct tnode
*inode
= (struct tnode
*) tnode_get_child(oldtnode
, i
);
650 inode
->pos
== oldtnode
->pos
+ oldtnode
->bits
&&
652 struct tnode
*left
, *right
;
653 t_key m
= TKEY_GET_MASK(inode
->pos
, 1);
655 left
= tnode_new(inode
->key
&(~m
), inode
->pos
+ 1,
660 right
= tnode_new(inode
->key
|m
, inode
->pos
+ 1,
668 put_child(t
, tn
, 2*i
, (struct node
*) left
);
669 put_child(t
, tn
, 2*i
+1, (struct node
*) right
);
673 for (i
= 0; i
< olen
; i
++) {
674 struct node
*node
= tnode_get_child(oldtnode
, i
);
675 struct tnode
*left
, *right
;
682 /* A leaf or an internal node with skipped bits */
684 if (IS_LEAF(node
) || ((struct tnode
*) node
)->pos
>
685 tn
->pos
+ tn
->bits
- 1) {
686 if (tkey_extract_bits(node
->key
, oldtnode
->pos
+ oldtnode
->bits
,
688 put_child(t
, tn
, 2*i
, node
);
690 put_child(t
, tn
, 2*i
+1, node
);
694 /* An internal node with two children */
695 inode
= (struct tnode
*) node
;
697 if (inode
->bits
== 1) {
698 put_child(t
, tn
, 2*i
, inode
->child
[0]);
699 put_child(t
, tn
, 2*i
+1, inode
->child
[1]);
705 /* An internal node with more than two children */
707 /* We will replace this node 'inode' with two new
708 * ones, 'left' and 'right', each with half of the
709 * original children. The two new nodes will have
710 * a position one bit further down the key and this
711 * means that the "significant" part of their keys
712 * (see the discussion near the top of this file)
713 * will differ by one bit, which will be "0" in
714 * left's key and "1" in right's key. Since we are
715 * moving the key position by one step, the bit that
716 * we are moving away from - the bit at position
717 * (inode->pos) - is the one that will differ between
718 * left and right. So... we synthesize that bit in the
720 * The mask 'm' below will be a single "one" bit at
721 * the position (inode->pos)
724 /* Use the old key, but set the new significant
728 left
= (struct tnode
*) tnode_get_child(tn
, 2*i
);
729 put_child(t
, tn
, 2*i
, NULL
);
733 right
= (struct tnode
*) tnode_get_child(tn
, 2*i
+1);
734 put_child(t
, tn
, 2*i
+1, NULL
);
738 size
= tnode_child_length(left
);
739 for (j
= 0; j
< size
; j
++) {
740 put_child(t
, left
, j
, inode
->child
[j
]);
741 put_child(t
, right
, j
, inode
->child
[j
+ size
]);
743 put_child(t
, tn
, 2*i
, resize(t
, left
));
744 put_child(t
, tn
, 2*i
+1, resize(t
, right
));
748 tnode_free(oldtnode
);
752 int size
= tnode_child_length(tn
);
755 for(j
= 0; j
< size
; j
++)
757 tnode_free((struct tnode
*)tn
->child
[j
]);
761 return ERR_PTR(-ENOMEM
);
765 static struct tnode
*halve(struct trie
*t
, struct tnode
*tn
)
767 struct tnode
*oldtnode
= tn
;
768 struct node
*left
, *right
;
770 int olen
= tnode_child_length(tn
);
774 tn
= tnode_new(oldtnode
->key
, oldtnode
->pos
, oldtnode
->bits
- 1);
777 return ERR_PTR(-ENOMEM
);
780 * Preallocate and store tnodes before the actual work so we
781 * don't get into an inconsistent state if memory allocation
782 * fails. In case of failure we return the oldnode and halve
783 * of tnode is ignored.
786 for (i
= 0; i
< olen
; i
+= 2) {
787 left
= tnode_get_child(oldtnode
, i
);
788 right
= tnode_get_child(oldtnode
, i
+1);
790 /* Two nonempty children */
794 newn
= tnode_new(left
->key
, tn
->pos
+ tn
->bits
, 1);
799 put_child(t
, tn
, i
/2, (struct node
*)newn
);
804 for (i
= 0; i
< olen
; i
+= 2) {
805 struct tnode
*newBinNode
;
807 left
= tnode_get_child(oldtnode
, i
);
808 right
= tnode_get_child(oldtnode
, i
+1);
810 /* At least one of the children is empty */
812 if (right
== NULL
) /* Both are empty */
814 put_child(t
, tn
, i
/2, right
);
819 put_child(t
, tn
, i
/2, left
);
823 /* Two nonempty children */
824 newBinNode
= (struct tnode
*) tnode_get_child(tn
, i
/2);
825 put_child(t
, tn
, i
/2, NULL
);
829 put_child(t
, newBinNode
, 0, left
);
830 put_child(t
, newBinNode
, 1, right
);
831 put_child(t
, tn
, i
/2, resize(t
, newBinNode
));
833 tnode_free(oldtnode
);
837 int size
= tnode_child_length(tn
);
840 for(j
= 0; j
< size
; j
++)
842 tnode_free((struct tnode
*)tn
->child
[j
]);
846 return ERR_PTR(-ENOMEM
);
850 static void trie_init(struct trie
*t
)
858 #ifdef CONFIG_IP_FIB_TRIE_STATS
859 memset(&t
->stats
, 0, sizeof(struct trie_use_stats
));
863 static struct leaf_info
*find_leaf_info(struct hlist_head
*head
, int plen
)
865 struct hlist_node
*node
;
866 struct leaf_info
*li
;
868 hlist_for_each_entry(li
, node
, head
, hlist
)
869 if (li
->plen
== plen
)
875 static inline struct list_head
* get_fa_head(struct leaf
*l
, int plen
)
877 struct leaf_info
*li
= find_leaf_info(&l
->list
, plen
);
885 static void insert_leaf_info(struct hlist_head
*head
, struct leaf_info
*new)
887 struct leaf_info
*li
= NULL
, *last
= NULL
;
888 struct hlist_node
*node
;
890 write_lock_bh(&fib_lock
);
892 if (hlist_empty(head
)) {
893 hlist_add_head(&new->hlist
, head
);
895 hlist_for_each_entry(li
, node
, head
, hlist
) {
896 if (new->plen
> li
->plen
)
902 hlist_add_after(&last
->hlist
, &new->hlist
);
904 hlist_add_before(&new->hlist
, &li
->hlist
);
906 write_unlock_bh(&fib_lock
);
910 fib_find_node(struct trie
*t
, u32 key
)
919 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
920 tn
= (struct tnode
*) n
;
924 if (tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
925 pos
= tn
->pos
+ tn
->bits
;
926 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
930 /* Case we have found a leaf. Compare prefixes */
932 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
))
933 return (struct leaf
*)n
;
938 static struct node
*trie_rebalance(struct trie
*t
, struct tnode
*tn
)
943 struct tnode
*tp
= NULL
;
950 while (tn
!= NULL
&& NODE_PARENT(tn
) != NULL
) {
952 printk("Rebalance tn=%p \n", tn
);
954 printk("tn->parent=%p \n", NODE_PARENT(tn
));
956 printk("Rebalance tp=%p \n", tp
);
958 printk("tp->parent=%p \n", NODE_PARENT(tp
));
961 BUG_ON(i
> 12); /* Why is this a bug? -ojn */
964 tp
= NODE_PARENT(tn
);
965 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
966 wasfull
= tnode_full(tp
, tnode_get_child(tp
, cindex
));
967 tn
= (struct tnode
*) resize (t
, (struct tnode
*)tn
);
968 tnode_put_child_reorg((struct tnode
*)tp
, cindex
,(struct node
*)tn
, wasfull
);
970 if (!NODE_PARENT(tn
))
973 tn
= NODE_PARENT(tn
);
975 /* Handle last (top) tnode */
977 tn
= (struct tnode
*) resize(t
, (struct tnode
*)tn
);
979 return (struct node
*) tn
;
982 static struct list_head
*
983 fib_insert_node(struct trie
*t
, int *err
, u32 key
, int plen
)
986 struct tnode
*tp
= NULL
, *tn
= NULL
;
990 struct list_head
*fa_head
= NULL
;
991 struct leaf_info
*li
;
997 /* If we point to NULL, stop. Either the tree is empty and we should
998 * just put a new leaf in if, or we have reached an empty child slot,
999 * and we should just put our new leaf in that.
1000 * If we point to a T_TNODE, check if it matches our key. Note that
1001 * a T_TNODE might be skipping any number of bits - its 'pos' need
1002 * not be the parent's 'pos'+'bits'!
1004 * If it does match the current key, get pos/bits from it, extract
1005 * the index from our key, push the T_TNODE and walk the tree.
1007 * If it doesn't, we have to replace it with a new T_TNODE.
1009 * If we point to a T_LEAF, it might or might not have the same key
1010 * as we do. If it does, just change the value, update the T_LEAF's
1011 * value, and return it.
1012 * If it doesn't, we need to replace it with a T_TNODE.
1015 while (n
!= NULL
&& NODE_TYPE(n
) == T_TNODE
) {
1016 tn
= (struct tnode
*) n
;
1020 if (tkey_sub_equals(tn
->key
, pos
, tn
->pos
-pos
, key
)) {
1022 pos
= tn
->pos
+ tn
->bits
;
1023 n
= tnode_get_child(tn
, tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1025 if (n
&& NODE_PARENT(n
) != tn
) {
1026 printk("BUG tn=%p, n->parent=%p\n", tn
, NODE_PARENT(n
));
1034 * n ----> NULL, LEAF or TNODE
1036 * tp is n's (parent) ----> NULL or TNODE
1039 BUG_ON(tp
&& IS_LEAF(tp
));
1041 /* Case 1: n is a leaf. Compare prefixes */
1043 if (n
!= NULL
&& IS_LEAF(n
) && tkey_equals(key
, n
->key
)) {
1044 struct leaf
*l
= (struct leaf
*) n
;
1046 li
= leaf_info_new(plen
);
1053 fa_head
= &li
->falh
;
1054 insert_leaf_info(&l
->list
, li
);
1066 li
= leaf_info_new(plen
);
1069 tnode_free((struct tnode
*) l
);
1074 fa_head
= &li
->falh
;
1075 insert_leaf_info(&l
->list
, li
);
1077 if (t
->trie
&& n
== NULL
) {
1078 /* Case 2: n is NULL, and will just insert a new leaf */
1080 NODE_SET_PARENT(l
, tp
);
1084 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1085 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)l
);
1087 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1089 * Add a new tnode here
1090 * first tnode need some special handling
1094 pos
= tp
->pos
+tp
->bits
;
1099 newpos
= tkey_mismatch(key
, pos
, n
->key
);
1100 tn
= tnode_new(n
->key
, newpos
, 1);
1103 tn
= tnode_new(key
, newpos
, 1); /* First tnode */
1108 tnode_free((struct tnode
*) l
);
1113 NODE_SET_PARENT(tn
, tp
);
1115 missbit
= tkey_extract_bits(key
, newpos
, 1);
1116 put_child(t
, tn
, missbit
, (struct node
*)l
);
1117 put_child(t
, tn
, 1-missbit
, n
);
1120 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1121 put_child(t
, (struct tnode
*)tp
, cindex
, (struct node
*)tn
);
1123 t
->trie
= (struct node
*) tn
; /* First tnode */
1128 if (tp
&& tp
->pos
+ tp
->bits
> 32)
1129 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1130 tp
, tp
->pos
, tp
->bits
, key
, plen
);
1132 /* Rebalance the trie */
1133 t
->trie
= trie_rebalance(t
, tp
);
1141 fn_trie_insert(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1142 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1144 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1145 struct fib_alias
*fa
, *new_fa
;
1146 struct list_head
*fa_head
= NULL
;
1147 struct fib_info
*fi
;
1148 int plen
= r
->rtm_dst_len
;
1149 int type
= r
->rtm_type
;
1150 u8 tos
= r
->rtm_tos
;
1160 memcpy(&key
, rta
->rta_dst
, 4);
1164 DBG("Insert table=%d %08x/%d\n", tb
->tb_id
, key
, plen
);
1166 mask
= ntohl(inet_make_mask(plen
));
1173 fi
= fib_create_info(r
, rta
, nlhdr
, &err
);
1178 l
= fib_find_node(t
, key
);
1182 fa_head
= get_fa_head(l
, plen
);
1183 fa
= fib_find_alias(fa_head
, tos
, fi
->fib_priority
);
1186 /* Now fa, if non-NULL, points to the first fib alias
1187 * with the same keys [prefix,tos,priority], if such key already
1188 * exists or to the node before which we will insert new one.
1190 * If fa is NULL, we will need to allocate a new one and
1191 * insert to the head of f.
1193 * If f is NULL, no fib node matched the destination key
1194 * and we need to allocate a new one of those as well.
1197 if (fa
&& fa
->fa_info
->fib_priority
== fi
->fib_priority
) {
1198 struct fib_alias
*fa_orig
;
1201 if (nlhdr
->nlmsg_flags
& NLM_F_EXCL
)
1204 if (nlhdr
->nlmsg_flags
& NLM_F_REPLACE
) {
1205 struct fib_info
*fi_drop
;
1208 write_lock_bh(&fib_lock
);
1210 fi_drop
= fa
->fa_info
;
1213 fa
->fa_scope
= r
->rtm_scope
;
1214 state
= fa
->fa_state
;
1215 fa
->fa_state
&= ~FA_S_ACCESSED
;
1217 write_unlock_bh(&fib_lock
);
1219 fib_release_info(fi_drop
);
1220 if (state
& FA_S_ACCESSED
)
1225 /* Error if we find a perfect match which
1226 * uses the same scope, type, and nexthop
1230 list_for_each_entry(fa
, fa_orig
->fa_list
.prev
, fa_list
) {
1231 if (fa
->fa_tos
!= tos
)
1233 if (fa
->fa_info
->fib_priority
!= fi
->fib_priority
)
1235 if (fa
->fa_type
== type
&&
1236 fa
->fa_scope
== r
->rtm_scope
&&
1237 fa
->fa_info
== fi
) {
1241 if (!(nlhdr
->nlmsg_flags
& NLM_F_APPEND
))
1245 if (!(nlhdr
->nlmsg_flags
& NLM_F_CREATE
))
1249 new_fa
= kmem_cache_alloc(fn_alias_kmem
, SLAB_KERNEL
);
1253 new_fa
->fa_info
= fi
;
1254 new_fa
->fa_tos
= tos
;
1255 new_fa
->fa_type
= type
;
1256 new_fa
->fa_scope
= r
->rtm_scope
;
1257 new_fa
->fa_state
= 0;
1259 * Insert new entry to the list.
1263 fa_head
= fib_insert_node(t
, &err
, key
, plen
);
1266 goto out_free_new_fa
;
1269 write_lock_bh(&fib_lock
);
1271 list_add_tail(&new_fa
->fa_list
, (fa
? &fa
->fa_list
: fa_head
));
1273 write_unlock_bh(&fib_lock
);
1276 rtmsg_fib(RTM_NEWROUTE
, htonl(key
), new_fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1281 kmem_cache_free(fn_alias_kmem
, new_fa
);
1283 fib_release_info(fi
);
1288 static inline int check_leaf(struct trie
*t
, struct leaf
*l
, t_key key
, int *plen
, const struct flowi
*flp
,
1289 struct fib_result
*res
)
1293 struct leaf_info
*li
;
1294 struct hlist_head
*hhead
= &l
->list
;
1295 struct hlist_node
*node
;
1297 hlist_for_each_entry(li
, node
, hhead
, hlist
) {
1299 mask
= ntohl(inet_make_mask(i
));
1300 if (l
->key
!= (key
& mask
))
1303 if ((err
= fib_semantic_match(&li
->falh
, flp
, res
, l
->key
, mask
, i
)) <= 0) {
1305 #ifdef CONFIG_IP_FIB_TRIE_STATS
1306 t
->stats
.semantic_match_passed
++;
1310 #ifdef CONFIG_IP_FIB_TRIE_STATS
1311 t
->stats
.semantic_match_miss
++;
1318 fn_trie_lookup(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1320 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1325 t_key key
= ntohl(flp
->fl4_dst
);
1328 int current_prefix_length
= KEYLENGTH
;
1330 t_key node_prefix
, key_prefix
, pref_mismatch
;
1335 read_lock(&fib_lock
);
1340 #ifdef CONFIG_IP_FIB_TRIE_STATS
1346 if ((ret
= check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
)) <= 0)
1350 pn
= (struct tnode
*) n
;
1358 cindex
= tkey_extract_bits(MASK_PFX(key
, current_prefix_length
), pos
, bits
);
1360 n
= tnode_get_child(pn
, cindex
);
1363 #ifdef CONFIG_IP_FIB_TRIE_STATS
1364 t
->stats
.null_node_hit
++;
1370 if ((ret
= check_leaf(t
, (struct leaf
*)n
, key
, &plen
, flp
, res
)) <= 0)
1378 cn
= (struct tnode
*)n
;
1381 * It's a tnode, and we can do some extra checks here if we
1382 * like, to avoid descending into a dead-end branch.
1383 * This tnode is in the parent's child array at index
1384 * key[p_pos..p_pos+p_bits] but potentially with some bits
1385 * chopped off, so in reality the index may be just a
1386 * subprefix, padded with zero at the end.
1387 * We can also take a look at any skipped bits in this
1388 * tnode - everything up to p_pos is supposed to be ok,
1389 * and the non-chopped bits of the index (se previous
1390 * paragraph) are also guaranteed ok, but the rest is
1391 * considered unknown.
1393 * The skipped bits are key[pos+bits..cn->pos].
1396 /* If current_prefix_length < pos+bits, we are already doing
1397 * actual prefix matching, which means everything from
1398 * pos+(bits-chopped_off) onward must be zero along some
1399 * branch of this subtree - otherwise there is *no* valid
1400 * prefix present. Here we can only check the skipped
1401 * bits. Remember, since we have already indexed into the
1402 * parent's child array, we know that the bits we chopped of
1406 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1408 if (current_prefix_length
< pos
+bits
) {
1409 if (tkey_extract_bits(cn
->key
, current_prefix_length
,
1410 cn
->pos
- current_prefix_length
) != 0 ||
1416 * If chopped_off=0, the index is fully validated and we
1417 * only need to look at the skipped bits for this, the new,
1418 * tnode. What we actually want to do is to find out if
1419 * these skipped bits match our key perfectly, or if we will
1420 * have to count on finding a matching prefix further down,
1421 * because if we do, we would like to have some way of
1422 * verifying the existence of such a prefix at this point.
1425 /* The only thing we can do at this point is to verify that
1426 * any such matching prefix can indeed be a prefix to our
1427 * key, and if the bits in the node we are inspecting that
1428 * do not match our key are not ZERO, this cannot be true.
1429 * Thus, find out where there is a mismatch (before cn->pos)
1430 * and verify that all the mismatching bits are zero in the
1434 /* Note: We aren't very concerned about the piece of the key
1435 * that precede pn->pos+pn->bits, since these have already been
1436 * checked. The bits after cn->pos aren't checked since these are
1437 * by definition "unknown" at this point. Thus, what we want to
1438 * see is if we are about to enter the "prefix matching" state,
1439 * and in that case verify that the skipped bits that will prevail
1440 * throughout this subtree are zero, as they have to be if we are
1441 * to find a matching prefix.
1444 node_prefix
= MASK_PFX(cn
->key
, cn
->pos
);
1445 key_prefix
= MASK_PFX(key
, cn
->pos
);
1446 pref_mismatch
= key_prefix
^node_prefix
;
1449 /* In short: If skipped bits in this node do not match the search
1450 * key, enter the "prefix matching" state.directly.
1452 if (pref_mismatch
) {
1453 while (!(pref_mismatch
& (1<<(KEYLENGTH
-1)))) {
1455 pref_mismatch
= pref_mismatch
<<1;
1457 key_prefix
= tkey_extract_bits(cn
->key
, mp
, cn
->pos
-mp
);
1459 if (key_prefix
!= 0)
1462 if (current_prefix_length
>= cn
->pos
)
1463 current_prefix_length
= mp
;
1466 pn
= (struct tnode
*)n
; /* Descend */
1473 /* As zero don't change the child key (cindex) */
1474 while ((chopped_off
<= pn
->bits
) && !(cindex
& (1<<(chopped_off
-1))))
1477 /* Decrease current_... with bits chopped off */
1478 if (current_prefix_length
> pn
->pos
+ pn
->bits
- chopped_off
)
1479 current_prefix_length
= pn
->pos
+ pn
->bits
- chopped_off
;
1482 * Either we do the actual chop off according or if we have
1483 * chopped off all bits in this tnode walk up to our parent.
1486 if (chopped_off
<= pn
->bits
) {
1487 cindex
&= ~(1 << (chopped_off
-1));
1489 if (NODE_PARENT(pn
) == NULL
)
1492 /* Get Child's index */
1493 cindex
= tkey_extract_bits(pn
->key
, NODE_PARENT(pn
)->pos
, NODE_PARENT(pn
)->bits
);
1494 pn
= NODE_PARENT(pn
);
1497 #ifdef CONFIG_IP_FIB_TRIE_STATS
1498 t
->stats
.backtrack
++;
1506 read_unlock(&fib_lock
);
1510 static int trie_leaf_remove(struct trie
*t
, t_key key
)
1513 struct tnode
*tp
= NULL
;
1514 struct node
*n
= t
->trie
;
1517 DBG("entering trie_leaf_remove(%p)\n", n
);
1519 /* Note that in the case skipped bits, those bits are *not* checked!
1520 * When we finish this, we will have NULL or a T_LEAF, and the
1521 * T_LEAF may or may not match our key.
1524 while (n
!= NULL
&& IS_TNODE(n
)) {
1525 struct tnode
*tn
= (struct tnode
*) n
;
1527 n
= tnode_get_child(tn
,tkey_extract_bits(key
, tn
->pos
, tn
->bits
));
1529 if (n
&& NODE_PARENT(n
) != tn
) {
1530 printk("BUG tn=%p, n->parent=%p\n", tn
, NODE_PARENT(n
));
1534 l
= (struct leaf
*) n
;
1536 if (!n
|| !tkey_equals(l
->key
, key
))
1541 * Remove the leaf and rebalance the tree
1547 tp
= NODE_PARENT(n
);
1548 tnode_free((struct tnode
*) n
);
1551 cindex
= tkey_extract_bits(key
, tp
->pos
, tp
->bits
);
1552 put_child(t
, (struct tnode
*)tp
, cindex
, NULL
);
1553 t
->trie
= trie_rebalance(t
, tp
);
1561 fn_trie_delete(struct fib_table
*tb
, struct rtmsg
*r
, struct kern_rta
*rta
,
1562 struct nlmsghdr
*nlhdr
, struct netlink_skb_parms
*req
)
1564 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1566 int plen
= r
->rtm_dst_len
;
1567 u8 tos
= r
->rtm_tos
;
1568 struct fib_alias
*fa
, *fa_to_delete
;
1569 struct list_head
*fa_head
;
1572 struct leaf_info
*li
;
1580 memcpy(&key
, rta
->rta_dst
, 4);
1583 mask
= ntohl(inet_make_mask(plen
));
1589 l
= fib_find_node(t
, key
);
1594 fa_head
= get_fa_head(l
, plen
);
1595 fa
= fib_find_alias(fa_head
, tos
, 0);
1600 DBG("Deleting %08x/%d tos=%d t=%p\n", key
, plen
, tos
, t
);
1602 fa_to_delete
= NULL
;
1603 fa_head
= fa
->fa_list
.prev
;
1604 list_for_each_entry(fa
, fa_head
, fa_list
) {
1605 struct fib_info
*fi
= fa
->fa_info
;
1607 if (fa
->fa_tos
!= tos
)
1610 if ((!r
->rtm_type
||
1611 fa
->fa_type
== r
->rtm_type
) &&
1612 (r
->rtm_scope
== RT_SCOPE_NOWHERE
||
1613 fa
->fa_scope
== r
->rtm_scope
) &&
1614 (!r
->rtm_protocol
||
1615 fi
->fib_protocol
== r
->rtm_protocol
) &&
1616 fib_nh_match(r
, nlhdr
, rta
, fi
) == 0) {
1626 rtmsg_fib(RTM_DELROUTE
, htonl(key
), fa
, plen
, tb
->tb_id
, nlhdr
, req
);
1628 l
= fib_find_node(t
, key
);
1629 li
= find_leaf_info(&l
->list
, plen
);
1631 write_lock_bh(&fib_lock
);
1633 list_del(&fa
->fa_list
);
1635 if (list_empty(fa_head
)) {
1636 hlist_del(&li
->hlist
);
1639 write_unlock_bh(&fib_lock
);
1644 if (hlist_empty(&l
->list
))
1645 trie_leaf_remove(t
, key
);
1647 if (fa
->fa_state
& FA_S_ACCESSED
)
1654 static int trie_flush_list(struct trie
*t
, struct list_head
*head
)
1656 struct fib_alias
*fa
, *fa_node
;
1659 list_for_each_entry_safe(fa
, fa_node
, head
, fa_list
) {
1660 struct fib_info
*fi
= fa
->fa_info
;
1662 if (fi
&& (fi
->fib_flags
&RTNH_F_DEAD
)) {
1663 write_lock_bh(&fib_lock
);
1664 list_del(&fa
->fa_list
);
1665 write_unlock_bh(&fib_lock
);
1674 static int trie_flush_leaf(struct trie
*t
, struct leaf
*l
)
1677 struct hlist_head
*lih
= &l
->list
;
1678 struct hlist_node
*node
, *tmp
;
1679 struct leaf_info
*li
= NULL
;
1681 hlist_for_each_entry_safe(li
, node
, tmp
, lih
, hlist
) {
1682 found
+= trie_flush_list(t
, &li
->falh
);
1684 if (list_empty(&li
->falh
)) {
1685 write_lock_bh(&fib_lock
);
1686 hlist_del(&li
->hlist
);
1687 write_unlock_bh(&fib_lock
);
1695 static struct leaf
*nextleaf(struct trie
*t
, struct leaf
*thisleaf
)
1697 struct node
*c
= (struct node
*) thisleaf
;
1702 if (t
->trie
== NULL
)
1705 if (IS_LEAF(t
->trie
)) /* trie w. just a leaf */
1706 return (struct leaf
*) t
->trie
;
1708 p
= (struct tnode
*) t
->trie
; /* Start */
1710 p
= (struct tnode
*) NODE_PARENT(c
);
1715 /* Find the next child of the parent */
1717 pos
= 1 + tkey_extract_bits(c
->key
, p
->pos
, p
->bits
);
1721 last
= 1 << p
->bits
;
1722 for (idx
= pos
; idx
< last
; idx
++) {
1726 /* Decend if tnode */
1727 while (IS_TNODE(p
->child
[idx
])) {
1728 p
= (struct tnode
*) p
->child
[idx
];
1731 /* Rightmost non-NULL branch */
1732 if (p
&& IS_TNODE(p
))
1733 while (p
->child
[idx
] == NULL
&& idx
< (1 << p
->bits
)) idx
++;
1735 /* Done with this tnode? */
1736 if (idx
>= (1 << p
->bits
) || p
->child
[idx
] == NULL
)
1739 return (struct leaf
*) p
->child
[idx
];
1742 /* No more children go up one step */
1743 c
= (struct node
*) p
;
1744 p
= (struct tnode
*) NODE_PARENT(p
);
1746 return NULL
; /* Ready. Root of trie */
1749 static int fn_trie_flush(struct fib_table
*tb
)
1751 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1752 struct leaf
*ll
= NULL
, *l
= NULL
;
1757 for (h
= 0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1758 found
+= trie_flush_leaf(t
, l
);
1760 if (ll
&& hlist_empty(&ll
->list
))
1761 trie_leaf_remove(t
, ll
->key
);
1765 if (ll
&& hlist_empty(&ll
->list
))
1766 trie_leaf_remove(t
, ll
->key
);
1768 DBG("trie_flush found=%d\n", found
);
1772 static int trie_last_dflt
= -1;
1775 fn_trie_select_default(struct fib_table
*tb
, const struct flowi
*flp
, struct fib_result
*res
)
1777 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1778 int order
, last_idx
;
1779 struct fib_info
*fi
= NULL
;
1780 struct fib_info
*last_resort
;
1781 struct fib_alias
*fa
= NULL
;
1782 struct list_head
*fa_head
;
1789 read_lock(&fib_lock
);
1791 l
= fib_find_node(t
, 0);
1795 fa_head
= get_fa_head(l
, 0);
1799 if (list_empty(fa_head
))
1802 list_for_each_entry(fa
, fa_head
, fa_list
) {
1803 struct fib_info
*next_fi
= fa
->fa_info
;
1805 if (fa
->fa_scope
!= res
->scope
||
1806 fa
->fa_type
!= RTN_UNICAST
)
1809 if (next_fi
->fib_priority
> res
->fi
->fib_priority
)
1811 if (!next_fi
->fib_nh
[0].nh_gw
||
1812 next_fi
->fib_nh
[0].nh_scope
!= RT_SCOPE_LINK
)
1814 fa
->fa_state
|= FA_S_ACCESSED
;
1817 if (next_fi
!= res
->fi
)
1819 } else if (!fib_detect_death(fi
, order
, &last_resort
,
1820 &last_idx
, &trie_last_dflt
)) {
1822 fib_info_put(res
->fi
);
1824 atomic_inc(&fi
->fib_clntref
);
1825 trie_last_dflt
= order
;
1831 if (order
<= 0 || fi
== NULL
) {
1832 trie_last_dflt
= -1;
1836 if (!fib_detect_death(fi
, order
, &last_resort
, &last_idx
, &trie_last_dflt
)) {
1838 fib_info_put(res
->fi
);
1840 atomic_inc(&fi
->fib_clntref
);
1841 trie_last_dflt
= order
;
1844 if (last_idx
>= 0) {
1846 fib_info_put(res
->fi
);
1847 res
->fi
= last_resort
;
1849 atomic_inc(&last_resort
->fib_clntref
);
1851 trie_last_dflt
= last_idx
;
1853 read_unlock(&fib_lock
);
1856 static int fn_trie_dump_fa(t_key key
, int plen
, struct list_head
*fah
, struct fib_table
*tb
,
1857 struct sk_buff
*skb
, struct netlink_callback
*cb
)
1860 struct fib_alias
*fa
;
1862 u32 xkey
= htonl(key
);
1867 list_for_each_entry(fa
, fah
, fa_list
) {
1872 if (fa
->fa_info
->fib_nh
== NULL
) {
1873 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i
, key
, plen
);
1877 if (fa
->fa_info
== NULL
) {
1878 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i
, key
, plen
);
1883 if (fib_dump_info(skb
, NETLINK_CB(cb
->skb
).pid
,
1892 fa
->fa_info
, 0) < 0) {
1902 static int fn_trie_dump_plen(struct trie
*t
, int plen
, struct fib_table
*tb
, struct sk_buff
*skb
,
1903 struct netlink_callback
*cb
)
1906 struct list_head
*fa_head
;
1907 struct leaf
*l
= NULL
;
1911 for (h
= 0; (l
= nextleaf(t
, l
)) != NULL
; h
++) {
1915 memset(&cb
->args
[3], 0,
1916 sizeof(cb
->args
) - 3*sizeof(cb
->args
[0]));
1918 fa_head
= get_fa_head(l
, plen
);
1923 if (list_empty(fa_head
))
1926 if (fn_trie_dump_fa(l
->key
, plen
, fa_head
, tb
, skb
, cb
)<0) {
1935 static int fn_trie_dump(struct fib_table
*tb
, struct sk_buff
*skb
, struct netlink_callback
*cb
)
1938 struct trie
*t
= (struct trie
*) tb
->tb_data
;
1942 read_lock(&fib_lock
);
1943 for (m
= 0; m
<= 32; m
++) {
1947 memset(&cb
->args
[2], 0,
1948 sizeof(cb
->args
) - 2*sizeof(cb
->args
[0]));
1950 if (fn_trie_dump_plen(t
, 32-m
, tb
, skb
, cb
)<0) {
1955 read_unlock(&fib_lock
);
1959 read_unlock(&fib_lock
);
1963 /* Fix more generic FIB names for init later */
1965 #ifdef CONFIG_IP_MULTIPLE_TABLES
1966 struct fib_table
* fib_hash_init(int id
)
1968 struct fib_table
* __init
fib_hash_init(int id
)
1971 struct fib_table
*tb
;
1974 if (fn_alias_kmem
== NULL
)
1975 fn_alias_kmem
= kmem_cache_create("ip_fib_alias",
1976 sizeof(struct fib_alias
),
1977 0, SLAB_HWCACHE_ALIGN
,
1980 tb
= kmalloc(sizeof(struct fib_table
) + sizeof(struct trie
),
1986 tb
->tb_lookup
= fn_trie_lookup
;
1987 tb
->tb_insert
= fn_trie_insert
;
1988 tb
->tb_delete
= fn_trie_delete
;
1989 tb
->tb_flush
= fn_trie_flush
;
1990 tb
->tb_select_default
= fn_trie_select_default
;
1991 tb
->tb_dump
= fn_trie_dump
;
1992 memset(tb
->tb_data
, 0, sizeof(struct trie
));
1994 t
= (struct trie
*) tb
->tb_data
;
1998 if (id
== RT_TABLE_LOCAL
)
2000 else if (id
== RT_TABLE_MAIN
)
2003 if (id
== RT_TABLE_LOCAL
)
2004 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION
);
2009 /* Trie dump functions */
2011 static void putspace_seq(struct seq_file
*seq
, int n
)
2014 seq_printf(seq
, " ");
2017 static void printbin_seq(struct seq_file
*seq
, unsigned int v
, int bits
)
2020 seq_printf(seq
, "%s", (v
& (1<<bits
))?"1":"0");
2023 static void printnode_seq(struct seq_file
*seq
, int indent
, struct node
*n
,
2024 int pend
, int cindex
, int bits
)
2026 putspace_seq(seq
, indent
);
2028 seq_printf(seq
, "|");
2030 seq_printf(seq
, "+");
2032 seq_printf(seq
, "%d/", cindex
);
2033 printbin_seq(seq
, cindex
, bits
);
2034 seq_printf(seq
, ": ");
2036 seq_printf(seq
, "<root>: ");
2037 seq_printf(seq
, "%s:%p ", IS_LEAF(n
)?"Leaf":"Internal node", n
);
2040 struct leaf
*l
= (struct leaf
*)n
;
2041 struct fib_alias
*fa
;
2044 seq_printf(seq
, "key=%d.%d.%d.%d\n",
2045 n
->key
>> 24, (n
->key
>> 16) % 256, (n
->key
>> 8) % 256, n
->key
% 256);
2047 for (i
= 32; i
>= 0; i
--)
2048 if (find_leaf_info(&l
->list
, i
)) {
2049 struct list_head
*fa_head
= get_fa_head(l
, i
);
2054 if (list_empty(fa_head
))
2057 putspace_seq(seq
, indent
+2);
2058 seq_printf(seq
, "{/%d...dumping}\n", i
);
2060 list_for_each_entry(fa
, fa_head
, fa_list
) {
2061 putspace_seq(seq
, indent
+2);
2062 if (fa
->fa_info
== NULL
) {
2063 seq_printf(seq
, "Error fa_info=NULL\n");
2066 if (fa
->fa_info
->fib_nh
== NULL
) {
2067 seq_printf(seq
, "Error _fib_nh=NULL\n");
2071 seq_printf(seq
, "{type=%d scope=%d TOS=%d}\n",
2078 struct tnode
*tn
= (struct tnode
*)n
;
2079 int plen
= ((struct tnode
*)n
)->pos
;
2080 t_key prf
= MASK_PFX(n
->key
, plen
);
2082 seq_printf(seq
, "key=%d.%d.%d.%d/%d\n",
2083 prf
>> 24, (prf
>> 16) % 256, (prf
>> 8) % 256, prf
% 256, plen
);
2085 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
2086 seq_printf(seq
, "{key prefix=%08x/", tn
->key
& TKEY_GET_MASK(0, tn
->pos
));
2087 printbin_seq(seq
, tkey_extract_bits(tn
->key
, 0, tn
->pos
), tn
->pos
);
2088 seq_printf(seq
, "}\n");
2089 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
2090 seq_printf(seq
, "{pos=%d", tn
->pos
);
2091 seq_printf(seq
, " (skip=%d bits)", tn
->pos
- pend
);
2092 seq_printf(seq
, " bits=%d (%u children)}\n", tn
->bits
, (1 << tn
->bits
));
2093 putspace_seq(seq
, indent
); seq_printf(seq
, "| ");
2094 seq_printf(seq
, "{empty=%d full=%d}\n", tn
->empty_children
, tn
->full_children
);
2098 static void trie_dump_seq(struct seq_file
*seq
, struct trie
*t
)
2100 struct node
*n
= t
->trie
;
2107 read_lock(&fib_lock
);
2109 seq_printf(seq
, "------ trie_dump of t=%p ------\n", t
);
2112 seq_printf(seq
, "------ trie is empty\n");
2114 read_unlock(&fib_lock
);
2118 printnode_seq(seq
, indent
, n
, pend
, cindex
, 0);
2121 read_unlock(&fib_lock
);
2125 tn
= (struct tnode
*)n
;
2126 pend
= tn
->pos
+tn
->bits
;
2127 putspace_seq(seq
, indent
); seq_printf(seq
, "\\--\n");
2131 while (tn
&& cindex
< (1 << tn
->bits
)) {
2132 if (tn
->child
[cindex
]) {
2135 printnode_seq(seq
, indent
, tn
->child
[cindex
], pend
, cindex
, tn
->bits
);
2136 if (IS_LEAF(tn
->child
[cindex
])) {
2140 * New tnode. Decend one level
2144 tn
= (struct tnode
*)tn
->child
[cindex
];
2145 pend
= tn
->pos
+ tn
->bits
;
2146 putspace_seq(seq
, indent
); seq_printf(seq
, "\\--\n");
2154 * Test if we are done
2157 while (cindex
>= (1 << tn
->bits
)) {
2159 * Move upwards and test for root
2160 * pop off all traversed nodes
2163 if (NODE_PARENT(tn
) == NULL
) {
2168 cindex
= tkey_extract_bits(tn
->key
, NODE_PARENT(tn
)->pos
, NODE_PARENT(tn
)->bits
);
2170 tn
= NODE_PARENT(tn
);
2171 pend
= tn
->pos
+ tn
->bits
;
2177 read_unlock(&fib_lock
);
2180 static struct trie_stat
*trie_stat_new(void)
2182 struct trie_stat
*s
;
2185 s
= kmalloc(sizeof(struct trie_stat
), GFP_KERNEL
);
2193 s
->nullpointers
= 0;
2195 for (i
= 0; i
< MAX_CHILDS
; i
++)
2196 s
->nodesizes
[i
] = 0;
2201 static struct trie_stat
*trie_collect_stats(struct trie
*t
)
2203 struct node
*n
= t
->trie
;
2204 struct trie_stat
*s
= trie_stat_new();
2214 read_lock(&fib_lock
);
2217 struct tnode
*tn
= (struct tnode
*)n
;
2218 pend
= tn
->pos
+tn
->bits
;
2219 s
->nodesizes
[tn
->bits
]++;
2222 while (tn
&& cindex
< (1 << tn
->bits
)) {
2223 if (tn
->child
[cindex
]) {
2226 if (IS_LEAF(tn
->child
[cindex
])) {
2230 if (depth
> s
->maxdepth
)
2231 s
->maxdepth
= depth
;
2232 s
->totdepth
+= depth
;
2236 * New tnode. Decend one level
2240 s
->nodesizes
[tn
->bits
]++;
2243 n
= tn
->child
[cindex
];
2244 tn
= (struct tnode
*)n
;
2245 pend
= tn
->pos
+tn
->bits
;
2255 * Test if we are done
2258 while (cindex
>= (1 << tn
->bits
)) {
2260 * Move upwards and test for root
2261 * pop off all traversed nodes
2264 if (NODE_PARENT(tn
) == NULL
) {
2270 cindex
= tkey_extract_bits(tn
->key
, NODE_PARENT(tn
)->pos
, NODE_PARENT(tn
)->bits
);
2271 tn
= NODE_PARENT(tn
);
2273 n
= (struct node
*)tn
;
2274 pend
= tn
->pos
+tn
->bits
;
2280 read_unlock(&fib_lock
);
2284 #ifdef CONFIG_PROC_FS
2286 static struct fib_alias
*fib_triestat_get_first(struct seq_file
*seq
)
2291 static struct fib_alias
*fib_triestat_get_next(struct seq_file
*seq
)
2296 static void *fib_triestat_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2298 if (!ip_fib_main_table
)
2302 return fib_triestat_get_next(seq
);
2304 return SEQ_START_TOKEN
;
2307 static void *fib_triestat_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2310 if (v
== SEQ_START_TOKEN
)
2311 return fib_triestat_get_first(seq
);
2313 return fib_triestat_get_next(seq
);
2316 static void fib_triestat_seq_stop(struct seq_file
*seq
, void *v
)
2322 * This outputs /proc/net/fib_triestats
2324 * It always works in backward compatibility mode.
2325 * The format of the file is not supposed to be changed.
2328 static void collect_and_show(struct trie
*t
, struct seq_file
*seq
)
2330 int bytes
= 0; /* How many bytes are used, a ref is 4 bytes */
2331 int i
, max
, pointers
;
2332 struct trie_stat
*stat
;
2335 stat
= trie_collect_stats(t
);
2338 seq_printf(seq
, "trie=%p\n", t
);
2342 avdepth
= stat
->totdepth
*100 / stat
->leaves
;
2345 seq_printf(seq
, "Aver depth: %d.%02d\n", avdepth
/ 100, avdepth
% 100);
2346 seq_printf(seq
, "Max depth: %4d\n", stat
->maxdepth
);
2348 seq_printf(seq
, "Leaves: %d\n", stat
->leaves
);
2349 bytes
+= sizeof(struct leaf
) * stat
->leaves
;
2350 seq_printf(seq
, "Internal nodes: %d\n", stat
->tnodes
);
2351 bytes
+= sizeof(struct tnode
) * stat
->tnodes
;
2355 while (max
>= 0 && stat
->nodesizes
[max
] == 0)
2359 for (i
= 1; i
<= max
; i
++)
2360 if (stat
->nodesizes
[i
] != 0) {
2361 seq_printf(seq
, " %d: %d", i
, stat
->nodesizes
[i
]);
2362 pointers
+= (1<<i
) * stat
->nodesizes
[i
];
2364 seq_printf(seq
, "\n");
2365 seq_printf(seq
, "Pointers: %d\n", pointers
);
2366 bytes
+= sizeof(struct node
*) * pointers
;
2367 seq_printf(seq
, "Null ptrs: %d\n", stat
->nullpointers
);
2368 seq_printf(seq
, "Total size: %d kB\n", bytes
/ 1024);
2373 #ifdef CONFIG_IP_FIB_TRIE_STATS
2374 seq_printf(seq
, "Counters:\n---------\n");
2375 seq_printf(seq
,"gets = %d\n", t
->stats
.gets
);
2376 seq_printf(seq
,"backtracks = %d\n", t
->stats
.backtrack
);
2377 seq_printf(seq
,"semantic match passed = %d\n", t
->stats
.semantic_match_passed
);
2378 seq_printf(seq
,"semantic match miss = %d\n", t
->stats
.semantic_match_miss
);
2379 seq_printf(seq
,"null node hit= %d\n", t
->stats
.null_node_hit
);
2380 seq_printf(seq
,"skipped node resize = %d\n", t
->stats
.resize_node_skipped
);
2382 memset(&(t
->stats
), 0, sizeof(t
->stats
));
2384 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2387 static int fib_triestat_seq_show(struct seq_file
*seq
, void *v
)
2391 if (v
== SEQ_START_TOKEN
) {
2392 seq_printf(seq
, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2393 sizeof(struct leaf
), sizeof(struct tnode
));
2395 collect_and_show(trie_local
, seq
);
2398 collect_and_show(trie_main
, seq
);
2400 snprintf(bf
, sizeof(bf
), "*\t%08X\t%08X", 200, 400);
2402 seq_printf(seq
, "%-127s\n", bf
);
2407 static struct seq_operations fib_triestat_seq_ops
= {
2408 .start
= fib_triestat_seq_start
,
2409 .next
= fib_triestat_seq_next
,
2410 .stop
= fib_triestat_seq_stop
,
2411 .show
= fib_triestat_seq_show
,
2414 static int fib_triestat_seq_open(struct inode
*inode
, struct file
*file
)
2416 struct seq_file
*seq
;
2419 rc
= seq_open(file
, &fib_triestat_seq_ops
);
2423 seq
= file
->private_data
;
2430 static struct file_operations fib_triestat_seq_fops
= {
2431 .owner
= THIS_MODULE
,
2432 .open
= fib_triestat_seq_open
,
2434 .llseek
= seq_lseek
,
2435 .release
= seq_release_private
,
2438 int __init
fib_stat_proc_init(void)
2440 if (!proc_net_fops_create("fib_triestat", S_IRUGO
, &fib_triestat_seq_fops
))
2445 void __init
fib_stat_proc_exit(void)
2447 proc_net_remove("fib_triestat");
2450 static struct fib_alias
*fib_trie_get_first(struct seq_file
*seq
)
2455 static struct fib_alias
*fib_trie_get_next(struct seq_file
*seq
)
2460 static void *fib_trie_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2462 if (!ip_fib_main_table
)
2466 return fib_trie_get_next(seq
);
2468 return SEQ_START_TOKEN
;
2471 static void *fib_trie_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2474 if (v
== SEQ_START_TOKEN
)
2475 return fib_trie_get_first(seq
);
2477 return fib_trie_get_next(seq
);
2481 static void fib_trie_seq_stop(struct seq_file
*seq
, void *v
)
2486 * This outputs /proc/net/fib_trie.
2488 * It always works in backward compatibility mode.
2489 * The format of the file is not supposed to be changed.
2492 static int fib_trie_seq_show(struct seq_file
*seq
, void *v
)
2496 if (v
== SEQ_START_TOKEN
) {
2498 trie_dump_seq(seq
, trie_local
);
2501 trie_dump_seq(seq
, trie_main
);
2503 snprintf(bf
, sizeof(bf
),
2504 "*\t%08X\t%08X", 200, 400);
2505 seq_printf(seq
, "%-127s\n", bf
);
2511 static struct seq_operations fib_trie_seq_ops
= {
2512 .start
= fib_trie_seq_start
,
2513 .next
= fib_trie_seq_next
,
2514 .stop
= fib_trie_seq_stop
,
2515 .show
= fib_trie_seq_show
,
2518 static int fib_trie_seq_open(struct inode
*inode
, struct file
*file
)
2520 struct seq_file
*seq
;
2523 rc
= seq_open(file
, &fib_trie_seq_ops
);
2527 seq
= file
->private_data
;
2534 static struct file_operations fib_trie_seq_fops
= {
2535 .owner
= THIS_MODULE
,
2536 .open
= fib_trie_seq_open
,
2538 .llseek
= seq_lseek
,
2539 .release
= seq_release_private
,
2542 int __init
fib_proc_init(void)
2544 if (!proc_net_fops_create("fib_trie", S_IRUGO
, &fib_trie_seq_fops
))
2549 void __init
fib_proc_exit(void)
2551 proc_net_remove("fib_trie");
2554 #endif /* CONFIG_PROC_FS */