914a4c2aae423199f404188b5e1fc4e3ad1203e9
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / fib_trie.c
1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
46 #define VERSION "0.325"
47
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/skbuff.h>
66 #include <linux/netlink.h>
67 #include <linux/init.h>
68 #include <linux/list.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/sock.h>
74 #include <net/ip_fib.h>
75 #include "fib_lookup.h"
76
77 #undef CONFIG_IP_FIB_TRIE_STATS
78 #define MAX_CHILDS 16384
79
80 #define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85 static DEFINE_RWLOCK(fib_lock);
86
87 typedef unsigned int t_key;
88
89 #define T_TNODE 0
90 #define T_LEAF 1
91 #define NODE_TYPE_MASK 0x1UL
92 #define NODE_PARENT(node) \
93 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
94 #define NODE_SET_PARENT(node, ptr) \
95 ((node)->parent = (((unsigned long)(ptr)) | \
96 ((node)->parent & NODE_TYPE_MASK)))
97 #define NODE_INIT_PARENT(node, type) \
98 ((node)->parent = (type))
99 #define NODE_TYPE(node) \
100 ((node)->parent & NODE_TYPE_MASK)
101
102 #define IS_TNODE(n) (!(n->parent & T_LEAF))
103 #define IS_LEAF(n) (n->parent & T_LEAF)
104
105 struct node {
106 t_key key;
107 unsigned long parent;
108 };
109
110 struct leaf {
111 t_key key;
112 unsigned long parent;
113 struct hlist_head list;
114 };
115
116 struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120 };
121
122 struct tnode {
123 t_key key;
124 unsigned long parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
130 };
131
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
140 };
141 #endif
142
143 struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
150 };
151
152 struct trie {
153 struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156 #endif
157 int size;
158 unsigned int revision;
159 };
160
161 static int trie_debug = 0;
162
163 #define DBG(x...) do { if (trie_debug) printk(x); } while (0)
164
165 static int tnode_full(struct tnode *tn, struct node *n);
166 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
167 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
168 static int tnode_child_length(struct tnode *tn);
169 static struct node *resize(struct trie *t, struct tnode *tn);
170 static struct tnode *inflate(struct trie *t, struct tnode *tn);
171 static struct tnode *halve(struct trie *t, struct tnode *tn);
172 static void tnode_free(struct tnode *tn);
173 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
174 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
175 extern int fib_detect_death(struct fib_info *fi, int order,
176 struct fib_info **last_resort, int *last_idx, int *dflt);
177
178 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
179 struct nlmsghdr *n, struct netlink_skb_parms *req);
180
181 static kmem_cache_t *fn_alias_kmem;
182 static struct trie *trie_local = NULL, *trie_main = NULL;
183
184 static inline struct node *tnode_get_child(struct tnode *tn, int i)
185 {
186 BUG_ON(i >= 1 << tn->bits);
187
188 return tn->child[i];
189 }
190
191 static inline int tnode_child_length(struct tnode *tn)
192 {
193 return 1 << tn->bits;
194 }
195
196 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
197 {
198 if (offset < KEYLENGTH)
199 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
200 else
201 return 0;
202 }
203
204 static inline int tkey_equals(t_key a, t_key b)
205 {
206 return a == b;
207 }
208
209 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
210 {
211 if (bits == 0 || offset >= KEYLENGTH)
212 return 1;
213 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
214 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
215 }
216
217 static inline int tkey_mismatch(t_key a, int offset, t_key b)
218 {
219 t_key diff = a ^ b;
220 int i = offset;
221
222 if (!diff)
223 return 0;
224 while ((diff << i) >> (KEYLENGTH-1) == 0)
225 i++;
226 return i;
227 }
228
229 /* Candidate for fib_semantics */
230
231 static void fn_free_alias(struct fib_alias *fa)
232 {
233 fib_release_info(fa->fa_info);
234 kmem_cache_free(fn_alias_kmem, fa);
235 }
236
237 /*
238 To understand this stuff, an understanding of keys and all their bits is
239 necessary. Every node in the trie has a key associated with it, but not
240 all of the bits in that key are significant.
241
242 Consider a node 'n' and its parent 'tp'.
243
244 If n is a leaf, every bit in its key is significant. Its presence is
245 necessitaded by path compression, since during a tree traversal (when
246 searching for a leaf - unless we are doing an insertion) we will completely
247 ignore all skipped bits we encounter. Thus we need to verify, at the end of
248 a potentially successful search, that we have indeed been walking the
249 correct key path.
250
251 Note that we can never "miss" the correct key in the tree if present by
252 following the wrong path. Path compression ensures that segments of the key
253 that are the same for all keys with a given prefix are skipped, but the
254 skipped part *is* identical for each node in the subtrie below the skipped
255 bit! trie_insert() in this implementation takes care of that - note the
256 call to tkey_sub_equals() in trie_insert().
257
258 if n is an internal node - a 'tnode' here, the various parts of its key
259 have many different meanings.
260
261 Example:
262 _________________________________________________________________
263 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
264 -----------------------------------------------------------------
265 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
266
267 _________________________________________________________________
268 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
269 -----------------------------------------------------------------
270 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
271
272 tp->pos = 7
273 tp->bits = 3
274 n->pos = 15
275 n->bits = 4
276
277 First, let's just ignore the bits that come before the parent tp, that is
278 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
279 not use them for anything.
280
281 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
282 index into the parent's child array. That is, they will be used to find
283 'n' among tp's children.
284
285 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
286 for the node n.
287
288 All the bits we have seen so far are significant to the node n. The rest
289 of the bits are really not needed or indeed known in n->key.
290
291 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
292 n's child array, and will of course be different for each child.
293
294
295 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
296 at this point.
297
298 */
299
300 static void check_tnode(struct tnode *tn)
301 {
302 if (tn && tn->pos+tn->bits > 32) {
303 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
304 }
305 }
306
307 static int halve_threshold = 25;
308 static int inflate_threshold = 50;
309
310 static struct leaf *leaf_new(void)
311 {
312 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
313 if (l) {
314 NODE_INIT_PARENT(l, T_LEAF);
315 INIT_HLIST_HEAD(&l->list);
316 }
317 return l;
318 }
319
320 static struct leaf_info *leaf_info_new(int plen)
321 {
322 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
323
324 if (!li)
325 return NULL;
326
327 li->plen = plen;
328 INIT_LIST_HEAD(&li->falh);
329
330 return li;
331 }
332
333 static inline void free_leaf(struct leaf *l)
334 {
335 kfree(l);
336 }
337
338 static inline void free_leaf_info(struct leaf_info *li)
339 {
340 kfree(li);
341 }
342
343 static struct tnode *tnode_alloc(unsigned int size)
344 {
345 if (size <= PAGE_SIZE) {
346 return kmalloc(size, GFP_KERNEL);
347 } else {
348 return (struct tnode *)
349 __get_free_pages(GFP_KERNEL, get_order(size));
350 }
351 }
352
353 static void __tnode_free(struct tnode *tn)
354 {
355 unsigned int size = sizeof(struct tnode) +
356 (1 << tn->bits) * sizeof(struct node *);
357
358 if (size <= PAGE_SIZE)
359 kfree(tn);
360 else
361 free_pages((unsigned long)tn, get_order(size));
362 }
363
364 static struct tnode* tnode_new(t_key key, int pos, int bits)
365 {
366 int nchildren = 1<<bits;
367 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
368 struct tnode *tn = tnode_alloc(sz);
369
370 if (tn) {
371 memset(tn, 0, sz);
372 NODE_INIT_PARENT(tn, T_TNODE);
373 tn->pos = pos;
374 tn->bits = bits;
375 tn->key = key;
376 tn->full_children = 0;
377 tn->empty_children = 1<<bits;
378 }
379
380 DBG("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
381 (unsigned int) (sizeof(struct node) * 1<<bits));
382 return tn;
383 }
384
385 static void tnode_free(struct tnode *tn)
386 {
387 BUG_ON(!tn);
388
389 if (IS_LEAF(tn)) {
390 free_leaf((struct leaf *)tn);
391 DBG("FL %p \n", tn);
392 } else {
393 __tnode_free(tn);
394 DBG("FT %p \n", tn);
395 }
396 }
397
398 /*
399 * Check whether a tnode 'n' is "full", i.e. it is an internal node
400 * and no bits are skipped. See discussion in dyntree paper p. 6
401 */
402
403 static inline int tnode_full(struct tnode *tn, struct node *n)
404 {
405 if (n == NULL || IS_LEAF(n))
406 return 0;
407
408 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
409 }
410
411 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
412 {
413 tnode_put_child_reorg(tn, i, n, -1);
414 }
415
416 /*
417 * Add a child at position i overwriting the old value.
418 * Update the value of full_children and empty_children.
419 */
420
421 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
422 {
423 struct node *chi;
424 int isfull;
425
426 if (i >= 1<<tn->bits) {
427 printk("bits=%d, i=%d\n", tn->bits, i);
428 BUG();
429 }
430 write_lock_bh(&fib_lock);
431 chi = tn->child[i];
432
433 /* update emptyChildren */
434 if (n == NULL && chi != NULL)
435 tn->empty_children++;
436 else if (n != NULL && chi == NULL)
437 tn->empty_children--;
438
439 /* update fullChildren */
440 if (wasfull == -1)
441 wasfull = tnode_full(tn, chi);
442
443 isfull = tnode_full(tn, n);
444 if (wasfull && !isfull)
445 tn->full_children--;
446 else if (!wasfull && isfull)
447 tn->full_children++;
448
449 if (n)
450 NODE_SET_PARENT(n, tn);
451
452 tn->child[i] = n;
453 write_unlock_bh(&fib_lock);
454 }
455
456 static struct node *resize(struct trie *t, struct tnode *tn)
457 {
458 int i;
459 int err = 0;
460 struct tnode *old_tn;
461
462 if (!tn)
463 return NULL;
464
465 DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
466 tn, inflate_threshold, halve_threshold);
467
468 /* No children */
469 if (tn->empty_children == tnode_child_length(tn)) {
470 tnode_free(tn);
471 return NULL;
472 }
473 /* One child */
474 if (tn->empty_children == tnode_child_length(tn) - 1)
475 for (i = 0; i < tnode_child_length(tn); i++) {
476 struct node *n;
477
478 write_lock_bh(&fib_lock);
479 n = tn->child[i];
480 if (!n) {
481 write_unlock_bh(&fib_lock);
482 continue;
483 }
484
485 /* compress one level */
486 NODE_INIT_PARENT(n, NODE_TYPE(n));
487
488 write_unlock_bh(&fib_lock);
489 tnode_free(tn);
490 return n;
491 }
492 /*
493 * Double as long as the resulting node has a number of
494 * nonempty nodes that are above the threshold.
495 */
496
497 /*
498 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
499 * the Helsinki University of Technology and Matti Tikkanen of Nokia
500 * Telecommunications, page 6:
501 * "A node is doubled if the ratio of non-empty children to all
502 * children in the *doubled* node is at least 'high'."
503 *
504 * 'high' in this instance is the variable 'inflate_threshold'. It
505 * is expressed as a percentage, so we multiply it with
506 * tnode_child_length() and instead of multiplying by 2 (since the
507 * child array will be doubled by inflate()) and multiplying
508 * the left-hand side by 100 (to handle the percentage thing) we
509 * multiply the left-hand side by 50.
510 *
511 * The left-hand side may look a bit weird: tnode_child_length(tn)
512 * - tn->empty_children is of course the number of non-null children
513 * in the current node. tn->full_children is the number of "full"
514 * children, that is non-null tnodes with a skip value of 0.
515 * All of those will be doubled in the resulting inflated tnode, so
516 * we just count them one extra time here.
517 *
518 * A clearer way to write this would be:
519 *
520 * to_be_doubled = tn->full_children;
521 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
522 * tn->full_children;
523 *
524 * new_child_length = tnode_child_length(tn) * 2;
525 *
526 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
527 * new_child_length;
528 * if (new_fill_factor >= inflate_threshold)
529 *
530 * ...and so on, tho it would mess up the while () loop.
531 *
532 * anyway,
533 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
534 * inflate_threshold
535 *
536 * avoid a division:
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
538 * inflate_threshold * new_child_length
539 *
540 * expand not_to_be_doubled and to_be_doubled, and shorten:
541 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 * tn->full_children) >= inflate_threshold * new_child_length
543 *
544 * expand new_child_length:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children) >=
547 * inflate_threshold * tnode_child_length(tn) * 2
548 *
549 * shorten again:
550 * 50 * (tn->full_children + tnode_child_length(tn) -
551 * tn->empty_children) >= inflate_threshold *
552 * tnode_child_length(tn)
553 *
554 */
555
556 check_tnode(tn);
557
558 err = 0;
559 while ((tn->full_children > 0 &&
560 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
561 inflate_threshold * tnode_child_length(tn))) {
562
563 old_tn = tn;
564 tn = inflate(t, tn);
565 if (IS_ERR(tn)) {
566 tn = old_tn;
567 #ifdef CONFIG_IP_FIB_TRIE_STATS
568 t->stats.resize_node_skipped++;
569 #endif
570 break;
571 }
572 }
573
574 check_tnode(tn);
575
576 /*
577 * Halve as long as the number of empty children in this
578 * node is above threshold.
579 */
580
581 err = 0;
582 while (tn->bits > 1 &&
583 100 * (tnode_child_length(tn) - tn->empty_children) <
584 halve_threshold * tnode_child_length(tn)) {
585
586 old_tn = tn;
587 tn = halve(t, tn);
588 if (IS_ERR(tn)) {
589 tn = old_tn;
590 #ifdef CONFIG_IP_FIB_TRIE_STATS
591 t->stats.resize_node_skipped++;
592 #endif
593 break;
594 }
595 }
596
597
598 /* Only one child remains */
599
600 if (tn->empty_children == tnode_child_length(tn) - 1)
601 for (i = 0; i < tnode_child_length(tn); i++) {
602 struct node *n;
603
604 write_lock_bh(&fib_lock);
605
606 n = tn->child[i];
607 if (!n) {
608 write_unlock_bh(&fib_lock);
609 continue;
610 }
611
612 /* compress one level */
613
614 NODE_INIT_PARENT(n, NODE_TYPE(n));
615
616 write_unlock_bh(&fib_lock);
617 tnode_free(tn);
618 return n;
619 }
620
621 return (struct node *) tn;
622 }
623
624 static struct tnode *inflate(struct trie *t, struct tnode *tn)
625 {
626 struct tnode *inode;
627 struct tnode *oldtnode = tn;
628 int olen = tnode_child_length(tn);
629 int i;
630
631 DBG("In inflate\n");
632
633 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
634
635 if (!tn)
636 return ERR_PTR(-ENOMEM);
637
638 /*
639 * Preallocate and store tnodes before the actual work so we
640 * don't get into an inconsistent state if memory allocation
641 * fails. In case of failure we return the oldnode and inflate
642 * of tnode is ignored.
643 */
644
645 for (i = 0; i < olen; i++) {
646 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
647
648 if (inode &&
649 IS_TNODE(inode) &&
650 inode->pos == oldtnode->pos + oldtnode->bits &&
651 inode->bits > 1) {
652 struct tnode *left, *right;
653 t_key m = TKEY_GET_MASK(inode->pos, 1);
654
655 left = tnode_new(inode->key&(~m), inode->pos + 1,
656 inode->bits - 1);
657 if (!left)
658 goto nomem;
659
660 right = tnode_new(inode->key|m, inode->pos + 1,
661 inode->bits - 1);
662
663 if (!right) {
664 tnode_free(left);
665 goto nomem;
666 }
667
668 put_child(t, tn, 2*i, (struct node *) left);
669 put_child(t, tn, 2*i+1, (struct node *) right);
670 }
671 }
672
673 for (i = 0; i < olen; i++) {
674 struct node *node = tnode_get_child(oldtnode, i);
675 struct tnode *left, *right;
676 int size, j;
677
678 /* An empty child */
679 if (node == NULL)
680 continue;
681
682 /* A leaf or an internal node with skipped bits */
683
684 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
685 tn->pos + tn->bits - 1) {
686 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
687 1) == 0)
688 put_child(t, tn, 2*i, node);
689 else
690 put_child(t, tn, 2*i+1, node);
691 continue;
692 }
693
694 /* An internal node with two children */
695 inode = (struct tnode *) node;
696
697 if (inode->bits == 1) {
698 put_child(t, tn, 2*i, inode->child[0]);
699 put_child(t, tn, 2*i+1, inode->child[1]);
700
701 tnode_free(inode);
702 continue;
703 }
704
705 /* An internal node with more than two children */
706
707 /* We will replace this node 'inode' with two new
708 * ones, 'left' and 'right', each with half of the
709 * original children. The two new nodes will have
710 * a position one bit further down the key and this
711 * means that the "significant" part of their keys
712 * (see the discussion near the top of this file)
713 * will differ by one bit, which will be "0" in
714 * left's key and "1" in right's key. Since we are
715 * moving the key position by one step, the bit that
716 * we are moving away from - the bit at position
717 * (inode->pos) - is the one that will differ between
718 * left and right. So... we synthesize that bit in the
719 * two new keys.
720 * The mask 'm' below will be a single "one" bit at
721 * the position (inode->pos)
722 */
723
724 /* Use the old key, but set the new significant
725 * bit to zero.
726 */
727
728 left = (struct tnode *) tnode_get_child(tn, 2*i);
729 put_child(t, tn, 2*i, NULL);
730
731 BUG_ON(!left);
732
733 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
734 put_child(t, tn, 2*i+1, NULL);
735
736 BUG_ON(!right);
737
738 size = tnode_child_length(left);
739 for (j = 0; j < size; j++) {
740 put_child(t, left, j, inode->child[j]);
741 put_child(t, right, j, inode->child[j + size]);
742 }
743 put_child(t, tn, 2*i, resize(t, left));
744 put_child(t, tn, 2*i+1, resize(t, right));
745
746 tnode_free(inode);
747 }
748 tnode_free(oldtnode);
749 return tn;
750 nomem:
751 {
752 int size = tnode_child_length(tn);
753 int j;
754
755 for(j = 0; j < size; j++)
756 if (tn->child[j])
757 tnode_free((struct tnode *)tn->child[j]);
758
759 tnode_free(tn);
760
761 return ERR_PTR(-ENOMEM);
762 }
763 }
764
765 static struct tnode *halve(struct trie *t, struct tnode *tn)
766 {
767 struct tnode *oldtnode = tn;
768 struct node *left, *right;
769 int i;
770 int olen = tnode_child_length(tn);
771
772 DBG("In halve\n");
773
774 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
775
776 if (!tn)
777 return ERR_PTR(-ENOMEM);
778
779 /*
780 * Preallocate and store tnodes before the actual work so we
781 * don't get into an inconsistent state if memory allocation
782 * fails. In case of failure we return the oldnode and halve
783 * of tnode is ignored.
784 */
785
786 for (i = 0; i < olen; i += 2) {
787 left = tnode_get_child(oldtnode, i);
788 right = tnode_get_child(oldtnode, i+1);
789
790 /* Two nonempty children */
791 if (left && right) {
792 struct tnode *newn;
793
794 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
795
796 if (!newn)
797 goto nomem;
798
799 put_child(t, tn, i/2, (struct node *)newn);
800 }
801
802 }
803
804 for (i = 0; i < olen; i += 2) {
805 struct tnode *newBinNode;
806
807 left = tnode_get_child(oldtnode, i);
808 right = tnode_get_child(oldtnode, i+1);
809
810 /* At least one of the children is empty */
811 if (left == NULL) {
812 if (right == NULL) /* Both are empty */
813 continue;
814 put_child(t, tn, i/2, right);
815 continue;
816 }
817
818 if (right == NULL) {
819 put_child(t, tn, i/2, left);
820 continue;
821 }
822
823 /* Two nonempty children */
824 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
825 put_child(t, tn, i/2, NULL);
826
827 BUG_ON(!newBinNode);
828
829 put_child(t, newBinNode, 0, left);
830 put_child(t, newBinNode, 1, right);
831 put_child(t, tn, i/2, resize(t, newBinNode));
832 }
833 tnode_free(oldtnode);
834 return tn;
835 nomem:
836 {
837 int size = tnode_child_length(tn);
838 int j;
839
840 for(j = 0; j < size; j++)
841 if (tn->child[j])
842 tnode_free((struct tnode *)tn->child[j]);
843
844 tnode_free(tn);
845
846 return ERR_PTR(-ENOMEM);
847 }
848 }
849
850 static void trie_init(struct trie *t)
851 {
852 if (!t)
853 return;
854
855 t->size = 0;
856 t->trie = NULL;
857 t->revision = 0;
858 #ifdef CONFIG_IP_FIB_TRIE_STATS
859 memset(&t->stats, 0, sizeof(struct trie_use_stats));
860 #endif
861 }
862
863 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
864 {
865 struct hlist_node *node;
866 struct leaf_info *li;
867
868 hlist_for_each_entry(li, node, head, hlist)
869 if (li->plen == plen)
870 return li;
871
872 return NULL;
873 }
874
875 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
876 {
877 struct leaf_info *li = find_leaf_info(&l->list, plen);
878
879 if (!li)
880 return NULL;
881
882 return &li->falh;
883 }
884
885 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
886 {
887 struct leaf_info *li = NULL, *last = NULL;
888 struct hlist_node *node;
889
890 write_lock_bh(&fib_lock);
891
892 if (hlist_empty(head)) {
893 hlist_add_head(&new->hlist, head);
894 } else {
895 hlist_for_each_entry(li, node, head, hlist) {
896 if (new->plen > li->plen)
897 break;
898
899 last = li;
900 }
901 if (last)
902 hlist_add_after(&last->hlist, &new->hlist);
903 else
904 hlist_add_before(&new->hlist, &li->hlist);
905 }
906 write_unlock_bh(&fib_lock);
907 }
908
909 static struct leaf *
910 fib_find_node(struct trie *t, u32 key)
911 {
912 int pos;
913 struct tnode *tn;
914 struct node *n;
915
916 pos = 0;
917 n = t->trie;
918
919 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
920 tn = (struct tnode *) n;
921
922 check_tnode(tn);
923
924 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
925 pos = tn->pos + tn->bits;
926 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
927 } else
928 break;
929 }
930 /* Case we have found a leaf. Compare prefixes */
931
932 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
933 return (struct leaf *)n;
934
935 return NULL;
936 }
937
938 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
939 {
940 int i;
941 int wasfull;
942 t_key cindex, key;
943 struct tnode *tp = NULL;
944
945 BUG_ON(!tn);
946
947 key = tn->key;
948 i = 0;
949
950 while (tn != NULL && NODE_PARENT(tn) != NULL) {
951 if (i > 10) {
952 printk("Rebalance tn=%p \n", tn);
953 if (tn)
954 printk("tn->parent=%p \n", NODE_PARENT(tn));
955
956 printk("Rebalance tp=%p \n", tp);
957 if (tp)
958 printk("tp->parent=%p \n", NODE_PARENT(tp));
959 }
960
961 BUG_ON(i > 12); /* Why is this a bug? -ojn */
962 i++;
963
964 tp = NODE_PARENT(tn);
965 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
966 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
967 tn = (struct tnode *) resize (t, (struct tnode *)tn);
968 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
969
970 if (!NODE_PARENT(tn))
971 break;
972
973 tn = NODE_PARENT(tn);
974 }
975 /* Handle last (top) tnode */
976 if (IS_TNODE(tn))
977 tn = (struct tnode*) resize(t, (struct tnode *)tn);
978
979 return (struct node*) tn;
980 }
981
982 static struct list_head *
983 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
984 {
985 int pos, newpos;
986 struct tnode *tp = NULL, *tn = NULL;
987 struct node *n;
988 struct leaf *l;
989 int missbit;
990 struct list_head *fa_head = NULL;
991 struct leaf_info *li;
992 t_key cindex;
993
994 pos = 0;
995 n = t->trie;
996
997 /* If we point to NULL, stop. Either the tree is empty and we should
998 * just put a new leaf in if, or we have reached an empty child slot,
999 * and we should just put our new leaf in that.
1000 * If we point to a T_TNODE, check if it matches our key. Note that
1001 * a T_TNODE might be skipping any number of bits - its 'pos' need
1002 * not be the parent's 'pos'+'bits'!
1003 *
1004 * If it does match the current key, get pos/bits from it, extract
1005 * the index from our key, push the T_TNODE and walk the tree.
1006 *
1007 * If it doesn't, we have to replace it with a new T_TNODE.
1008 *
1009 * If we point to a T_LEAF, it might or might not have the same key
1010 * as we do. If it does, just change the value, update the T_LEAF's
1011 * value, and return it.
1012 * If it doesn't, we need to replace it with a T_TNODE.
1013 */
1014
1015 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1016 tn = (struct tnode *) n;
1017
1018 check_tnode(tn);
1019
1020 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1021 tp = tn;
1022 pos = tn->pos + tn->bits;
1023 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1024
1025 if (n && NODE_PARENT(n) != tn) {
1026 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1027 BUG();
1028 }
1029 } else
1030 break;
1031 }
1032
1033 /*
1034 * n ----> NULL, LEAF or TNODE
1035 *
1036 * tp is n's (parent) ----> NULL or TNODE
1037 */
1038
1039 BUG_ON(tp && IS_LEAF(tp));
1040
1041 /* Case 1: n is a leaf. Compare prefixes */
1042
1043 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1044 struct leaf *l = (struct leaf *) n;
1045
1046 li = leaf_info_new(plen);
1047
1048 if (!li) {
1049 *err = -ENOMEM;
1050 goto err;
1051 }
1052
1053 fa_head = &li->falh;
1054 insert_leaf_info(&l->list, li);
1055 goto done;
1056 }
1057 t->size++;
1058 l = leaf_new();
1059
1060 if (!l) {
1061 *err = -ENOMEM;
1062 goto err;
1063 }
1064
1065 l->key = key;
1066 li = leaf_info_new(plen);
1067
1068 if (!li) {
1069 tnode_free((struct tnode *) l);
1070 *err = -ENOMEM;
1071 goto err;
1072 }
1073
1074 fa_head = &li->falh;
1075 insert_leaf_info(&l->list, li);
1076
1077 if (t->trie && n == NULL) {
1078 /* Case 2: n is NULL, and will just insert a new leaf */
1079
1080 NODE_SET_PARENT(l, tp);
1081
1082 BUG_ON(!tp);
1083
1084 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1085 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1086 } else {
1087 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1088 /*
1089 * Add a new tnode here
1090 * first tnode need some special handling
1091 */
1092
1093 if (tp)
1094 pos = tp->pos+tp->bits;
1095 else
1096 pos = 0;
1097
1098 if (n) {
1099 newpos = tkey_mismatch(key, pos, n->key);
1100 tn = tnode_new(n->key, newpos, 1);
1101 } else {
1102 newpos = 0;
1103 tn = tnode_new(key, newpos, 1); /* First tnode */
1104 }
1105
1106 if (!tn) {
1107 free_leaf_info(li);
1108 tnode_free((struct tnode *) l);
1109 *err = -ENOMEM;
1110 goto err;
1111 }
1112
1113 NODE_SET_PARENT(tn, tp);
1114
1115 missbit = tkey_extract_bits(key, newpos, 1);
1116 put_child(t, tn, missbit, (struct node *)l);
1117 put_child(t, tn, 1-missbit, n);
1118
1119 if (tp) {
1120 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1121 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1122 } else {
1123 t->trie = (struct node*) tn; /* First tnode */
1124 tp = tn;
1125 }
1126 }
1127
1128 if (tp && tp->pos + tp->bits > 32)
1129 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1130 tp, tp->pos, tp->bits, key, plen);
1131
1132 /* Rebalance the trie */
1133 t->trie = trie_rebalance(t, tp);
1134 done:
1135 t->revision++;
1136 err:
1137 return fa_head;
1138 }
1139
1140 static int
1141 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1142 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1143 {
1144 struct trie *t = (struct trie *) tb->tb_data;
1145 struct fib_alias *fa, *new_fa;
1146 struct list_head *fa_head = NULL;
1147 struct fib_info *fi;
1148 int plen = r->rtm_dst_len;
1149 int type = r->rtm_type;
1150 u8 tos = r->rtm_tos;
1151 u32 key, mask;
1152 int err;
1153 struct leaf *l;
1154
1155 if (plen > 32)
1156 return -EINVAL;
1157
1158 key = 0;
1159 if (rta->rta_dst)
1160 memcpy(&key, rta->rta_dst, 4);
1161
1162 key = ntohl(key);
1163
1164 DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1165
1166 mask = ntohl(inet_make_mask(plen));
1167
1168 if (key & ~mask)
1169 return -EINVAL;
1170
1171 key = key & mask;
1172
1173 fi = fib_create_info(r, rta, nlhdr, &err);
1174
1175 if (!fi)
1176 goto err;
1177
1178 l = fib_find_node(t, key);
1179 fa = NULL;
1180
1181 if (l) {
1182 fa_head = get_fa_head(l, plen);
1183 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1184 }
1185
1186 /* Now fa, if non-NULL, points to the first fib alias
1187 * with the same keys [prefix,tos,priority], if such key already
1188 * exists or to the node before which we will insert new one.
1189 *
1190 * If fa is NULL, we will need to allocate a new one and
1191 * insert to the head of f.
1192 *
1193 * If f is NULL, no fib node matched the destination key
1194 * and we need to allocate a new one of those as well.
1195 */
1196
1197 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1198 struct fib_alias *fa_orig;
1199
1200 err = -EEXIST;
1201 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1202 goto out;
1203
1204 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1205 struct fib_info *fi_drop;
1206 u8 state;
1207
1208 write_lock_bh(&fib_lock);
1209
1210 fi_drop = fa->fa_info;
1211 fa->fa_info = fi;
1212 fa->fa_type = type;
1213 fa->fa_scope = r->rtm_scope;
1214 state = fa->fa_state;
1215 fa->fa_state &= ~FA_S_ACCESSED;
1216
1217 write_unlock_bh(&fib_lock);
1218
1219 fib_release_info(fi_drop);
1220 if (state & FA_S_ACCESSED)
1221 rt_cache_flush(-1);
1222
1223 goto succeeded;
1224 }
1225 /* Error if we find a perfect match which
1226 * uses the same scope, type, and nexthop
1227 * information.
1228 */
1229 fa_orig = fa;
1230 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1231 if (fa->fa_tos != tos)
1232 break;
1233 if (fa->fa_info->fib_priority != fi->fib_priority)
1234 break;
1235 if (fa->fa_type == type &&
1236 fa->fa_scope == r->rtm_scope &&
1237 fa->fa_info == fi) {
1238 goto out;
1239 }
1240 }
1241 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1242 fa = fa_orig;
1243 }
1244 err = -ENOENT;
1245 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1246 goto out;
1247
1248 err = -ENOBUFS;
1249 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1250 if (new_fa == NULL)
1251 goto out;
1252
1253 new_fa->fa_info = fi;
1254 new_fa->fa_tos = tos;
1255 new_fa->fa_type = type;
1256 new_fa->fa_scope = r->rtm_scope;
1257 new_fa->fa_state = 0;
1258 /*
1259 * Insert new entry to the list.
1260 */
1261
1262 if (!fa_head) {
1263 fa_head = fib_insert_node(t, &err, key, plen);
1264 err = 0;
1265 if (err)
1266 goto out_free_new_fa;
1267 }
1268
1269 write_lock_bh(&fib_lock);
1270
1271 list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
1272
1273 write_unlock_bh(&fib_lock);
1274
1275 rt_cache_flush(-1);
1276 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1277 succeeded:
1278 return 0;
1279
1280 out_free_new_fa:
1281 kmem_cache_free(fn_alias_kmem, new_fa);
1282 out:
1283 fib_release_info(fi);
1284 err:
1285 return err;
1286 }
1287
1288 static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
1289 struct fib_result *res)
1290 {
1291 int err, i;
1292 t_key mask;
1293 struct leaf_info *li;
1294 struct hlist_head *hhead = &l->list;
1295 struct hlist_node *node;
1296
1297 hlist_for_each_entry(li, node, hhead, hlist) {
1298 i = li->plen;
1299 mask = ntohl(inet_make_mask(i));
1300 if (l->key != (key & mask))
1301 continue;
1302
1303 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1304 *plen = i;
1305 #ifdef CONFIG_IP_FIB_TRIE_STATS
1306 t->stats.semantic_match_passed++;
1307 #endif
1308 return err;
1309 }
1310 #ifdef CONFIG_IP_FIB_TRIE_STATS
1311 t->stats.semantic_match_miss++;
1312 #endif
1313 }
1314 return 1;
1315 }
1316
1317 static int
1318 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1319 {
1320 struct trie *t = (struct trie *) tb->tb_data;
1321 int plen, ret = 0;
1322 struct node *n;
1323 struct tnode *pn;
1324 int pos, bits;
1325 t_key key = ntohl(flp->fl4_dst);
1326 int chopped_off;
1327 t_key cindex = 0;
1328 int current_prefix_length = KEYLENGTH;
1329 struct tnode *cn;
1330 t_key node_prefix, key_prefix, pref_mismatch;
1331 int mp;
1332
1333 n = t->trie;
1334
1335 read_lock(&fib_lock);
1336
1337 if (!n)
1338 goto failed;
1339
1340 #ifdef CONFIG_IP_FIB_TRIE_STATS
1341 t->stats.gets++;
1342 #endif
1343
1344 /* Just a leaf? */
1345 if (IS_LEAF(n)) {
1346 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1347 goto found;
1348 goto failed;
1349 }
1350 pn = (struct tnode *) n;
1351 chopped_off = 0;
1352
1353 while (pn) {
1354 pos = pn->pos;
1355 bits = pn->bits;
1356
1357 if (!chopped_off)
1358 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1359
1360 n = tnode_get_child(pn, cindex);
1361
1362 if (n == NULL) {
1363 #ifdef CONFIG_IP_FIB_TRIE_STATS
1364 t->stats.null_node_hit++;
1365 #endif
1366 goto backtrace;
1367 }
1368
1369 if (IS_LEAF(n)) {
1370 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1371 goto found;
1372 else
1373 goto backtrace;
1374 }
1375
1376 #define HL_OPTIMIZE
1377 #ifdef HL_OPTIMIZE
1378 cn = (struct tnode *)n;
1379
1380 /*
1381 * It's a tnode, and we can do some extra checks here if we
1382 * like, to avoid descending into a dead-end branch.
1383 * This tnode is in the parent's child array at index
1384 * key[p_pos..p_pos+p_bits] but potentially with some bits
1385 * chopped off, so in reality the index may be just a
1386 * subprefix, padded with zero at the end.
1387 * We can also take a look at any skipped bits in this
1388 * tnode - everything up to p_pos is supposed to be ok,
1389 * and the non-chopped bits of the index (se previous
1390 * paragraph) are also guaranteed ok, but the rest is
1391 * considered unknown.
1392 *
1393 * The skipped bits are key[pos+bits..cn->pos].
1394 */
1395
1396 /* If current_prefix_length < pos+bits, we are already doing
1397 * actual prefix matching, which means everything from
1398 * pos+(bits-chopped_off) onward must be zero along some
1399 * branch of this subtree - otherwise there is *no* valid
1400 * prefix present. Here we can only check the skipped
1401 * bits. Remember, since we have already indexed into the
1402 * parent's child array, we know that the bits we chopped of
1403 * *are* zero.
1404 */
1405
1406 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1407
1408 if (current_prefix_length < pos+bits) {
1409 if (tkey_extract_bits(cn->key, current_prefix_length,
1410 cn->pos - current_prefix_length) != 0 ||
1411 !(cn->child[0]))
1412 goto backtrace;
1413 }
1414
1415 /*
1416 * If chopped_off=0, the index is fully validated and we
1417 * only need to look at the skipped bits for this, the new,
1418 * tnode. What we actually want to do is to find out if
1419 * these skipped bits match our key perfectly, or if we will
1420 * have to count on finding a matching prefix further down,
1421 * because if we do, we would like to have some way of
1422 * verifying the existence of such a prefix at this point.
1423 */
1424
1425 /* The only thing we can do at this point is to verify that
1426 * any such matching prefix can indeed be a prefix to our
1427 * key, and if the bits in the node we are inspecting that
1428 * do not match our key are not ZERO, this cannot be true.
1429 * Thus, find out where there is a mismatch (before cn->pos)
1430 * and verify that all the mismatching bits are zero in the
1431 * new tnode's key.
1432 */
1433
1434 /* Note: We aren't very concerned about the piece of the key
1435 * that precede pn->pos+pn->bits, since these have already been
1436 * checked. The bits after cn->pos aren't checked since these are
1437 * by definition "unknown" at this point. Thus, what we want to
1438 * see is if we are about to enter the "prefix matching" state,
1439 * and in that case verify that the skipped bits that will prevail
1440 * throughout this subtree are zero, as they have to be if we are
1441 * to find a matching prefix.
1442 */
1443
1444 node_prefix = MASK_PFX(cn->key, cn->pos);
1445 key_prefix = MASK_PFX(key, cn->pos);
1446 pref_mismatch = key_prefix^node_prefix;
1447 mp = 0;
1448
1449 /* In short: If skipped bits in this node do not match the search
1450 * key, enter the "prefix matching" state.directly.
1451 */
1452 if (pref_mismatch) {
1453 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1454 mp++;
1455 pref_mismatch = pref_mismatch <<1;
1456 }
1457 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1458
1459 if (key_prefix != 0)
1460 goto backtrace;
1461
1462 if (current_prefix_length >= cn->pos)
1463 current_prefix_length = mp;
1464 }
1465 #endif
1466 pn = (struct tnode *)n; /* Descend */
1467 chopped_off = 0;
1468 continue;
1469
1470 backtrace:
1471 chopped_off++;
1472
1473 /* As zero don't change the child key (cindex) */
1474 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1475 chopped_off++;
1476
1477 /* Decrease current_... with bits chopped off */
1478 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1479 current_prefix_length = pn->pos + pn->bits - chopped_off;
1480
1481 /*
1482 * Either we do the actual chop off according or if we have
1483 * chopped off all bits in this tnode walk up to our parent.
1484 */
1485
1486 if (chopped_off <= pn->bits) {
1487 cindex &= ~(1 << (chopped_off-1));
1488 } else {
1489 if (NODE_PARENT(pn) == NULL)
1490 goto failed;
1491
1492 /* Get Child's index */
1493 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1494 pn = NODE_PARENT(pn);
1495 chopped_off = 0;
1496
1497 #ifdef CONFIG_IP_FIB_TRIE_STATS
1498 t->stats.backtrack++;
1499 #endif
1500 goto backtrace;
1501 }
1502 }
1503 failed:
1504 ret = 1;
1505 found:
1506 read_unlock(&fib_lock);
1507 return ret;
1508 }
1509
1510 static int trie_leaf_remove(struct trie *t, t_key key)
1511 {
1512 t_key cindex;
1513 struct tnode *tp = NULL;
1514 struct node *n = t->trie;
1515 struct leaf *l;
1516
1517 DBG("entering trie_leaf_remove(%p)\n", n);
1518
1519 /* Note that in the case skipped bits, those bits are *not* checked!
1520 * When we finish this, we will have NULL or a T_LEAF, and the
1521 * T_LEAF may or may not match our key.
1522 */
1523
1524 while (n != NULL && IS_TNODE(n)) {
1525 struct tnode *tn = (struct tnode *) n;
1526 check_tnode(tn);
1527 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1528
1529 if (n && NODE_PARENT(n) != tn) {
1530 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1531 BUG();
1532 }
1533 }
1534 l = (struct leaf *) n;
1535
1536 if (!n || !tkey_equals(l->key, key))
1537 return 0;
1538
1539 /*
1540 * Key found.
1541 * Remove the leaf and rebalance the tree
1542 */
1543
1544 t->revision++;
1545 t->size--;
1546
1547 tp = NODE_PARENT(n);
1548 tnode_free((struct tnode *) n);
1549
1550 if (tp) {
1551 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1552 put_child(t, (struct tnode *)tp, cindex, NULL);
1553 t->trie = trie_rebalance(t, tp);
1554 } else
1555 t->trie = NULL;
1556
1557 return 1;
1558 }
1559
1560 static int
1561 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1562 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1563 {
1564 struct trie *t = (struct trie *) tb->tb_data;
1565 u32 key, mask;
1566 int plen = r->rtm_dst_len;
1567 u8 tos = r->rtm_tos;
1568 struct fib_alias *fa, *fa_to_delete;
1569 struct list_head *fa_head;
1570 struct leaf *l;
1571 int kill_li = 0;
1572 struct leaf_info *li;
1573
1574
1575 if (plen > 32)
1576 return -EINVAL;
1577
1578 key = 0;
1579 if (rta->rta_dst)
1580 memcpy(&key, rta->rta_dst, 4);
1581
1582 key = ntohl(key);
1583 mask = ntohl(inet_make_mask(plen));
1584
1585 if (key & ~mask)
1586 return -EINVAL;
1587
1588 key = key & mask;
1589 l = fib_find_node(t, key);
1590
1591 if (!l)
1592 return -ESRCH;
1593
1594 fa_head = get_fa_head(l, plen);
1595 fa = fib_find_alias(fa_head, tos, 0);
1596
1597 if (!fa)
1598 return -ESRCH;
1599
1600 DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1601
1602 fa_to_delete = NULL;
1603 fa_head = fa->fa_list.prev;
1604 list_for_each_entry(fa, fa_head, fa_list) {
1605 struct fib_info *fi = fa->fa_info;
1606
1607 if (fa->fa_tos != tos)
1608 break;
1609
1610 if ((!r->rtm_type ||
1611 fa->fa_type == r->rtm_type) &&
1612 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1613 fa->fa_scope == r->rtm_scope) &&
1614 (!r->rtm_protocol ||
1615 fi->fib_protocol == r->rtm_protocol) &&
1616 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1617 fa_to_delete = fa;
1618 break;
1619 }
1620 }
1621
1622 if (!fa_to_delete)
1623 return -ESRCH;
1624
1625 fa = fa_to_delete;
1626 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1627
1628 l = fib_find_node(t, key);
1629 li = find_leaf_info(&l->list, plen);
1630
1631 write_lock_bh(&fib_lock);
1632
1633 list_del(&fa->fa_list);
1634
1635 if (list_empty(fa_head)) {
1636 hlist_del(&li->hlist);
1637 kill_li = 1;
1638 }
1639 write_unlock_bh(&fib_lock);
1640
1641 if (kill_li)
1642 free_leaf_info(li);
1643
1644 if (hlist_empty(&l->list))
1645 trie_leaf_remove(t, key);
1646
1647 if (fa->fa_state & FA_S_ACCESSED)
1648 rt_cache_flush(-1);
1649
1650 fn_free_alias(fa);
1651 return 0;
1652 }
1653
1654 static int trie_flush_list(struct trie *t, struct list_head *head)
1655 {
1656 struct fib_alias *fa, *fa_node;
1657 int found = 0;
1658
1659 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1660 struct fib_info *fi = fa->fa_info;
1661
1662 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
1663 write_lock_bh(&fib_lock);
1664 list_del(&fa->fa_list);
1665 write_unlock_bh(&fib_lock);
1666
1667 fn_free_alias(fa);
1668 found++;
1669 }
1670 }
1671 return found;
1672 }
1673
1674 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1675 {
1676 int found = 0;
1677 struct hlist_head *lih = &l->list;
1678 struct hlist_node *node, *tmp;
1679 struct leaf_info *li = NULL;
1680
1681 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1682 found += trie_flush_list(t, &li->falh);
1683
1684 if (list_empty(&li->falh)) {
1685 write_lock_bh(&fib_lock);
1686 hlist_del(&li->hlist);
1687 write_unlock_bh(&fib_lock);
1688
1689 free_leaf_info(li);
1690 }
1691 }
1692 return found;
1693 }
1694
1695 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1696 {
1697 struct node *c = (struct node *) thisleaf;
1698 struct tnode *p;
1699 int idx;
1700
1701 if (c == NULL) {
1702 if (t->trie == NULL)
1703 return NULL;
1704
1705 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1706 return (struct leaf *) t->trie;
1707
1708 p = (struct tnode*) t->trie; /* Start */
1709 } else
1710 p = (struct tnode *) NODE_PARENT(c);
1711
1712 while (p) {
1713 int pos, last;
1714
1715 /* Find the next child of the parent */
1716 if (c)
1717 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1718 else
1719 pos = 0;
1720
1721 last = 1 << p->bits;
1722 for (idx = pos; idx < last ; idx++) {
1723 if (!p->child[idx])
1724 continue;
1725
1726 /* Decend if tnode */
1727 while (IS_TNODE(p->child[idx])) {
1728 p = (struct tnode*) p->child[idx];
1729 idx = 0;
1730
1731 /* Rightmost non-NULL branch */
1732 if (p && IS_TNODE(p))
1733 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1734
1735 /* Done with this tnode? */
1736 if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1737 goto up;
1738 }
1739 return (struct leaf*) p->child[idx];
1740 }
1741 up:
1742 /* No more children go up one step */
1743 c = (struct node *) p;
1744 p = (struct tnode *) NODE_PARENT(p);
1745 }
1746 return NULL; /* Ready. Root of trie */
1747 }
1748
1749 static int fn_trie_flush(struct fib_table *tb)
1750 {
1751 struct trie *t = (struct trie *) tb->tb_data;
1752 struct leaf *ll = NULL, *l = NULL;
1753 int found = 0, h;
1754
1755 t->revision++;
1756
1757 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1758 found += trie_flush_leaf(t, l);
1759
1760 if (ll && hlist_empty(&ll->list))
1761 trie_leaf_remove(t, ll->key);
1762 ll = l;
1763 }
1764
1765 if (ll && hlist_empty(&ll->list))
1766 trie_leaf_remove(t, ll->key);
1767
1768 DBG("trie_flush found=%d\n", found);
1769 return found;
1770 }
1771
1772 static int trie_last_dflt = -1;
1773
1774 static void
1775 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1776 {
1777 struct trie *t = (struct trie *) tb->tb_data;
1778 int order, last_idx;
1779 struct fib_info *fi = NULL;
1780 struct fib_info *last_resort;
1781 struct fib_alias *fa = NULL;
1782 struct list_head *fa_head;
1783 struct leaf *l;
1784
1785 last_idx = -1;
1786 last_resort = NULL;
1787 order = -1;
1788
1789 read_lock(&fib_lock);
1790
1791 l = fib_find_node(t, 0);
1792 if (!l)
1793 goto out;
1794
1795 fa_head = get_fa_head(l, 0);
1796 if (!fa_head)
1797 goto out;
1798
1799 if (list_empty(fa_head))
1800 goto out;
1801
1802 list_for_each_entry(fa, fa_head, fa_list) {
1803 struct fib_info *next_fi = fa->fa_info;
1804
1805 if (fa->fa_scope != res->scope ||
1806 fa->fa_type != RTN_UNICAST)
1807 continue;
1808
1809 if (next_fi->fib_priority > res->fi->fib_priority)
1810 break;
1811 if (!next_fi->fib_nh[0].nh_gw ||
1812 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1813 continue;
1814 fa->fa_state |= FA_S_ACCESSED;
1815
1816 if (fi == NULL) {
1817 if (next_fi != res->fi)
1818 break;
1819 } else if (!fib_detect_death(fi, order, &last_resort,
1820 &last_idx, &trie_last_dflt)) {
1821 if (res->fi)
1822 fib_info_put(res->fi);
1823 res->fi = fi;
1824 atomic_inc(&fi->fib_clntref);
1825 trie_last_dflt = order;
1826 goto out;
1827 }
1828 fi = next_fi;
1829 order++;
1830 }
1831 if (order <= 0 || fi == NULL) {
1832 trie_last_dflt = -1;
1833 goto out;
1834 }
1835
1836 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1837 if (res->fi)
1838 fib_info_put(res->fi);
1839 res->fi = fi;
1840 atomic_inc(&fi->fib_clntref);
1841 trie_last_dflt = order;
1842 goto out;
1843 }
1844 if (last_idx >= 0) {
1845 if (res->fi)
1846 fib_info_put(res->fi);
1847 res->fi = last_resort;
1848 if (last_resort)
1849 atomic_inc(&last_resort->fib_clntref);
1850 }
1851 trie_last_dflt = last_idx;
1852 out:;
1853 read_unlock(&fib_lock);
1854 }
1855
1856 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1857 struct sk_buff *skb, struct netlink_callback *cb)
1858 {
1859 int i, s_i;
1860 struct fib_alias *fa;
1861
1862 u32 xkey = htonl(key);
1863
1864 s_i = cb->args[3];
1865 i = 0;
1866
1867 list_for_each_entry(fa, fah, fa_list) {
1868 if (i < s_i) {
1869 i++;
1870 continue;
1871 }
1872 if (fa->fa_info->fib_nh == NULL) {
1873 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1874 i++;
1875 continue;
1876 }
1877 if (fa->fa_info == NULL) {
1878 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1879 i++;
1880 continue;
1881 }
1882
1883 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1884 cb->nlh->nlmsg_seq,
1885 RTM_NEWROUTE,
1886 tb->tb_id,
1887 fa->fa_type,
1888 fa->fa_scope,
1889 &xkey,
1890 plen,
1891 fa->fa_tos,
1892 fa->fa_info, 0) < 0) {
1893 cb->args[3] = i;
1894 return -1;
1895 }
1896 i++;
1897 }
1898 cb->args[3] = i;
1899 return skb->len;
1900 }
1901
1902 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1903 struct netlink_callback *cb)
1904 {
1905 int h, s_h;
1906 struct list_head *fa_head;
1907 struct leaf *l = NULL;
1908
1909 s_h = cb->args[2];
1910
1911 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1912 if (h < s_h)
1913 continue;
1914 if (h > s_h)
1915 memset(&cb->args[3], 0,
1916 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1917
1918 fa_head = get_fa_head(l, plen);
1919
1920 if (!fa_head)
1921 continue;
1922
1923 if (list_empty(fa_head))
1924 continue;
1925
1926 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1927 cb->args[2] = h;
1928 return -1;
1929 }
1930 }
1931 cb->args[2] = h;
1932 return skb->len;
1933 }
1934
1935 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1936 {
1937 int m, s_m;
1938 struct trie *t = (struct trie *) tb->tb_data;
1939
1940 s_m = cb->args[1];
1941
1942 read_lock(&fib_lock);
1943 for (m = 0; m <= 32; m++) {
1944 if (m < s_m)
1945 continue;
1946 if (m > s_m)
1947 memset(&cb->args[2], 0,
1948 sizeof(cb->args) - 2*sizeof(cb->args[0]));
1949
1950 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1951 cb->args[1] = m;
1952 goto out;
1953 }
1954 }
1955 read_unlock(&fib_lock);
1956 cb->args[1] = m;
1957 return skb->len;
1958 out:
1959 read_unlock(&fib_lock);
1960 return -1;
1961 }
1962
1963 /* Fix more generic FIB names for init later */
1964
1965 #ifdef CONFIG_IP_MULTIPLE_TABLES
1966 struct fib_table * fib_hash_init(int id)
1967 #else
1968 struct fib_table * __init fib_hash_init(int id)
1969 #endif
1970 {
1971 struct fib_table *tb;
1972 struct trie *t;
1973
1974 if (fn_alias_kmem == NULL)
1975 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1976 sizeof(struct fib_alias),
1977 0, SLAB_HWCACHE_ALIGN,
1978 NULL, NULL);
1979
1980 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1981 GFP_KERNEL);
1982 if (tb == NULL)
1983 return NULL;
1984
1985 tb->tb_id = id;
1986 tb->tb_lookup = fn_trie_lookup;
1987 tb->tb_insert = fn_trie_insert;
1988 tb->tb_delete = fn_trie_delete;
1989 tb->tb_flush = fn_trie_flush;
1990 tb->tb_select_default = fn_trie_select_default;
1991 tb->tb_dump = fn_trie_dump;
1992 memset(tb->tb_data, 0, sizeof(struct trie));
1993
1994 t = (struct trie *) tb->tb_data;
1995
1996 trie_init(t);
1997
1998 if (id == RT_TABLE_LOCAL)
1999 trie_local = t;
2000 else if (id == RT_TABLE_MAIN)
2001 trie_main = t;
2002
2003 if (id == RT_TABLE_LOCAL)
2004 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2005
2006 return tb;
2007 }
2008
2009 /* Trie dump functions */
2010
2011 static void putspace_seq(struct seq_file *seq, int n)
2012 {
2013 while (n--)
2014 seq_printf(seq, " ");
2015 }
2016
2017 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2018 {
2019 while (bits--)
2020 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2021 }
2022
2023 static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
2024 int pend, int cindex, int bits)
2025 {
2026 putspace_seq(seq, indent);
2027 if (IS_LEAF(n))
2028 seq_printf(seq, "|");
2029 else
2030 seq_printf(seq, "+");
2031 if (bits) {
2032 seq_printf(seq, "%d/", cindex);
2033 printbin_seq(seq, cindex, bits);
2034 seq_printf(seq, ": ");
2035 } else
2036 seq_printf(seq, "<root>: ");
2037 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2038
2039 if (IS_LEAF(n)) {
2040 struct leaf *l = (struct leaf *)n;
2041 struct fib_alias *fa;
2042 int i;
2043
2044 seq_printf(seq, "key=%d.%d.%d.%d\n",
2045 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2046
2047 for (i = 32; i >= 0; i--)
2048 if (find_leaf_info(&l->list, i)) {
2049 struct list_head *fa_head = get_fa_head(l, i);
2050
2051 if (!fa_head)
2052 continue;
2053
2054 if (list_empty(fa_head))
2055 continue;
2056
2057 putspace_seq(seq, indent+2);
2058 seq_printf(seq, "{/%d...dumping}\n", i);
2059
2060 list_for_each_entry(fa, fa_head, fa_list) {
2061 putspace_seq(seq, indent+2);
2062 if (fa->fa_info == NULL) {
2063 seq_printf(seq, "Error fa_info=NULL\n");
2064 continue;
2065 }
2066 if (fa->fa_info->fib_nh == NULL) {
2067 seq_printf(seq, "Error _fib_nh=NULL\n");
2068 continue;
2069 }
2070
2071 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2072 fa->fa_type,
2073 fa->fa_scope,
2074 fa->fa_tos);
2075 }
2076 }
2077 } else {
2078 struct tnode *tn = (struct tnode *)n;
2079 int plen = ((struct tnode *)n)->pos;
2080 t_key prf = MASK_PFX(n->key, plen);
2081
2082 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2083 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2084
2085 putspace_seq(seq, indent); seq_printf(seq, "| ");
2086 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
2087 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2088 seq_printf(seq, "}\n");
2089 putspace_seq(seq, indent); seq_printf(seq, "| ");
2090 seq_printf(seq, "{pos=%d", tn->pos);
2091 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2092 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2093 putspace_seq(seq, indent); seq_printf(seq, "| ");
2094 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2095 }
2096 }
2097
2098 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2099 {
2100 struct node *n = t->trie;
2101 int cindex = 0;
2102 int indent = 1;
2103 int pend = 0;
2104 int depth = 0;
2105 struct tnode *tn;
2106
2107 read_lock(&fib_lock);
2108
2109 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
2110
2111 if (!n) {
2112 seq_printf(seq, "------ trie is empty\n");
2113
2114 read_unlock(&fib_lock);
2115 return;
2116 }
2117
2118 printnode_seq(seq, indent, n, pend, cindex, 0);
2119
2120 if (!IS_TNODE(n)) {
2121 read_unlock(&fib_lock);
2122 return;
2123 }
2124
2125 tn = (struct tnode *)n;
2126 pend = tn->pos+tn->bits;
2127 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2128 indent += 3;
2129 depth++;
2130
2131 while (tn && cindex < (1 << tn->bits)) {
2132 if (tn->child[cindex]) {
2133 /* Got a child */
2134
2135 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2136 if (IS_LEAF(tn->child[cindex])) {
2137 cindex++;
2138 } else {
2139 /*
2140 * New tnode. Decend one level
2141 */
2142
2143 depth++;
2144 tn = (struct tnode *)tn->child[cindex];
2145 pend = tn->pos + tn->bits;
2146 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2147 indent += 3;
2148 cindex = 0;
2149 }
2150 } else
2151 cindex++;
2152
2153 /*
2154 * Test if we are done
2155 */
2156
2157 while (cindex >= (1 << tn->bits)) {
2158 /*
2159 * Move upwards and test for root
2160 * pop off all traversed nodes
2161 */
2162
2163 if (NODE_PARENT(tn) == NULL) {
2164 tn = NULL;
2165 break;
2166 }
2167
2168 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2169 cindex++;
2170 tn = NODE_PARENT(tn);
2171 pend = tn->pos + tn->bits;
2172 indent -= 3;
2173 depth--;
2174 }
2175 }
2176
2177 read_unlock(&fib_lock);
2178 }
2179
2180 static struct trie_stat *trie_stat_new(void)
2181 {
2182 struct trie_stat *s;
2183 int i;
2184
2185 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2186 if (!s)
2187 return NULL;
2188
2189 s->totdepth = 0;
2190 s->maxdepth = 0;
2191 s->tnodes = 0;
2192 s->leaves = 0;
2193 s->nullpointers = 0;
2194
2195 for (i = 0; i < MAX_CHILDS; i++)
2196 s->nodesizes[i] = 0;
2197
2198 return s;
2199 }
2200
2201 static struct trie_stat *trie_collect_stats(struct trie *t)
2202 {
2203 struct node *n = t->trie;
2204 struct trie_stat *s = trie_stat_new();
2205 int cindex = 0;
2206 int pend = 0;
2207 int depth = 0;
2208
2209 if (!s)
2210 return NULL;
2211 if (!n)
2212 return s;
2213
2214 read_lock(&fib_lock);
2215
2216 if (IS_TNODE(n)) {
2217 struct tnode *tn = (struct tnode *)n;
2218 pend = tn->pos+tn->bits;
2219 s->nodesizes[tn->bits]++;
2220 depth++;
2221
2222 while (tn && cindex < (1 << tn->bits)) {
2223 if (tn->child[cindex]) {
2224 /* Got a child */
2225
2226 if (IS_LEAF(tn->child[cindex])) {
2227 cindex++;
2228
2229 /* stats */
2230 if (depth > s->maxdepth)
2231 s->maxdepth = depth;
2232 s->totdepth += depth;
2233 s->leaves++;
2234 } else {
2235 /*
2236 * New tnode. Decend one level
2237 */
2238
2239 s->tnodes++;
2240 s->nodesizes[tn->bits]++;
2241 depth++;
2242
2243 n = tn->child[cindex];
2244 tn = (struct tnode *)n;
2245 pend = tn->pos+tn->bits;
2246
2247 cindex = 0;
2248 }
2249 } else {
2250 cindex++;
2251 s->nullpointers++;
2252 }
2253
2254 /*
2255 * Test if we are done
2256 */
2257
2258 while (cindex >= (1 << tn->bits)) {
2259 /*
2260 * Move upwards and test for root
2261 * pop off all traversed nodes
2262 */
2263
2264 if (NODE_PARENT(tn) == NULL) {
2265 tn = NULL;
2266 n = NULL;
2267 break;
2268 }
2269
2270 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2271 tn = NODE_PARENT(tn);
2272 cindex++;
2273 n = (struct node *)tn;
2274 pend = tn->pos+tn->bits;
2275 depth--;
2276 }
2277 }
2278 }
2279
2280 read_unlock(&fib_lock);
2281 return s;
2282 }
2283
2284 #ifdef CONFIG_PROC_FS
2285
2286 static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2287 {
2288 return NULL;
2289 }
2290
2291 static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2292 {
2293 return NULL;
2294 }
2295
2296 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2297 {
2298 if (!ip_fib_main_table)
2299 return NULL;
2300
2301 if (*pos)
2302 return fib_triestat_get_next(seq);
2303 else
2304 return SEQ_START_TOKEN;
2305 }
2306
2307 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2308 {
2309 ++*pos;
2310 if (v == SEQ_START_TOKEN)
2311 return fib_triestat_get_first(seq);
2312 else
2313 return fib_triestat_get_next(seq);
2314 }
2315
2316 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2317 {
2318
2319 }
2320
2321 /*
2322 * This outputs /proc/net/fib_triestats
2323 *
2324 * It always works in backward compatibility mode.
2325 * The format of the file is not supposed to be changed.
2326 */
2327
2328 static void collect_and_show(struct trie *t, struct seq_file *seq)
2329 {
2330 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2331 int i, max, pointers;
2332 struct trie_stat *stat;
2333 int avdepth;
2334
2335 stat = trie_collect_stats(t);
2336
2337 bytes = 0;
2338 seq_printf(seq, "trie=%p\n", t);
2339
2340 if (stat) {
2341 if (stat->leaves)
2342 avdepth = stat->totdepth*100 / stat->leaves;
2343 else
2344 avdepth = 0;
2345 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
2346 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
2347
2348 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2349 bytes += sizeof(struct leaf) * stat->leaves;
2350 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2351 bytes += sizeof(struct tnode) * stat->tnodes;
2352
2353 max = MAX_CHILDS-1;
2354
2355 while (max >= 0 && stat->nodesizes[max] == 0)
2356 max--;
2357 pointers = 0;
2358
2359 for (i = 1; i <= max; i++)
2360 if (stat->nodesizes[i] != 0) {
2361 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2362 pointers += (1<<i) * stat->nodesizes[i];
2363 }
2364 seq_printf(seq, "\n");
2365 seq_printf(seq, "Pointers: %d\n", pointers);
2366 bytes += sizeof(struct node *) * pointers;
2367 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2368 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2369
2370 kfree(stat);
2371 }
2372
2373 #ifdef CONFIG_IP_FIB_TRIE_STATS
2374 seq_printf(seq, "Counters:\n---------\n");
2375 seq_printf(seq,"gets = %d\n", t->stats.gets);
2376 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2377 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2378 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2379 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2380 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2381 #ifdef CLEAR_STATS
2382 memset(&(t->stats), 0, sizeof(t->stats));
2383 #endif
2384 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2385 }
2386
2387 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2388 {
2389 char bf[128];
2390
2391 if (v == SEQ_START_TOKEN) {
2392 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2393 sizeof(struct leaf), sizeof(struct tnode));
2394 if (trie_local)
2395 collect_and_show(trie_local, seq);
2396
2397 if (trie_main)
2398 collect_and_show(trie_main, seq);
2399 } else {
2400 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2401
2402 seq_printf(seq, "%-127s\n", bf);
2403 }
2404 return 0;
2405 }
2406
2407 static struct seq_operations fib_triestat_seq_ops = {
2408 .start = fib_triestat_seq_start,
2409 .next = fib_triestat_seq_next,
2410 .stop = fib_triestat_seq_stop,
2411 .show = fib_triestat_seq_show,
2412 };
2413
2414 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2415 {
2416 struct seq_file *seq;
2417 int rc = -ENOMEM;
2418
2419 rc = seq_open(file, &fib_triestat_seq_ops);
2420 if (rc)
2421 goto out_kfree;
2422
2423 seq = file->private_data;
2424 out:
2425 return rc;
2426 out_kfree:
2427 goto out;
2428 }
2429
2430 static struct file_operations fib_triestat_seq_fops = {
2431 .owner = THIS_MODULE,
2432 .open = fib_triestat_seq_open,
2433 .read = seq_read,
2434 .llseek = seq_lseek,
2435 .release = seq_release_private,
2436 };
2437
2438 int __init fib_stat_proc_init(void)
2439 {
2440 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2441 return -ENOMEM;
2442 return 0;
2443 }
2444
2445 void __init fib_stat_proc_exit(void)
2446 {
2447 proc_net_remove("fib_triestat");
2448 }
2449
2450 static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2451 {
2452 return NULL;
2453 }
2454
2455 static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2456 {
2457 return NULL;
2458 }
2459
2460 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2461 {
2462 if (!ip_fib_main_table)
2463 return NULL;
2464
2465 if (*pos)
2466 return fib_trie_get_next(seq);
2467 else
2468 return SEQ_START_TOKEN;
2469 }
2470
2471 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2472 {
2473 ++*pos;
2474 if (v == SEQ_START_TOKEN)
2475 return fib_trie_get_first(seq);
2476 else
2477 return fib_trie_get_next(seq);
2478
2479 }
2480
2481 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2482 {
2483 }
2484
2485 /*
2486 * This outputs /proc/net/fib_trie.
2487 *
2488 * It always works in backward compatibility mode.
2489 * The format of the file is not supposed to be changed.
2490 */
2491
2492 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2493 {
2494 char bf[128];
2495
2496 if (v == SEQ_START_TOKEN) {
2497 if (trie_local)
2498 trie_dump_seq(seq, trie_local);
2499
2500 if (trie_main)
2501 trie_dump_seq(seq, trie_main);
2502 } else {
2503 snprintf(bf, sizeof(bf),
2504 "*\t%08X\t%08X", 200, 400);
2505 seq_printf(seq, "%-127s\n", bf);
2506 }
2507
2508 return 0;
2509 }
2510
2511 static struct seq_operations fib_trie_seq_ops = {
2512 .start = fib_trie_seq_start,
2513 .next = fib_trie_seq_next,
2514 .stop = fib_trie_seq_stop,
2515 .show = fib_trie_seq_show,
2516 };
2517
2518 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2519 {
2520 struct seq_file *seq;
2521 int rc = -ENOMEM;
2522
2523 rc = seq_open(file, &fib_trie_seq_ops);
2524 if (rc)
2525 goto out_kfree;
2526
2527 seq = file->private_data;
2528 out:
2529 return rc;
2530 out_kfree:
2531 goto out;
2532 }
2533
2534 static struct file_operations fib_trie_seq_fops = {
2535 .owner = THIS_MODULE,
2536 .open = fib_trie_seq_open,
2537 .read = seq_read,
2538 .llseek = seq_lseek,
2539 .release= seq_release_private,
2540 };
2541
2542 int __init fib_proc_init(void)
2543 {
2544 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2545 return -ENOMEM;
2546 return 0;
2547 }
2548
2549 void __init fib_proc_exit(void)
2550 {
2551 proc_net_remove("fib_trie");
2552 }
2553
2554 #endif /* CONFIG_PROC_FS */