Merge tag 'v3.10.69' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
25985edc 15 * This work is based on the LPC-trie which is originally described in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
631dd1a8 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
19baf839
RO
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839 53#include <asm/uaccess.h>
1977f032 54#include <linux/bitops.h>
19baf839
RO
55#include <linux/types.h>
56#include <linux/kernel.h>
19baf839
RO
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
cd8787ab 64#include <linux/inetdevice.h>
19baf839
RO
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
2373ce1c 68#include <linux/rcupdate.h>
19baf839
RO
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
5a0e3ad6 73#include <linux/slab.h>
bc3b2d7f 74#include <linux/export.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF 1
92#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
91b9a277
OJ
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839 97
b299e4f0 98struct rt_trie_node {
91b9a277 99 unsigned long parent;
8d965444 100 t_key key;
19baf839
RO
101};
102
103struct leaf {
91b9a277 104 unsigned long parent;
8d965444 105 t_key key;
19baf839 106 struct hlist_head list;
2373ce1c 107 struct rcu_head rcu;
19baf839
RO
108};
109
110struct leaf_info {
111 struct hlist_node hlist;
112 int plen;
5c74501f 113 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
19baf839 114 struct list_head falh;
5c74501f 115 struct rcu_head rcu;
19baf839
RO
116};
117
118struct tnode {
91b9a277 119 unsigned long parent;
8d965444 120 t_key key;
112d8cfc
ED
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
125 union {
126 struct rcu_head rcu;
e0f7cb8c 127 struct tnode *tnode_free;
15be75cd 128 };
0a5c0475 129 struct rt_trie_node __rcu *child[0];
19baf839
RO
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
2f36895a 139 unsigned int resize_node_skipped;
19baf839
RO
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
93672292 149 unsigned int prefixes;
06ef921d 150 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 151};
19baf839
RO
152
153struct trie {
0a5c0475 154 struct rt_trie_node __rcu *trie;
19baf839
RO
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats;
157#endif
19baf839
RO
158};
159
b299e4f0 160static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 161 int wasfull);
b299e4f0 162static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
c3059477
JP
167static size_t tnode_free_size;
168
169/*
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
173 */
174static const int sync_pages = 128;
19baf839 175
e18b890b 176static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 177static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 178
0a5c0475
ED
179/*
180 * caller must hold RTNL
181 */
182static inline struct tnode *node_parent(const struct rt_trie_node *node)
06801916 183{
0a5c0475
ED
184 unsigned long parent;
185
186 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
187
188 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
b59cfbf7
ED
189}
190
0a5c0475
ED
191/*
192 * caller must hold RCU read lock or RTNL
193 */
194static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
b59cfbf7 195{
0a5c0475
ED
196 unsigned long parent;
197
198 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
199 lockdep_rtnl_is_held());
06801916 200
0a5c0475 201 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
06801916
SH
202}
203
cf778b00 204/* Same as rcu_assign_pointer
6440cc9e
SH
205 * but that macro() assumes that value is a pointer.
206 */
b299e4f0 207static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
06801916 208{
6440cc9e
SH
209 smp_wmb();
210 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 211}
2373ce1c 212
0a5c0475
ED
213/*
214 * caller must hold RTNL
215 */
216static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
b59cfbf7
ED
217{
218 BUG_ON(i >= 1U << tn->bits);
2373ce1c 219
0a5c0475 220 return rtnl_dereference(tn->child[i]);
b59cfbf7
ED
221}
222
0a5c0475
ED
223/*
224 * caller must hold RCU read lock or RTNL
225 */
226static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
19baf839 227{
0a5c0475 228 BUG_ON(i >= 1U << tn->bits);
19baf839 229
0a5c0475 230 return rcu_dereference_rtnl(tn->child[i]);
19baf839
RO
231}
232
bb435b8d 233static inline int tnode_child_length(const struct tnode *tn)
19baf839 234{
91b9a277 235 return 1 << tn->bits;
19baf839
RO
236}
237
3b004569 238static inline t_key mask_pfx(t_key k, unsigned int l)
ab66b4a7
SH
239{
240 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
241}
242
3b004569 243static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
19baf839 244{
91b9a277 245 if (offset < KEYLENGTH)
19baf839 246 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 247 else
19baf839
RO
248 return 0;
249}
250
251static inline int tkey_equals(t_key a, t_key b)
252{
c877efb2 253 return a == b;
19baf839
RO
254}
255
256static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
257{
c877efb2
SH
258 if (bits == 0 || offset >= KEYLENGTH)
259 return 1;
91b9a277
OJ
260 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
261 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 262}
19baf839
RO
263
264static inline int tkey_mismatch(t_key a, int offset, t_key b)
265{
266 t_key diff = a ^ b;
267 int i = offset;
268
c877efb2
SH
269 if (!diff)
270 return 0;
271 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
272 i++;
273 return i;
274}
275
19baf839 276/*
e905a9ed
YH
277 To understand this stuff, an understanding of keys and all their bits is
278 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
279 all of the bits in that key are significant.
280
281 Consider a node 'n' and its parent 'tp'.
282
e905a9ed
YH
283 If n is a leaf, every bit in its key is significant. Its presence is
284 necessitated by path compression, since during a tree traversal (when
285 searching for a leaf - unless we are doing an insertion) we will completely
286 ignore all skipped bits we encounter. Thus we need to verify, at the end of
287 a potentially successful search, that we have indeed been walking the
19baf839
RO
288 correct key path.
289
e905a9ed
YH
290 Note that we can never "miss" the correct key in the tree if present by
291 following the wrong path. Path compression ensures that segments of the key
292 that are the same for all keys with a given prefix are skipped, but the
293 skipped part *is* identical for each node in the subtrie below the skipped
294 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
295 call to tkey_sub_equals() in trie_insert().
296
e905a9ed 297 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
298 have many different meanings.
299
e905a9ed 300 Example:
19baf839
RO
301 _________________________________________________________________
302 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
303 -----------------------------------------------------------------
e905a9ed 304 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
305
306 _________________________________________________________________
307 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
308 -----------------------------------------------------------------
309 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
310
311 tp->pos = 7
312 tp->bits = 3
313 n->pos = 15
91b9a277 314 n->bits = 4
19baf839 315
e905a9ed
YH
316 First, let's just ignore the bits that come before the parent tp, that is
317 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
318 not use them for anything.
319
320 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 321 index into the parent's child array. That is, they will be used to find
19baf839
RO
322 'n' among tp's children.
323
324 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
325 for the node n.
326
e905a9ed 327 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
328 of the bits are really not needed or indeed known in n->key.
329
e905a9ed 330 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 331 n's child array, and will of course be different for each child.
e905a9ed 332
c877efb2 333
19baf839
RO
334 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
335 at this point.
336
337*/
338
0c7770c7 339static inline void check_tnode(const struct tnode *tn)
19baf839 340{
0c7770c7 341 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
342}
343
f5026fab
DL
344static const int halve_threshold = 25;
345static const int inflate_threshold = 50;
345aa031 346static const int halve_threshold_root = 15;
80b71b80 347static const int inflate_threshold_root = 30;
2373ce1c
RO
348
349static void __alias_free_mem(struct rcu_head *head)
19baf839 350{
2373ce1c
RO
351 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
352 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
353}
354
2373ce1c 355static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 356{
2373ce1c
RO
357 call_rcu(&fa->rcu, __alias_free_mem);
358}
91b9a277 359
2373ce1c
RO
360static void __leaf_free_rcu(struct rcu_head *head)
361{
bc3c8c1e
SH
362 struct leaf *l = container_of(head, struct leaf, rcu);
363 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 364}
91b9a277 365
387a5487
SH
366static inline void free_leaf(struct leaf *l)
367{
0c03eca3 368 call_rcu(&l->rcu, __leaf_free_rcu);
387a5487
SH
369}
370
2373ce1c 371static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 372{
bceb0f45 373 kfree_rcu(leaf, rcu);
19baf839
RO
374}
375
8d965444 376static struct tnode *tnode_alloc(size_t size)
f0e36f8c 377{
2373ce1c 378 if (size <= PAGE_SIZE)
8d965444 379 return kzalloc(size, GFP_KERNEL);
15be75cd 380 else
7a1c8e5a 381 return vzalloc(size);
15be75cd 382}
2373ce1c 383
2373ce1c 384static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 385{
2373ce1c 386 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444 387 size_t size = sizeof(struct tnode) +
b299e4f0 388 (sizeof(struct rt_trie_node *) << tn->bits);
f0e36f8c
PM
389
390 if (size <= PAGE_SIZE)
391 kfree(tn);
00203563
AV
392 else
393 vfree(tn);
f0e36f8c
PM
394}
395
2373ce1c
RO
396static inline void tnode_free(struct tnode *tn)
397{
387a5487
SH
398 if (IS_LEAF(tn))
399 free_leaf((struct leaf *) tn);
400 else
550e29bc 401 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
402}
403
e0f7cb8c
JP
404static void tnode_free_safe(struct tnode *tn)
405{
406 BUG_ON(IS_LEAF(tn));
7b85576d
JP
407 tn->tnode_free = tnode_free_head;
408 tnode_free_head = tn;
c3059477 409 tnode_free_size += sizeof(struct tnode) +
b299e4f0 410 (sizeof(struct rt_trie_node *) << tn->bits);
e0f7cb8c
JP
411}
412
413static void tnode_free_flush(void)
414{
415 struct tnode *tn;
416
417 while ((tn = tnode_free_head)) {
418 tnode_free_head = tn->tnode_free;
419 tn->tnode_free = NULL;
420 tnode_free(tn);
421 }
c3059477
JP
422
423 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
424 tnode_free_size = 0;
425 synchronize_rcu();
426 }
e0f7cb8c
JP
427}
428
2373ce1c
RO
429static struct leaf *leaf_new(void)
430{
bc3c8c1e 431 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
432 if (l) {
433 l->parent = T_LEAF;
434 INIT_HLIST_HEAD(&l->list);
435 }
436 return l;
437}
438
439static struct leaf_info *leaf_info_new(int plen)
440{
441 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
442 if (li) {
443 li->plen = plen;
5c74501f 444 li->mask_plen = ntohl(inet_make_mask(plen));
2373ce1c
RO
445 INIT_LIST_HEAD(&li->falh);
446 }
447 return li;
448}
449
a07f5f50 450static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 451{
b299e4f0 452 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
f0e36f8c 453 struct tnode *tn = tnode_alloc(sz);
19baf839 454
91b9a277 455 if (tn) {
2373ce1c 456 tn->parent = T_TNODE;
19baf839
RO
457 tn->pos = pos;
458 tn->bits = bits;
459 tn->key = key;
460 tn->full_children = 0;
461 tn->empty_children = 1<<bits;
462 }
c877efb2 463
a034ee3c 464 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
4ea4bf7e 465 sizeof(struct rt_trie_node *) << bits);
19baf839
RO
466 return tn;
467}
468
19baf839
RO
469/*
470 * Check whether a tnode 'n' is "full", i.e. it is an internal node
471 * and no bits are skipped. See discussion in dyntree paper p. 6
472 */
473
b299e4f0 474static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
19baf839 475{
c877efb2 476 if (n == NULL || IS_LEAF(n))
19baf839
RO
477 return 0;
478
479 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
480}
481
61648d91 482static inline void put_child(struct tnode *tn, int i,
b299e4f0 483 struct rt_trie_node *n)
19baf839
RO
484{
485 tnode_put_child_reorg(tn, i, n, -1);
486}
487
c877efb2 488 /*
19baf839
RO
489 * Add a child at position i overwriting the old value.
490 * Update the value of full_children and empty_children.
491 */
492
b299e4f0 493static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
a07f5f50 494 int wasfull)
19baf839 495{
0a5c0475 496 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
19baf839
RO
497 int isfull;
498
0c7770c7
SH
499 BUG_ON(i >= 1<<tn->bits);
500
19baf839
RO
501 /* update emptyChildren */
502 if (n == NULL && chi != NULL)
503 tn->empty_children++;
504 else if (n != NULL && chi == NULL)
505 tn->empty_children--;
c877efb2 506
19baf839 507 /* update fullChildren */
91b9a277 508 if (wasfull == -1)
19baf839
RO
509 wasfull = tnode_full(tn, chi);
510
511 isfull = tnode_full(tn, n);
c877efb2 512 if (wasfull && !isfull)
19baf839 513 tn->full_children--;
c877efb2 514 else if (!wasfull && isfull)
19baf839 515 tn->full_children++;
91b9a277 516
c877efb2 517 if (n)
06801916 518 node_set_parent(n, tn);
19baf839 519
cf778b00 520 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
521}
522
80b71b80 523#define MAX_WORK 10
b299e4f0 524static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
525{
526 int i;
2f80b3c8 527 struct tnode *old_tn;
e6308be8
RO
528 int inflate_threshold_use;
529 int halve_threshold_use;
80b71b80 530 int max_work;
19baf839 531
e905a9ed 532 if (!tn)
19baf839
RO
533 return NULL;
534
0c7770c7
SH
535 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
536 tn, inflate_threshold, halve_threshold);
19baf839
RO
537
538 /* No children */
539 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 540 tnode_free_safe(tn);
19baf839
RO
541 return NULL;
542 }
543 /* One child */
544 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 545 goto one_child;
c877efb2 546 /*
19baf839
RO
547 * Double as long as the resulting node has a number of
548 * nonempty nodes that are above the threshold.
549 */
550
551 /*
c877efb2
SH
552 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
553 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 554 * Telecommunications, page 6:
c877efb2 555 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
556 * children in the *doubled* node is at least 'high'."
557 *
c877efb2
SH
558 * 'high' in this instance is the variable 'inflate_threshold'. It
559 * is expressed as a percentage, so we multiply it with
560 * tnode_child_length() and instead of multiplying by 2 (since the
561 * child array will be doubled by inflate()) and multiplying
562 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 563 * multiply the left-hand side by 50.
c877efb2
SH
564 *
565 * The left-hand side may look a bit weird: tnode_child_length(tn)
566 * - tn->empty_children is of course the number of non-null children
567 * in the current node. tn->full_children is the number of "full"
19baf839 568 * children, that is non-null tnodes with a skip value of 0.
c877efb2 569 * All of those will be doubled in the resulting inflated tnode, so
19baf839 570 * we just count them one extra time here.
c877efb2 571 *
19baf839 572 * A clearer way to write this would be:
c877efb2 573 *
19baf839 574 * to_be_doubled = tn->full_children;
c877efb2 575 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
576 * tn->full_children;
577 *
578 * new_child_length = tnode_child_length(tn) * 2;
579 *
c877efb2 580 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
581 * new_child_length;
582 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
583 *
584 * ...and so on, tho it would mess up the while () loop.
585 *
19baf839
RO
586 * anyway,
587 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
588 * inflate_threshold
c877efb2 589 *
19baf839
RO
590 * avoid a division:
591 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
592 * inflate_threshold * new_child_length
c877efb2 593 *
19baf839 594 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 595 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 596 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 597 *
19baf839 598 * expand new_child_length:
c877efb2 599 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 600 * tn->full_children) >=
19baf839 601 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 602 *
19baf839 603 * shorten again:
c877efb2 604 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 605 * tn->empty_children) >= inflate_threshold *
19baf839 606 * tnode_child_length(tn)
c877efb2 607 *
19baf839
RO
608 */
609
610 check_tnode(tn);
c877efb2 611
e6308be8
RO
612 /* Keep root node larger */
613
b299e4f0 614 if (!node_parent((struct rt_trie_node *)tn)) {
80b71b80
JL
615 inflate_threshold_use = inflate_threshold_root;
616 halve_threshold_use = halve_threshold_root;
a034ee3c 617 } else {
e6308be8 618 inflate_threshold_use = inflate_threshold;
80b71b80
JL
619 halve_threshold_use = halve_threshold;
620 }
e6308be8 621
80b71b80
JL
622 max_work = MAX_WORK;
623 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
624 50 * (tn->full_children + tnode_child_length(tn)
625 - tn->empty_children)
626 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 627
2f80b3c8
RO
628 old_tn = tn;
629 tn = inflate(t, tn);
a07f5f50 630
2f80b3c8
RO
631 if (IS_ERR(tn)) {
632 tn = old_tn;
2f36895a
RO
633#ifdef CONFIG_IP_FIB_TRIE_STATS
634 t->stats.resize_node_skipped++;
635#endif
636 break;
637 }
19baf839
RO
638 }
639
640 check_tnode(tn);
641
80b71b80 642 /* Return if at least one inflate is run */
a034ee3c 643 if (max_work != MAX_WORK)
b299e4f0 644 return (struct rt_trie_node *) tn;
80b71b80 645
19baf839
RO
646 /*
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
649 */
2f36895a 650
80b71b80
JL
651 max_work = MAX_WORK;
652 while (tn->bits > 1 && max_work-- &&
19baf839 653 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 654 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 655
2f80b3c8
RO
656 old_tn = tn;
657 tn = halve(t, tn);
658 if (IS_ERR(tn)) {
659 tn = old_tn;
2f36895a
RO
660#ifdef CONFIG_IP_FIB_TRIE_STATS
661 t->stats.resize_node_skipped++;
662#endif
663 break;
664 }
665 }
19baf839 666
c877efb2 667
19baf839 668 /* Only one child remains */
80b71b80
JL
669 if (tn->empty_children == tnode_child_length(tn) - 1) {
670one_child:
19baf839 671 for (i = 0; i < tnode_child_length(tn); i++) {
b299e4f0 672 struct rt_trie_node *n;
19baf839 673
0a5c0475 674 n = rtnl_dereference(tn->child[i]);
2373ce1c 675 if (!n)
91b9a277 676 continue;
91b9a277
OJ
677
678 /* compress one level */
679
06801916 680 node_set_parent(n, NULL);
e0f7cb8c 681 tnode_free_safe(tn);
91b9a277 682 return n;
19baf839 683 }
80b71b80 684 }
b299e4f0 685 return (struct rt_trie_node *) tn;
19baf839
RO
686}
687
0a5c0475
ED
688
689static void tnode_clean_free(struct tnode *tn)
690{
691 int i;
692 struct tnode *tofree;
693
694 for (i = 0; i < tnode_child_length(tn); i++) {
695 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
696 if (tofree)
697 tnode_free(tofree);
698 }
699 tnode_free(tn);
700}
701
2f80b3c8 702static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 703{
19baf839
RO
704 struct tnode *oldtnode = tn;
705 int olen = tnode_child_length(tn);
706 int i;
707
0c7770c7 708 pr_debug("In inflate\n");
19baf839
RO
709
710 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
711
0c7770c7 712 if (!tn)
2f80b3c8 713 return ERR_PTR(-ENOMEM);
2f36895a
RO
714
715 /*
c877efb2
SH
716 * Preallocate and store tnodes before the actual work so we
717 * don't get into an inconsistent state if memory allocation
718 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
719 * of tnode is ignored.
720 */
91b9a277
OJ
721
722 for (i = 0; i < olen; i++) {
a07f5f50 723 struct tnode *inode;
2f36895a 724
a07f5f50 725 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
726 if (inode &&
727 IS_TNODE(inode) &&
728 inode->pos == oldtnode->pos + oldtnode->bits &&
729 inode->bits > 1) {
730 struct tnode *left, *right;
ab66b4a7 731 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 732
2f36895a
RO
733 left = tnode_new(inode->key&(~m), inode->pos + 1,
734 inode->bits - 1);
2f80b3c8
RO
735 if (!left)
736 goto nomem;
91b9a277 737
2f36895a
RO
738 right = tnode_new(inode->key|m, inode->pos + 1,
739 inode->bits - 1);
740
e905a9ed 741 if (!right) {
2f80b3c8
RO
742 tnode_free(left);
743 goto nomem;
e905a9ed 744 }
2f36895a 745
61648d91
LM
746 put_child(tn, 2*i, (struct rt_trie_node *) left);
747 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
2f36895a
RO
748 }
749 }
750
91b9a277 751 for (i = 0; i < olen; i++) {
c95aaf9a 752 struct tnode *inode;
b299e4f0 753 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
754 struct tnode *left, *right;
755 int size, j;
c877efb2 756
19baf839
RO
757 /* An empty child */
758 if (node == NULL)
759 continue;
760
761 /* A leaf or an internal node with skipped bits */
762
c877efb2 763 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 764 tn->pos + tn->bits - 1) {
a07f5f50
SH
765 if (tkey_extract_bits(node->key,
766 oldtnode->pos + oldtnode->bits,
767 1) == 0)
61648d91 768 put_child(tn, 2*i, node);
19baf839 769 else
61648d91 770 put_child(tn, 2*i+1, node);
19baf839
RO
771 continue;
772 }
773
774 /* An internal node with two children */
775 inode = (struct tnode *) node;
776
777 if (inode->bits == 1) {
61648d91
LM
778 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
779 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
19baf839 780
e0f7cb8c 781 tnode_free_safe(inode);
91b9a277 782 continue;
19baf839
RO
783 }
784
91b9a277
OJ
785 /* An internal node with more than two children */
786
787 /* We will replace this node 'inode' with two new
788 * ones, 'left' and 'right', each with half of the
789 * original children. The two new nodes will have
790 * a position one bit further down the key and this
791 * means that the "significant" part of their keys
792 * (see the discussion near the top of this file)
793 * will differ by one bit, which will be "0" in
794 * left's key and "1" in right's key. Since we are
795 * moving the key position by one step, the bit that
796 * we are moving away from - the bit at position
797 * (inode->pos) - is the one that will differ between
798 * left and right. So... we synthesize that bit in the
799 * two new keys.
800 * The mask 'm' below will be a single "one" bit at
801 * the position (inode->pos)
802 */
19baf839 803
91b9a277
OJ
804 /* Use the old key, but set the new significant
805 * bit to zero.
806 */
2f36895a 807
91b9a277 808 left = (struct tnode *) tnode_get_child(tn, 2*i);
61648d91 809 put_child(tn, 2*i, NULL);
2f36895a 810
91b9a277 811 BUG_ON(!left);
2f36895a 812
91b9a277 813 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
61648d91 814 put_child(tn, 2*i+1, NULL);
19baf839 815
91b9a277 816 BUG_ON(!right);
19baf839 817
91b9a277
OJ
818 size = tnode_child_length(left);
819 for (j = 0; j < size; j++) {
61648d91
LM
820 put_child(left, j, rtnl_dereference(inode->child[j]));
821 put_child(right, j, rtnl_dereference(inode->child[j + size]));
19baf839 822 }
61648d91
LM
823 put_child(tn, 2*i, resize(t, left));
824 put_child(tn, 2*i+1, resize(t, right));
91b9a277 825
e0f7cb8c 826 tnode_free_safe(inode);
19baf839 827 }
e0f7cb8c 828 tnode_free_safe(oldtnode);
19baf839 829 return tn;
2f80b3c8 830nomem:
0a5c0475
ED
831 tnode_clean_free(tn);
832 return ERR_PTR(-ENOMEM);
19baf839
RO
833}
834
2f80b3c8 835static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
836{
837 struct tnode *oldtnode = tn;
b299e4f0 838 struct rt_trie_node *left, *right;
19baf839
RO
839 int i;
840 int olen = tnode_child_length(tn);
841
0c7770c7 842 pr_debug("In halve\n");
c877efb2
SH
843
844 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 845
2f80b3c8
RO
846 if (!tn)
847 return ERR_PTR(-ENOMEM);
2f36895a
RO
848
849 /*
c877efb2
SH
850 * Preallocate and store tnodes before the actual work so we
851 * don't get into an inconsistent state if memory allocation
852 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
853 * of tnode is ignored.
854 */
855
91b9a277 856 for (i = 0; i < olen; i += 2) {
2f36895a
RO
857 left = tnode_get_child(oldtnode, i);
858 right = tnode_get_child(oldtnode, i+1);
c877efb2 859
2f36895a 860 /* Two nonempty children */
0c7770c7 861 if (left && right) {
2f80b3c8 862 struct tnode *newn;
0c7770c7 863
2f80b3c8 864 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
865
866 if (!newn)
2f80b3c8 867 goto nomem;
0c7770c7 868
61648d91 869 put_child(tn, i/2, (struct rt_trie_node *)newn);
2f36895a 870 }
2f36895a 871
2f36895a 872 }
19baf839 873
91b9a277
OJ
874 for (i = 0; i < olen; i += 2) {
875 struct tnode *newBinNode;
876
19baf839
RO
877 left = tnode_get_child(oldtnode, i);
878 right = tnode_get_child(oldtnode, i+1);
c877efb2 879
19baf839
RO
880 /* At least one of the children is empty */
881 if (left == NULL) {
882 if (right == NULL) /* Both are empty */
883 continue;
61648d91 884 put_child(tn, i/2, right);
91b9a277 885 continue;
0c7770c7 886 }
91b9a277
OJ
887
888 if (right == NULL) {
61648d91 889 put_child(tn, i/2, left);
91b9a277
OJ
890 continue;
891 }
c877efb2 892
19baf839 893 /* Two nonempty children */
91b9a277 894 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
61648d91
LM
895 put_child(tn, i/2, NULL);
896 put_child(newBinNode, 0, left);
897 put_child(newBinNode, 1, right);
898 put_child(tn, i/2, resize(t, newBinNode));
19baf839 899 }
e0f7cb8c 900 tnode_free_safe(oldtnode);
19baf839 901 return tn;
2f80b3c8 902nomem:
0a5c0475
ED
903 tnode_clean_free(tn);
904 return ERR_PTR(-ENOMEM);
19baf839
RO
905}
906
772cb712 907/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
908 via get_fa_head and dump */
909
772cb712 910static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 911{
772cb712 912 struct hlist_head *head = &l->list;
19baf839
RO
913 struct leaf_info *li;
914
b67bfe0d 915 hlist_for_each_entry_rcu(li, head, hlist)
c877efb2 916 if (li->plen == plen)
19baf839 917 return li;
91b9a277 918
19baf839
RO
919 return NULL;
920}
921
a07f5f50 922static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 923{
772cb712 924 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 925
91b9a277
OJ
926 if (!li)
927 return NULL;
c877efb2 928
91b9a277 929 return &li->falh;
19baf839
RO
930}
931
932static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
933{
e905a9ed 934 struct leaf_info *li = NULL, *last = NULL;
e905a9ed
YH
935
936 if (hlist_empty(head)) {
937 hlist_add_head_rcu(&new->hlist, head);
938 } else {
b67bfe0d 939 hlist_for_each_entry(li, head, hlist) {
e905a9ed
YH
940 if (new->plen > li->plen)
941 break;
942
943 last = li;
944 }
945 if (last)
946 hlist_add_after_rcu(&last->hlist, &new->hlist);
947 else
948 hlist_add_before_rcu(&new->hlist, &li->hlist);
949 }
19baf839
RO
950}
951
2373ce1c
RO
952/* rcu_read_lock needs to be hold by caller from readside */
953
19baf839
RO
954static struct leaf *
955fib_find_node(struct trie *t, u32 key)
956{
957 int pos;
958 struct tnode *tn;
b299e4f0 959 struct rt_trie_node *n;
19baf839
RO
960
961 pos = 0;
a034ee3c 962 n = rcu_dereference_rtnl(t->trie);
19baf839
RO
963
964 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
965 tn = (struct tnode *) n;
91b9a277 966
19baf839 967 check_tnode(tn);
91b9a277 968
c877efb2 969 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 970 pos = tn->pos + tn->bits;
a07f5f50
SH
971 n = tnode_get_child_rcu(tn,
972 tkey_extract_bits(key,
973 tn->pos,
974 tn->bits));
91b9a277 975 } else
19baf839
RO
976 break;
977 }
978 /* Case we have found a leaf. Compare prefixes */
979
91b9a277
OJ
980 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
981 return (struct leaf *)n;
982
19baf839
RO
983 return NULL;
984}
985
7b85576d 986static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 987{
19baf839 988 int wasfull;
3ed18d76 989 t_key cindex, key;
06801916 990 struct tnode *tp;
19baf839 991
3ed18d76
RO
992 key = tn->key;
993
b299e4f0 994 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
19baf839
RO
995 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
996 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
e3192690 997 tn = (struct tnode *)resize(t, tn);
a07f5f50 998
e3192690 999 tnode_put_child_reorg(tp, cindex,
b299e4f0 1000 (struct rt_trie_node *)tn, wasfull);
91b9a277 1001
b299e4f0 1002 tp = node_parent((struct rt_trie_node *) tn);
008440e3 1003 if (!tp)
cf778b00 1004 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
008440e3 1005
e0f7cb8c 1006 tnode_free_flush();
06801916 1007 if (!tp)
19baf839 1008 break;
06801916 1009 tn = tp;
19baf839 1010 }
06801916 1011
19baf839 1012 /* Handle last (top) tnode */
7b85576d 1013 if (IS_TNODE(tn))
e3192690 1014 tn = (struct tnode *)resize(t, tn);
19baf839 1015
cf778b00 1016 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
7b85576d 1017 tnode_free_flush();
19baf839
RO
1018}
1019
2373ce1c
RO
1020/* only used from updater-side */
1021
fea86ad8 1022static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1023{
1024 int pos, newpos;
1025 struct tnode *tp = NULL, *tn = NULL;
b299e4f0 1026 struct rt_trie_node *n;
19baf839
RO
1027 struct leaf *l;
1028 int missbit;
c877efb2 1029 struct list_head *fa_head = NULL;
19baf839
RO
1030 struct leaf_info *li;
1031 t_key cindex;
1032
1033 pos = 0;
0a5c0475 1034 n = rtnl_dereference(t->trie);
19baf839 1035
c877efb2
SH
1036 /* If we point to NULL, stop. Either the tree is empty and we should
1037 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1038 * and we should just put our new leaf in that.
c877efb2
SH
1039 * If we point to a T_TNODE, check if it matches our key. Note that
1040 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1041 * not be the parent's 'pos'+'bits'!
1042 *
c877efb2 1043 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1044 * the index from our key, push the T_TNODE and walk the tree.
1045 *
1046 * If it doesn't, we have to replace it with a new T_TNODE.
1047 *
c877efb2
SH
1048 * If we point to a T_LEAF, it might or might not have the same key
1049 * as we do. If it does, just change the value, update the T_LEAF's
1050 * value, and return it.
19baf839
RO
1051 * If it doesn't, we need to replace it with a T_TNODE.
1052 */
1053
1054 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1055 tn = (struct tnode *) n;
91b9a277 1056
c877efb2 1057 check_tnode(tn);
91b9a277 1058
c877efb2 1059 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1060 tp = tn;
91b9a277 1061 pos = tn->pos + tn->bits;
a07f5f50
SH
1062 n = tnode_get_child(tn,
1063 tkey_extract_bits(key,
1064 tn->pos,
1065 tn->bits));
19baf839 1066
06801916 1067 BUG_ON(n && node_parent(n) != tn);
91b9a277 1068 } else
19baf839
RO
1069 break;
1070 }
1071
1072 /*
1073 * n ----> NULL, LEAF or TNODE
1074 *
c877efb2 1075 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1076 */
1077
91b9a277 1078 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1079
1080 /* Case 1: n is a leaf. Compare prefixes */
1081
c877efb2 1082 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1083 l = (struct leaf *) n;
19baf839 1084 li = leaf_info_new(plen);
91b9a277 1085
fea86ad8
SH
1086 if (!li)
1087 return NULL;
19baf839
RO
1088
1089 fa_head = &li->falh;
1090 insert_leaf_info(&l->list, li);
1091 goto done;
1092 }
19baf839
RO
1093 l = leaf_new();
1094
fea86ad8
SH
1095 if (!l)
1096 return NULL;
19baf839
RO
1097
1098 l->key = key;
1099 li = leaf_info_new(plen);
1100
c877efb2 1101 if (!li) {
387a5487 1102 free_leaf(l);
fea86ad8 1103 return NULL;
f835e471 1104 }
19baf839
RO
1105
1106 fa_head = &li->falh;
1107 insert_leaf_info(&l->list, li);
1108
19baf839 1109 if (t->trie && n == NULL) {
91b9a277 1110 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1111
b299e4f0 1112 node_set_parent((struct rt_trie_node *)l, tp);
19baf839 1113
91b9a277 1114 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
61648d91 1115 put_child(tp, cindex, (struct rt_trie_node *)l);
91b9a277
OJ
1116 } else {
1117 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1118 /*
1119 * Add a new tnode here
19baf839
RO
1120 * first tnode need some special handling
1121 */
1122
1123 if (tp)
91b9a277 1124 pos = tp->pos+tp->bits;
19baf839 1125 else
91b9a277
OJ
1126 pos = 0;
1127
c877efb2 1128 if (n) {
19baf839
RO
1129 newpos = tkey_mismatch(key, pos, n->key);
1130 tn = tnode_new(n->key, newpos, 1);
91b9a277 1131 } else {
19baf839 1132 newpos = 0;
c877efb2 1133 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1134 }
19baf839 1135
c877efb2 1136 if (!tn) {
f835e471 1137 free_leaf_info(li);
387a5487 1138 free_leaf(l);
fea86ad8 1139 return NULL;
91b9a277
OJ
1140 }
1141
b299e4f0 1142 node_set_parent((struct rt_trie_node *)tn, tp);
19baf839 1143
91b9a277 1144 missbit = tkey_extract_bits(key, newpos, 1);
61648d91
LM
1145 put_child(tn, missbit, (struct rt_trie_node *)l);
1146 put_child(tn, 1-missbit, n);
19baf839 1147
c877efb2 1148 if (tp) {
19baf839 1149 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
61648d91 1150 put_child(tp, cindex, (struct rt_trie_node *)tn);
91b9a277 1151 } else {
cf778b00 1152 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
19baf839
RO
1153 tp = tn;
1154 }
1155 }
91b9a277
OJ
1156
1157 if (tp && tp->pos + tp->bits > 32)
058bd4d2
JP
1158 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1159 tp, tp->pos, tp->bits, key, plen);
91b9a277 1160
19baf839 1161 /* Rebalance the trie */
2373ce1c 1162
7b85576d 1163 trie_rebalance(t, tp);
f835e471 1164done:
19baf839
RO
1165 return fa_head;
1166}
1167
d562f1f8
RO
1168/*
1169 * Caller must hold RTNL.
1170 */
16c6cf8b 1171int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1172{
1173 struct trie *t = (struct trie *) tb->tb_data;
1174 struct fib_alias *fa, *new_fa;
c877efb2 1175 struct list_head *fa_head = NULL;
19baf839 1176 struct fib_info *fi;
4e902c57
TG
1177 int plen = cfg->fc_dst_len;
1178 u8 tos = cfg->fc_tos;
19baf839
RO
1179 u32 key, mask;
1180 int err;
1181 struct leaf *l;
1182
1183 if (plen > 32)
1184 return -EINVAL;
1185
4e902c57 1186 key = ntohl(cfg->fc_dst);
19baf839 1187
2dfe55b4 1188 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1189
91b9a277 1190 mask = ntohl(inet_make_mask(plen));
19baf839 1191
c877efb2 1192 if (key & ~mask)
19baf839
RO
1193 return -EINVAL;
1194
1195 key = key & mask;
1196
4e902c57
TG
1197 fi = fib_create_info(cfg);
1198 if (IS_ERR(fi)) {
1199 err = PTR_ERR(fi);
19baf839 1200 goto err;
4e902c57 1201 }
19baf839
RO
1202
1203 l = fib_find_node(t, key);
c877efb2 1204 fa = NULL;
19baf839 1205
c877efb2 1206 if (l) {
19baf839
RO
1207 fa_head = get_fa_head(l, plen);
1208 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1209 }
1210
1211 /* Now fa, if non-NULL, points to the first fib alias
1212 * with the same keys [prefix,tos,priority], if such key already
1213 * exists or to the node before which we will insert new one.
1214 *
1215 * If fa is NULL, we will need to allocate a new one and
1216 * insert to the head of f.
1217 *
1218 * If f is NULL, no fib node matched the destination key
1219 * and we need to allocate a new one of those as well.
1220 */
1221
936f6f8e
JA
1222 if (fa && fa->fa_tos == tos &&
1223 fa->fa_info->fib_priority == fi->fib_priority) {
1224 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1225
1226 err = -EEXIST;
4e902c57 1227 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1228 goto out;
1229
936f6f8e
JA
1230 /* We have 2 goals:
1231 * 1. Find exact match for type, scope, fib_info to avoid
1232 * duplicate routes
1233 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1234 */
1235 fa_match = NULL;
1236 fa_first = fa;
1237 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1238 list_for_each_entry_continue(fa, fa_head, fa_list) {
1239 if (fa->fa_tos != tos)
1240 break;
1241 if (fa->fa_info->fib_priority != fi->fib_priority)
1242 break;
1243 if (fa->fa_type == cfg->fc_type &&
936f6f8e
JA
1244 fa->fa_info == fi) {
1245 fa_match = fa;
1246 break;
1247 }
1248 }
1249
4e902c57 1250 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1251 struct fib_info *fi_drop;
1252 u8 state;
1253
936f6f8e
JA
1254 fa = fa_first;
1255 if (fa_match) {
1256 if (fa == fa_match)
1257 err = 0;
6725033f 1258 goto out;
936f6f8e 1259 }
2373ce1c 1260 err = -ENOBUFS;
e94b1766 1261 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1262 if (new_fa == NULL)
1263 goto out;
19baf839
RO
1264
1265 fi_drop = fa->fa_info;
2373ce1c
RO
1266 new_fa->fa_tos = fa->fa_tos;
1267 new_fa->fa_info = fi;
4e902c57 1268 new_fa->fa_type = cfg->fc_type;
19baf839 1269 state = fa->fa_state;
936f6f8e 1270 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1271
2373ce1c
RO
1272 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1273 alias_free_mem_rcu(fa);
19baf839
RO
1274
1275 fib_release_info(fi_drop);
1276 if (state & FA_S_ACCESSED)
4ccfe6d4 1277 rt_cache_flush(cfg->fc_nlinfo.nl_net);
b8f55831
MK
1278 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1279 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1280
91b9a277 1281 goto succeeded;
19baf839
RO
1282 }
1283 /* Error if we find a perfect match which
1284 * uses the same scope, type, and nexthop
1285 * information.
1286 */
936f6f8e
JA
1287 if (fa_match)
1288 goto out;
a07f5f50 1289
4e902c57 1290 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1291 fa = fa_first;
19baf839
RO
1292 }
1293 err = -ENOENT;
4e902c57 1294 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1295 goto out;
1296
1297 err = -ENOBUFS;
e94b1766 1298 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1299 if (new_fa == NULL)
1300 goto out;
1301
1302 new_fa->fa_info = fi;
1303 new_fa->fa_tos = tos;
4e902c57 1304 new_fa->fa_type = cfg->fc_type;
19baf839 1305 new_fa->fa_state = 0;
19baf839
RO
1306 /*
1307 * Insert new entry to the list.
1308 */
1309
c877efb2 1310 if (!fa_head) {
fea86ad8
SH
1311 fa_head = fib_insert_node(t, key, plen);
1312 if (unlikely(!fa_head)) {
1313 err = -ENOMEM;
f835e471 1314 goto out_free_new_fa;
fea86ad8 1315 }
f835e471 1316 }
19baf839 1317
21d8c49e
DM
1318 if (!plen)
1319 tb->tb_num_default++;
1320
2373ce1c
RO
1321 list_add_tail_rcu(&new_fa->fa_list,
1322 (fa ? &fa->fa_list : fa_head));
19baf839 1323
4ccfe6d4 1324 rt_cache_flush(cfg->fc_nlinfo.nl_net);
4e902c57 1325 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1326 &cfg->fc_nlinfo, 0);
19baf839
RO
1327succeeded:
1328 return 0;
f835e471
RO
1329
1330out_free_new_fa:
1331 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1332out:
1333 fib_release_info(fi);
91b9a277 1334err:
19baf839
RO
1335 return err;
1336}
1337
772cb712 1338/* should be called with rcu_read_lock */
5b470441 1339static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
22bd5b9b 1340 t_key key, const struct flowi4 *flp,
ebc0ffae 1341 struct fib_result *res, int fib_flags)
19baf839 1342{
19baf839
RO
1343 struct leaf_info *li;
1344 struct hlist_head *hhead = &l->list;
c877efb2 1345
b67bfe0d 1346 hlist_for_each_entry_rcu(li, hhead, hlist) {
3be0686b 1347 struct fib_alias *fa;
a07f5f50 1348
5c74501f 1349 if (l->key != (key & li->mask_plen))
19baf839
RO
1350 continue;
1351
3be0686b
DM
1352 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1353 struct fib_info *fi = fa->fa_info;
1354 int nhsel, err;
a07f5f50 1355
22bd5b9b 1356 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
3be0686b 1357 continue;
dccd9ecc
DM
1358 if (fi->fib_dead)
1359 continue;
37e826c5 1360 if (fa->fa_info->fib_scope < flp->flowi4_scope)
3be0686b
DM
1361 continue;
1362 fib_alias_accessed(fa);
1363 err = fib_props[fa->fa_type].error;
1364 if (err) {
19baf839 1365#ifdef CONFIG_IP_FIB_TRIE_STATS
1fbc7843 1366 t->stats.semantic_match_passed++;
3be0686b 1367#endif
1fbc7843 1368 return err;
3be0686b
DM
1369 }
1370 if (fi->fib_flags & RTNH_F_DEAD)
1371 continue;
1372 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1373 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1374
1375 if (nh->nh_flags & RTNH_F_DEAD)
1376 continue;
22bd5b9b 1377 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
3be0686b
DM
1378 continue;
1379
1380#ifdef CONFIG_IP_FIB_TRIE_STATS
1381 t->stats.semantic_match_passed++;
1382#endif
5c74501f 1383 res->prefixlen = li->plen;
3be0686b
DM
1384 res->nh_sel = nhsel;
1385 res->type = fa->fa_type;
37e826c5 1386 res->scope = fa->fa_info->fib_scope;
3be0686b
DM
1387 res->fi = fi;
1388 res->table = tb;
1389 res->fa_head = &li->falh;
1390 if (!(fib_flags & FIB_LOOKUP_NOREF))
5c74501f 1391 atomic_inc(&fi->fib_clntref);
3be0686b
DM
1392 return 0;
1393 }
1394 }
1395
1396#ifdef CONFIG_IP_FIB_TRIE_STATS
1397 t->stats.semantic_match_miss++;
19baf839 1398#endif
19baf839 1399 }
a07f5f50 1400
2e655571 1401 return 1;
19baf839
RO
1402}
1403
22bd5b9b 1404int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
ebc0ffae 1405 struct fib_result *res, int fib_flags)
19baf839
RO
1406{
1407 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1408 int ret;
b299e4f0 1409 struct rt_trie_node *n;
19baf839 1410 struct tnode *pn;
3b004569 1411 unsigned int pos, bits;
22bd5b9b 1412 t_key key = ntohl(flp->daddr);
3b004569 1413 unsigned int chopped_off;
19baf839 1414 t_key cindex = 0;
3b004569 1415 unsigned int current_prefix_length = KEYLENGTH;
91b9a277 1416 struct tnode *cn;
874ffa8f 1417 t_key pref_mismatch;
91b9a277 1418
2373ce1c 1419 rcu_read_lock();
91b9a277 1420
2373ce1c 1421 n = rcu_dereference(t->trie);
c877efb2 1422 if (!n)
19baf839
RO
1423 goto failed;
1424
1425#ifdef CONFIG_IP_FIB_TRIE_STATS
1426 t->stats.gets++;
1427#endif
1428
1429 /* Just a leaf? */
1430 if (IS_LEAF(n)) {
5b470441 1431 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
a07f5f50 1432 goto found;
19baf839 1433 }
a07f5f50 1434
19baf839
RO
1435 pn = (struct tnode *) n;
1436 chopped_off = 0;
c877efb2 1437
91b9a277 1438 while (pn) {
19baf839
RO
1439 pos = pn->pos;
1440 bits = pn->bits;
1441
c877efb2 1442 if (!chopped_off)
ab66b4a7
SH
1443 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1444 pos, bits);
19baf839 1445
b902e573 1446 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1447
1448 if (n == NULL) {
1449#ifdef CONFIG_IP_FIB_TRIE_STATS
1450 t->stats.null_node_hit++;
1451#endif
1452 goto backtrace;
1453 }
1454
91b9a277 1455 if (IS_LEAF(n)) {
5b470441 1456 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
2e655571 1457 if (ret > 0)
91b9a277 1458 goto backtrace;
a07f5f50 1459 goto found;
91b9a277
OJ
1460 }
1461
91b9a277 1462 cn = (struct tnode *)n;
19baf839 1463
91b9a277
OJ
1464 /*
1465 * It's a tnode, and we can do some extra checks here if we
1466 * like, to avoid descending into a dead-end branch.
1467 * This tnode is in the parent's child array at index
1468 * key[p_pos..p_pos+p_bits] but potentially with some bits
1469 * chopped off, so in reality the index may be just a
1470 * subprefix, padded with zero at the end.
1471 * We can also take a look at any skipped bits in this
1472 * tnode - everything up to p_pos is supposed to be ok,
1473 * and the non-chopped bits of the index (se previous
1474 * paragraph) are also guaranteed ok, but the rest is
1475 * considered unknown.
1476 *
1477 * The skipped bits are key[pos+bits..cn->pos].
1478 */
19baf839 1479
91b9a277
OJ
1480 /* If current_prefix_length < pos+bits, we are already doing
1481 * actual prefix matching, which means everything from
1482 * pos+(bits-chopped_off) onward must be zero along some
1483 * branch of this subtree - otherwise there is *no* valid
1484 * prefix present. Here we can only check the skipped
1485 * bits. Remember, since we have already indexed into the
1486 * parent's child array, we know that the bits we chopped of
1487 * *are* zero.
1488 */
19baf839 1489
a07f5f50
SH
1490 /* NOTA BENE: Checking only skipped bits
1491 for the new node here */
19baf839 1492
91b9a277
OJ
1493 if (current_prefix_length < pos+bits) {
1494 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1495 cn->pos - current_prefix_length)
1496 || !(cn->child[0]))
91b9a277
OJ
1497 goto backtrace;
1498 }
19baf839 1499
91b9a277
OJ
1500 /*
1501 * If chopped_off=0, the index is fully validated and we
1502 * only need to look at the skipped bits for this, the new,
1503 * tnode. What we actually want to do is to find out if
1504 * these skipped bits match our key perfectly, or if we will
1505 * have to count on finding a matching prefix further down,
1506 * because if we do, we would like to have some way of
1507 * verifying the existence of such a prefix at this point.
1508 */
19baf839 1509
91b9a277
OJ
1510 /* The only thing we can do at this point is to verify that
1511 * any such matching prefix can indeed be a prefix to our
1512 * key, and if the bits in the node we are inspecting that
1513 * do not match our key are not ZERO, this cannot be true.
1514 * Thus, find out where there is a mismatch (before cn->pos)
1515 * and verify that all the mismatching bits are zero in the
1516 * new tnode's key.
1517 */
19baf839 1518
a07f5f50
SH
1519 /*
1520 * Note: We aren't very concerned about the piece of
1521 * the key that precede pn->pos+pn->bits, since these
1522 * have already been checked. The bits after cn->pos
1523 * aren't checked since these are by definition
1524 * "unknown" at this point. Thus, what we want to see
1525 * is if we are about to enter the "prefix matching"
1526 * state, and in that case verify that the skipped
1527 * bits that will prevail throughout this subtree are
1528 * zero, as they have to be if we are to find a
1529 * matching prefix.
91b9a277
OJ
1530 */
1531
874ffa8f 1532 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
91b9a277 1533
a07f5f50
SH
1534 /*
1535 * In short: If skipped bits in this node do not match
1536 * the search key, enter the "prefix matching"
1537 * state.directly.
91b9a277
OJ
1538 */
1539 if (pref_mismatch) {
79cda75a
ED
1540 /* fls(x) = __fls(x) + 1 */
1541 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
91b9a277 1542
874ffa8f 1543 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
91b9a277
OJ
1544 goto backtrace;
1545
1546 if (current_prefix_length >= cn->pos)
1547 current_prefix_length = mp;
c877efb2 1548 }
a07f5f50 1549
91b9a277
OJ
1550 pn = (struct tnode *)n; /* Descend */
1551 chopped_off = 0;
1552 continue;
1553
19baf839
RO
1554backtrace:
1555 chopped_off++;
1556
1557 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1558 while ((chopped_off <= pn->bits)
1559 && !(cindex & (1<<(chopped_off-1))))
19baf839 1560 chopped_off++;
19baf839
RO
1561
1562 /* Decrease current_... with bits chopped off */
1563 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1564 current_prefix_length = pn->pos + pn->bits
1565 - chopped_off;
91b9a277 1566
19baf839 1567 /*
c877efb2 1568 * Either we do the actual chop off according or if we have
19baf839
RO
1569 * chopped off all bits in this tnode walk up to our parent.
1570 */
1571
91b9a277 1572 if (chopped_off <= pn->bits) {
19baf839 1573 cindex &= ~(1 << (chopped_off-1));
91b9a277 1574 } else {
b299e4f0 1575 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
06801916 1576 if (!parent)
19baf839 1577 goto failed;
91b9a277 1578
19baf839 1579 /* Get Child's index */
06801916
SH
1580 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1581 pn = parent;
19baf839
RO
1582 chopped_off = 0;
1583
1584#ifdef CONFIG_IP_FIB_TRIE_STATS
1585 t->stats.backtrack++;
1586#endif
1587 goto backtrace;
c877efb2 1588 }
19baf839
RO
1589 }
1590failed:
c877efb2 1591 ret = 1;
19baf839 1592found:
2373ce1c 1593 rcu_read_unlock();
19baf839
RO
1594 return ret;
1595}
6fc01438 1596EXPORT_SYMBOL_GPL(fib_table_lookup);
19baf839 1597
9195bef7
SH
1598/*
1599 * Remove the leaf and return parent.
1600 */
1601static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1602{
b299e4f0 1603 struct tnode *tp = node_parent((struct rt_trie_node *) l);
c877efb2 1604
9195bef7 1605 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1606
c877efb2 1607 if (tp) {
9195bef7 1608 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
61648d91 1609 put_child(tp, cindex, NULL);
7b85576d 1610 trie_rebalance(t, tp);
91b9a277 1611 } else
a9b3cd7f 1612 RCU_INIT_POINTER(t->trie, NULL);
19baf839 1613
387a5487 1614 free_leaf(l);
19baf839
RO
1615}
1616
d562f1f8
RO
1617/*
1618 * Caller must hold RTNL.
1619 */
16c6cf8b 1620int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1621{
1622 struct trie *t = (struct trie *) tb->tb_data;
1623 u32 key, mask;
4e902c57
TG
1624 int plen = cfg->fc_dst_len;
1625 u8 tos = cfg->fc_tos;
19baf839
RO
1626 struct fib_alias *fa, *fa_to_delete;
1627 struct list_head *fa_head;
1628 struct leaf *l;
91b9a277
OJ
1629 struct leaf_info *li;
1630
c877efb2 1631 if (plen > 32)
19baf839
RO
1632 return -EINVAL;
1633
4e902c57 1634 key = ntohl(cfg->fc_dst);
91b9a277 1635 mask = ntohl(inet_make_mask(plen));
19baf839 1636
c877efb2 1637 if (key & ~mask)
19baf839
RO
1638 return -EINVAL;
1639
1640 key = key & mask;
1641 l = fib_find_node(t, key);
1642
c877efb2 1643 if (!l)
19baf839
RO
1644 return -ESRCH;
1645
ad5b3102
IM
1646 li = find_leaf_info(l, plen);
1647
1648 if (!li)
1649 return -ESRCH;
1650
1651 fa_head = &li->falh;
19baf839
RO
1652 fa = fib_find_alias(fa_head, tos, 0);
1653
1654 if (!fa)
1655 return -ESRCH;
1656
0c7770c7 1657 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1658
1659 fa_to_delete = NULL;
936f6f8e
JA
1660 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1661 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1662 struct fib_info *fi = fa->fa_info;
1663
1664 if (fa->fa_tos != tos)
1665 break;
1666
4e902c57
TG
1667 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1668 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
37e826c5 1669 fa->fa_info->fib_scope == cfg->fc_scope) &&
74cb3c10
JA
1670 (!cfg->fc_prefsrc ||
1671 fi->fib_prefsrc == cfg->fc_prefsrc) &&
4e902c57
TG
1672 (!cfg->fc_protocol ||
1673 fi->fib_protocol == cfg->fc_protocol) &&
1674 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1675 fa_to_delete = fa;
1676 break;
1677 }
1678 }
1679
91b9a277
OJ
1680 if (!fa_to_delete)
1681 return -ESRCH;
19baf839 1682
91b9a277 1683 fa = fa_to_delete;
4e902c57 1684 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1685 &cfg->fc_nlinfo, 0);
91b9a277 1686
2373ce1c 1687 list_del_rcu(&fa->fa_list);
19baf839 1688
21d8c49e
DM
1689 if (!plen)
1690 tb->tb_num_default--;
1691
91b9a277 1692 if (list_empty(fa_head)) {
2373ce1c 1693 hlist_del_rcu(&li->hlist);
91b9a277 1694 free_leaf_info(li);
2373ce1c 1695 }
19baf839 1696
91b9a277 1697 if (hlist_empty(&l->list))
9195bef7 1698 trie_leaf_remove(t, l);
19baf839 1699
91b9a277 1700 if (fa->fa_state & FA_S_ACCESSED)
4ccfe6d4 1701 rt_cache_flush(cfg->fc_nlinfo.nl_net);
19baf839 1702
2373ce1c
RO
1703 fib_release_info(fa->fa_info);
1704 alias_free_mem_rcu(fa);
91b9a277 1705 return 0;
19baf839
RO
1706}
1707
ef3660ce 1708static int trie_flush_list(struct list_head *head)
19baf839
RO
1709{
1710 struct fib_alias *fa, *fa_node;
1711 int found = 0;
1712
1713 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1714 struct fib_info *fi = fa->fa_info;
19baf839 1715
2373ce1c
RO
1716 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1717 list_del_rcu(&fa->fa_list);
1718 fib_release_info(fa->fa_info);
1719 alias_free_mem_rcu(fa);
19baf839
RO
1720 found++;
1721 }
1722 }
1723 return found;
1724}
1725
ef3660ce 1726static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1727{
1728 int found = 0;
1729 struct hlist_head *lih = &l->list;
b67bfe0d 1730 struct hlist_node *tmp;
19baf839
RO
1731 struct leaf_info *li = NULL;
1732
b67bfe0d 1733 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
ef3660ce 1734 found += trie_flush_list(&li->falh);
19baf839
RO
1735
1736 if (list_empty(&li->falh)) {
2373ce1c 1737 hlist_del_rcu(&li->hlist);
19baf839
RO
1738 free_leaf_info(li);
1739 }
1740 }
1741 return found;
1742}
1743
82cfbb00
SH
1744/*
1745 * Scan for the next right leaf starting at node p->child[idx]
1746 * Since we have back pointer, no recursion necessary.
1747 */
b299e4f0 1748static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
19baf839 1749{
82cfbb00
SH
1750 do {
1751 t_key idx;
c877efb2 1752
c877efb2 1753 if (c)
82cfbb00 1754 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1755 else
82cfbb00 1756 idx = 0;
2373ce1c 1757
82cfbb00
SH
1758 while (idx < 1u << p->bits) {
1759 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1760 if (!c)
91b9a277
OJ
1761 continue;
1762
8b2b5e27 1763 if (IS_LEAF(c))
82cfbb00 1764 return (struct leaf *) c;
82cfbb00
SH
1765
1766 /* Rescan start scanning in new node */
1767 p = (struct tnode *) c;
1768 idx = 0;
19baf839 1769 }
82cfbb00
SH
1770
1771 /* Node empty, walk back up to parent */
b299e4f0 1772 c = (struct rt_trie_node *) p;
a034ee3c 1773 } while ((p = node_parent_rcu(c)) != NULL);
82cfbb00
SH
1774
1775 return NULL; /* Root of trie */
1776}
1777
82cfbb00
SH
1778static struct leaf *trie_firstleaf(struct trie *t)
1779{
a034ee3c 1780 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
82cfbb00
SH
1781
1782 if (!n)
1783 return NULL;
1784
1785 if (IS_LEAF(n)) /* trie is just a leaf */
1786 return (struct leaf *) n;
1787
1788 return leaf_walk_rcu(n, NULL);
1789}
1790
1791static struct leaf *trie_nextleaf(struct leaf *l)
1792{
b299e4f0 1793 struct rt_trie_node *c = (struct rt_trie_node *) l;
b902e573 1794 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1795
1796 if (!p)
1797 return NULL; /* trie with just one leaf */
1798
1799 return leaf_walk_rcu(p, c);
19baf839
RO
1800}
1801
71d67e66
SH
1802static struct leaf *trie_leafindex(struct trie *t, int index)
1803{
1804 struct leaf *l = trie_firstleaf(t);
1805
ec28cf73 1806 while (l && index-- > 0)
71d67e66 1807 l = trie_nextleaf(l);
ec28cf73 1808
71d67e66
SH
1809 return l;
1810}
1811
1812
d562f1f8
RO
1813/*
1814 * Caller must hold RTNL.
1815 */
16c6cf8b 1816int fib_table_flush(struct fib_table *tb)
19baf839
RO
1817{
1818 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1819 struct leaf *l, *ll = NULL;
82cfbb00 1820 int found = 0;
19baf839 1821
82cfbb00 1822 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1823 found += trie_flush_leaf(l);
19baf839
RO
1824
1825 if (ll && hlist_empty(&ll->list))
9195bef7 1826 trie_leaf_remove(t, ll);
19baf839
RO
1827 ll = l;
1828 }
1829
1830 if (ll && hlist_empty(&ll->list))
9195bef7 1831 trie_leaf_remove(t, ll);
19baf839 1832
0c7770c7 1833 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1834 return found;
1835}
1836
4aa2c466
PE
1837void fib_free_table(struct fib_table *tb)
1838{
1839 kfree(tb);
1840}
1841
a07f5f50
SH
1842static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1843 struct fib_table *tb,
19baf839
RO
1844 struct sk_buff *skb, struct netlink_callback *cb)
1845{
1846 int i, s_i;
1847 struct fib_alias *fa;
32ab5f80 1848 __be32 xkey = htonl(key);
19baf839 1849
71d67e66 1850 s_i = cb->args[5];
19baf839
RO
1851 i = 0;
1852
2373ce1c
RO
1853 /* rcu_read_lock is hold by caller */
1854
1855 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1856 if (i < s_i) {
1857 i++;
1858 continue;
1859 }
19baf839 1860
15e47304 1861 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
19baf839
RO
1862 cb->nlh->nlmsg_seq,
1863 RTM_NEWROUTE,
1864 tb->tb_id,
1865 fa->fa_type,
be403ea1 1866 xkey,
19baf839
RO
1867 plen,
1868 fa->fa_tos,
64347f78 1869 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1870 cb->args[5] = i;
19baf839 1871 return -1;
91b9a277 1872 }
19baf839
RO
1873 i++;
1874 }
71d67e66 1875 cb->args[5] = i;
19baf839
RO
1876 return skb->len;
1877}
1878
a88ee229
SH
1879static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1880 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1881{
a88ee229 1882 struct leaf_info *li;
a88ee229 1883 int i, s_i;
19baf839 1884
71d67e66 1885 s_i = cb->args[4];
a88ee229 1886 i = 0;
19baf839 1887
a88ee229 1888 /* rcu_read_lock is hold by caller */
b67bfe0d 1889 hlist_for_each_entry_rcu(li, &l->list, hlist) {
a88ee229
SH
1890 if (i < s_i) {
1891 i++;
19baf839 1892 continue;
a88ee229 1893 }
91b9a277 1894
a88ee229 1895 if (i > s_i)
71d67e66 1896 cb->args[5] = 0;
19baf839 1897
a88ee229 1898 if (list_empty(&li->falh))
19baf839
RO
1899 continue;
1900
a88ee229 1901 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1902 cb->args[4] = i;
19baf839
RO
1903 return -1;
1904 }
a88ee229 1905 i++;
19baf839 1906 }
a88ee229 1907
71d67e66 1908 cb->args[4] = i;
19baf839
RO
1909 return skb->len;
1910}
1911
16c6cf8b
SH
1912int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1913 struct netlink_callback *cb)
19baf839 1914{
a88ee229 1915 struct leaf *l;
19baf839 1916 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1917 t_key key = cb->args[2];
71d67e66 1918 int count = cb->args[3];
19baf839 1919
2373ce1c 1920 rcu_read_lock();
d5ce8a0e
SH
1921 /* Dump starting at last key.
1922 * Note: 0.0.0.0/0 (ie default) is first key.
1923 */
71d67e66 1924 if (count == 0)
d5ce8a0e
SH
1925 l = trie_firstleaf(t);
1926 else {
71d67e66
SH
1927 /* Normally, continue from last key, but if that is missing
1928 * fallback to using slow rescan
1929 */
d5ce8a0e 1930 l = fib_find_node(t, key);
71d67e66
SH
1931 if (!l)
1932 l = trie_leafindex(t, count);
d5ce8a0e 1933 }
a88ee229 1934
d5ce8a0e
SH
1935 while (l) {
1936 cb->args[2] = l->key;
a88ee229 1937 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1938 cb->args[3] = count;
a88ee229 1939 rcu_read_unlock();
a88ee229 1940 return -1;
19baf839 1941 }
d5ce8a0e 1942
71d67e66 1943 ++count;
d5ce8a0e 1944 l = trie_nextleaf(l);
71d67e66
SH
1945 memset(&cb->args[4], 0,
1946 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1947 }
71d67e66 1948 cb->args[3] = count;
2373ce1c 1949 rcu_read_unlock();
a88ee229 1950
19baf839 1951 return skb->len;
19baf839
RO
1952}
1953
5348ba85 1954void __init fib_trie_init(void)
7f9b8052 1955{
a07f5f50
SH
1956 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1957 sizeof(struct fib_alias),
bc3c8c1e
SH
1958 0, SLAB_PANIC, NULL);
1959
1960 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1961 max(sizeof(struct leaf),
1962 sizeof(struct leaf_info)),
1963 0, SLAB_PANIC, NULL);
7f9b8052 1964}
19baf839 1965
7f9b8052 1966
5348ba85 1967struct fib_table *fib_trie_table(u32 id)
19baf839
RO
1968{
1969 struct fib_table *tb;
1970 struct trie *t;
1971
19baf839
RO
1972 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1973 GFP_KERNEL);
1974 if (tb == NULL)
1975 return NULL;
1976
1977 tb->tb_id = id;
971b893e 1978 tb->tb_default = -1;
21d8c49e 1979 tb->tb_num_default = 0;
19baf839
RO
1980
1981 t = (struct trie *) tb->tb_data;
c28a1cf4 1982 memset(t, 0, sizeof(*t));
19baf839 1983
19baf839
RO
1984 return tb;
1985}
1986
cb7b593c
SH
1987#ifdef CONFIG_PROC_FS
1988/* Depth first Trie walk iterator */
1989struct fib_trie_iter {
1c340b2f 1990 struct seq_net_private p;
3d3b2d25 1991 struct fib_table *tb;
cb7b593c 1992 struct tnode *tnode;
a034ee3c
ED
1993 unsigned int index;
1994 unsigned int depth;
cb7b593c 1995};
19baf839 1996
b299e4f0 1997static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 1998{
cb7b593c 1999 struct tnode *tn = iter->tnode;
a034ee3c 2000 unsigned int cindex = iter->index;
cb7b593c 2001 struct tnode *p;
19baf839 2002
6640e697
EB
2003 /* A single entry routing table */
2004 if (!tn)
2005 return NULL;
2006
cb7b593c
SH
2007 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2008 iter->tnode, iter->index, iter->depth);
2009rescan:
2010 while (cindex < (1<<tn->bits)) {
b299e4f0 2011 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2012
cb7b593c
SH
2013 if (n) {
2014 if (IS_LEAF(n)) {
2015 iter->tnode = tn;
2016 iter->index = cindex + 1;
2017 } else {
2018 /* push down one level */
2019 iter->tnode = (struct tnode *) n;
2020 iter->index = 0;
2021 ++iter->depth;
2022 }
2023 return n;
2024 }
19baf839 2025
cb7b593c
SH
2026 ++cindex;
2027 }
91b9a277 2028
cb7b593c 2029 /* Current node exhausted, pop back up */
b299e4f0 2030 p = node_parent_rcu((struct rt_trie_node *)tn);
cb7b593c
SH
2031 if (p) {
2032 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2033 tn = p;
2034 --iter->depth;
2035 goto rescan;
19baf839 2036 }
cb7b593c
SH
2037
2038 /* got root? */
2039 return NULL;
19baf839
RO
2040}
2041
b299e4f0 2042static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
cb7b593c 2043 struct trie *t)
19baf839 2044{
b299e4f0 2045 struct rt_trie_node *n;
5ddf0eb2 2046
132adf54 2047 if (!t)
5ddf0eb2
RO
2048 return NULL;
2049
2050 n = rcu_dereference(t->trie);
3d3b2d25 2051 if (!n)
5ddf0eb2 2052 return NULL;
19baf839 2053
3d3b2d25
SH
2054 if (IS_TNODE(n)) {
2055 iter->tnode = (struct tnode *) n;
2056 iter->index = 0;
2057 iter->depth = 1;
2058 } else {
2059 iter->tnode = NULL;
2060 iter->index = 0;
2061 iter->depth = 0;
91b9a277 2062 }
3d3b2d25
SH
2063
2064 return n;
cb7b593c 2065}
91b9a277 2066
cb7b593c
SH
2067static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068{
b299e4f0 2069 struct rt_trie_node *n;
cb7b593c 2070 struct fib_trie_iter iter;
91b9a277 2071
cb7b593c 2072 memset(s, 0, sizeof(*s));
91b9a277 2073
cb7b593c 2074 rcu_read_lock();
3d3b2d25 2075 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2076 if (IS_LEAF(n)) {
93672292
SH
2077 struct leaf *l = (struct leaf *)n;
2078 struct leaf_info *li;
93672292 2079
cb7b593c
SH
2080 s->leaves++;
2081 s->totdepth += iter.depth;
2082 if (iter.depth > s->maxdepth)
2083 s->maxdepth = iter.depth;
93672292 2084
b67bfe0d 2085 hlist_for_each_entry_rcu(li, &l->list, hlist)
93672292 2086 ++s->prefixes;
cb7b593c
SH
2087 } else {
2088 const struct tnode *tn = (const struct tnode *) n;
2089 int i;
2090
2091 s->tnodes++;
132adf54 2092 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2093 s->nodesizes[tn->bits]++;
2094
cb7b593c
SH
2095 for (i = 0; i < (1<<tn->bits); i++)
2096 if (!tn->child[i])
2097 s->nullpointers++;
19baf839 2098 }
19baf839 2099 }
2373ce1c 2100 rcu_read_unlock();
19baf839
RO
2101}
2102
cb7b593c
SH
2103/*
2104 * This outputs /proc/net/fib_triestats
2105 */
2106static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2107{
a034ee3c 2108 unsigned int i, max, pointers, bytes, avdepth;
c877efb2 2109
cb7b593c
SH
2110 if (stat->leaves)
2111 avdepth = stat->totdepth*100 / stat->leaves;
2112 else
2113 avdepth = 0;
91b9a277 2114
a07f5f50
SH
2115 seq_printf(seq, "\tAver depth: %u.%02d\n",
2116 avdepth / 100, avdepth % 100);
cb7b593c 2117 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2118
cb7b593c 2119 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2120 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2121
2122 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2123 bytes += sizeof(struct leaf_info) * stat->prefixes;
2124
187b5188 2125 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2126 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2127
06ef921d
RO
2128 max = MAX_STAT_DEPTH;
2129 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2130 max--;
19baf839 2131
cb7b593c
SH
2132 pointers = 0;
2133 for (i = 1; i <= max; i++)
2134 if (stat->nodesizes[i] != 0) {
187b5188 2135 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2136 pointers += (1<<i) * stat->nodesizes[i];
2137 }
2138 seq_putc(seq, '\n');
187b5188 2139 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2140
b299e4f0 2141 bytes += sizeof(struct rt_trie_node *) * pointers;
187b5188
SH
2142 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2143 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2144}
2373ce1c 2145
cb7b593c 2146#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2147static void trie_show_usage(struct seq_file *seq,
2148 const struct trie_use_stats *stats)
2149{
2150 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2151 seq_printf(seq, "gets = %u\n", stats->gets);
2152 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2153 seq_printf(seq, "semantic match passed = %u\n",
2154 stats->semantic_match_passed);
2155 seq_printf(seq, "semantic match miss = %u\n",
2156 stats->semantic_match_miss);
2157 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2158 seq_printf(seq, "skipped node resize = %u\n\n",
2159 stats->resize_node_skipped);
cb7b593c 2160}
66a2f7fd
SH
2161#endif /* CONFIG_IP_FIB_TRIE_STATS */
2162
3d3b2d25 2163static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2164{
3d3b2d25
SH
2165 if (tb->tb_id == RT_TABLE_LOCAL)
2166 seq_puts(seq, "Local:\n");
2167 else if (tb->tb_id == RT_TABLE_MAIN)
2168 seq_puts(seq, "Main:\n");
2169 else
2170 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2171}
19baf839 2172
3d3b2d25 2173
cb7b593c
SH
2174static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2175{
1c340b2f 2176 struct net *net = (struct net *)seq->private;
3d3b2d25 2177 unsigned int h;
877a9bff 2178
d717a9a6 2179 seq_printf(seq,
a07f5f50
SH
2180 "Basic info: size of leaf:"
2181 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2182 sizeof(struct leaf), sizeof(struct tnode));
2183
3d3b2d25
SH
2184 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2185 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25
SH
2186 struct fib_table *tb;
2187
b67bfe0d 2188 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2189 struct trie *t = (struct trie *) tb->tb_data;
2190 struct trie_stat stat;
877a9bff 2191
3d3b2d25
SH
2192 if (!t)
2193 continue;
2194
2195 fib_table_print(seq, tb);
2196
2197 trie_collect_stats(t, &stat);
2198 trie_show_stats(seq, &stat);
2199#ifdef CONFIG_IP_FIB_TRIE_STATS
2200 trie_show_usage(seq, &t->stats);
2201#endif
2202 }
2203 }
19baf839 2204
cb7b593c 2205 return 0;
19baf839
RO
2206}
2207
cb7b593c 2208static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2209{
de05c557 2210 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2211}
2212
9a32144e 2213static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2214 .owner = THIS_MODULE,
2215 .open = fib_triestat_seq_open,
2216 .read = seq_read,
2217 .llseek = seq_lseek,
b6fcbdb4 2218 .release = single_release_net,
cb7b593c
SH
2219};
2220
b299e4f0 2221static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2222{
1218854a
YH
2223 struct fib_trie_iter *iter = seq->private;
2224 struct net *net = seq_file_net(seq);
cb7b593c 2225 loff_t idx = 0;
3d3b2d25 2226 unsigned int h;
cb7b593c 2227
3d3b2d25
SH
2228 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2229 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
3d3b2d25 2230 struct fib_table *tb;
cb7b593c 2231
b67bfe0d 2232 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
b299e4f0 2233 struct rt_trie_node *n;
3d3b2d25
SH
2234
2235 for (n = fib_trie_get_first(iter,
2236 (struct trie *) tb->tb_data);
2237 n; n = fib_trie_get_next(iter))
2238 if (pos == idx++) {
2239 iter->tb = tb;
2240 return n;
2241 }
2242 }
cb7b593c 2243 }
3d3b2d25 2244
19baf839
RO
2245 return NULL;
2246}
2247
cb7b593c 2248static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2249 __acquires(RCU)
19baf839 2250{
cb7b593c 2251 rcu_read_lock();
1218854a 2252 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2253}
2254
cb7b593c 2255static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2256{
cb7b593c 2257 struct fib_trie_iter *iter = seq->private;
1218854a 2258 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2259 struct fib_table *tb = iter->tb;
2260 struct hlist_node *tb_node;
2261 unsigned int h;
b299e4f0 2262 struct rt_trie_node *n;
cb7b593c 2263
19baf839 2264 ++*pos;
3d3b2d25
SH
2265 /* next node in same table */
2266 n = fib_trie_get_next(iter);
2267 if (n)
2268 return n;
19baf839 2269
3d3b2d25
SH
2270 /* walk rest of this hash chain */
2271 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
0a5c0475 2272 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
3d3b2d25
SH
2273 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2274 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2275 if (n)
2276 goto found;
2277 }
19baf839 2278
3d3b2d25
SH
2279 /* new hash chain */
2280 while (++h < FIB_TABLE_HASHSZ) {
2281 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
b67bfe0d 2282 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
3d3b2d25
SH
2283 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2284 if (n)
2285 goto found;
2286 }
2287 }
cb7b593c 2288 return NULL;
3d3b2d25
SH
2289
2290found:
2291 iter->tb = tb;
2292 return n;
cb7b593c 2293}
19baf839 2294
cb7b593c 2295static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2296 __releases(RCU)
19baf839 2297{
cb7b593c
SH
2298 rcu_read_unlock();
2299}
91b9a277 2300
cb7b593c
SH
2301static void seq_indent(struct seq_file *seq, int n)
2302{
a034ee3c
ED
2303 while (n-- > 0)
2304 seq_puts(seq, " ");
cb7b593c 2305}
19baf839 2306
28d36e37 2307static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2308{
132adf54 2309 switch (s) {
cb7b593c
SH
2310 case RT_SCOPE_UNIVERSE: return "universe";
2311 case RT_SCOPE_SITE: return "site";
2312 case RT_SCOPE_LINK: return "link";
2313 case RT_SCOPE_HOST: return "host";
2314 case RT_SCOPE_NOWHERE: return "nowhere";
2315 default:
28d36e37 2316 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2317 return buf;
2318 }
2319}
19baf839 2320
36cbd3dc 2321static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2322 [RTN_UNSPEC] = "UNSPEC",
2323 [RTN_UNICAST] = "UNICAST",
2324 [RTN_LOCAL] = "LOCAL",
2325 [RTN_BROADCAST] = "BROADCAST",
2326 [RTN_ANYCAST] = "ANYCAST",
2327 [RTN_MULTICAST] = "MULTICAST",
2328 [RTN_BLACKHOLE] = "BLACKHOLE",
2329 [RTN_UNREACHABLE] = "UNREACHABLE",
2330 [RTN_PROHIBIT] = "PROHIBIT",
2331 [RTN_THROW] = "THROW",
2332 [RTN_NAT] = "NAT",
2333 [RTN_XRESOLVE] = "XRESOLVE",
2334};
19baf839 2335
a034ee3c 2336static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
cb7b593c 2337{
cb7b593c
SH
2338 if (t < __RTN_MAX && rtn_type_names[t])
2339 return rtn_type_names[t];
28d36e37 2340 snprintf(buf, len, "type %u", t);
cb7b593c 2341 return buf;
19baf839
RO
2342}
2343
cb7b593c
SH
2344/* Pretty print the trie */
2345static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2346{
cb7b593c 2347 const struct fib_trie_iter *iter = seq->private;
b299e4f0 2348 struct rt_trie_node *n = v;
c877efb2 2349
3d3b2d25
SH
2350 if (!node_parent_rcu(n))
2351 fib_table_print(seq, iter->tb);
095b8501 2352
cb7b593c
SH
2353 if (IS_TNODE(n)) {
2354 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2355 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2356
1d25cd6c 2357 seq_indent(seq, iter->depth-1);
673d57e7
HH
2358 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2359 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2360 tn->empty_children);
e905a9ed 2361
cb7b593c
SH
2362 } else {
2363 struct leaf *l = (struct leaf *) n;
1328042e 2364 struct leaf_info *li;
32ab5f80 2365 __be32 val = htonl(l->key);
cb7b593c
SH
2366
2367 seq_indent(seq, iter->depth);
673d57e7 2368 seq_printf(seq, " |-- %pI4\n", &val);
1328042e 2369
b67bfe0d 2370 hlist_for_each_entry_rcu(li, &l->list, hlist) {
1328042e
SH
2371 struct fib_alias *fa;
2372
2373 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2374 char buf1[32], buf2[32];
2375
2376 seq_indent(seq, iter->depth+1);
2377 seq_printf(seq, " /%d %s %s", li->plen,
2378 rtn_scope(buf1, sizeof(buf1),
37e826c5 2379 fa->fa_info->fib_scope),
1328042e
SH
2380 rtn_type(buf2, sizeof(buf2),
2381 fa->fa_type));
2382 if (fa->fa_tos)
b9c4d82a 2383 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2384 seq_putc(seq, '\n');
cb7b593c
SH
2385 }
2386 }
19baf839 2387 }
cb7b593c 2388
19baf839
RO
2389 return 0;
2390}
2391
f690808e 2392static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2393 .start = fib_trie_seq_start,
2394 .next = fib_trie_seq_next,
2395 .stop = fib_trie_seq_stop,
2396 .show = fib_trie_seq_show,
19baf839
RO
2397};
2398
cb7b593c 2399static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2400{
1c340b2f
DL
2401 return seq_open_net(inode, file, &fib_trie_seq_ops,
2402 sizeof(struct fib_trie_iter));
19baf839
RO
2403}
2404
9a32144e 2405static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2406 .owner = THIS_MODULE,
2407 .open = fib_trie_seq_open,
2408 .read = seq_read,
2409 .llseek = seq_lseek,
1c340b2f 2410 .release = seq_release_net,
19baf839
RO
2411};
2412
8315f5d8
SH
2413struct fib_route_iter {
2414 struct seq_net_private p;
2415 struct trie *main_trie;
2416 loff_t pos;
2417 t_key key;
2418};
2419
2420static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2421{
2422 struct leaf *l = NULL;
2423 struct trie *t = iter->main_trie;
2424
2425 /* use cache location of last found key */
2426 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2427 pos -= iter->pos;
2428 else {
2429 iter->pos = 0;
2430 l = trie_firstleaf(t);
2431 }
2432
2433 while (l && pos-- > 0) {
2434 iter->pos++;
2435 l = trie_nextleaf(l);
2436 }
2437
2438 if (l)
2439 iter->key = pos; /* remember it */
2440 else
2441 iter->pos = 0; /* forget it */
2442
2443 return l;
2444}
2445
2446static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2447 __acquires(RCU)
2448{
2449 struct fib_route_iter *iter = seq->private;
2450 struct fib_table *tb;
2451
2452 rcu_read_lock();
1218854a 2453 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2454 if (!tb)
2455 return NULL;
2456
2457 iter->main_trie = (struct trie *) tb->tb_data;
2458 if (*pos == 0)
2459 return SEQ_START_TOKEN;
2460 else
2461 return fib_route_get_idx(iter, *pos - 1);
2462}
2463
2464static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2465{
2466 struct fib_route_iter *iter = seq->private;
2467 struct leaf *l = v;
2468
2469 ++*pos;
2470 if (v == SEQ_START_TOKEN) {
2471 iter->pos = 0;
2472 l = trie_firstleaf(iter->main_trie);
2473 } else {
2474 iter->pos++;
2475 l = trie_nextleaf(l);
2476 }
2477
2478 if (l)
2479 iter->key = l->key;
2480 else
2481 iter->pos = 0;
2482 return l;
2483}
2484
2485static void fib_route_seq_stop(struct seq_file *seq, void *v)
2486 __releases(RCU)
2487{
2488 rcu_read_unlock();
2489}
2490
a034ee3c 2491static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2492{
a034ee3c 2493 unsigned int flags = 0;
19baf839 2494
a034ee3c
ED
2495 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2496 flags = RTF_REJECT;
cb7b593c
SH
2497 if (fi && fi->fib_nh->nh_gw)
2498 flags |= RTF_GATEWAY;
32ab5f80 2499 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2500 flags |= RTF_HOST;
2501 flags |= RTF_UP;
2502 return flags;
19baf839
RO
2503}
2504
cb7b593c
SH
2505/*
2506 * This outputs /proc/net/route.
2507 * The format of the file is not supposed to be changed
a034ee3c 2508 * and needs to be same as fib_hash output to avoid breaking
cb7b593c
SH
2509 * legacy utilities
2510 */
2511static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2512{
cb7b593c 2513 struct leaf *l = v;
1328042e 2514 struct leaf_info *li;
19baf839 2515
cb7b593c
SH
2516 if (v == SEQ_START_TOKEN) {
2517 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2518 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2519 "\tWindow\tIRTT");
2520 return 0;
2521 }
19baf839 2522
b67bfe0d 2523 hlist_for_each_entry_rcu(li, &l->list, hlist) {
cb7b593c 2524 struct fib_alias *fa;
32ab5f80 2525 __be32 mask, prefix;
91b9a277 2526
cb7b593c
SH
2527 mask = inet_make_mask(li->plen);
2528 prefix = htonl(l->key);
19baf839 2529
cb7b593c 2530 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2531 const struct fib_info *fi = fa->fa_info;
a034ee3c 2532 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2533 int len;
19baf839 2534
cb7b593c
SH
2535 if (fa->fa_type == RTN_BROADCAST
2536 || fa->fa_type == RTN_MULTICAST)
2537 continue;
19baf839 2538
cb7b593c 2539 if (fi)
5e659e4c
PE
2540 seq_printf(seq,
2541 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2542 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2543 fi->fib_dev ? fi->fib_dev->name : "*",
2544 prefix,
2545 fi->fib_nh->nh_gw, flags, 0, 0,
2546 fi->fib_priority,
2547 mask,
a07f5f50
SH
2548 (fi->fib_advmss ?
2549 fi->fib_advmss + 40 : 0),
cb7b593c 2550 fi->fib_window,
5e659e4c 2551 fi->fib_rtt >> 3, &len);
cb7b593c 2552 else
5e659e4c
PE
2553 seq_printf(seq,
2554 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2555 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2556 prefix, 0, flags, 0, 0, 0,
5e659e4c 2557 mask, 0, 0, 0, &len);
19baf839 2558
5e659e4c 2559 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2560 }
19baf839
RO
2561 }
2562
2563 return 0;
2564}
2565
f690808e 2566static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2567 .start = fib_route_seq_start,
2568 .next = fib_route_seq_next,
2569 .stop = fib_route_seq_stop,
cb7b593c 2570 .show = fib_route_seq_show,
19baf839
RO
2571};
2572
cb7b593c 2573static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2574{
1c340b2f 2575 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2576 sizeof(struct fib_route_iter));
19baf839
RO
2577}
2578
9a32144e 2579static const struct file_operations fib_route_fops = {
cb7b593c
SH
2580 .owner = THIS_MODULE,
2581 .open = fib_route_seq_open,
2582 .read = seq_read,
2583 .llseek = seq_lseek,
1c340b2f 2584 .release = seq_release_net,
19baf839
RO
2585};
2586
61a02653 2587int __net_init fib_proc_init(struct net *net)
19baf839 2588{
d4beaa66 2589 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
cb7b593c
SH
2590 goto out1;
2591
d4beaa66
G
2592 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2593 &fib_triestat_fops))
cb7b593c
SH
2594 goto out2;
2595
d4beaa66 2596 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
cb7b593c
SH
2597 goto out3;
2598
19baf839 2599 return 0;
cb7b593c
SH
2600
2601out3:
ece31ffd 2602 remove_proc_entry("fib_triestat", net->proc_net);
cb7b593c 2603out2:
ece31ffd 2604 remove_proc_entry("fib_trie", net->proc_net);
cb7b593c
SH
2605out1:
2606 return -ENOMEM;
19baf839
RO
2607}
2608
61a02653 2609void __net_exit fib_proc_exit(struct net *net)
19baf839 2610{
ece31ffd
G
2611 remove_proc_entry("fib_trie", net->proc_net);
2612 remove_proc_entry("fib_triestat", net->proc_net);
2613 remove_proc_entry("route", net->proc_net);
19baf839
RO
2614}
2615
2616#endif /* CONFIG_PROC_FS */