phy/micrel: add ability to support 50MHz RMII clock on KZS8051RNL
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
1080d709 132static int rt_chain_length_max __read_mostly = 20;
1da177e4 133
125bb8f5
ED
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
1da177e4
LT
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
d33e4553 143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
1da177e4 144static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 148static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 149
72cdd1d9
ED
150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
1da177e4 154
62fa8a84
DM
155static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156{
06582540
DM
157 struct rtable *rt = (struct rtable *) dst;
158 struct inet_peer *peer;
159 u32 *p = NULL;
160
161 if (!rt->peer)
162 rt_bind_peer(rt, 1);
62fa8a84 163
06582540
DM
164 peer = rt->peer;
165 if (peer) {
62fa8a84
DM
166 u32 *old_p = __DST_METRICS_PTR(old);
167 unsigned long prev, new;
168
06582540
DM
169 p = peer->metrics;
170 if (inet_metrics_new(peer))
171 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
172
173 new = (unsigned long) p;
174 prev = cmpxchg(&dst->_metrics, old, new);
175
176 if (prev != old) {
62fa8a84
DM
177 p = __DST_METRICS_PTR(prev);
178 if (prev & DST_METRICS_READ_ONLY)
179 p = NULL;
180 } else {
62fa8a84
DM
181 if (rt->fi) {
182 fib_info_put(rt->fi);
183 rt->fi = NULL;
184 }
185 }
186 }
187 return p;
188}
189
1da177e4
LT
190static struct dst_ops ipv4_dst_ops = {
191 .family = AF_INET,
09640e63 192 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
193 .gc = rt_garbage_collect,
194 .check = ipv4_dst_check,
0dbaee3b 195 .default_advmss = ipv4_default_advmss,
d33e4553 196 .default_mtu = ipv4_default_mtu,
62fa8a84 197 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
198 .destroy = ipv4_dst_destroy,
199 .ifdown = ipv4_dst_ifdown,
200 .negative_advice = ipv4_negative_advice,
201 .link_failure = ipv4_link_failure,
202 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 203 .local_out = __ip_local_out,
1da177e4
LT
204};
205
206#define ECN_OR_COST(class) TC_PRIO_##class
207
4839c52b 208const __u8 ip_tos2prio[16] = {
1da177e4
LT
209 TC_PRIO_BESTEFFORT,
210 ECN_OR_COST(FILLER),
211 TC_PRIO_BESTEFFORT,
212 ECN_OR_COST(BESTEFFORT),
213 TC_PRIO_BULK,
214 ECN_OR_COST(BULK),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_INTERACTIVE,
218 ECN_OR_COST(INTERACTIVE),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE_BULK,
222 ECN_OR_COST(INTERACTIVE_BULK),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK)
225};
226
227
228/*
229 * Route cache.
230 */
231
232/* The locking scheme is rather straight forward:
233 *
234 * 1) Read-Copy Update protects the buckets of the central route hash.
235 * 2) Only writers remove entries, and they hold the lock
236 * as they look at rtable reference counts.
237 * 3) Only readers acquire references to rtable entries,
238 * they do so with atomic increments and with the
239 * lock held.
240 */
241
242struct rt_hash_bucket {
1c31720a 243 struct rtable __rcu *chain;
22c047cc 244};
1080d709 245
8a25d5de
IM
246#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
248/*
249 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250 * The size of this table is a power of two and depends on the number of CPUS.
62051200 251 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 252 */
62051200
IM
253#ifdef CONFIG_LOCKDEP
254# define RT_HASH_LOCK_SZ 256
22c047cc 255#else
62051200
IM
256# if NR_CPUS >= 32
257# define RT_HASH_LOCK_SZ 4096
258# elif NR_CPUS >= 16
259# define RT_HASH_LOCK_SZ 2048
260# elif NR_CPUS >= 8
261# define RT_HASH_LOCK_SZ 1024
262# elif NR_CPUS >= 4
263# define RT_HASH_LOCK_SZ 512
264# else
265# define RT_HASH_LOCK_SZ 256
266# endif
22c047cc
ED
267#endif
268
269static spinlock_t *rt_hash_locks;
270# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
271
272static __init void rt_hash_lock_init(void)
273{
274 int i;
275
276 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277 GFP_KERNEL);
278 if (!rt_hash_locks)
279 panic("IP: failed to allocate rt_hash_locks\n");
280
281 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282 spin_lock_init(&rt_hash_locks[i]);
283}
22c047cc
ED
284#else
285# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
286
287static inline void rt_hash_lock_init(void)
288{
289}
22c047cc 290#endif
1da177e4 291
817bc4db
SH
292static struct rt_hash_bucket *rt_hash_table __read_mostly;
293static unsigned rt_hash_mask __read_mostly;
294static unsigned int rt_hash_log __read_mostly;
1da177e4 295
2f970d83 296static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 297#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 298
b00180de 299static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 300 int genid)
1da177e4 301{
0eae88f3 302 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 303 idx, genid)
29e75252 304 & rt_hash_mask;
1da177e4
LT
305}
306
e84f84f2
DL
307static inline int rt_genid(struct net *net)
308{
309 return atomic_read(&net->ipv4.rt_genid);
310}
311
1da177e4
LT
312#ifdef CONFIG_PROC_FS
313struct rt_cache_iter_state {
a75e936f 314 struct seq_net_private p;
1da177e4 315 int bucket;
29e75252 316 int genid;
1da177e4
LT
317};
318
1218854a 319static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 320{
1218854a 321 struct rt_cache_iter_state *st = seq->private;
1da177e4 322 struct rtable *r = NULL;
1da177e4
LT
323
324 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
1c31720a 325 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
a6272665 326 continue;
1da177e4 327 rcu_read_lock_bh();
a898def2 328 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 329 while (r) {
d8d1f30b 330 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 331 r->rt_genid == st->genid)
29e75252 332 return r;
d8d1f30b 333 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 334 }
1da177e4
LT
335 rcu_read_unlock_bh();
336 }
29e75252 337 return r;
1da177e4
LT
338}
339
1218854a 340static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 341 struct rtable *r)
1da177e4 342{
1218854a 343 struct rt_cache_iter_state *st = seq->private;
a6272665 344
1c31720a 345 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
346 while (!r) {
347 rcu_read_unlock_bh();
a6272665
ED
348 do {
349 if (--st->bucket < 0)
350 return NULL;
1c31720a 351 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
1da177e4 352 rcu_read_lock_bh();
1c31720a 353 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 354 }
1c31720a 355 return r;
1da177e4
LT
356}
357
1218854a 358static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
359 struct rtable *r)
360{
1218854a
YH
361 struct rt_cache_iter_state *st = seq->private;
362 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 363 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 364 continue;
642d6318
DL
365 if (r->rt_genid == st->genid)
366 break;
367 }
368 return r;
369}
370
1218854a 371static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 372{
1218854a 373 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
374
375 if (r)
1218854a 376 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
377 --pos;
378 return pos ? NULL : r;
379}
380
381static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382{
29e75252 383 struct rt_cache_iter_state *st = seq->private;
29e75252 384 if (*pos)
1218854a 385 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 386 st->genid = rt_genid(seq_file_net(seq));
29e75252 387 return SEQ_START_TOKEN;
1da177e4
LT
388}
389
390static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391{
29e75252 392 struct rtable *r;
1da177e4
LT
393
394 if (v == SEQ_START_TOKEN)
1218854a 395 r = rt_cache_get_first(seq);
1da177e4 396 else
1218854a 397 r = rt_cache_get_next(seq, v);
1da177e4
LT
398 ++*pos;
399 return r;
400}
401
402static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403{
404 if (v && v != SEQ_START_TOKEN)
405 rcu_read_unlock_bh();
406}
407
408static int rt_cache_seq_show(struct seq_file *seq, void *v)
409{
410 if (v == SEQ_START_TOKEN)
411 seq_printf(seq, "%-127s\n",
412 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414 "HHUptod\tSpecDst");
415 else {
416 struct rtable *r = v;
5e659e4c 417 int len;
1da177e4 418
0eae88f3
ED
419 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 421 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
422 (__force u32)r->rt_dst,
423 (__force u32)r->rt_gateway,
d8d1f30b
CG
424 r->rt_flags, atomic_read(&r->dst.__refcnt),
425 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 426 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
427 dst_metric(&r->dst, RTAX_WINDOW),
428 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429 dst_metric(&r->dst, RTAX_RTTVAR)),
1da177e4 430 r->fl.fl4_tos,
d8d1f30b
CG
431 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432 r->dst.hh ? (r->dst.hh->hh_output ==
1da177e4 433 dev_queue_xmit) : 0,
5e659e4c
PE
434 r->rt_spec_dst, &len);
435
436 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
437 }
438 return 0;
1da177e4
LT
439}
440
f690808e 441static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
442 .start = rt_cache_seq_start,
443 .next = rt_cache_seq_next,
444 .stop = rt_cache_seq_stop,
445 .show = rt_cache_seq_show,
446};
447
448static int rt_cache_seq_open(struct inode *inode, struct file *file)
449{
a75e936f 450 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 451 sizeof(struct rt_cache_iter_state));
1da177e4
LT
452}
453
9a32144e 454static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
455 .owner = THIS_MODULE,
456 .open = rt_cache_seq_open,
457 .read = seq_read,
458 .llseek = seq_lseek,
a75e936f 459 .release = seq_release_net,
1da177e4
LT
460};
461
462
463static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464{
465 int cpu;
466
467 if (*pos == 0)
468 return SEQ_START_TOKEN;
469
0f23174a 470 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
471 if (!cpu_possible(cpu))
472 continue;
473 *pos = cpu+1;
2f970d83 474 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
475 }
476 return NULL;
477}
478
479static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480{
481 int cpu;
482
0f23174a 483 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
484 if (!cpu_possible(cpu))
485 continue;
486 *pos = cpu+1;
2f970d83 487 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
488 }
489 return NULL;
e905a9ed 490
1da177e4
LT
491}
492
493static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494{
495
496}
497
498static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499{
500 struct rt_cache_stat *st = v;
501
502 if (v == SEQ_START_TOKEN) {
5bec0039 503 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
504 return 0;
505 }
e905a9ed 506
1da177e4
LT
507 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
508 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 509 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
510 st->in_hit,
511 st->in_slow_tot,
512 st->in_slow_mc,
513 st->in_no_route,
514 st->in_brd,
515 st->in_martian_dst,
516 st->in_martian_src,
517
518 st->out_hit,
519 st->out_slow_tot,
e905a9ed 520 st->out_slow_mc,
1da177e4
LT
521
522 st->gc_total,
523 st->gc_ignored,
524 st->gc_goal_miss,
525 st->gc_dst_overflow,
526 st->in_hlist_search,
527 st->out_hlist_search
528 );
529 return 0;
530}
531
f690808e 532static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
533 .start = rt_cpu_seq_start,
534 .next = rt_cpu_seq_next,
535 .stop = rt_cpu_seq_stop,
536 .show = rt_cpu_seq_show,
537};
538
539
540static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541{
542 return seq_open(file, &rt_cpu_seq_ops);
543}
544
9a32144e 545static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
546 .owner = THIS_MODULE,
547 .open = rt_cpu_seq_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = seq_release,
551};
552
c7066f70 553#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 554static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 555{
a661c419
AD
556 struct ip_rt_acct *dst, *src;
557 unsigned int i, j;
558
559 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560 if (!dst)
561 return -ENOMEM;
562
563 for_each_possible_cpu(i) {
564 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565 for (j = 0; j < 256; j++) {
566 dst[j].o_bytes += src[j].o_bytes;
567 dst[j].o_packets += src[j].o_packets;
568 dst[j].i_bytes += src[j].i_bytes;
569 dst[j].i_packets += src[j].i_packets;
570 }
78c686e9
PE
571 }
572
a661c419
AD
573 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574 kfree(dst);
575 return 0;
576}
78c686e9 577
a661c419
AD
578static int rt_acct_proc_open(struct inode *inode, struct file *file)
579{
580 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 581}
a661c419
AD
582
583static const struct file_operations rt_acct_proc_fops = {
584 .owner = THIS_MODULE,
585 .open = rt_acct_proc_open,
586 .read = seq_read,
587 .llseek = seq_lseek,
588 .release = single_release,
589};
78c686e9 590#endif
107f1634 591
73b38711 592static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
593{
594 struct proc_dir_entry *pde;
595
596 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597 &rt_cache_seq_fops);
598 if (!pde)
599 goto err1;
600
77020720
WC
601 pde = proc_create("rt_cache", S_IRUGO,
602 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
603 if (!pde)
604 goto err2;
605
c7066f70 606#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 607 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
608 if (!pde)
609 goto err3;
610#endif
611 return 0;
612
c7066f70 613#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
614err3:
615 remove_proc_entry("rt_cache", net->proc_net_stat);
616#endif
617err2:
618 remove_proc_entry("rt_cache", net->proc_net);
619err1:
620 return -ENOMEM;
621}
73b38711
DL
622
623static void __net_exit ip_rt_do_proc_exit(struct net *net)
624{
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 627#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 628 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 629#endif
73b38711
DL
630}
631
632static struct pernet_operations ip_rt_proc_ops __net_initdata = {
633 .init = ip_rt_do_proc_init,
634 .exit = ip_rt_do_proc_exit,
635};
636
637static int __init ip_rt_proc_init(void)
638{
639 return register_pernet_subsys(&ip_rt_proc_ops);
640}
641
107f1634 642#else
73b38711 643static inline int ip_rt_proc_init(void)
107f1634
PE
644{
645 return 0;
646}
1da177e4 647#endif /* CONFIG_PROC_FS */
e905a9ed 648
5969f71d 649static inline void rt_free(struct rtable *rt)
1da177e4 650{
d8d1f30b 651 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
652}
653
5969f71d 654static inline void rt_drop(struct rtable *rt)
1da177e4 655{
1da177e4 656 ip_rt_put(rt);
d8d1f30b 657 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
658}
659
5969f71d 660static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
661{
662 /* Kill broadcast/multicast entries very aggresively, if they
663 collide in hash table with more useful entries */
664 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 665 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
666}
667
5969f71d 668static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
669{
670 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
d8d1f30b 671 rth->dst.expires;
1da177e4
LT
672}
673
674static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675{
676 unsigned long age;
677 int ret = 0;
678
d8d1f30b 679 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
680 goto out;
681
682 ret = 1;
d8d1f30b
CG
683 if (rth->dst.expires &&
684 time_after_eq(jiffies, rth->dst.expires))
1da177e4
LT
685 goto out;
686
d8d1f30b 687 age = jiffies - rth->dst.lastuse;
1da177e4
LT
688 ret = 0;
689 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690 (age <= tmo2 && rt_valuable(rth)))
691 goto out;
692 ret = 1;
693out: return ret;
694}
695
696/* Bits of score are:
697 * 31: very valuable
698 * 30: not quite useless
699 * 29..0: usage counter
700 */
701static inline u32 rt_score(struct rtable *rt)
702{
d8d1f30b 703 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
704
705 score = ~score & ~(3<<30);
706
707 if (rt_valuable(rt))
708 score |= (1<<31);
709
c7537967 710 if (rt_is_output_route(rt) ||
1da177e4
LT
711 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712 score |= (1<<30);
713
714 return score;
715}
716
1080d709
NH
717static inline bool rt_caching(const struct net *net)
718{
719 return net->ipv4.current_rt_cache_rebuild_count <=
720 net->ipv4.sysctl_rt_cache_rebuild_count;
721}
722
723static inline bool compare_hash_inputs(const struct flowi *fl1,
724 const struct flowi *fl2)
725{
5811662b
CG
726 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
1080d709
NH
728 (fl1->iif ^ fl2->iif)) == 0);
729}
730
1da177e4
LT
731static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732{
5811662b
CG
733 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
47dcf0cb 735 (fl1->mark ^ fl2->mark) |
5811662b 736 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
8238b218
DM
737 (fl1->oif ^ fl2->oif) |
738 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
739}
740
b5921910
DL
741static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742{
d8d1f30b 743 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
744}
745
e84f84f2
DL
746static inline int rt_is_expired(struct rtable *rth)
747{
d8d1f30b 748 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
749}
750
beb659bd
ED
751/*
752 * Perform a full scan of hash table and free all entries.
753 * Can be called by a softirq or a process.
754 * In the later case, we want to be reschedule if necessary
755 */
6561a3b1 756static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
757{
758 unsigned int i;
759 struct rtable *rth, *next;
760
761 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
762 struct rtable __rcu **pprev;
763 struct rtable *list;
764
beb659bd
ED
765 if (process_context && need_resched())
766 cond_resched();
1c31720a 767 rth = rcu_dereference_raw(rt_hash_table[i].chain);
beb659bd
ED
768 if (!rth)
769 continue;
770
771 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 772
6561a3b1
DM
773 list = NULL;
774 pprev = &rt_hash_table[i].chain;
775 rth = rcu_dereference_protected(*pprev,
1c31720a 776 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 777
6561a3b1
DM
778 while (rth) {
779 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 780 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
781
782 if (!net ||
783 net_eq(dev_net(rth->dst.dev), net)) {
784 rcu_assign_pointer(*pprev, next);
785 rcu_assign_pointer(rth->dst.rt_next, list);
786 list = rth;
32cb5b4e 787 } else {
6561a3b1 788 pprev = &rth->dst.rt_next;
32cb5b4e 789 }
6561a3b1 790 rth = next;
32cb5b4e 791 }
6561a3b1 792
beb659bd
ED
793 spin_unlock_bh(rt_hash_lock_addr(i));
794
6561a3b1
DM
795 for (; list; list = next) {
796 next = rcu_dereference_protected(list->dst.rt_next, 1);
797 rt_free(list);
beb659bd
ED
798 }
799 }
800}
801
1080d709
NH
802/*
803 * While freeing expired entries, we compute average chain length
804 * and standard deviation, using fixed-point arithmetic.
805 * This to have an estimation of rt_chain_length_max
806 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
807 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808 */
809
810#define FRACT_BITS 3
811#define ONE (1UL << FRACT_BITS)
812
98376387
ED
813/*
814 * Given a hash chain and an item in this hash chain,
815 * find if a previous entry has the same hash_inputs
816 * (but differs on tos, mark or oif)
817 * Returns 0 if an alias is found.
818 * Returns ONE if rth has no alias before itself.
819 */
820static int has_noalias(const struct rtable *head, const struct rtable *rth)
821{
822 const struct rtable *aux = head;
823
824 while (aux != rth) {
825 if (compare_hash_inputs(&aux->fl, &rth->fl))
826 return 0;
1c31720a 827 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
828 }
829 return ONE;
830}
831
beb659bd 832static void rt_check_expire(void)
1da177e4 833{
bb1d23b0
ED
834 static unsigned int rover;
835 unsigned int i = rover, goal;
1c31720a
ED
836 struct rtable *rth;
837 struct rtable __rcu **rthp;
cf8da764 838 unsigned long samples = 0;
1080d709 839 unsigned long sum = 0, sum2 = 0;
125bb8f5 840 unsigned long delta;
bb1d23b0
ED
841 u64 mult;
842
125bb8f5
ED
843 delta = jiffies - expires_ljiffies;
844 expires_ljiffies = jiffies;
845 mult = ((u64)delta) << rt_hash_log;
bb1d23b0
ED
846 if (ip_rt_gc_timeout > 1)
847 do_div(mult, ip_rt_gc_timeout);
848 goal = (unsigned int)mult;
39c90ece
ED
849 if (goal > rt_hash_mask)
850 goal = rt_hash_mask + 1;
bb1d23b0 851 for (; goal > 0; goal--) {
1da177e4 852 unsigned long tmo = ip_rt_gc_timeout;
cf8da764 853 unsigned long length;
1da177e4
LT
854
855 i = (i + 1) & rt_hash_mask;
856 rthp = &rt_hash_table[i].chain;
857
d90bf5a9
ED
858 if (need_resched())
859 cond_resched();
860
1080d709
NH
861 samples++;
862
1c31720a 863 if (rcu_dereference_raw(*rthp) == NULL)
bb1d23b0 864 continue;
cf8da764 865 length = 0;
39c90ece 866 spin_lock_bh(rt_hash_lock_addr(i));
1c31720a
ED
867 while ((rth = rcu_dereference_protected(*rthp,
868 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
d8d1f30b 869 prefetch(rth->dst.rt_next);
e84f84f2 870 if (rt_is_expired(rth)) {
d8d1f30b 871 *rthp = rth->dst.rt_next;
29e75252
ED
872 rt_free(rth);
873 continue;
874 }
d8d1f30b 875 if (rth->dst.expires) {
1da177e4 876 /* Entry is expired even if it is in use */
d8d1f30b 877 if (time_before_eq(jiffies, rth->dst.expires)) {
1ddbcb00 878nofree:
1da177e4 879 tmo >>= 1;
d8d1f30b 880 rthp = &rth->dst.rt_next;
1080d709 881 /*
1ddbcb00 882 * We only count entries on
1080d709
NH
883 * a chain with equal hash inputs once
884 * so that entries for different QOS
885 * levels, and other non-hash input
886 * attributes don't unfairly skew
887 * the length computation
888 */
98376387 889 length += has_noalias(rt_hash_table[i].chain, rth);
1da177e4
LT
890 continue;
891 }
1ddbcb00
ED
892 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893 goto nofree;
1da177e4
LT
894
895 /* Cleanup aged off entries. */
d8d1f30b 896 *rthp = rth->dst.rt_next;
e905a9ed 897 rt_free(rth);
1da177e4 898 }
39c90ece 899 spin_unlock_bh(rt_hash_lock_addr(i));
1080d709
NH
900 sum += length;
901 sum2 += length*length;
902 }
903 if (samples) {
904 unsigned long avg = sum / samples;
905 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906 rt_chain_length_max = max_t(unsigned long,
907 ip_rt_gc_elasticity,
908 (avg + 4*sd) >> FRACT_BITS);
1da177e4
LT
909 }
910 rover = i;
beb659bd
ED
911}
912
913/*
914 * rt_worker_func() is run in process context.
29e75252 915 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
916 */
917static void rt_worker_func(struct work_struct *work)
918{
29e75252 919 rt_check_expire();
39c90ece 920 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
921}
922
29e75252
ED
923/*
924 * Pertubation of rt_genid by a small quantity [1..256]
925 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926 * many times (2^24) without giving recent rt_genid.
927 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 928 */
86c657f6 929static void rt_cache_invalidate(struct net *net)
1da177e4 930{
29e75252 931 unsigned char shuffle;
1da177e4 932
29e75252 933 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 934 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
935}
936
29e75252
ED
937/*
938 * delay < 0 : invalidate cache (fast : entries will be deleted later)
939 * delay >= 0 : invalidate & flush cache (can be long)
940 */
76e6ebfb 941void rt_cache_flush(struct net *net, int delay)
1da177e4 942{
86c657f6 943 rt_cache_invalidate(net);
29e75252 944 if (delay >= 0)
6561a3b1 945 rt_do_flush(net, !in_softirq());
1da177e4
LT
946}
947
a5ee1551 948/* Flush previous cache invalidated entries from the cache */
6561a3b1 949void rt_cache_flush_batch(struct net *net)
a5ee1551 950{
6561a3b1 951 rt_do_flush(net, !in_softirq());
a5ee1551
EB
952}
953
1080d709
NH
954static void rt_emergency_hash_rebuild(struct net *net)
955{
3ee94372 956 if (net_ratelimit())
1080d709 957 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 958 rt_cache_invalidate(net);
1080d709
NH
959}
960
1da177e4
LT
961/*
962 Short description of GC goals.
963
964 We want to build algorithm, which will keep routing cache
965 at some equilibrium point, when number of aged off entries
966 is kept approximately equal to newly generated ones.
967
968 Current expiration strength is variable "expire".
969 We try to adjust it dynamically, so that if networking
970 is idle expires is large enough to keep enough of warm entries,
971 and when load increases it reduces to limit cache size.
972 */
973
569d3645 974static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
975{
976 static unsigned long expire = RT_GC_TIMEOUT;
977 static unsigned long last_gc;
978 static int rover;
979 static int equilibrium;
1c31720a
ED
980 struct rtable *rth;
981 struct rtable __rcu **rthp;
1da177e4
LT
982 unsigned long now = jiffies;
983 int goal;
fc66f95c 984 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
985
986 /*
987 * Garbage collection is pretty expensive,
988 * do not make it too frequently.
989 */
990
991 RT_CACHE_STAT_INC(gc_total);
992
993 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 994 entries < ip_rt_max_size) {
1da177e4
LT
995 RT_CACHE_STAT_INC(gc_ignored);
996 goto out;
997 }
998
fc66f95c 999 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 1000 /* Calculate number of entries, which we want to expire now. */
fc66f95c 1001 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
1002 if (goal <= 0) {
1003 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 1005 goal = entries - equilibrium;
1da177e4 1006 if (goal > 0) {
b790cedd 1007 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1008 goal = entries - equilibrium;
1da177e4
LT
1009 }
1010 } else {
1011 /* We are in dangerous area. Try to reduce cache really
1012 * aggressively.
1013 */
b790cedd 1014 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 1015 equilibrium = entries - goal;
1da177e4
LT
1016 }
1017
1018 if (now - last_gc >= ip_rt_gc_min_interval)
1019 last_gc = now;
1020
1021 if (goal <= 0) {
1022 equilibrium += goal;
1023 goto work_done;
1024 }
1025
1026 do {
1027 int i, k;
1028
1029 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030 unsigned long tmo = expire;
1031
1032 k = (k + 1) & rt_hash_mask;
1033 rthp = &rt_hash_table[k].chain;
22c047cc 1034 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
1035 while ((rth = rcu_dereference_protected(*rthp,
1036 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 1037 if (!rt_is_expired(rth) &&
29e75252 1038 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1039 tmo >>= 1;
d8d1f30b 1040 rthp = &rth->dst.rt_next;
1da177e4
LT
1041 continue;
1042 }
d8d1f30b 1043 *rthp = rth->dst.rt_next;
1da177e4
LT
1044 rt_free(rth);
1045 goal--;
1da177e4 1046 }
22c047cc 1047 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1048 if (goal <= 0)
1049 break;
1050 }
1051 rover = k;
1052
1053 if (goal <= 0)
1054 goto work_done;
1055
1056 /* Goal is not achieved. We stop process if:
1057
1058 - if expire reduced to zero. Otherwise, expire is halfed.
1059 - if table is not full.
1060 - if we are called from interrupt.
1061 - jiffies check is just fallback/debug loop breaker.
1062 We will not spin here for long time in any case.
1063 */
1064
1065 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067 if (expire == 0)
1068 break;
1069
1070 expire >>= 1;
1071#if RT_CACHE_DEBUG >= 2
1072 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
fc66f95c 1073 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1da177e4
LT
1074#endif
1075
fc66f95c 1076 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1077 goto out;
1078 } while (!in_softirq() && time_before_eq(jiffies, now));
1079
fc66f95c
ED
1080 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081 goto out;
1082 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
1083 goto out;
1084 if (net_ratelimit())
1085 printk(KERN_WARNING "dst cache overflow\n");
1086 RT_CACHE_STAT_INC(gc_dst_overflow);
1087 return 1;
1088
1089work_done:
1090 expire += ip_rt_gc_min_interval;
1091 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
1092 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4
LT
1094 expire = ip_rt_gc_timeout;
1095#if RT_CACHE_DEBUG >= 2
1096 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
fc66f95c 1097 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1da177e4
LT
1098#endif
1099out: return 0;
1100}
1101
98376387
ED
1102/*
1103 * Returns number of entries in a hash chain that have different hash_inputs
1104 */
1105static int slow_chain_length(const struct rtable *head)
1106{
1107 int length = 0;
1108 const struct rtable *rth = head;
1109
1110 while (rth) {
1111 length += has_noalias(head, rth);
1c31720a 1112 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1113 }
1114 return length >> FRACT_BITS;
1115}
1116
511c3f92 1117static int rt_intern_hash(unsigned hash, struct rtable *rt,
6a2bad70 1118 struct rtable **rp, struct sk_buff *skb, int ifindex)
1da177e4 1119{
1c31720a
ED
1120 struct rtable *rth, *cand;
1121 struct rtable __rcu **rthp, **candp;
1da177e4 1122 unsigned long now;
1da177e4
LT
1123 u32 min_score;
1124 int chain_length;
1125 int attempts = !in_softirq();
1126
1127restart:
1128 chain_length = 0;
1129 min_score = ~(u32)0;
1130 cand = NULL;
1131 candp = NULL;
1132 now = jiffies;
1133
d8d1f30b 1134 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1135 /*
1136 * If we're not caching, just tell the caller we
1137 * were successful and don't touch the route. The
1138 * caller hold the sole reference to the cache entry, and
1139 * it will be released when the caller is done with it.
1140 * If we drop it here, the callers have no way to resolve routes
1141 * when we're not caching. Instead, just point *rp at rt, so
1142 * the caller gets a single use out of the route
b6280b47
NH
1143 * Note that we do rt_free on this new route entry, so that
1144 * once its refcount hits zero, we are still able to reap it
1145 * (Thanks Alexey)
27b75c95
ED
1146 * Note: To avoid expensive rcu stuff for this uncached dst,
1147 * we set DST_NOCACHE so that dst_release() can free dst without
1148 * waiting a grace period.
73e42897 1149 */
b6280b47 1150
c7d4426a 1151 rt->dst.flags |= DST_NOCACHE;
c7537967 1152 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1153 int err = arp_bind_neighbour(&rt->dst);
b6280b47
NH
1154 if (err) {
1155 if (net_ratelimit())
1156 printk(KERN_WARNING
1157 "Neighbour table failure & not caching routes.\n");
27b75c95 1158 ip_rt_put(rt);
b6280b47
NH
1159 return err;
1160 }
1161 }
1162
b6280b47 1163 goto skip_hashing;
1080d709
NH
1164 }
1165
1da177e4
LT
1166 rthp = &rt_hash_table[hash].chain;
1167
22c047cc 1168 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1169 while ((rth = rcu_dereference_protected(*rthp,
1170 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1171 if (rt_is_expired(rth)) {
d8d1f30b 1172 *rthp = rth->dst.rt_next;
29e75252
ED
1173 rt_free(rth);
1174 continue;
1175 }
b5921910 1176 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 1177 /* Put it first */
d8d1f30b 1178 *rthp = rth->dst.rt_next;
1da177e4
LT
1179 /*
1180 * Since lookup is lockfree, the deletion
1181 * must be visible to another weakly ordered CPU before
1182 * the insertion at the start of the hash chain.
1183 */
d8d1f30b 1184 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1185 rt_hash_table[hash].chain);
1186 /*
1187 * Since lookup is lockfree, the update writes
1188 * must be ordered for consistency on SMP.
1189 */
1190 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
d8d1f30b 1192 dst_use(&rth->dst, now);
22c047cc 1193 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1194
1195 rt_drop(rt);
511c3f92
ED
1196 if (rp)
1197 *rp = rth;
1198 else
d8d1f30b 1199 skb_dst_set(skb, &rth->dst);
1da177e4
LT
1200 return 0;
1201 }
1202
d8d1f30b 1203 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1204 u32 score = rt_score(rth);
1205
1206 if (score <= min_score) {
1207 cand = rth;
1208 candp = rthp;
1209 min_score = score;
1210 }
1211 }
1212
1213 chain_length++;
1214
d8d1f30b 1215 rthp = &rth->dst.rt_next;
1da177e4
LT
1216 }
1217
1218 if (cand) {
1219 /* ip_rt_gc_elasticity used to be average length of chain
1220 * length, when exceeded gc becomes really aggressive.
1221 *
1222 * The second limit is less certain. At the moment it allows
1223 * only 2 entries per bucket. We will see.
1224 */
1225 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1226 *candp = cand->dst.rt_next;
1da177e4
LT
1227 rt_free(cand);
1228 }
1080d709 1229 } else {
98376387
ED
1230 if (chain_length > rt_chain_length_max &&
1231 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1232 struct net *net = dev_net(rt->dst.dev);
1080d709 1233 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1234 if (!rt_caching(net)) {
1080d709 1235 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1236 rt->dst.dev->name, num);
1080d709 1237 }
b35ecb5d 1238 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1239 spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242 ifindex, rt_genid(net));
1243 goto restart;
1080d709 1244 }
1da177e4
LT
1245 }
1246
1247 /* Try to bind route to arp only if it is output
1248 route or unicast forwarding path.
1249 */
c7537967 1250 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
d8d1f30b 1251 int err = arp_bind_neighbour(&rt->dst);
1da177e4 1252 if (err) {
22c047cc 1253 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1254
1255 if (err != -ENOBUFS) {
1256 rt_drop(rt);
1257 return err;
1258 }
1259
1260 /* Neighbour tables are full and nothing
1261 can be released. Try to shrink route cache,
1262 it is most likely it holds some neighbour records.
1263 */
1264 if (attempts-- > 0) {
1265 int saved_elasticity = ip_rt_gc_elasticity;
1266 int saved_int = ip_rt_gc_min_interval;
1267 ip_rt_gc_elasticity = 1;
1268 ip_rt_gc_min_interval = 0;
569d3645 1269 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1270 ip_rt_gc_min_interval = saved_int;
1271 ip_rt_gc_elasticity = saved_elasticity;
1272 goto restart;
1273 }
1274
1275 if (net_ratelimit())
7e1b33e5 1276 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4
LT
1277 rt_drop(rt);
1278 return -ENOBUFS;
1279 }
1280 }
1281
d8d1f30b 1282 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1283
1da177e4 1284#if RT_CACHE_DEBUG >= 2
d8d1f30b 1285 if (rt->dst.rt_next) {
1da177e4 1286 struct rtable *trt;
b6280b47
NH
1287 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288 hash, &rt->rt_dst);
d8d1f30b 1289 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
673d57e7 1290 printk(" . %pI4", &trt->rt_dst);
1da177e4
LT
1291 printk("\n");
1292 }
1293#endif
00269b54
ED
1294 /*
1295 * Since lookup is lockfree, we must make sure
1296 * previous writes to rt are comitted to memory
1297 * before making rt visible to other CPUS.
1298 */
1ddbcb00 1299 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1300
22c047cc 1301 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1302
b6280b47 1303skip_hashing:
511c3f92
ED
1304 if (rp)
1305 *rp = rt;
1306 else
d8d1f30b 1307 skb_dst_set(skb, &rt->dst);
1da177e4
LT
1308 return 0;
1309}
1310
6431cbc2
DM
1311static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1312
1313static u32 rt_peer_genid(void)
1314{
1315 return atomic_read(&__rt_peer_genid);
1316}
1317
1da177e4
LT
1318void rt_bind_peer(struct rtable *rt, int create)
1319{
1da177e4
LT
1320 struct inet_peer *peer;
1321
b534ecf1 1322 peer = inet_getpeer_v4(rt->rt_dst, create);
1da177e4 1323
49e8ab03 1324 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1325 inet_putpeer(peer);
6431cbc2
DM
1326 else
1327 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1328}
1329
1330/*
1331 * Peer allocation may fail only in serious out-of-memory conditions. However
1332 * we still can generate some output.
1333 * Random ID selection looks a bit dangerous because we have no chances to
1334 * select ID being unique in a reasonable period of time.
1335 * But broken packet identifier may be better than no packet at all.
1336 */
1337static void ip_select_fb_ident(struct iphdr *iph)
1338{
1339 static DEFINE_SPINLOCK(ip_fb_id_lock);
1340 static u32 ip_fallback_id;
1341 u32 salt;
1342
1343 spin_lock_bh(&ip_fb_id_lock);
e448515c 1344 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1345 iph->id = htons(salt & 0xFFFF);
1346 ip_fallback_id = salt;
1347 spin_unlock_bh(&ip_fb_id_lock);
1348}
1349
1350void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1351{
1352 struct rtable *rt = (struct rtable *) dst;
1353
1354 if (rt) {
1355 if (rt->peer == NULL)
1356 rt_bind_peer(rt, 1);
1357
1358 /* If peer is attached to destination, it is never detached,
1359 so that we need not to grab a lock to dereference it.
1360 */
1361 if (rt->peer) {
1362 iph->id = htons(inet_getid(rt->peer, more));
1363 return;
1364 }
1365 } else
e905a9ed 1366 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1367 __builtin_return_address(0));
1da177e4
LT
1368
1369 ip_select_fb_ident(iph);
1370}
4bc2f18b 1371EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1372
1373static void rt_del(unsigned hash, struct rtable *rt)
1374{
1c31720a
ED
1375 struct rtable __rcu **rthp;
1376 struct rtable *aux;
1da177e4 1377
29e75252 1378 rthp = &rt_hash_table[hash].chain;
22c047cc 1379 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1380 ip_rt_put(rt);
1c31720a
ED
1381 while ((aux = rcu_dereference_protected(*rthp,
1382 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1383 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1384 *rthp = aux->dst.rt_next;
29e75252
ED
1385 rt_free(aux);
1386 continue;
1da177e4 1387 }
d8d1f30b 1388 rthp = &aux->dst.rt_next;
29e75252 1389 }
22c047cc 1390 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1391}
1392
ed7865a4 1393/* called in rcu_read_lock() section */
f7655229
AV
1394void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1395 __be32 saddr, struct net_device *dev)
1da177e4
LT
1396{
1397 int i, k;
ed7865a4 1398 struct in_device *in_dev = __in_dev_get_rcu(dev);
1c31720a
ED
1399 struct rtable *rth;
1400 struct rtable __rcu **rthp;
f7655229 1401 __be32 skeys[2] = { saddr, 0 };
1da177e4 1402 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1403 struct netevent_redirect netevent;
317805b8 1404 struct net *net;
1da177e4 1405
1da177e4
LT
1406 if (!in_dev)
1407 return;
1408
c346dca1 1409 net = dev_net(dev);
9d4fb27d
JP
1410 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1411 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1412 ipv4_is_zeronet(new_gw))
1da177e4
LT
1413 goto reject_redirect;
1414
1080d709
NH
1415 if (!rt_caching(net))
1416 goto reject_redirect;
1417
1da177e4
LT
1418 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1419 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1420 goto reject_redirect;
1421 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1422 goto reject_redirect;
1423 } else {
317805b8 1424 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1425 goto reject_redirect;
1426 }
1427
1428 for (i = 0; i < 2; i++) {
1429 for (k = 0; k < 2; k++) {
b00180de 1430 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1431 rt_genid(net));
1da177e4 1432
1c31720a 1433 rthp = &rt_hash_table[hash].chain;
1da177e4 1434
1da177e4
LT
1435 while ((rth = rcu_dereference(*rthp)) != NULL) {
1436 struct rtable *rt;
1437
1438 if (rth->fl.fl4_dst != daddr ||
1439 rth->fl.fl4_src != skeys[i] ||
1da177e4 1440 rth->fl.oif != ikeys[k] ||
c7537967 1441 rt_is_input_route(rth) ||
e84f84f2 1442 rt_is_expired(rth) ||
d8d1f30b
CG
1443 !net_eq(dev_net(rth->dst.dev), net)) {
1444 rthp = &rth->dst.rt_next;
1da177e4
LT
1445 continue;
1446 }
1447
1448 if (rth->rt_dst != daddr ||
1449 rth->rt_src != saddr ||
d8d1f30b 1450 rth->dst.error ||
1da177e4 1451 rth->rt_gateway != old_gw ||
d8d1f30b 1452 rth->dst.dev != dev)
1da177e4
LT
1453 break;
1454
d8d1f30b 1455 dst_hold(&rth->dst);
1da177e4
LT
1456
1457 rt = dst_alloc(&ipv4_dst_ops);
1458 if (rt == NULL) {
1459 ip_rt_put(rth);
1da177e4
LT
1460 return;
1461 }
1462
1463 /* Copy all the information. */
1464 *rt = *rth;
d8d1f30b
CG
1465 rt->dst.__use = 1;
1466 atomic_set(&rt->dst.__refcnt, 1);
1467 rt->dst.child = NULL;
1468 if (rt->dst.dev)
1469 dev_hold(rt->dst.dev);
d8d1f30b
CG
1470 rt->dst.obsolete = -1;
1471 rt->dst.lastuse = jiffies;
1472 rt->dst.path = &rt->dst;
1473 rt->dst.neighbour = NULL;
1474 rt->dst.hh = NULL;
def8b4fa 1475#ifdef CONFIG_XFRM
d8d1f30b 1476 rt->dst.xfrm = NULL;
def8b4fa 1477#endif
e84f84f2 1478 rt->rt_genid = rt_genid(net);
1da177e4
LT
1479 rt->rt_flags |= RTCF_REDIRECTED;
1480
1481 /* Gateway is different ... */
1482 rt->rt_gateway = new_gw;
1483
1484 /* Redirect received -> path was valid */
d8d1f30b 1485 dst_confirm(&rth->dst);
1da177e4
LT
1486
1487 if (rt->peer)
1488 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
1489 if (rt->fi)
1490 atomic_inc(&rt->fi->fib_clntref);
1da177e4 1491
d8d1f30b
CG
1492 if (arp_bind_neighbour(&rt->dst) ||
1493 !(rt->dst.neighbour->nud_state &
1da177e4 1494 NUD_VALID)) {
d8d1f30b
CG
1495 if (rt->dst.neighbour)
1496 neigh_event_send(rt->dst.neighbour, NULL);
1da177e4
LT
1497 ip_rt_put(rth);
1498 rt_drop(rt);
1499 goto do_next;
1500 }
e905a9ed 1501
d8d1f30b
CG
1502 netevent.old = &rth->dst;
1503 netevent.new = &rt->dst;
e905a9ed
YH
1504 call_netevent_notifiers(NETEVENT_REDIRECT,
1505 &netevent);
1da177e4
LT
1506
1507 rt_del(hash, rth);
6a2bad70 1508 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1da177e4
LT
1509 ip_rt_put(rt);
1510 goto do_next;
1511 }
1da177e4
LT
1512 do_next:
1513 ;
1514 }
1515 }
1da177e4
LT
1516 return;
1517
1518reject_redirect:
1519#ifdef CONFIG_IP_ROUTE_VERBOSE
1520 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1521 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1522 " Advised path = %pI4 -> %pI4\n",
1523 &old_gw, dev->name, &new_gw,
1524 &saddr, &daddr);
1da177e4 1525#endif
ed7865a4 1526 ;
1da177e4
LT
1527}
1528
1529static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1530{
ee6b9673 1531 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1532 struct dst_entry *ret = dst;
1533
1534 if (rt) {
d11a4dc1 1535 if (dst->obsolete > 0) {
1da177e4
LT
1536 ip_rt_put(rt);
1537 ret = NULL;
1538 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
d8d1f30b
CG
1539 (rt->dst.expires &&
1540 time_after_eq(jiffies, rt->dst.expires))) {
8c7bc840 1541 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
b00180de 1542 rt->fl.oif,
e84f84f2 1543 rt_genid(dev_net(dst->dev)));
1da177e4 1544#if RT_CACHE_DEBUG >= 1
673d57e7
HH
1545 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1546 &rt->rt_dst, rt->fl.fl4_tos);
1da177e4
LT
1547#endif
1548 rt_del(hash, rt);
1549 ret = NULL;
1550 }
1551 }
1552 return ret;
1553}
1554
1555/*
1556 * Algorithm:
1557 * 1. The first ip_rt_redirect_number redirects are sent
1558 * with exponential backoff, then we stop sending them at all,
1559 * assuming that the host ignores our redirects.
1560 * 2. If we did not see packets requiring redirects
1561 * during ip_rt_redirect_silence, we assume that the host
1562 * forgot redirected route and start to send redirects again.
1563 *
1564 * This algorithm is much cheaper and more intelligent than dumb load limiting
1565 * in icmp.c.
1566 *
1567 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1568 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1569 */
1570
1571void ip_rt_send_redirect(struct sk_buff *skb)
1572{
511c3f92 1573 struct rtable *rt = skb_rtable(skb);
30038fc6 1574 struct in_device *in_dev;
92d86829 1575 struct inet_peer *peer;
30038fc6 1576 int log_martians;
1da177e4 1577
30038fc6 1578 rcu_read_lock();
d8d1f30b 1579 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1580 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1581 rcu_read_unlock();
1da177e4 1582 return;
30038fc6
ED
1583 }
1584 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1585 rcu_read_unlock();
1da177e4 1586
92d86829
DM
1587 if (!rt->peer)
1588 rt_bind_peer(rt, 1);
1589 peer = rt->peer;
1590 if (!peer) {
1591 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1592 return;
1593 }
1594
1da177e4
LT
1595 /* No redirected packets during ip_rt_redirect_silence;
1596 * reset the algorithm.
1597 */
92d86829
DM
1598 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1599 peer->rate_tokens = 0;
1da177e4
LT
1600
1601 /* Too many ignored redirects; do not send anything
d8d1f30b 1602 * set dst.rate_last to the last seen redirected packet.
1da177e4 1603 */
92d86829
DM
1604 if (peer->rate_tokens >= ip_rt_redirect_number) {
1605 peer->rate_last = jiffies;
30038fc6 1606 return;
1da177e4
LT
1607 }
1608
1609 /* Check for load limit; set rate_last to the latest sent
1610 * redirect.
1611 */
92d86829 1612 if (peer->rate_tokens == 0 ||
14fb8a76 1613 time_after(jiffies,
92d86829
DM
1614 (peer->rate_last +
1615 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1616 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1617 peer->rate_last = jiffies;
1618 ++peer->rate_tokens;
1da177e4 1619#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1620 if (log_martians &&
92d86829 1621 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1622 net_ratelimit())
673d57e7
HH
1623 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1624 &rt->rt_src, rt->rt_iif,
1625 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1626#endif
1627 }
1da177e4
LT
1628}
1629
1630static int ip_error(struct sk_buff *skb)
1631{
511c3f92 1632 struct rtable *rt = skb_rtable(skb);
92d86829 1633 struct inet_peer *peer;
1da177e4 1634 unsigned long now;
92d86829 1635 bool send;
1da177e4
LT
1636 int code;
1637
d8d1f30b 1638 switch (rt->dst.error) {
1da177e4
LT
1639 case EINVAL:
1640 default:
1641 goto out;
1642 case EHOSTUNREACH:
1643 code = ICMP_HOST_UNREACH;
1644 break;
1645 case ENETUNREACH:
1646 code = ICMP_NET_UNREACH;
d8d1f30b 1647 IP_INC_STATS_BH(dev_net(rt->dst.dev),
7c73a6fa 1648 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1649 break;
1650 case EACCES:
1651 code = ICMP_PKT_FILTERED;
1652 break;
1653 }
1654
92d86829
DM
1655 if (!rt->peer)
1656 rt_bind_peer(rt, 1);
1657 peer = rt->peer;
1658
1659 send = true;
1660 if (peer) {
1661 now = jiffies;
1662 peer->rate_tokens += now - peer->rate_last;
1663 if (peer->rate_tokens > ip_rt_error_burst)
1664 peer->rate_tokens = ip_rt_error_burst;
1665 peer->rate_last = now;
1666 if (peer->rate_tokens >= ip_rt_error_cost)
1667 peer->rate_tokens -= ip_rt_error_cost;
1668 else
1669 send = false;
1da177e4 1670 }
92d86829
DM
1671 if (send)
1672 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1673
1674out: kfree_skb(skb);
1675 return 0;
e905a9ed 1676}
1da177e4
LT
1677
1678/*
1679 * The last two values are not from the RFC but
1680 * are needed for AMPRnet AX.25 paths.
1681 */
1682
9b5b5cff 1683static const unsigned short mtu_plateau[] =
1da177e4
LT
1684{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1685
5969f71d 1686static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1687{
1688 int i;
e905a9ed 1689
1da177e4
LT
1690 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1691 if (old_mtu > mtu_plateau[i])
1692 return mtu_plateau[i];
1693 return 68;
1694}
1695
b5921910 1696unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1697 unsigned short new_mtu,
1698 struct net_device *dev)
1da177e4 1699{
0010e465 1700 int i, k;
1da177e4
LT
1701 unsigned short old_mtu = ntohs(iph->tot_len);
1702 struct rtable *rth;
0010e465 1703 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1704 __be32 skeys[2] = { iph->saddr, 0, };
1705 __be32 daddr = iph->daddr;
1da177e4
LT
1706 unsigned short est_mtu = 0;
1707
0010e465
TT
1708 for (k = 0; k < 2; k++) {
1709 for (i = 0; i < 2; i++) {
b00180de 1710 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1711 rt_genid(net));
0010e465
TT
1712
1713 rcu_read_lock();
1714 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 1715 rth = rcu_dereference(rth->dst.rt_next)) {
1da177e4
LT
1716 unsigned short mtu = new_mtu;
1717
0010e465
TT
1718 if (rth->fl.fl4_dst != daddr ||
1719 rth->fl.fl4_src != skeys[i] ||
1720 rth->rt_dst != daddr ||
1721 rth->rt_src != iph->saddr ||
1722 rth->fl.oif != ikeys[k] ||
c7537967 1723 rt_is_input_route(rth) ||
d8d1f30b
CG
1724 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1725 !net_eq(dev_net(rth->dst.dev), net) ||
6c3b8fc6 1726 rt_is_expired(rth))
0010e465
TT
1727 continue;
1728
1da177e4
LT
1729 if (new_mtu < 68 || new_mtu >= old_mtu) {
1730
1731 /* BSD 4.2 compatibility hack :-( */
1732 if (mtu == 0 &&
d8d1f30b 1733 old_mtu >= dst_mtu(&rth->dst) &&
1da177e4
LT
1734 old_mtu >= 68 + (iph->ihl << 2))
1735 old_mtu -= iph->ihl << 2;
1736
1737 mtu = guess_mtu(old_mtu);
1738 }
d8d1f30b
CG
1739 if (mtu <= dst_mtu(&rth->dst)) {
1740 if (mtu < dst_mtu(&rth->dst)) {
1741 dst_confirm(&rth->dst);
1da177e4 1742 if (mtu < ip_rt_min_pmtu) {
defb3519
DM
1743 u32 lock = dst_metric(&rth->dst,
1744 RTAX_LOCK);
1da177e4 1745 mtu = ip_rt_min_pmtu;
defb3519
DM
1746 lock |= (1 << RTAX_MTU);
1747 dst_metric_set(&rth->dst, RTAX_LOCK,
1748 lock);
1da177e4 1749 }
defb3519 1750 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
d8d1f30b 1751 dst_set_expires(&rth->dst,
1da177e4
LT
1752 ip_rt_mtu_expires);
1753 }
1754 est_mtu = mtu;
1755 }
1756 }
0010e465 1757 rcu_read_unlock();
1da177e4 1758 }
1da177e4
LT
1759 }
1760 return est_mtu ? : new_mtu;
1761}
1762
1763static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1764{
6d273f8d 1765 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1da177e4
LT
1766 !(dst_metric_locked(dst, RTAX_MTU))) {
1767 if (mtu < ip_rt_min_pmtu) {
defb3519 1768 u32 lock = dst_metric(dst, RTAX_LOCK);
1da177e4 1769 mtu = ip_rt_min_pmtu;
defb3519 1770 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1da177e4 1771 }
defb3519 1772 dst_metric_set(dst, RTAX_MTU, mtu);
1da177e4
LT
1773 dst_set_expires(dst, ip_rt_mtu_expires);
1774 }
1775}
1776
1777static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1778{
6431cbc2
DM
1779 struct rtable *rt = (struct rtable *) dst;
1780
1781 if (rt_is_expired(rt))
d11a4dc1 1782 return NULL;
6431cbc2
DM
1783 if (rt->rt_peer_genid != rt_peer_genid()) {
1784 if (!rt->peer)
1785 rt_bind_peer(rt, 0);
1786
1787 rt->rt_peer_genid = rt_peer_genid();
1788 }
d11a4dc1 1789 return dst;
1da177e4
LT
1790}
1791
1792static void ipv4_dst_destroy(struct dst_entry *dst)
1793{
1794 struct rtable *rt = (struct rtable *) dst;
1795 struct inet_peer *peer = rt->peer;
1da177e4 1796
62fa8a84
DM
1797 if (rt->fi) {
1798 fib_info_put(rt->fi);
1799 rt->fi = NULL;
1800 }
1da177e4
LT
1801 if (peer) {
1802 rt->peer = NULL;
1803 inet_putpeer(peer);
1804 }
1da177e4
LT
1805}
1806
1da177e4
LT
1807
1808static void ipv4_link_failure(struct sk_buff *skb)
1809{
1810 struct rtable *rt;
1811
1812 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1813
511c3f92 1814 rt = skb_rtable(skb);
1da177e4 1815 if (rt)
d8d1f30b 1816 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1817}
1818
1819static int ip_rt_bug(struct sk_buff *skb)
1820{
673d57e7
HH
1821 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1822 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1823 skb->dev ? skb->dev->name : "?");
1824 kfree_skb(skb);
1825 return 0;
1826}
1827
1828/*
1829 We do not cache source address of outgoing interface,
1830 because it is used only by IP RR, TS and SRR options,
1831 so that it out of fast path.
1832
1833 BTW remember: "addr" is allowed to be not aligned
1834 in IP options!
1835 */
1836
1837void ip_rt_get_source(u8 *addr, struct rtable *rt)
1838{
a61ced5d 1839 __be32 src;
1da177e4
LT
1840 struct fib_result res;
1841
c7537967 1842 if (rt_is_output_route(rt))
1da177e4 1843 src = rt->rt_src;
ebc0ffae
ED
1844 else {
1845 rcu_read_lock();
1846 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1847 src = FIB_RES_PREFSRC(res);
1848 else
1849 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1850 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1851 rcu_read_unlock();
1852 }
1da177e4
LT
1853 memcpy(addr, &src, 4);
1854}
1855
c7066f70 1856#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1857static void set_class_tag(struct rtable *rt, u32 tag)
1858{
d8d1f30b
CG
1859 if (!(rt->dst.tclassid & 0xFFFF))
1860 rt->dst.tclassid |= tag & 0xFFFF;
1861 if (!(rt->dst.tclassid & 0xFFFF0000))
1862 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1863}
1864#endif
1865
0dbaee3b
DM
1866static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1867{
1868 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1869
1870 if (advmss == 0) {
1871 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1872 ip_rt_min_advmss);
1873 if (advmss > 65535 - 40)
1874 advmss = 65535 - 40;
1875 }
1876 return advmss;
1877}
1878
d33e4553
DM
1879static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1880{
1881 unsigned int mtu = dst->dev->mtu;
1882
1883 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1884 const struct rtable *rt = (const struct rtable *) dst;
1885
1886 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1887 mtu = 576;
1888 }
1889
1890 if (mtu > IP_MAX_MTU)
1891 mtu = IP_MAX_MTU;
1892
1893 return mtu;
1894}
1895
a4daad6b
DM
1896static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1897{
0131ba45
DM
1898 struct inet_peer *peer;
1899 int create = 0;
a4daad6b 1900
0131ba45
DM
1901 /* If a peer entry exists for this destination, we must hook
1902 * it up in order to get at cached metrics.
1903 */
1904 if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1905 create = 1;
1906
1907 rt_bind_peer(rt, create);
1908 peer = rt->peer;
1909 if (peer) {
a4daad6b
DM
1910 if (inet_metrics_new(peer))
1911 memcpy(peer->metrics, fi->fib_metrics,
1912 sizeof(u32) * RTAX_MAX);
1913 dst_init_metrics(&rt->dst, peer->metrics, false);
0131ba45
DM
1914 } else {
1915 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1916 rt->fi = fi;
1917 atomic_inc(&fi->fib_clntref);
1918 }
1919 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1920 }
1921}
1922
1da177e4
LT
1923static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1924{
defb3519 1925 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1926 struct fib_info *fi = res->fi;
1927
1928 if (fi) {
1929 if (FIB_RES_GW(*res) &&
1930 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1931 rt->rt_gateway = FIB_RES_GW(*res);
a4daad6b 1932 rt_init_metrics(rt, fi);
c7066f70 1933#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1934 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1935#endif
d33e4553 1936 }
defb3519 1937
defb3519
DM
1938 if (dst_mtu(dst) > IP_MAX_MTU)
1939 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1940 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1941 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1942
c7066f70 1943#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1944#ifdef CONFIG_IP_MULTIPLE_TABLES
1945 set_class_tag(rt, fib_rules_tclass(res));
1946#endif
1947 set_class_tag(rt, itag);
1948#endif
e905a9ed 1949 rt->rt_type = res->type;
1da177e4
LT
1950}
1951
96d36220 1952/* called in rcu_read_lock() section */
9e12bb22 1953static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1954 u8 tos, struct net_device *dev, int our)
1955{
96d36220 1956 unsigned int hash;
1da177e4 1957 struct rtable *rth;
a61ced5d 1958 __be32 spec_dst;
96d36220 1959 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1960 u32 itag = 0;
b5f7e755 1961 int err;
1da177e4
LT
1962
1963 /* Primary sanity checks. */
1964
1965 if (in_dev == NULL)
1966 return -EINVAL;
1967
1e637c74 1968 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1969 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1970 goto e_inval;
1971
f97c1e0c
JP
1972 if (ipv4_is_zeronet(saddr)) {
1973 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1974 goto e_inval;
1975 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755
ED
1976 } else {
1977 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1978 &itag, 0);
1979 if (err < 0)
1980 goto e_err;
1981 }
1da177e4
LT
1982 rth = dst_alloc(&ipv4_dst_ops);
1983 if (!rth)
1984 goto e_nobufs;
1985
d8d1f30b
CG
1986 rth->dst.output = ip_rt_bug;
1987 rth->dst.obsolete = -1;
1da177e4 1988
d8d1f30b
CG
1989 atomic_set(&rth->dst.__refcnt, 1);
1990 rth->dst.flags= DST_HOST;
42f811b8 1991 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 1992 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
1993 rth->fl.fl4_dst = daddr;
1994 rth->rt_dst = daddr;
1995 rth->fl.fl4_tos = tos;
47dcf0cb 1996 rth->fl.mark = skb->mark;
1da177e4
LT
1997 rth->fl.fl4_src = saddr;
1998 rth->rt_src = saddr;
c7066f70 1999#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2000 rth->dst.tclassid = itag;
1da177e4
LT
2001#endif
2002 rth->rt_iif =
2003 rth->fl.iif = dev->ifindex;
d8d1f30b
CG
2004 rth->dst.dev = init_net.loopback_dev;
2005 dev_hold(rth->dst.dev);
1da177e4
LT
2006 rth->fl.oif = 0;
2007 rth->rt_gateway = daddr;
2008 rth->rt_spec_dst= spec_dst;
e84f84f2 2009 rth->rt_genid = rt_genid(dev_net(dev));
1da177e4 2010 rth->rt_flags = RTCF_MULTICAST;
29e75252 2011 rth->rt_type = RTN_MULTICAST;
1da177e4 2012 if (our) {
d8d1f30b 2013 rth->dst.input= ip_local_deliver;
1da177e4
LT
2014 rth->rt_flags |= RTCF_LOCAL;
2015 }
2016
2017#ifdef CONFIG_IP_MROUTE
f97c1e0c 2018 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 2019 rth->dst.input = ip_mr_input;
1da177e4
LT
2020#endif
2021 RT_CACHE_STAT_INC(in_slow_mc);
2022
e84f84f2 2023 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
6a2bad70 2024 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1da177e4
LT
2025
2026e_nobufs:
1da177e4 2027 return -ENOBUFS;
1da177e4 2028e_inval:
96d36220 2029 return -EINVAL;
b5f7e755 2030e_err:
b5f7e755 2031 return err;
1da177e4
LT
2032}
2033
2034
2035static void ip_handle_martian_source(struct net_device *dev,
2036 struct in_device *in_dev,
2037 struct sk_buff *skb,
9e12bb22
AV
2038 __be32 daddr,
2039 __be32 saddr)
1da177e4
LT
2040{
2041 RT_CACHE_STAT_INC(in_martian_src);
2042#ifdef CONFIG_IP_ROUTE_VERBOSE
2043 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2044 /*
2045 * RFC1812 recommendation, if source is martian,
2046 * the only hint is MAC header.
2047 */
673d57e7
HH
2048 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2049 &daddr, &saddr, dev->name);
98e399f8 2050 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 2051 int i;
98e399f8 2052 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
2053 printk(KERN_WARNING "ll header: ");
2054 for (i = 0; i < dev->hard_header_len; i++, p++) {
2055 printk("%02x", *p);
2056 if (i < (dev->hard_header_len - 1))
2057 printk(":");
2058 }
2059 printk("\n");
2060 }
2061 }
2062#endif
2063}
2064
47360228 2065/* called in rcu_read_lock() section */
5969f71d
SH
2066static int __mkroute_input(struct sk_buff *skb,
2067 struct fib_result *res,
2068 struct in_device *in_dev,
2069 __be32 daddr, __be32 saddr, u32 tos,
2070 struct rtable **result)
1da177e4 2071{
1da177e4
LT
2072 struct rtable *rth;
2073 int err;
2074 struct in_device *out_dev;
47360228 2075 unsigned int flags = 0;
d9c9df8c
AV
2076 __be32 spec_dst;
2077 u32 itag;
1da177e4
LT
2078
2079 /* get a working reference to the output device */
47360228 2080 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2081 if (out_dev == NULL) {
2082 if (net_ratelimit())
2083 printk(KERN_CRIT "Bug in ip_route_input" \
2084 "_slow(). Please, report\n");
2085 return -EINVAL;
2086 }
2087
2088
e905a9ed 2089 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
b0c110ca 2090 in_dev->dev, &spec_dst, &itag, skb->mark);
1da177e4 2091 if (err < 0) {
e905a9ed 2092 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2093 saddr);
e905a9ed 2094
1da177e4
LT
2095 goto cleanup;
2096 }
2097
2098 if (err)
2099 flags |= RTCF_DIRECTSRC;
2100
51b77cae 2101 if (out_dev == in_dev && err &&
1da177e4
LT
2102 (IN_DEV_SHARED_MEDIA(out_dev) ||
2103 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2104 flags |= RTCF_DOREDIRECT;
2105
2106 if (skb->protocol != htons(ETH_P_IP)) {
2107 /* Not IP (i.e. ARP). Do not create route, if it is
2108 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2109 *
2110 * Proxy arp feature have been extended to allow, ARP
2111 * replies back to the same interface, to support
2112 * Private VLAN switch technologies. See arp.c.
1da177e4 2113 */
65324144
JDB
2114 if (out_dev == in_dev &&
2115 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2116 err = -EINVAL;
2117 goto cleanup;
2118 }
2119 }
2120
2121
2122 rth = dst_alloc(&ipv4_dst_ops);
2123 if (!rth) {
2124 err = -ENOBUFS;
2125 goto cleanup;
2126 }
2127
d8d1f30b
CG
2128 atomic_set(&rth->dst.__refcnt, 1);
2129 rth->dst.flags= DST_HOST;
42f811b8 2130 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2131 rth->dst.flags |= DST_NOPOLICY;
42f811b8 2132 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
d8d1f30b 2133 rth->dst.flags |= DST_NOXFRM;
1da177e4
LT
2134 rth->fl.fl4_dst = daddr;
2135 rth->rt_dst = daddr;
2136 rth->fl.fl4_tos = tos;
47dcf0cb 2137 rth->fl.mark = skb->mark;
1da177e4
LT
2138 rth->fl.fl4_src = saddr;
2139 rth->rt_src = saddr;
2140 rth->rt_gateway = daddr;
2141 rth->rt_iif =
2142 rth->fl.iif = in_dev->dev->ifindex;
d8d1f30b
CG
2143 rth->dst.dev = (out_dev)->dev;
2144 dev_hold(rth->dst.dev);
1da177e4
LT
2145 rth->fl.oif = 0;
2146 rth->rt_spec_dst= spec_dst;
2147
d8d1f30b
CG
2148 rth->dst.obsolete = -1;
2149 rth->dst.input = ip_forward;
2150 rth->dst.output = ip_output;
2151 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1da177e4
LT
2152
2153 rt_set_nexthop(rth, res, itag);
2154
2155 rth->rt_flags = flags;
2156
2157 *result = rth;
2158 err = 0;
2159 cleanup:
1da177e4 2160 return err;
e905a9ed 2161}
1da177e4 2162
5969f71d
SH
2163static int ip_mkroute_input(struct sk_buff *skb,
2164 struct fib_result *res,
2165 const struct flowi *fl,
2166 struct in_device *in_dev,
2167 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2168{
7abaa27c 2169 struct rtable* rth = NULL;
1da177e4
LT
2170 int err;
2171 unsigned hash;
2172
2173#ifdef CONFIG_IP_ROUTE_MULTIPATH
2174 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2175 fib_select_multipath(fl, res);
2176#endif
2177
2178 /* create a routing cache entry */
2179 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2180 if (err)
2181 return err;
1da177e4
LT
2182
2183 /* put it into the cache */
e84f84f2 2184 hash = rt_hash(daddr, saddr, fl->iif,
d8d1f30b 2185 rt_genid(dev_net(rth->dst.dev)));
6a2bad70 2186 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
1da177e4
LT
2187}
2188
1da177e4
LT
2189/*
2190 * NOTE. We drop all the packets that has local source
2191 * addresses, because every properly looped back packet
2192 * must have correct destination already attached by output routine.
2193 *
2194 * Such approach solves two big problems:
2195 * 1. Not simplex devices are handled properly.
2196 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2197 * called with rcu_read_lock()
1da177e4
LT
2198 */
2199
9e12bb22 2200static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2201 u8 tos, struct net_device *dev)
2202{
2203 struct fib_result res;
96d36220 2204 struct in_device *in_dev = __in_dev_get_rcu(dev);
5811662b
CG
2205 struct flowi fl = { .fl4_dst = daddr,
2206 .fl4_src = saddr,
2207 .fl4_tos = tos,
2208 .fl4_scope = RT_SCOPE_UNIVERSE,
47dcf0cb 2209 .mark = skb->mark,
1da177e4
LT
2210 .iif = dev->ifindex };
2211 unsigned flags = 0;
2212 u32 itag = 0;
2213 struct rtable * rth;
2214 unsigned hash;
9e12bb22 2215 __be32 spec_dst;
1da177e4 2216 int err = -EINVAL;
c346dca1 2217 struct net * net = dev_net(dev);
1da177e4
LT
2218
2219 /* IP on this device is disabled. */
2220
2221 if (!in_dev)
2222 goto out;
2223
2224 /* Check for the most weird martians, which can be not detected
2225 by fib_lookup.
2226 */
2227
1e637c74 2228 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2229 ipv4_is_loopback(saddr))
1da177e4
LT
2230 goto martian_source;
2231
27a954bd 2232 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2233 goto brd_input;
2234
2235 /* Accept zero addresses only to limited broadcast;
2236 * I even do not know to fix it or not. Waiting for complains :-)
2237 */
f97c1e0c 2238 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2239 goto martian_source;
2240
27a954bd 2241 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2242 goto martian_destination;
2243
2244 /*
2245 * Now we are ready to route packet.
2246 */
ebc0ffae
ED
2247 err = fib_lookup(net, &fl, &res);
2248 if (err != 0) {
1da177e4 2249 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2250 goto e_hostunreach;
1da177e4
LT
2251 goto no_route;
2252 }
1da177e4
LT
2253
2254 RT_CACHE_STAT_INC(in_slow_tot);
2255
2256 if (res.type == RTN_BROADCAST)
2257 goto brd_input;
2258
2259 if (res.type == RTN_LOCAL) {
b5f7e755 2260 err = fib_validate_source(saddr, daddr, tos,
ebc0ffae
ED
2261 net->loopback_dev->ifindex,
2262 dev, &spec_dst, &itag, skb->mark);
b5f7e755
ED
2263 if (err < 0)
2264 goto martian_source_keep_err;
2265 if (err)
1da177e4
LT
2266 flags |= RTCF_DIRECTSRC;
2267 spec_dst = daddr;
2268 goto local_input;
2269 }
2270
2271 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2272 goto e_hostunreach;
1da177e4
LT
2273 if (res.type != RTN_UNICAST)
2274 goto martian_destination;
2275
2276 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
2277out: return err;
2278
2279brd_input:
2280 if (skb->protocol != htons(ETH_P_IP))
2281 goto e_inval;
2282
f97c1e0c 2283 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2284 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2285 else {
2286 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
b0c110ca 2287 &itag, skb->mark);
1da177e4 2288 if (err < 0)
b5f7e755 2289 goto martian_source_keep_err;
1da177e4
LT
2290 if (err)
2291 flags |= RTCF_DIRECTSRC;
2292 }
2293 flags |= RTCF_BROADCAST;
2294 res.type = RTN_BROADCAST;
2295 RT_CACHE_STAT_INC(in_brd);
2296
2297local_input:
2298 rth = dst_alloc(&ipv4_dst_ops);
2299 if (!rth)
2300 goto e_nobufs;
2301
d8d1f30b
CG
2302 rth->dst.output= ip_rt_bug;
2303 rth->dst.obsolete = -1;
e84f84f2 2304 rth->rt_genid = rt_genid(net);
1da177e4 2305
d8d1f30b
CG
2306 atomic_set(&rth->dst.__refcnt, 1);
2307 rth->dst.flags= DST_HOST;
42f811b8 2308 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2309 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
2310 rth->fl.fl4_dst = daddr;
2311 rth->rt_dst = daddr;
2312 rth->fl.fl4_tos = tos;
47dcf0cb 2313 rth->fl.mark = skb->mark;
1da177e4
LT
2314 rth->fl.fl4_src = saddr;
2315 rth->rt_src = saddr;
c7066f70 2316#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2317 rth->dst.tclassid = itag;
1da177e4
LT
2318#endif
2319 rth->rt_iif =
2320 rth->fl.iif = dev->ifindex;
d8d1f30b
CG
2321 rth->dst.dev = net->loopback_dev;
2322 dev_hold(rth->dst.dev);
1da177e4
LT
2323 rth->rt_gateway = daddr;
2324 rth->rt_spec_dst= spec_dst;
d8d1f30b 2325 rth->dst.input= ip_local_deliver;
1da177e4
LT
2326 rth->rt_flags = flags|RTCF_LOCAL;
2327 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2328 rth->dst.input= ip_error;
2329 rth->dst.error= -err;
1da177e4
LT
2330 rth->rt_flags &= ~RTCF_LOCAL;
2331 }
2332 rth->rt_type = res.type;
e84f84f2 2333 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
6a2bad70 2334 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
ebc0ffae 2335 goto out;
1da177e4
LT
2336
2337no_route:
2338 RT_CACHE_STAT_INC(in_no_route);
2339 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2340 res.type = RTN_UNREACHABLE;
7f53878d
MC
2341 if (err == -ESRCH)
2342 err = -ENETUNREACH;
1da177e4
LT
2343 goto local_input;
2344
2345 /*
2346 * Do not cache martian addresses: they should be logged (RFC1812)
2347 */
2348martian_destination:
2349 RT_CACHE_STAT_INC(in_martian_dst);
2350#ifdef CONFIG_IP_ROUTE_VERBOSE
2351 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2352 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2353 &daddr, &saddr, dev->name);
1da177e4 2354#endif
2c2910a4
DE
2355
2356e_hostunreach:
e905a9ed 2357 err = -EHOSTUNREACH;
ebc0ffae 2358 goto out;
2c2910a4 2359
1da177e4
LT
2360e_inval:
2361 err = -EINVAL;
ebc0ffae 2362 goto out;
1da177e4
LT
2363
2364e_nobufs:
2365 err = -ENOBUFS;
ebc0ffae 2366 goto out;
1da177e4
LT
2367
2368martian_source:
b5f7e755
ED
2369 err = -EINVAL;
2370martian_source_keep_err:
1da177e4 2371 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2372 goto out;
1da177e4
LT
2373}
2374
407eadd9
ED
2375int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2376 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2377{
2378 struct rtable * rth;
2379 unsigned hash;
2380 int iif = dev->ifindex;
b5921910 2381 struct net *net;
96d36220 2382 int res;
1da177e4 2383
c346dca1 2384 net = dev_net(dev);
1080d709 2385
96d36220
ED
2386 rcu_read_lock();
2387
1080d709
NH
2388 if (!rt_caching(net))
2389 goto skip_cache;
2390
1da177e4 2391 tos &= IPTOS_RT_MASK;
e84f84f2 2392 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2393
1da177e4 2394 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2395 rth = rcu_dereference(rth->dst.rt_next)) {
0eae88f3
ED
2396 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2397 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
c0b8c32b
SH
2398 (rth->fl.iif ^ iif) |
2399 rth->fl.oif |
2400 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2401 rth->fl.mark == skb->mark &&
d8d1f30b 2402 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2403 !rt_is_expired(rth)) {
407eadd9 2404 if (noref) {
d8d1f30b
CG
2405 dst_use_noref(&rth->dst, jiffies);
2406 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2407 } else {
d8d1f30b
CG
2408 dst_use(&rth->dst, jiffies);
2409 skb_dst_set(skb, &rth->dst);
407eadd9 2410 }
1da177e4
LT
2411 RT_CACHE_STAT_INC(in_hit);
2412 rcu_read_unlock();
1da177e4
LT
2413 return 0;
2414 }
2415 RT_CACHE_STAT_INC(in_hlist_search);
2416 }
1da177e4 2417
1080d709 2418skip_cache:
1da177e4
LT
2419 /* Multicast recognition logic is moved from route cache to here.
2420 The problem was that too many Ethernet cards have broken/missing
2421 hardware multicast filters :-( As result the host on multicasting
2422 network acquires a lot of useless route cache entries, sort of
2423 SDR messages from all the world. Now we try to get rid of them.
2424 Really, provided software IP multicast filter is organized
2425 reasonably (at least, hashed), it does not result in a slowdown
2426 comparing with route cache reject entries.
2427 Note, that multicast routers are not affected, because
2428 route cache entry is created eventually.
2429 */
f97c1e0c 2430 if (ipv4_is_multicast(daddr)) {
96d36220 2431 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2432
96d36220 2433 if (in_dev) {
1da177e4 2434 int our = ip_check_mc(in_dev, daddr, saddr,
96d36220 2435 ip_hdr(skb)->protocol);
1da177e4
LT
2436 if (our
2437#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2438 ||
2439 (!ipv4_is_local_multicast(daddr) &&
2440 IN_DEV_MFORWARD(in_dev))
1da177e4 2441#endif
9d4fb27d 2442 ) {
96d36220
ED
2443 int res = ip_route_input_mc(skb, daddr, saddr,
2444 tos, dev, our);
1da177e4 2445 rcu_read_unlock();
96d36220 2446 return res;
1da177e4
LT
2447 }
2448 }
2449 rcu_read_unlock();
2450 return -EINVAL;
2451 }
96d36220
ED
2452 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2453 rcu_read_unlock();
2454 return res;
1da177e4 2455}
407eadd9 2456EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2457
ebc0ffae 2458/* called with rcu_read_lock() */
5969f71d
SH
2459static int __mkroute_output(struct rtable **result,
2460 struct fib_result *res,
2461 const struct flowi *fl,
2462 const struct flowi *oldflp,
2463 struct net_device *dev_out,
2464 unsigned flags)
1da177e4
LT
2465{
2466 struct rtable *rth;
2467 struct in_device *in_dev;
2468 u32 tos = RT_FL_TOS(oldflp);
1da177e4 2469
dd28d1a0 2470 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
1da177e4
LT
2471 return -EINVAL;
2472
27a954bd 2473 if (ipv4_is_lbcast(fl->fl4_dst))
1da177e4 2474 res->type = RTN_BROADCAST;
f97c1e0c 2475 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2476 res->type = RTN_MULTICAST;
27a954bd 2477 else if (ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2478 return -EINVAL;
2479
2480 if (dev_out->flags & IFF_LOOPBACK)
2481 flags |= RTCF_LOCAL;
2482
dd28d1a0 2483 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2484 if (!in_dev)
1da177e4 2485 return -EINVAL;
ebc0ffae 2486
1da177e4
LT
2487 if (res->type == RTN_BROADCAST) {
2488 flags |= RTCF_BROADCAST | RTCF_LOCAL;
ebc0ffae 2489 res->fi = NULL;
1da177e4 2490 } else if (res->type == RTN_MULTICAST) {
dd28d1a0 2491 flags |= RTCF_MULTICAST | RTCF_LOCAL;
e905a9ed 2492 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2493 oldflp->proto))
2494 flags &= ~RTCF_LOCAL;
2495 /* If multicast route do not exist use
dd28d1a0
ED
2496 * default one, but do not gateway in this case.
2497 * Yes, it is hack.
1da177e4 2498 */
ebc0ffae 2499 if (res->fi && res->prefixlen < 4)
1da177e4 2500 res->fi = NULL;
1da177e4
LT
2501 }
2502
2503
2504 rth = dst_alloc(&ipv4_dst_ops);
8391d07b 2505 if (!rth)
dd28d1a0 2506 return -ENOBUFS;
8391d07b 2507
d8d1f30b
CG
2508 atomic_set(&rth->dst.__refcnt, 1);
2509 rth->dst.flags= DST_HOST;
42f811b8 2510 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
d8d1f30b 2511 rth->dst.flags |= DST_NOXFRM;
42f811b8 2512 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
d8d1f30b 2513 rth->dst.flags |= DST_NOPOLICY;
1da177e4
LT
2514
2515 rth->fl.fl4_dst = oldflp->fl4_dst;
2516 rth->fl.fl4_tos = tos;
2517 rth->fl.fl4_src = oldflp->fl4_src;
2518 rth->fl.oif = oldflp->oif;
47dcf0cb 2519 rth->fl.mark = oldflp->mark;
1da177e4
LT
2520 rth->rt_dst = fl->fl4_dst;
2521 rth->rt_src = fl->fl4_src;
2522 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2523 /* get references to the devices that are to be hold by the routing
1da177e4 2524 cache entry */
d8d1f30b 2525 rth->dst.dev = dev_out;
1da177e4 2526 dev_hold(dev_out);
1da177e4
LT
2527 rth->rt_gateway = fl->fl4_dst;
2528 rth->rt_spec_dst= fl->fl4_src;
2529
d8d1f30b
CG
2530 rth->dst.output=ip_output;
2531 rth->dst.obsolete = -1;
e84f84f2 2532 rth->rt_genid = rt_genid(dev_net(dev_out));
1da177e4
LT
2533
2534 RT_CACHE_STAT_INC(out_slow_tot);
2535
2536 if (flags & RTCF_LOCAL) {
d8d1f30b 2537 rth->dst.input = ip_local_deliver;
1da177e4
LT
2538 rth->rt_spec_dst = fl->fl4_dst;
2539 }
2540 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2541 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2542 if (flags & RTCF_LOCAL &&
1da177e4 2543 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2544 rth->dst.output = ip_mc_output;
1da177e4
LT
2545 RT_CACHE_STAT_INC(out_slow_mc);
2546 }
2547#ifdef CONFIG_IP_MROUTE
2548 if (res->type == RTN_MULTICAST) {
2549 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2550 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
d8d1f30b
CG
2551 rth->dst.input = ip_mr_input;
2552 rth->dst.output = ip_mc_output;
1da177e4
LT
2553 }
2554 }
2555#endif
2556 }
2557
2558 rt_set_nexthop(rth, res, 0);
2559
2560 rth->rt_flags = flags;
1da177e4 2561 *result = rth;
dd28d1a0 2562 return 0;
1da177e4
LT
2563}
2564
ebc0ffae 2565/* called with rcu_read_lock() */
5969f71d
SH
2566static int ip_mkroute_output(struct rtable **rp,
2567 struct fib_result *res,
2568 const struct flowi *fl,
2569 const struct flowi *oldflp,
2570 struct net_device *dev_out,
2571 unsigned flags)
1da177e4 2572{
7abaa27c 2573 struct rtable *rth = NULL;
1da177e4
LT
2574 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2575 unsigned hash;
2576 if (err == 0) {
b00180de 2577 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
e84f84f2 2578 rt_genid(dev_net(dev_out)));
6a2bad70 2579 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
1da177e4 2580 }
e905a9ed 2581
1da177e4
LT
2582 return err;
2583}
2584
1da177e4
LT
2585/*
2586 * Major route resolver routine.
0197aa38 2587 * called with rcu_read_lock();
1da177e4
LT
2588 */
2589
b40afd0e
DL
2590static int ip_route_output_slow(struct net *net, struct rtable **rp,
2591 const struct flowi *oldflp)
1da177e4
LT
2592{
2593 u32 tos = RT_FL_TOS(oldflp);
5811662b
CG
2594 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2595 .fl4_src = oldflp->fl4_src,
2596 .fl4_tos = tos & IPTOS_RT_MASK,
2597 .fl4_scope = ((tos & RTO_ONLINK) ?
2598 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
47dcf0cb 2599 .mark = oldflp->mark,
b40afd0e 2600 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2601 .oif = oldflp->oif };
2602 struct fib_result res;
0197aa38 2603 unsigned int flags = 0;
1da177e4 2604 struct net_device *dev_out = NULL;
1da177e4
LT
2605 int err;
2606
2607
2608 res.fi = NULL;
2609#ifdef CONFIG_IP_MULTIPLE_TABLES
2610 res.r = NULL;
2611#endif
2612
2613 if (oldflp->fl4_src) {
2614 err = -EINVAL;
f97c1e0c 2615 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2616 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2617 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2618 goto out;
2619
1da177e4
LT
2620 /* I removed check for oif == dev_out->oif here.
2621 It was wrong for two reasons:
1ab35276
DL
2622 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2623 is assigned to multiple interfaces.
1da177e4
LT
2624 2. Moreover, we are allowed to send packets with saddr
2625 of another iface. --ANK
2626 */
2627
9d4fb27d
JP
2628 if (oldflp->oif == 0 &&
2629 (ipv4_is_multicast(oldflp->fl4_dst) ||
27a954bd 2630 ipv4_is_lbcast(oldflp->fl4_dst))) {
a210d01a 2631 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
0197aa38 2632 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
a210d01a
JA
2633 if (dev_out == NULL)
2634 goto out;
2635
1da177e4
LT
2636 /* Special hack: user can direct multicasts
2637 and limited broadcast via necessary interface
2638 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2639 This hack is not just for fun, it allows
2640 vic,vat and friends to work.
2641 They bind socket to loopback, set ttl to zero
2642 and expect that it will work.
2643 From the viewpoint of routing cache they are broken,
2644 because we are not allowed to build multicast path
2645 with loopback source addr (look, routing cache
2646 cannot know, that ttl is zero, so that packet
2647 will not leave this host and route is valid).
2648 Luckily, this hack is good workaround.
2649 */
2650
2651 fl.oif = dev_out->ifindex;
2652 goto make_route;
2653 }
a210d01a
JA
2654
2655 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2656 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
0197aa38 2657 if (!__ip_dev_find(net, oldflp->fl4_src, false))
a210d01a 2658 goto out;
a210d01a 2659 }
1da177e4
LT
2660 }
2661
2662
2663 if (oldflp->oif) {
0197aa38 2664 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
1da177e4
LT
2665 err = -ENODEV;
2666 if (dev_out == NULL)
2667 goto out;
e5ed6399
HX
2668
2669 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83
ED
2670 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2671 err = -ENETUNREACH;
2672 goto out;
2673 }
f97c1e0c 2674 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
27a954bd 2675 ipv4_is_lbcast(oldflp->fl4_dst)) {
1da177e4
LT
2676 if (!fl.fl4_src)
2677 fl.fl4_src = inet_select_addr(dev_out, 0,
2678 RT_SCOPE_LINK);
2679 goto make_route;
2680 }
2681 if (!fl.fl4_src) {
f97c1e0c 2682 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2683 fl.fl4_src = inet_select_addr(dev_out, 0,
2684 fl.fl4_scope);
2685 else if (!oldflp->fl4_dst)
2686 fl.fl4_src = inet_select_addr(dev_out, 0,
2687 RT_SCOPE_HOST);
2688 }
2689 }
2690
2691 if (!fl.fl4_dst) {
2692 fl.fl4_dst = fl.fl4_src;
2693 if (!fl.fl4_dst)
2694 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
b40afd0e 2695 dev_out = net->loopback_dev;
b40afd0e 2696 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2697 res.type = RTN_LOCAL;
2698 flags |= RTCF_LOCAL;
2699 goto make_route;
2700 }
2701
b40afd0e 2702 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2703 res.fi = NULL;
2704 if (oldflp->oif) {
2705 /* Apparently, routing tables are wrong. Assume,
2706 that the destination is on link.
2707
2708 WHY? DW.
2709 Because we are allowed to send to iface
2710 even if it has NO routes and NO assigned
2711 addresses. When oif is specified, routing
2712 tables are looked up with only one purpose:
2713 to catch if destination is gatewayed, rather than
2714 direct. Moreover, if MSG_DONTROUTE is set,
2715 we send packet, ignoring both routing tables
2716 and ifaddr state. --ANK
2717
2718
2719 We could make it even if oif is unknown,
2720 likely IPv6, but we do not.
2721 */
2722
2723 if (fl.fl4_src == 0)
2724 fl.fl4_src = inet_select_addr(dev_out, 0,
2725 RT_SCOPE_LINK);
2726 res.type = RTN_UNICAST;
2727 goto make_route;
2728 }
1da177e4
LT
2729 err = -ENETUNREACH;
2730 goto out;
2731 }
1da177e4
LT
2732
2733 if (res.type == RTN_LOCAL) {
9fc3bbb4
JS
2734 if (!fl.fl4_src) {
2735 if (res.fi->fib_prefsrc)
2736 fl.fl4_src = res.fi->fib_prefsrc;
2737 else
2738 fl.fl4_src = fl.fl4_dst;
2739 }
b40afd0e 2740 dev_out = net->loopback_dev;
1da177e4 2741 fl.oif = dev_out->ifindex;
1da177e4
LT
2742 res.fi = NULL;
2743 flags |= RTCF_LOCAL;
2744 goto make_route;
2745 }
2746
2747#ifdef CONFIG_IP_ROUTE_MULTIPATH
2748 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2749 fib_select_multipath(&fl, &res);
2750 else
2751#endif
2752 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
0c838ff1 2753 fib_select_default(&res);
1da177e4
LT
2754
2755 if (!fl.fl4_src)
2756 fl.fl4_src = FIB_RES_PREFSRC(res);
2757
1da177e4 2758 dev_out = FIB_RES_DEV(res);
1da177e4
LT
2759 fl.oif = dev_out->ifindex;
2760
2761
2762make_route:
2763 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2764
1da177e4
LT
2765out: return err;
2766}
2767
611c183e
DL
2768int __ip_route_output_key(struct net *net, struct rtable **rp,
2769 const struct flowi *flp)
1da177e4 2770{
0197aa38
ED
2771 unsigned int hash;
2772 int res;
1da177e4
LT
2773 struct rtable *rth;
2774
1080d709
NH
2775 if (!rt_caching(net))
2776 goto slow_output;
2777
e84f84f2 2778 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
1da177e4
LT
2779
2780 rcu_read_lock_bh();
a898def2 2781 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2782 rth = rcu_dereference_bh(rth->dst.rt_next)) {
1da177e4
LT
2783 if (rth->fl.fl4_dst == flp->fl4_dst &&
2784 rth->fl.fl4_src == flp->fl4_src &&
c7537967 2785 rt_is_output_route(rth) &&
1da177e4 2786 rth->fl.oif == flp->oif &&
47dcf0cb 2787 rth->fl.mark == flp->mark &&
1da177e4 2788 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2789 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2790 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2791 !rt_is_expired(rth)) {
d8d1f30b 2792 dst_use(&rth->dst, jiffies);
1da177e4
LT
2793 RT_CACHE_STAT_INC(out_hit);
2794 rcu_read_unlock_bh();
2795 *rp = rth;
2796 return 0;
2797 }
2798 RT_CACHE_STAT_INC(out_hlist_search);
2799 }
2800 rcu_read_unlock_bh();
2801
1080d709 2802slow_output:
0197aa38
ED
2803 rcu_read_lock();
2804 res = ip_route_output_slow(net, rp, flp);
2805 rcu_read_unlock();
2806 return res;
1da177e4 2807}
d8c97a94
ACM
2808EXPORT_SYMBOL_GPL(__ip_route_output_key);
2809
ae2688d5
JW
2810static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2811{
2812 return NULL;
2813}
2814
ec831ea7
RD
2815static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2816{
2817 return 0;
2818}
2819
14e50e57
DM
2820static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2821{
2822}
2823
2824static struct dst_ops ipv4_dst_blackhole_ops = {
2825 .family = AF_INET,
09640e63 2826 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2827 .destroy = ipv4_dst_destroy,
ae2688d5 2828 .check = ipv4_blackhole_dst_check,
ec831ea7 2829 .default_mtu = ipv4_blackhole_default_mtu,
14e50e57 2830 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
14e50e57
DM
2831};
2832
2833
e84f84f2 2834static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
14e50e57
DM
2835{
2836 struct rtable *ort = *rp;
2837 struct rtable *rt = (struct rtable *)
2838 dst_alloc(&ipv4_dst_blackhole_ops);
2839
2840 if (rt) {
d8d1f30b 2841 struct dst_entry *new = &rt->dst;
14e50e57
DM
2842
2843 atomic_set(&new->__refcnt, 1);
2844 new->__use = 1;
352e512c
HX
2845 new->input = dst_discard;
2846 new->output = dst_discard;
defb3519 2847 dst_copy_metrics(new, &ort->dst);
14e50e57 2848
d8d1f30b 2849 new->dev = ort->dst.dev;
14e50e57
DM
2850 if (new->dev)
2851 dev_hold(new->dev);
2852
2853 rt->fl = ort->fl;
2854
e84f84f2 2855 rt->rt_genid = rt_genid(net);
14e50e57
DM
2856 rt->rt_flags = ort->rt_flags;
2857 rt->rt_type = ort->rt_type;
2858 rt->rt_dst = ort->rt_dst;
2859 rt->rt_src = ort->rt_src;
2860 rt->rt_iif = ort->rt_iif;
2861 rt->rt_gateway = ort->rt_gateway;
2862 rt->rt_spec_dst = ort->rt_spec_dst;
2863 rt->peer = ort->peer;
2864 if (rt->peer)
2865 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2866 rt->fi = ort->fi;
2867 if (rt->fi)
2868 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2869
2870 dst_free(new);
2871 }
2872
d8d1f30b 2873 dst_release(&(*rp)->dst);
14e50e57 2874 *rp = rt;
a02cec21 2875 return rt ? 0 : -ENOMEM;
14e50e57
DM
2876}
2877
f1b050bf
DL
2878int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2879 struct sock *sk, int flags)
1da177e4
LT
2880{
2881 int err;
2882
f1b050bf 2883 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2884 return err;
2885
2886 if (flp->proto) {
2887 if (!flp->fl4_src)
2888 flp->fl4_src = (*rp)->rt_src;
2889 if (!flp->fl4_dst)
2890 flp->fl4_dst = (*rp)->rt_dst;
52479b62 2891 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
bb72845e 2892 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2893 if (err == -EREMOTE)
e84f84f2 2894 err = ipv4_dst_blackhole(net, rp, flp);
14e50e57
DM
2895
2896 return err;
1da177e4
LT
2897 }
2898
2899 return 0;
2900}
d8c97a94
ACM
2901EXPORT_SYMBOL_GPL(ip_route_output_flow);
2902
f206351a 2903int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2904{
f206351a 2905 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4 2906}
4bc2f18b 2907EXPORT_SYMBOL(ip_route_output_key);
1da177e4 2908
4feb88e5
BT
2909static int rt_fill_info(struct net *net,
2910 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2911 int nowait, unsigned int flags)
1da177e4 2912{
511c3f92 2913 struct rtable *rt = skb_rtable(skb);
1da177e4 2914 struct rtmsg *r;
be403ea1 2915 struct nlmsghdr *nlh;
e3703b3d
TG
2916 long expires;
2917 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2918
2919 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2920 if (nlh == NULL)
26932566 2921 return -EMSGSIZE;
be403ea1
TG
2922
2923 r = nlmsg_data(nlh);
1da177e4
LT
2924 r->rtm_family = AF_INET;
2925 r->rtm_dst_len = 32;
2926 r->rtm_src_len = 0;
2927 r->rtm_tos = rt->fl.fl4_tos;
2928 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2929 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2930 r->rtm_type = rt->rt_type;
2931 r->rtm_scope = RT_SCOPE_UNIVERSE;
2932 r->rtm_protocol = RTPROT_UNSPEC;
2933 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2934 if (rt->rt_flags & RTCF_NOTIFY)
2935 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2936
17fb2c64 2937 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2938
1da177e4
LT
2939 if (rt->fl.fl4_src) {
2940 r->rtm_src_len = 32;
17fb2c64 2941 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4 2942 }
d8d1f30b
CG
2943 if (rt->dst.dev)
2944 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2945#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2946 if (rt->dst.tclassid)
2947 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2948#endif
c7537967 2949 if (rt_is_input_route(rt))
17fb2c64 2950 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2951 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2952 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2953
1da177e4 2954 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2955 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2956
defb3519 2957 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2958 goto nla_put_failure;
2959
963bfeee
ED
2960 if (rt->fl.mark)
2961 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2962
d8d1f30b
CG
2963 error = rt->dst.error;
2964 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
1da177e4 2965 if (rt->peer) {
317fe0e6 2966 inet_peer_refcheck(rt->peer);
2c1409a0 2967 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
1da177e4 2968 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2969 ts = rt->peer->tcp_ts;
9d729f72 2970 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2971 }
2972 }
be403ea1 2973
c7537967 2974 if (rt_is_input_route(rt)) {
1da177e4 2975#ifdef CONFIG_IP_MROUTE
e448515c 2976 __be32 dst = rt->rt_dst;
1da177e4 2977
f97c1e0c 2978 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5
BT
2979 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2980 int err = ipmr_get_route(net, skb, r, nowait);
1da177e4
LT
2981 if (err <= 0) {
2982 if (!nowait) {
2983 if (err == 0)
2984 return 0;
be403ea1 2985 goto nla_put_failure;
1da177e4
LT
2986 } else {
2987 if (err == -EMSGSIZE)
be403ea1 2988 goto nla_put_failure;
e3703b3d 2989 error = err;
1da177e4
LT
2990 }
2991 }
2992 } else
2993#endif
be403ea1 2994 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2995 }
2996
d8d1f30b 2997 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
2998 expires, error) < 0)
2999 goto nla_put_failure;
be403ea1
TG
3000
3001 return nlmsg_end(skb, nlh);
1da177e4 3002
be403ea1 3003nla_put_failure:
26932566
PM
3004 nlmsg_cancel(skb, nlh);
3005 return -EMSGSIZE;
1da177e4
LT
3006}
3007
63f3444f 3008static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 3009{
3b1e0a65 3010 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
3011 struct rtmsg *rtm;
3012 struct nlattr *tb[RTA_MAX+1];
1da177e4 3013 struct rtable *rt = NULL;
9e12bb22
AV
3014 __be32 dst = 0;
3015 __be32 src = 0;
3016 u32 iif;
d889ce3b 3017 int err;
963bfeee 3018 int mark;
1da177e4
LT
3019 struct sk_buff *skb;
3020
d889ce3b
TG
3021 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3022 if (err < 0)
3023 goto errout;
3024
3025 rtm = nlmsg_data(nlh);
3026
1da177e4 3027 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
3028 if (skb == NULL) {
3029 err = -ENOBUFS;
3030 goto errout;
3031 }
1da177e4
LT
3032
3033 /* Reserve room for dummy headers, this skb can pass
3034 through good chunk of routing engine.
3035 */
459a98ed 3036 skb_reset_mac_header(skb);
c1d2bbe1 3037 skb_reset_network_header(skb);
d2c962b8
SH
3038
3039 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 3040 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
3041 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3042
17fb2c64
AV
3043 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3044 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 3045 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3046 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
3047
3048 if (iif) {
d889ce3b
TG
3049 struct net_device *dev;
3050
1937504d 3051 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
3052 if (dev == NULL) {
3053 err = -ENODEV;
3054 goto errout_free;
3055 }
3056
1da177e4
LT
3057 skb->protocol = htons(ETH_P_IP);
3058 skb->dev = dev;
963bfeee 3059 skb->mark = mark;
1da177e4
LT
3060 local_bh_disable();
3061 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3062 local_bh_enable();
d889ce3b 3063
511c3f92 3064 rt = skb_rtable(skb);
d8d1f30b
CG
3065 if (err == 0 && rt->dst.error)
3066 err = -rt->dst.error;
1da177e4 3067 } else {
d889ce3b 3068 struct flowi fl = {
5811662b
CG
3069 .fl4_dst = dst,
3070 .fl4_src = src,
3071 .fl4_tos = rtm->rtm_tos,
d889ce3b 3072 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
963bfeee 3073 .mark = mark,
d889ce3b 3074 };
1937504d 3075 err = ip_route_output_key(net, &rt, &fl);
1da177e4 3076 }
d889ce3b 3077
1da177e4 3078 if (err)
d889ce3b 3079 goto errout_free;
1da177e4 3080
d8d1f30b 3081 skb_dst_set(skb, &rt->dst);
1da177e4
LT
3082 if (rtm->rtm_flags & RTM_F_NOTIFY)
3083 rt->rt_flags |= RTCF_NOTIFY;
3084
4feb88e5 3085 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3086 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3087 if (err <= 0)
3088 goto errout_free;
1da177e4 3089
1937504d 3090 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3091errout:
2942e900 3092 return err;
1da177e4 3093
d889ce3b 3094errout_free:
1da177e4 3095 kfree_skb(skb);
d889ce3b 3096 goto errout;
1da177e4
LT
3097}
3098
3099int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3100{
3101 struct rtable *rt;
3102 int h, s_h;
3103 int idx, s_idx;
1937504d
DL
3104 struct net *net;
3105
3b1e0a65 3106 net = sock_net(skb->sk);
1da177e4
LT
3107
3108 s_h = cb->args[0];
d8c92830
ED
3109 if (s_h < 0)
3110 s_h = 0;
1da177e4 3111 s_idx = idx = cb->args[1];
a6272665
ED
3112 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3113 if (!rt_hash_table[h].chain)
3114 continue;
1da177e4 3115 rcu_read_lock_bh();
a898def2 3116 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3117 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3118 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3119 continue;
e84f84f2 3120 if (rt_is_expired(rt))
29e75252 3121 continue;
d8d1f30b 3122 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3123 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3124 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3125 1, NLM_F_MULTI) <= 0) {
adf30907 3126 skb_dst_drop(skb);
1da177e4
LT
3127 rcu_read_unlock_bh();
3128 goto done;
3129 }
adf30907 3130 skb_dst_drop(skb);
1da177e4
LT
3131 }
3132 rcu_read_unlock_bh();
3133 }
3134
3135done:
3136 cb->args[0] = h;
3137 cb->args[1] = idx;
3138 return skb->len;
3139}
3140
3141void ip_rt_multicast_event(struct in_device *in_dev)
3142{
76e6ebfb 3143 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3144}
3145
3146#ifdef CONFIG_SYSCTL
81c684d1 3147static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3148 void __user *buffer,
1da177e4
LT
3149 size_t *lenp, loff_t *ppos)
3150{
3151 if (write) {
639e104f 3152 int flush_delay;
81c684d1 3153 ctl_table ctl;
39a23e75 3154 struct net *net;
639e104f 3155
81c684d1
DL
3156 memcpy(&ctl, __ctl, sizeof(ctl));
3157 ctl.data = &flush_delay;
8d65af78 3158 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3159
81c684d1 3160 net = (struct net *)__ctl->extra1;
39a23e75 3161 rt_cache_flush(net, flush_delay);
1da177e4 3162 return 0;
e905a9ed 3163 }
1da177e4
LT
3164
3165 return -EINVAL;
3166}
3167
eeb61f71 3168static ctl_table ipv4_route_table[] = {
1da177e4 3169 {
1da177e4
LT
3170 .procname = "gc_thresh",
3171 .data = &ipv4_dst_ops.gc_thresh,
3172 .maxlen = sizeof(int),
3173 .mode = 0644,
6d9f239a 3174 .proc_handler = proc_dointvec,
1da177e4
LT
3175 },
3176 {
1da177e4
LT
3177 .procname = "max_size",
3178 .data = &ip_rt_max_size,
3179 .maxlen = sizeof(int),
3180 .mode = 0644,
6d9f239a 3181 .proc_handler = proc_dointvec,
1da177e4
LT
3182 },
3183 {
3184 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3185
1da177e4
LT
3186 .procname = "gc_min_interval",
3187 .data = &ip_rt_gc_min_interval,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
6d9f239a 3190 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3191 },
3192 {
1da177e4
LT
3193 .procname = "gc_min_interval_ms",
3194 .data = &ip_rt_gc_min_interval,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
6d9f239a 3197 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3198 },
3199 {
1da177e4
LT
3200 .procname = "gc_timeout",
3201 .data = &ip_rt_gc_timeout,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
6d9f239a 3204 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3205 },
3206 {
1da177e4
LT
3207 .procname = "gc_interval",
3208 .data = &ip_rt_gc_interval,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
6d9f239a 3211 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3212 },
3213 {
1da177e4
LT
3214 .procname = "redirect_load",
3215 .data = &ip_rt_redirect_load,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
6d9f239a 3218 .proc_handler = proc_dointvec,
1da177e4
LT
3219 },
3220 {
1da177e4
LT
3221 .procname = "redirect_number",
3222 .data = &ip_rt_redirect_number,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
6d9f239a 3225 .proc_handler = proc_dointvec,
1da177e4
LT
3226 },
3227 {
1da177e4
LT
3228 .procname = "redirect_silence",
3229 .data = &ip_rt_redirect_silence,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
6d9f239a 3232 .proc_handler = proc_dointvec,
1da177e4
LT
3233 },
3234 {
1da177e4
LT
3235 .procname = "error_cost",
3236 .data = &ip_rt_error_cost,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
6d9f239a 3239 .proc_handler = proc_dointvec,
1da177e4
LT
3240 },
3241 {
1da177e4
LT
3242 .procname = "error_burst",
3243 .data = &ip_rt_error_burst,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
6d9f239a 3246 .proc_handler = proc_dointvec,
1da177e4
LT
3247 },
3248 {
1da177e4
LT
3249 .procname = "gc_elasticity",
3250 .data = &ip_rt_gc_elasticity,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
6d9f239a 3253 .proc_handler = proc_dointvec,
1da177e4
LT
3254 },
3255 {
1da177e4
LT
3256 .procname = "mtu_expires",
3257 .data = &ip_rt_mtu_expires,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
6d9f239a 3260 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3261 },
3262 {
1da177e4
LT
3263 .procname = "min_pmtu",
3264 .data = &ip_rt_min_pmtu,
3265 .maxlen = sizeof(int),
3266 .mode = 0644,
6d9f239a 3267 .proc_handler = proc_dointvec,
1da177e4
LT
3268 },
3269 {
1da177e4
LT
3270 .procname = "min_adv_mss",
3271 .data = &ip_rt_min_advmss,
3272 .maxlen = sizeof(int),
3273 .mode = 0644,
6d9f239a 3274 .proc_handler = proc_dointvec,
1da177e4 3275 },
f8572d8f 3276 { }
1da177e4 3277};
39a23e75 3278
2f4520d3
AV
3279static struct ctl_table empty[1];
3280
3281static struct ctl_table ipv4_skeleton[] =
3282{
f8572d8f 3283 { .procname = "route",
d994af0d 3284 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3285 { .procname = "neigh",
d994af0d 3286 .mode = 0555, .child = empty},
2f4520d3
AV
3287 { }
3288};
3289
3290static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3291 { .procname = "net", },
3292 { .procname = "ipv4", },
39a23e75
DL
3293 { },
3294};
3295
39a23e75
DL
3296static struct ctl_table ipv4_route_flush_table[] = {
3297 {
39a23e75
DL
3298 .procname = "flush",
3299 .maxlen = sizeof(int),
3300 .mode = 0200,
6d9f239a 3301 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3302 },
f8572d8f 3303 { },
39a23e75
DL
3304};
3305
2f4520d3 3306static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3307 { .procname = "net", },
3308 { .procname = "ipv4", },
3309 { .procname = "route", },
2f4520d3
AV
3310 { },
3311};
3312
39a23e75
DL
3313static __net_init int sysctl_route_net_init(struct net *net)
3314{
3315 struct ctl_table *tbl;
3316
3317 tbl = ipv4_route_flush_table;
09ad9bc7 3318 if (!net_eq(net, &init_net)) {
39a23e75
DL
3319 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3320 if (tbl == NULL)
3321 goto err_dup;
3322 }
3323 tbl[0].extra1 = net;
3324
3325 net->ipv4.route_hdr =
3326 register_net_sysctl_table(net, ipv4_route_path, tbl);
3327 if (net->ipv4.route_hdr == NULL)
3328 goto err_reg;
3329 return 0;
3330
3331err_reg:
3332 if (tbl != ipv4_route_flush_table)
3333 kfree(tbl);
3334err_dup:
3335 return -ENOMEM;
3336}
3337
3338static __net_exit void sysctl_route_net_exit(struct net *net)
3339{
3340 struct ctl_table *tbl;
3341
3342 tbl = net->ipv4.route_hdr->ctl_table_arg;
3343 unregister_net_sysctl_table(net->ipv4.route_hdr);
3344 BUG_ON(tbl == ipv4_route_flush_table);
3345 kfree(tbl);
3346}
3347
3348static __net_initdata struct pernet_operations sysctl_route_ops = {
3349 .init = sysctl_route_net_init,
3350 .exit = sysctl_route_net_exit,
3351};
1da177e4
LT
3352#endif
3353
3ee94372 3354static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3355{
3ee94372
NH
3356 get_random_bytes(&net->ipv4.rt_genid,
3357 sizeof(net->ipv4.rt_genid));
9f5e97e5
DL
3358 return 0;
3359}
3360
3ee94372
NH
3361static __net_initdata struct pernet_operations rt_genid_ops = {
3362 .init = rt_genid_init,
9f5e97e5
DL
3363};
3364
3365
c7066f70 3366#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3367struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3368#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3369
3370static __initdata unsigned long rhash_entries;
3371static int __init set_rhash_entries(char *str)
3372{
3373 if (!str)
3374 return 0;
3375 rhash_entries = simple_strtoul(str, &str, 0);
3376 return 1;
3377}
3378__setup("rhash_entries=", set_rhash_entries);
3379
3380int __init ip_rt_init(void)
3381{
424c4b70 3382 int rc = 0;
1da177e4 3383
c7066f70 3384#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3385 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3386 if (!ip_rt_acct)
3387 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3388#endif
3389
e5d679f3
AD
3390 ipv4_dst_ops.kmem_cachep =
3391 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3392 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3393
14e50e57
DM
3394 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3395
fc66f95c
ED
3396 if (dst_entries_init(&ipv4_dst_ops) < 0)
3397 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3398
3399 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3400 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3401
424c4b70
ED
3402 rt_hash_table = (struct rt_hash_bucket *)
3403 alloc_large_system_hash("IP route cache",
3404 sizeof(struct rt_hash_bucket),
3405 rhash_entries,
4481374c 3406 (totalram_pages >= 128 * 1024) ?
18955cfc 3407 15 : 17,
8d1502de 3408 0,
424c4b70
ED
3409 &rt_hash_log,
3410 &rt_hash_mask,
c9503e0f 3411 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3412 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3413 rt_hash_lock_init();
1da177e4
LT
3414
3415 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3416 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3417
1da177e4
LT
3418 devinet_init();
3419 ip_fib_init();
3420
1da177e4
LT
3421 /* All the timers, started at system startup tend
3422 to synchronize. Perturb it a bit.
3423 */
125bb8f5
ED
3424 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3425 expires_ljiffies = jiffies;
39c90ece
ED
3426 schedule_delayed_work(&expires_work,
3427 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3428
73b38711 3429 if (ip_rt_proc_init())
107f1634 3430 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3431#ifdef CONFIG_XFRM
3432 xfrm_init();
a33bc5c1 3433 xfrm4_init(ip_rt_max_size);
1da177e4 3434#endif
63f3444f
TG
3435 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3436
39a23e75
DL
3437#ifdef CONFIG_SYSCTL
3438 register_pernet_subsys(&sysctl_route_ops);
3439#endif
3ee94372 3440 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3441 return rc;
3442}
3443
a1bc6eb4 3444#ifdef CONFIG_SYSCTL
eeb61f71
AV
3445/*
3446 * We really need to sanitize the damn ipv4 init order, then all
3447 * this nonsense will go away.
3448 */
3449void __init ip_static_sysctl_init(void)
3450{
2f4520d3 3451 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3452}
a1bc6eb4 3453#endif