inetpeer: get rid of ip_id_count
[GitHub/LineageOS/android_kernel_samsung_universal7580.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
ff1f69a8 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
457c4cbc 94#include <net/net_namespace.h>
1da177e4
LT
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
8d71740c 105#include <net/netevent.h>
63f3444f 106#include <net/rtnetlink.h>
1da177e4
LT
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
7426a564 109#include <linux/kmemleak.h>
1da177e4 110#endif
6e5714ea 111#include <net/secure_seq.h>
1da177e4 112
68a5e3dd 113#define RT_FL_TOS(oldflp4) \
f61759e6 114 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4
LT
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
1da177e4 120static int ip_rt_max_size;
817bc4db
SH
121static int ip_rt_redirect_number __read_mostly = 9;
122static int ip_rt_redirect_load __read_mostly = HZ / 50;
123static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124static int ip_rt_error_cost __read_mostly = HZ;
125static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
126static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
127static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
128static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 129
1da177e4
LT
130/*
131 * Interface to generic destination cache.
132 */
133
134static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 135static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 136static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
137static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
139static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140 struct sk_buff *skb, u32 mtu);
141static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142 struct sk_buff *skb);
caacf05e 143static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 144
72cdd1d9
ED
145static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146 int how)
147{
148}
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
d3aaeb38 159
1da177e4
LT
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
09640e63 162 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .ifdown = ipv4_dst_ifdown,
169 .negative_advice = ipv4_negative_advice,
170 .link_failure = ipv4_link_failure,
171 .update_pmtu = ip_rt_update_pmtu,
e47a185b 172 .redirect = ip_do_redirect,
1ac06e03 173 .local_out = __ip_local_out,
d3aaeb38 174 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 200#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .owner = THIS_MODULE,
244 .open = rt_cache_seq_open,
245 .read = seq_read,
246 .llseek = seq_lseek,
89aef892 247 .release = seq_release,
1da177e4
LT
248};
249
250
251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252{
253 int cpu;
254
255 if (*pos == 0)
256 return SEQ_START_TOKEN;
257
0f23174a 258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
259 if (!cpu_possible(cpu))
260 continue;
261 *pos = cpu+1;
2f970d83 262 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
263 }
264 return NULL;
265}
266
267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 int cpu;
270
0f23174a 271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
272 if (!cpu_possible(cpu))
273 continue;
274 *pos = cpu+1;
2f970d83 275 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
276 }
277 return NULL;
e905a9ed 278
1da177e4
LT
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
5bec0039 291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
292 return 0;
293 }
e905a9ed 294
1da177e4
LT
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 297 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
298 st->in_hit,
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
306 st->out_hit,
307 st->out_slow_tot,
e905a9ed 308 st->out_slow_mc,
1da177e4
LT
309
310 st->gc_total,
311 st->gc_ignored,
312 st->gc_goal_miss,
313 st->gc_dst_overflow,
314 st->in_hlist_search,
315 st->out_hlist_search
316 );
317 return 0;
318}
319
f690808e 320static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
9a32144e 333static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
334 .owner = THIS_MODULE,
335 .open = rt_cpu_seq_open,
336 .read = seq_read,
337 .llseek = seq_lseek,
338 .release = seq_release,
339};
340
c7066f70 341#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 342static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 343{
a661c419
AD
344 struct ip_rt_acct *dst, *src;
345 unsigned int i, j;
346
347 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 if (!dst)
349 return -ENOMEM;
350
351 for_each_possible_cpu(i) {
352 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 for (j = 0; j < 256; j++) {
354 dst[j].o_bytes += src[j].o_bytes;
355 dst[j].o_packets += src[j].o_packets;
356 dst[j].i_bytes += src[j].i_bytes;
357 dst[j].i_packets += src[j].i_packets;
358 }
78c686e9
PE
359 }
360
a661c419
AD
361 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 kfree(dst);
363 return 0;
364}
78c686e9 365
a661c419
AD
366static int rt_acct_proc_open(struct inode *inode, struct file *file)
367{
368 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 369}
a661c419
AD
370
371static const struct file_operations rt_acct_proc_fops = {
372 .owner = THIS_MODULE,
373 .open = rt_acct_proc_open,
374 .read = seq_read,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
78c686e9 378#endif
107f1634 379
73b38711 380static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
381{
382 struct proc_dir_entry *pde;
383
d4beaa66
G
384 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 &rt_cache_seq_fops);
107f1634
PE
386 if (!pde)
387 goto err1;
388
77020720
WC
389 pde = proc_create("rt_cache", S_IRUGO,
390 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
391 if (!pde)
392 goto err2;
393
c7066f70 394#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 395 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
396 if (!pde)
397 goto err3;
398#endif
399 return 0;
400
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
402err3:
403 remove_proc_entry("rt_cache", net->proc_net_stat);
404#endif
405err2:
406 remove_proc_entry("rt_cache", net->proc_net);
407err1:
408 return -ENOMEM;
409}
73b38711
DL
410
411static void __net_exit ip_rt_do_proc_exit(struct net *net)
412{
413 remove_proc_entry("rt_cache", net->proc_net_stat);
414 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 415#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 416 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 417#endif
73b38711
DL
418}
419
420static struct pernet_operations ip_rt_proc_ops __net_initdata = {
421 .init = ip_rt_do_proc_init,
422 .exit = ip_rt_do_proc_exit,
423};
424
425static int __init ip_rt_proc_init(void)
426{
427 return register_pernet_subsys(&ip_rt_proc_ops);
428}
429
107f1634 430#else
73b38711 431static inline int ip_rt_proc_init(void)
107f1634
PE
432{
433 return 0;
434}
1da177e4 435#endif /* CONFIG_PROC_FS */
e905a9ed 436
4331debc 437static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 438{
d8d1f30b 439 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
440}
441
4ccfe6d4 442void rt_cache_flush(struct net *net)
1da177e4 443{
b42664f8 444 rt_genid_bump(net);
98376387
ED
445}
446
f894cbf8
DM
447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct sk_buff *skb,
449 const void *daddr)
3769cffb 450{
d3aaeb38
DM
451 struct net_device *dev = dst->dev;
452 const __be32 *pkey = daddr;
39232973 453 const struct rtable *rt;
3769cffb
DM
454 struct neighbour *n;
455
39232973 456 rt = (const struct rtable *) dst;
a263b309 457 if (rt->rt_gateway)
39232973 458 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
459 else if (skb)
460 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 461
80703d26 462 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
463 if (n)
464 return n;
32092ecf 465 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
466}
467
ff1f69a8
ED
468atomic_t *ip_idents __read_mostly;
469EXPORT_SYMBOL(ip_idents);
1da177e4 470
ff1f69a8 471void __ip_select_ident(struct iphdr *iph, int segs)
1da177e4 472{
ff1f69a8
ED
473 static u32 ip_idents_hashrnd __read_mostly;
474 static bool hashrnd_initialized = false;
475 u32 hash, id;
1da177e4 476
ff1f69a8
ED
477 if (unlikely(!hashrnd_initialized)) {
478 hashrnd_initialized = true;
479 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1d861aa4 480 }
1da177e4 481
ff1f69a8
ED
482 hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
483 id = ip_idents_reserve(hash, segs);
484 iph->id = htons(id);
1da177e4 485}
4bc2f18b 486EXPORT_SYMBOL(__ip_select_ident);
1da177e4 487
5abf7f7e 488static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
4895c771
DM
489 const struct iphdr *iph,
490 int oif, u8 tos,
491 u8 prot, u32 mark, int flow_flags)
492{
493 if (sk) {
494 const struct inet_sock *inet = inet_sk(sk);
495
496 oif = sk->sk_bound_dev_if;
497 mark = sk->sk_mark;
498 tos = RT_CONN_FLAGS(sk);
499 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
500 }
501 flowi4_init_output(fl4, oif, mark, tos,
502 RT_SCOPE_UNIVERSE, prot,
503 flow_flags,
504 iph->daddr, iph->saddr, 0, 0);
505}
506
5abf7f7e
ED
507static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
508 const struct sock *sk)
4895c771
DM
509{
510 const struct iphdr *iph = ip_hdr(skb);
511 int oif = skb->dev->ifindex;
512 u8 tos = RT_TOS(iph->tos);
513 u8 prot = iph->protocol;
514 u32 mark = skb->mark;
515
516 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
517}
518
5abf7f7e 519static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
520{
521 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 522 const struct ip_options_rcu *inet_opt;
4895c771
DM
523 __be32 daddr = inet->inet_daddr;
524
525 rcu_read_lock();
526 inet_opt = rcu_dereference(inet->inet_opt);
527 if (inet_opt && inet_opt->opt.srr)
528 daddr = inet_opt->opt.faddr;
529 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
530 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
531 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
532 inet_sk_flowi_flags(sk),
533 daddr, inet->inet_saddr, 0, 0);
534 rcu_read_unlock();
535}
536
5abf7f7e
ED
537static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
538 const struct sk_buff *skb)
4895c771
DM
539{
540 if (skb)
541 build_skb_flow_key(fl4, skb, sk);
542 else
543 build_sk_flow_key(fl4, sk);
544}
545
c5038a83
DM
546static inline void rt_free(struct rtable *rt)
547{
548 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
549}
550
551static DEFINE_SPINLOCK(fnhe_lock);
4895c771 552
aee06da6 553static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
554{
555 struct fib_nh_exception *fnhe, *oldest;
c5038a83 556 struct rtable *orig;
4895c771
DM
557
558 oldest = rcu_dereference(hash->chain);
559 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
560 fnhe = rcu_dereference(fnhe->fnhe_next)) {
561 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
562 oldest = fnhe;
563 }
c5038a83
DM
564 orig = rcu_dereference(oldest->fnhe_rth);
565 if (orig) {
566 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
567 rt_free(orig);
568 }
4895c771
DM
569 return oldest;
570}
571
d3a25c98
DM
572static inline u32 fnhe_hashfun(__be32 daddr)
573{
574 u32 hval;
575
576 hval = (__force u32) daddr;
577 hval ^= (hval >> 11) ^ (hval >> 22);
578
579 return hval & (FNHE_HASH_SIZE - 1);
580}
581
aee06da6
JA
582static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
583 u32 pmtu, unsigned long expires)
4895c771 584{
aee06da6 585 struct fnhe_hash_bucket *hash;
4895c771
DM
586 struct fib_nh_exception *fnhe;
587 int depth;
aee06da6
JA
588 u32 hval = fnhe_hashfun(daddr);
589
c5038a83 590 spin_lock_bh(&fnhe_lock);
4895c771 591
aee06da6 592 hash = nh->nh_exceptions;
4895c771 593 if (!hash) {
aee06da6 594 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 595 if (!hash)
aee06da6
JA
596 goto out_unlock;
597 nh->nh_exceptions = hash;
4895c771
DM
598 }
599
4895c771
DM
600 hash += hval;
601
602 depth = 0;
603 for (fnhe = rcu_dereference(hash->chain); fnhe;
604 fnhe = rcu_dereference(fnhe->fnhe_next)) {
605 if (fnhe->fnhe_daddr == daddr)
aee06da6 606 break;
4895c771
DM
607 depth++;
608 }
609
aee06da6
JA
610 if (fnhe) {
611 if (gw)
612 fnhe->fnhe_gw = gw;
613 if (pmtu) {
614 fnhe->fnhe_pmtu = pmtu;
615 fnhe->fnhe_expires = expires;
616 }
617 } else {
618 if (depth > FNHE_RECLAIM_DEPTH)
619 fnhe = fnhe_oldest(hash);
620 else {
621 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
622 if (!fnhe)
623 goto out_unlock;
624
625 fnhe->fnhe_next = hash->chain;
626 rcu_assign_pointer(hash->chain, fnhe);
627 }
628 fnhe->fnhe_daddr = daddr;
629 fnhe->fnhe_gw = gw;
630 fnhe->fnhe_pmtu = pmtu;
631 fnhe->fnhe_expires = expires;
4895c771 632 }
4895c771 633
4895c771 634 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
635
636out_unlock:
c5038a83 637 spin_unlock_bh(&fnhe_lock);
aee06da6 638 return;
4895c771
DM
639}
640
ceb33206
DM
641static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
642 bool kill_route)
1da177e4 643{
e47a185b 644 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 645 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 646 struct net_device *dev = skb->dev;
e47a185b 647 struct in_device *in_dev;
4895c771 648 struct fib_result res;
e47a185b 649 struct neighbour *n;
317805b8 650 struct net *net;
1da177e4 651
94206125
DM
652 switch (icmp_hdr(skb)->code & 7) {
653 case ICMP_REDIR_NET:
654 case ICMP_REDIR_NETTOS:
655 case ICMP_REDIR_HOST:
656 case ICMP_REDIR_HOSTTOS:
657 break;
658
659 default:
660 return;
661 }
662
e47a185b
DM
663 if (rt->rt_gateway != old_gw)
664 return;
665
666 in_dev = __in_dev_get_rcu(dev);
667 if (!in_dev)
668 return;
669
c346dca1 670 net = dev_net(dev);
9d4fb27d
JP
671 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
672 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
673 ipv4_is_zeronet(new_gw))
1da177e4
LT
674 goto reject_redirect;
675
676 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
677 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
678 goto reject_redirect;
679 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
680 goto reject_redirect;
681 } else {
317805b8 682 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
683 goto reject_redirect;
684 }
685
4895c771 686 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
e47a185b
DM
687 if (n) {
688 if (!(n->nud_state & NUD_VALID)) {
689 neigh_event_send(n, NULL);
690 } else {
4895c771
DM
691 if (fib_lookup(net, fl4, &res) == 0) {
692 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 693
aee06da6
JA
694 update_or_create_fnhe(nh, fl4->daddr, new_gw,
695 0, 0);
4895c771 696 }
ceb33206
DM
697 if (kill_route)
698 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
699 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
700 }
701 neigh_release(n);
702 }
703 return;
704
705reject_redirect:
706#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
707 if (IN_DEV_LOG_MARTIANS(in_dev)) {
708 const struct iphdr *iph = (const struct iphdr *) skb->data;
709 __be32 daddr = iph->daddr;
710 __be32 saddr = iph->saddr;
711
e47a185b
DM
712 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
713 " Advised path = %pI4 -> %pI4\n",
714 &old_gw, dev->name, &new_gw,
715 &saddr, &daddr);
99ee038d 716 }
e47a185b
DM
717#endif
718 ;
719}
720
4895c771
DM
721static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
722{
723 struct rtable *rt;
724 struct flowi4 fl4;
f96ef988
MK
725 const struct iphdr *iph = (const struct iphdr *) skb->data;
726 int oif = skb->dev->ifindex;
727 u8 tos = RT_TOS(iph->tos);
728 u8 prot = iph->protocol;
729 u32 mark = skb->mark;
4895c771
DM
730
731 rt = (struct rtable *) dst;
732
f96ef988 733 __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 734 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
735}
736
1da177e4
LT
737static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
738{
ee6b9673 739 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
740 struct dst_entry *ret = dst;
741
742 if (rt) {
d11a4dc1 743 if (dst->obsolete > 0) {
1da177e4
LT
744 ip_rt_put(rt);
745 ret = NULL;
5943634f
DM
746 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
747 rt->dst.expires) {
89aef892 748 ip_rt_put(rt);
1da177e4
LT
749 ret = NULL;
750 }
751 }
752 return ret;
753}
754
755/*
756 * Algorithm:
757 * 1. The first ip_rt_redirect_number redirects are sent
758 * with exponential backoff, then we stop sending them at all,
759 * assuming that the host ignores our redirects.
760 * 2. If we did not see packets requiring redirects
761 * during ip_rt_redirect_silence, we assume that the host
762 * forgot redirected route and start to send redirects again.
763 *
764 * This algorithm is much cheaper and more intelligent than dumb load limiting
765 * in icmp.c.
766 *
767 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
768 * and "frag. need" (breaks PMTU discovery) in icmp.c.
769 */
770
771void ip_rt_send_redirect(struct sk_buff *skb)
772{
511c3f92 773 struct rtable *rt = skb_rtable(skb);
30038fc6 774 struct in_device *in_dev;
92d86829 775 struct inet_peer *peer;
1d861aa4 776 struct net *net;
30038fc6 777 int log_martians;
1da177e4 778
30038fc6 779 rcu_read_lock();
d8d1f30b 780 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
781 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
782 rcu_read_unlock();
1da177e4 783 return;
30038fc6
ED
784 }
785 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
786 rcu_read_unlock();
1da177e4 787
1d861aa4
DM
788 net = dev_net(rt->dst.dev);
789 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829 790 if (!peer) {
e81da0e1
JA
791 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
792 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
793 return;
794 }
795
1da177e4
LT
796 /* No redirected packets during ip_rt_redirect_silence;
797 * reset the algorithm.
798 */
92d86829
DM
799 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
800 peer->rate_tokens = 0;
1da177e4
LT
801
802 /* Too many ignored redirects; do not send anything
d8d1f30b 803 * set dst.rate_last to the last seen redirected packet.
1da177e4 804 */
92d86829
DM
805 if (peer->rate_tokens >= ip_rt_redirect_number) {
806 peer->rate_last = jiffies;
1d861aa4 807 goto out_put_peer;
1da177e4
LT
808 }
809
810 /* Check for load limit; set rate_last to the latest sent
811 * redirect.
812 */
92d86829 813 if (peer->rate_tokens == 0 ||
14fb8a76 814 time_after(jiffies,
92d86829
DM
815 (peer->rate_last +
816 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
817 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
818
819 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
820 peer->rate_last = jiffies;
821 ++peer->rate_tokens;
1da177e4 822#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 823 if (log_martians &&
e87cc472
JP
824 peer->rate_tokens == ip_rt_redirect_number)
825 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 826 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 827 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
828#endif
829 }
1d861aa4
DM
830out_put_peer:
831 inet_putpeer(peer);
1da177e4
LT
832}
833
834static int ip_error(struct sk_buff *skb)
835{
251da413 836 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 837 struct rtable *rt = skb_rtable(skb);
92d86829 838 struct inet_peer *peer;
1da177e4 839 unsigned long now;
251da413 840 struct net *net;
92d86829 841 bool send;
1da177e4
LT
842 int code;
843
251da413
DM
844 net = dev_net(rt->dst.dev);
845 if (!IN_DEV_FORWARD(in_dev)) {
846 switch (rt->dst.error) {
847 case EHOSTUNREACH:
848 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
849 break;
850
851 case ENETUNREACH:
852 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
853 break;
854 }
855 goto out;
856 }
857
d8d1f30b 858 switch (rt->dst.error) {
4500ebf8
JP
859 case EINVAL:
860 default:
861 goto out;
862 case EHOSTUNREACH:
863 code = ICMP_HOST_UNREACH;
864 break;
865 case ENETUNREACH:
866 code = ICMP_NET_UNREACH;
251da413 867 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
868 break;
869 case EACCES:
870 code = ICMP_PKT_FILTERED;
871 break;
1da177e4
LT
872 }
873
1d861aa4 874 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
92d86829
DM
875
876 send = true;
877 if (peer) {
878 now = jiffies;
879 peer->rate_tokens += now - peer->rate_last;
880 if (peer->rate_tokens > ip_rt_error_burst)
881 peer->rate_tokens = ip_rt_error_burst;
882 peer->rate_last = now;
883 if (peer->rate_tokens >= ip_rt_error_cost)
884 peer->rate_tokens -= ip_rt_error_cost;
885 else
886 send = false;
1d861aa4 887 inet_putpeer(peer);
1da177e4 888 }
92d86829
DM
889 if (send)
890 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
891
892out: kfree_skb(skb);
893 return 0;
e905a9ed 894}
1da177e4 895
d851c12b 896static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 897{
d851c12b 898 struct dst_entry *dst = &rt->dst;
4895c771 899 struct fib_result res;
2c8cec5c 900
fa1e492a
SK
901 if (dst_metric_locked(dst, RTAX_MTU))
902 return;
903
7f92d334
SK
904 if (dst->dev->mtu < mtu)
905 return;
906
5943634f
DM
907 if (mtu < ip_rt_min_pmtu)
908 mtu = ip_rt_min_pmtu;
2c8cec5c 909
d851c12b
SK
910 if (!rt->rt_pmtu) {
911 dst->obsolete = DST_OBSOLETE_KILL;
912 } else {
913 rt->rt_pmtu = mtu;
914 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
915 }
916
c5ae7d41 917 rcu_read_lock();
d851c12b 918 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
4895c771 919 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 920
aee06da6
JA
921 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
922 jiffies + ip_rt_mtu_expires);
4895c771 923 }
c5ae7d41 924 rcu_read_unlock();
1da177e4
LT
925}
926
4895c771
DM
927static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
928 struct sk_buff *skb, u32 mtu)
929{
930 struct rtable *rt = (struct rtable *) dst;
931 struct flowi4 fl4;
932
933 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 934 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
935}
936
36393395
DM
937void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
938 int oif, u32 mark, u8 protocol, int flow_flags)
939{
4895c771 940 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
941 struct flowi4 fl4;
942 struct rtable *rt;
943
4895c771
DM
944 __build_flow_key(&fl4, NULL, iph, oif,
945 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
946 rt = __ip_route_output_key(net, &fl4);
947 if (!IS_ERR(rt)) {
4895c771 948 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
949 ip_rt_put(rt);
950 }
951}
952EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
953
9cb3a50c 954static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 955{
4895c771
DM
956 const struct iphdr *iph = (const struct iphdr *) skb->data;
957 struct flowi4 fl4;
958 struct rtable *rt;
36393395 959
4895c771
DM
960 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
961 rt = __ip_route_output_key(sock_net(sk), &fl4);
962 if (!IS_ERR(rt)) {
963 __ip_rt_update_pmtu(rt, &fl4, mtu);
964 ip_rt_put(rt);
965 }
36393395 966}
9cb3a50c
SK
967
968void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
969{
970 const struct iphdr *iph = (const struct iphdr *) skb->data;
971 struct flowi4 fl4;
972 struct rtable *rt;
f1e1b06f 973 struct dst_entry *odst = NULL;
b44108db 974 bool new = false;
9cb3a50c
SK
975
976 bh_lock_sock(sk);
f1e1b06f 977 odst = sk_dst_get(sk);
9cb3a50c 978
f1e1b06f 979 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
980 __ipv4_sk_update_pmtu(skb, sk, mtu);
981 goto out;
982 }
983
984 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
985
f1e1b06f
ED
986 rt = (struct rtable *)odst;
987 if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
9cb3a50c
SK
988 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
989 if (IS_ERR(rt))
990 goto out;
b44108db
SK
991
992 new = true;
9cb3a50c
SK
993 }
994
995 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
996
f1e1b06f 997 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
998 if (new)
999 dst_release(&rt->dst);
1000
9cb3a50c
SK
1001 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1002 if (IS_ERR(rt))
1003 goto out;
1004
b44108db 1005 new = true;
9cb3a50c
SK
1006 }
1007
b44108db 1008 if (new)
f1e1b06f 1009 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1010
1011out:
1012 bh_unlock_sock(sk);
f1e1b06f 1013 dst_release(odst);
9cb3a50c 1014}
36393395 1015EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1016
b42597e2
DM
1017void ipv4_redirect(struct sk_buff *skb, struct net *net,
1018 int oif, u32 mark, u8 protocol, int flow_flags)
1019{
4895c771 1020 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1021 struct flowi4 fl4;
1022 struct rtable *rt;
1023
4895c771
DM
1024 __build_flow_key(&fl4, NULL, iph, oif,
1025 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1026 rt = __ip_route_output_key(net, &fl4);
1027 if (!IS_ERR(rt)) {
ceb33206 1028 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1029 ip_rt_put(rt);
1030 }
1031}
1032EXPORT_SYMBOL_GPL(ipv4_redirect);
1033
1034void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1035{
4895c771
DM
1036 const struct iphdr *iph = (const struct iphdr *) skb->data;
1037 struct flowi4 fl4;
1038 struct rtable *rt;
b42597e2 1039
4895c771
DM
1040 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1041 rt = __ip_route_output_key(sock_net(sk), &fl4);
1042 if (!IS_ERR(rt)) {
ceb33206 1043 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1044 ip_rt_put(rt);
1045 }
b42597e2
DM
1046}
1047EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1048
efbc368d
DM
1049static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1050{
1051 struct rtable *rt = (struct rtable *) dst;
1052
ceb33206
DM
1053 /* All IPV4 dsts are created with ->obsolete set to the value
1054 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1055 * into this function always.
1056 *
1057 * When a PMTU/redirect information update invalidates a
1058 * route, this is indicated by setting obsolete to
1059 * DST_OBSOLETE_KILL.
1060 */
1061 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
efbc368d 1062 return NULL;
d11a4dc1 1063 return dst;
1da177e4
LT
1064}
1065
1da177e4
LT
1066static void ipv4_link_failure(struct sk_buff *skb)
1067{
1068 struct rtable *rt;
1069
1070 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1071
511c3f92 1072 rt = skb_rtable(skb);
5943634f
DM
1073 if (rt)
1074 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1075}
1076
1077static int ip_rt_bug(struct sk_buff *skb)
1078{
91df42be
JP
1079 pr_debug("%s: %pI4 -> %pI4, %s\n",
1080 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1081 skb->dev ? skb->dev->name : "?");
1da177e4 1082 kfree_skb(skb);
c378a9c0 1083 WARN_ON(1);
1da177e4
LT
1084 return 0;
1085}
1086
1087/*
1088 We do not cache source address of outgoing interface,
1089 because it is used only by IP RR, TS and SRR options,
1090 so that it out of fast path.
1091
1092 BTW remember: "addr" is allowed to be not aligned
1093 in IP options!
1094 */
1095
8e36360a 1096void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1097{
a61ced5d 1098 __be32 src;
1da177e4 1099
c7537967 1100 if (rt_is_output_route(rt))
c5be24ff 1101 src = ip_hdr(skb)->saddr;
ebc0ffae 1102 else {
8e36360a
DM
1103 struct fib_result res;
1104 struct flowi4 fl4;
1105 struct iphdr *iph;
1106
1107 iph = ip_hdr(skb);
1108
1109 memset(&fl4, 0, sizeof(fl4));
1110 fl4.daddr = iph->daddr;
1111 fl4.saddr = iph->saddr;
b0fe4a31 1112 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1113 fl4.flowi4_oif = rt->dst.dev->ifindex;
1114 fl4.flowi4_iif = skb->dev->ifindex;
1115 fl4.flowi4_mark = skb->mark;
5e2b61f7 1116
ebc0ffae 1117 rcu_read_lock();
68a5e3dd 1118 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1119 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1120 else
f8126f1d
DM
1121 src = inet_select_addr(rt->dst.dev,
1122 rt_nexthop(rt, iph->daddr),
1123 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1124 rcu_read_unlock();
1125 }
1da177e4
LT
1126 memcpy(addr, &src, 4);
1127}
1128
c7066f70 1129#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1130static void set_class_tag(struct rtable *rt, u32 tag)
1131{
d8d1f30b
CG
1132 if (!(rt->dst.tclassid & 0xFFFF))
1133 rt->dst.tclassid |= tag & 0xFFFF;
1134 if (!(rt->dst.tclassid & 0xFFFF0000))
1135 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1136}
1137#endif
1138
0dbaee3b
DM
1139static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1140{
1141 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1142
1143 if (advmss == 0) {
1144 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1145 ip_rt_min_advmss);
1146 if (advmss > 65535 - 40)
1147 advmss = 65535 - 40;
1148 }
1149 return advmss;
1150}
1151
ebb762f2 1152static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1153{
261663b0 1154 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1155 unsigned int mtu = rt->rt_pmtu;
1156
98d75c37 1157 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1158 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1159
38d523e2 1160 if (mtu)
618f9bc7
SK
1161 return mtu;
1162
1163 mtu = dst->dev->mtu;
d33e4553
DM
1164
1165 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1166 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1167 mtu = 576;
1168 }
1169
1170 if (mtu > IP_MAX_MTU)
1171 mtu = IP_MAX_MTU;
1172
1173 return mtu;
1174}
1175
f2bb4bed 1176static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771
DM
1177{
1178 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1179 struct fib_nh_exception *fnhe;
1180 u32 hval;
1181
f2bb4bed
DM
1182 if (!hash)
1183 return NULL;
1184
d3a25c98 1185 hval = fnhe_hashfun(daddr);
4895c771
DM
1186
1187 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1188 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1189 if (fnhe->fnhe_daddr == daddr)
1190 return fnhe;
1191 }
1192 return NULL;
1193}
aee06da6 1194
caacf05e 1195static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
f2bb4bed
DM
1196 __be32 daddr)
1197{
caacf05e
DM
1198 bool ret = false;
1199
c5038a83 1200 spin_lock_bh(&fnhe_lock);
f2bb4bed 1201
c5038a83 1202 if (daddr == fnhe->fnhe_daddr) {
13d82bf5
SK
1203 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1204 if (orig && rt_is_expired(orig)) {
1205 fnhe->fnhe_gw = 0;
1206 fnhe->fnhe_pmtu = 0;
1207 fnhe->fnhe_expires = 0;
1208 }
c5038a83
DM
1209 if (fnhe->fnhe_pmtu) {
1210 unsigned long expires = fnhe->fnhe_expires;
1211 unsigned long diff = expires - jiffies;
1212
1213 if (time_before(jiffies, expires)) {
1214 rt->rt_pmtu = fnhe->fnhe_pmtu;
1215 dst_set_expires(&rt->dst, diff);
1216 }
1217 }
1218 if (fnhe->fnhe_gw) {
1219 rt->rt_flags |= RTCF_REDIRECTED;
1220 rt->rt_gateway = fnhe->fnhe_gw;
155e8336
JA
1221 rt->rt_uses_gateway = 1;
1222 } else if (!rt->rt_gateway)
1223 rt->rt_gateway = daddr;
f2bb4bed 1224
c5038a83
DM
1225 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1226 if (orig)
1227 rt_free(orig);
1228
1229 fnhe->fnhe_stamp = jiffies;
caacf05e 1230 ret = true;
c5038a83
DM
1231 }
1232 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1233
1234 return ret;
54764bb6
ED
1235}
1236
caacf05e 1237static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1238{
d26b3a7c 1239 struct rtable *orig, *prev, **p;
caacf05e 1240 bool ret = true;
f2bb4bed 1241
d26b3a7c 1242 if (rt_is_input_route(rt)) {
54764bb6 1243 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1244 } else {
d26b3a7c
ED
1245 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1246 }
f2bb4bed
DM
1247 orig = *p;
1248
1249 prev = cmpxchg(p, orig, rt);
1250 if (prev == orig) {
f2bb4bed 1251 if (orig)
54764bb6 1252 rt_free(orig);
155e8336 1253 } else
caacf05e 1254 ret = false;
caacf05e
DM
1255
1256 return ret;
1257}
1258
1259static DEFINE_SPINLOCK(rt_uncached_lock);
1260static LIST_HEAD(rt_uncached_list);
1261
1262static void rt_add_uncached_list(struct rtable *rt)
1263{
1264 spin_lock_bh(&rt_uncached_lock);
1265 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1266 spin_unlock_bh(&rt_uncached_lock);
1267}
1268
1269static void ipv4_dst_destroy(struct dst_entry *dst)
1270{
1271 struct rtable *rt = (struct rtable *) dst;
1272
78df76a0 1273 if (!list_empty(&rt->rt_uncached)) {
caacf05e
DM
1274 spin_lock_bh(&rt_uncached_lock);
1275 list_del(&rt->rt_uncached);
1276 spin_unlock_bh(&rt_uncached_lock);
1277 }
1278}
1279
1280void rt_flush_dev(struct net_device *dev)
1281{
1282 if (!list_empty(&rt_uncached_list)) {
1283 struct net *net = dev_net(dev);
1284 struct rtable *rt;
1285
1286 spin_lock_bh(&rt_uncached_lock);
1287 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1288 if (rt->dst.dev != dev)
1289 continue;
1290 rt->dst.dev = net->loopback_dev;
1291 dev_hold(rt->dst.dev);
1292 dev_put(dev);
1293 }
1294 spin_unlock_bh(&rt_uncached_lock);
4895c771
DM
1295 }
1296}
1297
4331debc 1298static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1299{
4331debc
ED
1300 return rt &&
1301 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1302 !rt_is_expired(rt);
d2d68ba9
DM
1303}
1304
f2bb4bed 1305static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1306 const struct fib_result *res,
f2bb4bed 1307 struct fib_nh_exception *fnhe,
982721f3 1308 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1309{
caacf05e
DM
1310 bool cached = false;
1311
1da177e4 1312 if (fi) {
4895c771
DM
1313 struct fib_nh *nh = &FIB_RES_NH(*res);
1314
155e8336 1315 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1316 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1317 rt->rt_uses_gateway = 1;
1318 }
2860583f 1319 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
c7066f70 1320#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1321 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1322#endif
c5038a83 1323 if (unlikely(fnhe))
caacf05e 1324 cached = rt_bind_exception(rt, fnhe, daddr);
c5038a83 1325 else if (!(rt->dst.flags & DST_NOCACHE))
caacf05e 1326 cached = rt_cache_route(nh, rt);
155e8336
JA
1327 if (unlikely(!cached)) {
1328 /* Routes we intend to cache in nexthop exception or
1329 * FIB nexthop have the DST_NOCACHE bit clear.
1330 * However, if we are unsuccessful at storing this
1331 * route into the cache we really need to set it.
1332 */
1333 rt->dst.flags |= DST_NOCACHE;
1334 if (!rt->rt_gateway)
1335 rt->rt_gateway = daddr;
1336 rt_add_uncached_list(rt);
1337 }
1338 } else
caacf05e 1339 rt_add_uncached_list(rt);
defb3519 1340
c7066f70 1341#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1342#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1343 set_class_tag(rt, res->tclassid);
1da177e4
LT
1344#endif
1345 set_class_tag(rt, itag);
1346#endif
1da177e4
LT
1347}
1348
5c1e6aa3 1349static struct rtable *rt_dst_alloc(struct net_device *dev,
f2bb4bed 1350 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1351{
f5b0a874 1352 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
c6cffba4 1353 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
5c1e6aa3
DM
1354 (nopolicy ? DST_NOPOLICY : 0) |
1355 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1356}
1357
96d36220 1358/* called in rcu_read_lock() section */
9e12bb22 1359static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1360 u8 tos, struct net_device *dev, int our)
1361{
1da177e4 1362 struct rtable *rth;
96d36220 1363 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1364 u32 itag = 0;
b5f7e755 1365 int err;
1da177e4
LT
1366
1367 /* Primary sanity checks. */
1368
1369 if (in_dev == NULL)
1370 return -EINVAL;
1371
1e637c74 1372 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1373 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1374 goto e_inval;
1375
d0daebc3
TG
1376 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1377 if (ipv4_is_loopback(saddr))
1378 goto e_inval;
1379
f97c1e0c
JP
1380 if (ipv4_is_zeronet(saddr)) {
1381 if (!ipv4_is_local_multicast(daddr))
1da177e4 1382 goto e_inval;
b5f7e755 1383 } else {
9e56e380
DM
1384 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1385 in_dev, &itag);
b5f7e755
ED
1386 if (err < 0)
1387 goto e_err;
1388 }
4e7b2f14 1389 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
f2bb4bed 1390 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4
LT
1391 if (!rth)
1392 goto e_nobufs;
1393
cf911662
DM
1394#ifdef CONFIG_IP_ROUTE_CLASSID
1395 rth->dst.tclassid = itag;
1396#endif
d8d1f30b 1397 rth->dst.output = ip_rt_bug;
1da177e4 1398
cf911662
DM
1399 rth->rt_genid = rt_genid(dev_net(dev));
1400 rth->rt_flags = RTCF_MULTICAST;
1401 rth->rt_type = RTN_MULTICAST;
9917e1e8 1402 rth->rt_is_input= 1;
13378cad 1403 rth->rt_iif = 0;
5943634f 1404 rth->rt_pmtu = 0;
f8126f1d 1405 rth->rt_gateway = 0;
155e8336 1406 rth->rt_uses_gateway = 0;
caacf05e 1407 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4 1408 if (our) {
d8d1f30b 1409 rth->dst.input= ip_local_deliver;
1da177e4
LT
1410 rth->rt_flags |= RTCF_LOCAL;
1411 }
1412
1413#ifdef CONFIG_IP_MROUTE
f97c1e0c 1414 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1415 rth->dst.input = ip_mr_input;
1da177e4
LT
1416#endif
1417 RT_CACHE_STAT_INC(in_slow_mc);
1418
89aef892
DM
1419 skb_dst_set(skb, &rth->dst);
1420 return 0;
1da177e4
LT
1421
1422e_nobufs:
1da177e4 1423 return -ENOBUFS;
1da177e4 1424e_inval:
96d36220 1425 return -EINVAL;
b5f7e755 1426e_err:
b5f7e755 1427 return err;
1da177e4
LT
1428}
1429
1430
1431static void ip_handle_martian_source(struct net_device *dev,
1432 struct in_device *in_dev,
1433 struct sk_buff *skb,
9e12bb22
AV
1434 __be32 daddr,
1435 __be32 saddr)
1da177e4
LT
1436{
1437 RT_CACHE_STAT_INC(in_martian_src);
1438#ifdef CONFIG_IP_ROUTE_VERBOSE
1439 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1440 /*
1441 * RFC1812 recommendation, if source is martian,
1442 * the only hint is MAC header.
1443 */
058bd4d2 1444 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1445 &daddr, &saddr, dev->name);
98e399f8 1446 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1447 print_hex_dump(KERN_WARNING, "ll header: ",
1448 DUMP_PREFIX_OFFSET, 16, 1,
1449 skb_mac_header(skb),
1450 dev->hard_header_len, true);
1da177e4
LT
1451 }
1452 }
1453#endif
1454}
1455
47360228 1456/* called in rcu_read_lock() section */
5969f71d 1457static int __mkroute_input(struct sk_buff *skb,
982721f3 1458 const struct fib_result *res,
5969f71d 1459 struct in_device *in_dev,
c6cffba4 1460 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1461{
1da177e4
LT
1462 struct rtable *rth;
1463 int err;
1464 struct in_device *out_dev;
47360228 1465 unsigned int flags = 0;
d2d68ba9 1466 bool do_cache;
809d0bf7 1467 u32 itag = 0;
1da177e4
LT
1468
1469 /* get a working reference to the output device */
47360228 1470 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4 1471 if (out_dev == NULL) {
e87cc472 1472 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1473 return -EINVAL;
1474 }
1475
5c04c819 1476 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1477 in_dev->dev, in_dev, &itag);
1da177e4 1478 if (err < 0) {
e905a9ed 1479 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1480 saddr);
e905a9ed 1481
1da177e4
LT
1482 goto cleanup;
1483 }
1484
e81da0e1
JA
1485 do_cache = res->fi && !itag;
1486 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1da177e4 1487 (IN_DEV_SHARED_MEDIA(out_dev) ||
e81da0e1 1488 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1da177e4 1489 flags |= RTCF_DOREDIRECT;
e81da0e1
JA
1490 do_cache = false;
1491 }
1da177e4
LT
1492
1493 if (skb->protocol != htons(ETH_P_IP)) {
1494 /* Not IP (i.e. ARP). Do not create route, if it is
1495 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1496 *
1497 * Proxy arp feature have been extended to allow, ARP
1498 * replies back to the same interface, to support
1499 * Private VLAN switch technologies. See arp.c.
1da177e4 1500 */
65324144
JDB
1501 if (out_dev == in_dev &&
1502 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1503 err = -EINVAL;
1504 goto cleanup;
1505 }
1506 }
1507
e81da0e1
JA
1508 if (do_cache) {
1509 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1510 if (rt_cache_valid(rth)) {
1511 skb_dst_set_noref(skb, &rth->dst);
1512 goto out;
d2d68ba9
DM
1513 }
1514 }
f2bb4bed 1515
5c1e6aa3
DM
1516 rth = rt_dst_alloc(out_dev->dev,
1517 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1518 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1519 if (!rth) {
1520 err = -ENOBUFS;
1521 goto cleanup;
1522 }
1523
cf911662
DM
1524 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1525 rth->rt_flags = flags;
1526 rth->rt_type = res->type;
9917e1e8 1527 rth->rt_is_input = 1;
13378cad 1528 rth->rt_iif = 0;
5943634f 1529 rth->rt_pmtu = 0;
f8126f1d 1530 rth->rt_gateway = 0;
155e8336 1531 rth->rt_uses_gateway = 0;
caacf05e 1532 INIT_LIST_HEAD(&rth->rt_uncached);
e82a32af 1533 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1534
d8d1f30b
CG
1535 rth->dst.input = ip_forward;
1536 rth->dst.output = ip_output;
1da177e4 1537
d2d68ba9 1538 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
c6cffba4 1539 skb_dst_set(skb, &rth->dst);
d2d68ba9 1540out:
1da177e4
LT
1541 err = 0;
1542 cleanup:
1da177e4 1543 return err;
e905a9ed 1544}
1da177e4 1545
5969f71d
SH
1546static int ip_mkroute_input(struct sk_buff *skb,
1547 struct fib_result *res,
68a5e3dd 1548 const struct flowi4 *fl4,
5969f71d
SH
1549 struct in_device *in_dev,
1550 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1551{
1da177e4 1552#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 1553 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 1554 fib_select_multipath(res);
1da177e4
LT
1555#endif
1556
1557 /* create a routing cache entry */
c6cffba4 1558 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1559}
1560
1da177e4
LT
1561/*
1562 * NOTE. We drop all the packets that has local source
1563 * addresses, because every properly looped back packet
1564 * must have correct destination already attached by output routine.
1565 *
1566 * Such approach solves two big problems:
1567 * 1. Not simplex devices are handled properly.
1568 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1569 * called with rcu_read_lock()
1da177e4
LT
1570 */
1571
9e12bb22 1572static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
c10237e0 1573 u8 tos, struct net_device *dev)
1da177e4
LT
1574{
1575 struct fib_result res;
96d36220 1576 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 1577 struct flowi4 fl4;
95c96174 1578 unsigned int flags = 0;
1da177e4 1579 u32 itag = 0;
95c96174 1580 struct rtable *rth;
1da177e4 1581 int err = -EINVAL;
5e73ea1a 1582 struct net *net = dev_net(dev);
d2d68ba9 1583 bool do_cache;
1da177e4
LT
1584
1585 /* IP on this device is disabled. */
1586
1587 if (!in_dev)
1588 goto out;
1589
1590 /* Check for the most weird martians, which can be not detected
1591 by fib_lookup.
1592 */
1593
d0daebc3 1594 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1595 goto martian_source;
1596
d2d68ba9 1597 res.fi = NULL;
27a954bd 1598 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1599 goto brd_input;
1600
1601 /* Accept zero addresses only to limited broadcast;
1602 * I even do not know to fix it or not. Waiting for complains :-)
1603 */
f97c1e0c 1604 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1605 goto martian_source;
1606
d0daebc3 1607 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1608 goto martian_destination;
1609
9eb43e76
ED
1610 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1611 * and call it once if daddr or/and saddr are loopback addresses
1612 */
1613 if (ipv4_is_loopback(daddr)) {
1614 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1615 goto martian_destination;
9eb43e76
ED
1616 } else if (ipv4_is_loopback(saddr)) {
1617 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1618 goto martian_source;
1619 }
1620
1da177e4
LT
1621 /*
1622 * Now we are ready to route packet.
1623 */
68a5e3dd
DM
1624 fl4.flowi4_oif = 0;
1625 fl4.flowi4_iif = dev->ifindex;
1626 fl4.flowi4_mark = skb->mark;
1627 fl4.flowi4_tos = tos;
1628 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1629 fl4.daddr = daddr;
1630 fl4.saddr = saddr;
1631 err = fib_lookup(net, &fl4, &res);
251da413 1632 if (err != 0)
1da177e4 1633 goto no_route;
1da177e4 1634
1da177e4
LT
1635 if (res.type == RTN_BROADCAST)
1636 goto brd_input;
1637
1638 if (res.type == RTN_LOCAL) {
5c04c819 1639 err = fib_validate_source(skb, saddr, daddr, tos,
1fb9489b 1640 LOOPBACK_IFINDEX,
9e56e380 1641 dev, in_dev, &itag);
b5f7e755
ED
1642 if (err < 0)
1643 goto martian_source_keep_err;
1da177e4
LT
1644 goto local_input;
1645 }
1646
1647 if (!IN_DEV_FORWARD(in_dev))
251da413 1648 goto no_route;
1da177e4
LT
1649 if (res.type != RTN_UNICAST)
1650 goto martian_destination;
1651
68a5e3dd 1652 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
1653out: return err;
1654
1655brd_input:
1656 if (skb->protocol != htons(ETH_P_IP))
1657 goto e_inval;
1658
41347dcd 1659 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1660 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1661 in_dev, &itag);
1da177e4 1662 if (err < 0)
b5f7e755 1663 goto martian_source_keep_err;
1da177e4
LT
1664 }
1665 flags |= RTCF_BROADCAST;
1666 res.type = RTN_BROADCAST;
1667 RT_CACHE_STAT_INC(in_brd);
1668
1669local_input:
d2d68ba9
DM
1670 do_cache = false;
1671 if (res.fi) {
fe3edf45 1672 if (!itag) {
54764bb6 1673 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
d2d68ba9 1674 if (rt_cache_valid(rth)) {
c6cffba4
DM
1675 skb_dst_set_noref(skb, &rth->dst);
1676 err = 0;
1677 goto out;
d2d68ba9
DM
1678 }
1679 do_cache = true;
1680 }
1681 }
1682
5c1e6aa3 1683 rth = rt_dst_alloc(net->loopback_dev,
d2d68ba9 1684 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
1685 if (!rth)
1686 goto e_nobufs;
1687
cf911662 1688 rth->dst.input= ip_local_deliver;
d8d1f30b 1689 rth->dst.output= ip_rt_bug;
cf911662
DM
1690#ifdef CONFIG_IP_ROUTE_CLASSID
1691 rth->dst.tclassid = itag;
1692#endif
1da177e4 1693
cf911662
DM
1694 rth->rt_genid = rt_genid(net);
1695 rth->rt_flags = flags|RTCF_LOCAL;
1696 rth->rt_type = res.type;
9917e1e8 1697 rth->rt_is_input = 1;
13378cad 1698 rth->rt_iif = 0;
5943634f 1699 rth->rt_pmtu = 0;
f8126f1d 1700 rth->rt_gateway = 0;
155e8336 1701 rth->rt_uses_gateway = 0;
caacf05e 1702 INIT_LIST_HEAD(&rth->rt_uncached);
e82a32af 1703 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1704 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
1705 rth->dst.input= ip_error;
1706 rth->dst.error= -err;
1da177e4
LT
1707 rth->rt_flags &= ~RTCF_LOCAL;
1708 }
62713c4b
AS
1709 if (do_cache) {
1710 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1711 rth->dst.flags |= DST_NOCACHE;
1712 rt_add_uncached_list(rth);
1713 }
1714 }
89aef892 1715 skb_dst_set(skb, &rth->dst);
b23dd4fe 1716 err = 0;
ebc0ffae 1717 goto out;
1da177e4
LT
1718
1719no_route:
1720 RT_CACHE_STAT_INC(in_no_route);
1da177e4 1721 res.type = RTN_UNREACHABLE;
7f53878d
MC
1722 if (err == -ESRCH)
1723 err = -ENETUNREACH;
1da177e4
LT
1724 goto local_input;
1725
1726 /*
1727 * Do not cache martian addresses: they should be logged (RFC1812)
1728 */
1729martian_destination:
1730 RT_CACHE_STAT_INC(in_martian_dst);
1731#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
1732 if (IN_DEV_LOG_MARTIANS(in_dev))
1733 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1734 &daddr, &saddr, dev->name);
1da177e4 1735#endif
2c2910a4 1736
1da177e4
LT
1737e_inval:
1738 err = -EINVAL;
ebc0ffae 1739 goto out;
1da177e4
LT
1740
1741e_nobufs:
1742 err = -ENOBUFS;
ebc0ffae 1743 goto out;
1da177e4
LT
1744
1745martian_source:
b5f7e755
ED
1746 err = -EINVAL;
1747martian_source_keep_err:
1da177e4 1748 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 1749 goto out;
1da177e4
LT
1750}
1751
c6cffba4
DM
1752int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1753 u8 tos, struct net_device *dev)
1da177e4 1754{
96d36220 1755 int res;
1da177e4 1756
96d36220
ED
1757 rcu_read_lock();
1758
1da177e4
LT
1759 /* Multicast recognition logic is moved from route cache to here.
1760 The problem was that too many Ethernet cards have broken/missing
1761 hardware multicast filters :-( As result the host on multicasting
1762 network acquires a lot of useless route cache entries, sort of
1763 SDR messages from all the world. Now we try to get rid of them.
1764 Really, provided software IP multicast filter is organized
1765 reasonably (at least, hashed), it does not result in a slowdown
1766 comparing with route cache reject entries.
1767 Note, that multicast routers are not affected, because
1768 route cache entry is created eventually.
1769 */
f97c1e0c 1770 if (ipv4_is_multicast(daddr)) {
96d36220 1771 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1772
96d36220 1773 if (in_dev) {
dbdd9a52
DM
1774 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1775 ip_hdr(skb)->protocol);
1da177e4
LT
1776 if (our
1777#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
1778 ||
1779 (!ipv4_is_local_multicast(daddr) &&
1780 IN_DEV_MFORWARD(in_dev))
1da177e4 1781#endif
9d4fb27d 1782 ) {
96d36220
ED
1783 int res = ip_route_input_mc(skb, daddr, saddr,
1784 tos, dev, our);
1da177e4 1785 rcu_read_unlock();
96d36220 1786 return res;
1da177e4
LT
1787 }
1788 }
1789 rcu_read_unlock();
1790 return -EINVAL;
1791 }
c10237e0 1792 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
96d36220
ED
1793 rcu_read_unlock();
1794 return res;
1da177e4 1795}
c6cffba4 1796EXPORT_SYMBOL(ip_route_input_noref);
1da177e4 1797
ebc0ffae 1798/* called with rcu_read_lock() */
982721f3 1799static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 1800 const struct flowi4 *fl4, int orig_oif,
f61759e6 1801 struct net_device *dev_out,
5ada5527 1802 unsigned int flags)
1da177e4 1803{
982721f3 1804 struct fib_info *fi = res->fi;
f2bb4bed 1805 struct fib_nh_exception *fnhe;
5ada5527 1806 struct in_device *in_dev;
982721f3 1807 u16 type = res->type;
5ada5527 1808 struct rtable *rth;
c92b9655 1809 bool do_cache;
1da177e4 1810
d0daebc3
TG
1811 in_dev = __in_dev_get_rcu(dev_out);
1812 if (!in_dev)
5ada5527 1813 return ERR_PTR(-EINVAL);
1da177e4 1814
d0daebc3
TG
1815 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1816 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1817 return ERR_PTR(-EINVAL);
1818
68a5e3dd 1819 if (ipv4_is_lbcast(fl4->daddr))
982721f3 1820 type = RTN_BROADCAST;
68a5e3dd 1821 else if (ipv4_is_multicast(fl4->daddr))
982721f3 1822 type = RTN_MULTICAST;
68a5e3dd 1823 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 1824 return ERR_PTR(-EINVAL);
1da177e4
LT
1825
1826 if (dev_out->flags & IFF_LOOPBACK)
1827 flags |= RTCF_LOCAL;
1828
63617421 1829 do_cache = true;
982721f3 1830 if (type == RTN_BROADCAST) {
1da177e4 1831 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
1832 fi = NULL;
1833 } else if (type == RTN_MULTICAST) {
dd28d1a0 1834 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
1835 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1836 fl4->flowi4_proto))
1da177e4 1837 flags &= ~RTCF_LOCAL;
63617421
JA
1838 else
1839 do_cache = false;
1da177e4 1840 /* If multicast route do not exist use
dd28d1a0
ED
1841 * default one, but do not gateway in this case.
1842 * Yes, it is hack.
1da177e4 1843 */
982721f3
DM
1844 if (fi && res->prefixlen < 4)
1845 fi = NULL;
1da177e4
LT
1846 }
1847
f2bb4bed 1848 fnhe = NULL;
63617421
JA
1849 do_cache &= fi != NULL;
1850 if (do_cache) {
c5038a83 1851 struct rtable __rcu **prth;
c92b9655 1852 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 1853
c92b9655 1854 fnhe = find_exception(nh, fl4->daddr);
c5038a83
DM
1855 if (fnhe)
1856 prth = &fnhe->fnhe_rth;
c92b9655
JA
1857 else {
1858 if (unlikely(fl4->flowi4_flags &
1859 FLOWI_FLAG_KNOWN_NH &&
1860 !(nh->nh_gw &&
1861 nh->nh_scope == RT_SCOPE_LINK))) {
1862 do_cache = false;
1863 goto add;
1864 }
1865 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1866 }
c5038a83
DM
1867 rth = rcu_dereference(*prth);
1868 if (rt_cache_valid(rth)) {
1869 dst_hold(&rth->dst);
1870 return rth;
f2bb4bed
DM
1871 }
1872 }
c92b9655
JA
1873
1874add:
5c1e6aa3
DM
1875 rth = rt_dst_alloc(dev_out,
1876 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 1877 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 1878 do_cache);
8391d07b 1879 if (!rth)
5ada5527 1880 return ERR_PTR(-ENOBUFS);
8391d07b 1881
cf911662
DM
1882 rth->dst.output = ip_output;
1883
cf911662
DM
1884 rth->rt_genid = rt_genid(dev_net(dev_out));
1885 rth->rt_flags = flags;
1886 rth->rt_type = type;
9917e1e8 1887 rth->rt_is_input = 0;
13378cad 1888 rth->rt_iif = orig_oif ? : 0;
5943634f 1889 rth->rt_pmtu = 0;
f8126f1d 1890 rth->rt_gateway = 0;
155e8336 1891 rth->rt_uses_gateway = 0;
caacf05e 1892 INIT_LIST_HEAD(&rth->rt_uncached);
1da177e4
LT
1893
1894 RT_CACHE_STAT_INC(out_slow_tot);
1895
41347dcd 1896 if (flags & RTCF_LOCAL)
d8d1f30b 1897 rth->dst.input = ip_local_deliver;
1da177e4 1898 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 1899 if (flags & RTCF_LOCAL &&
1da177e4 1900 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 1901 rth->dst.output = ip_mc_output;
1da177e4
LT
1902 RT_CACHE_STAT_INC(out_slow_mc);
1903 }
1904#ifdef CONFIG_IP_MROUTE
982721f3 1905 if (type == RTN_MULTICAST) {
1da177e4 1906 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 1907 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
1908 rth->dst.input = ip_mr_input;
1909 rth->dst.output = ip_mc_output;
1da177e4
LT
1910 }
1911 }
1912#endif
1913 }
1914
f2bb4bed 1915 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1da177e4 1916
5ada5527 1917 return rth;
1da177e4
LT
1918}
1919
1da177e4
LT
1920/*
1921 * Major route resolver routine.
1922 */
1923
89aef892 1924struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1da177e4 1925{
1da177e4 1926 struct net_device *dev_out = NULL;
f61759e6 1927 __u8 tos = RT_FL_TOS(fl4);
813b3b5d
DM
1928 unsigned int flags = 0;
1929 struct fib_result res;
5ada5527 1930 struct rtable *rth;
813b3b5d 1931 int orig_oif;
1da177e4 1932
85b91b03 1933 res.tclassid = 0;
1da177e4 1934 res.fi = NULL;
8b96d22d 1935 res.table = NULL;
1da177e4 1936
813b3b5d
DM
1937 orig_oif = fl4->flowi4_oif;
1938
1fb9489b 1939 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
1940 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1941 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1942 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 1943
010c2708 1944 rcu_read_lock();
813b3b5d 1945 if (fl4->saddr) {
b23dd4fe 1946 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
1947 if (ipv4_is_multicast(fl4->saddr) ||
1948 ipv4_is_lbcast(fl4->saddr) ||
1949 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
1950 goto out;
1951
1da177e4
LT
1952 /* I removed check for oif == dev_out->oif here.
1953 It was wrong for two reasons:
1ab35276
DL
1954 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1955 is assigned to multiple interfaces.
1da177e4
LT
1956 2. Moreover, we are allowed to send packets with saddr
1957 of another iface. --ANK
1958 */
1959
813b3b5d
DM
1960 if (fl4->flowi4_oif == 0 &&
1961 (ipv4_is_multicast(fl4->daddr) ||
1962 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 1963 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1964 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
1965 if (dev_out == NULL)
1966 goto out;
1967
1da177e4
LT
1968 /* Special hack: user can direct multicasts
1969 and limited broadcast via necessary interface
1970 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1971 This hack is not just for fun, it allows
1972 vic,vat and friends to work.
1973 They bind socket to loopback, set ttl to zero
1974 and expect that it will work.
1975 From the viewpoint of routing cache they are broken,
1976 because we are not allowed to build multicast path
1977 with loopback source addr (look, routing cache
1978 cannot know, that ttl is zero, so that packet
1979 will not leave this host and route is valid).
1980 Luckily, this hack is good workaround.
1981 */
1982
813b3b5d 1983 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
1984 goto make_route;
1985 }
a210d01a 1986
813b3b5d 1987 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 1988 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 1989 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 1990 goto out;
a210d01a 1991 }
1da177e4
LT
1992 }
1993
1994
813b3b5d
DM
1995 if (fl4->flowi4_oif) {
1996 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 1997 rth = ERR_PTR(-ENODEV);
1da177e4
LT
1998 if (dev_out == NULL)
1999 goto out;
e5ed6399
HX
2000
2001 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2002 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2003 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2004 goto out;
2005 }
813b3b5d
DM
2006 if (ipv4_is_local_multicast(fl4->daddr) ||
2007 ipv4_is_lbcast(fl4->daddr)) {
2008 if (!fl4->saddr)
2009 fl4->saddr = inet_select_addr(dev_out, 0,
2010 RT_SCOPE_LINK);
1da177e4
LT
2011 goto make_route;
2012 }
b15e22da 2013 if (!fl4->saddr) {
813b3b5d
DM
2014 if (ipv4_is_multicast(fl4->daddr))
2015 fl4->saddr = inet_select_addr(dev_out, 0,
2016 fl4->flowi4_scope);
2017 else if (!fl4->daddr)
2018 fl4->saddr = inet_select_addr(dev_out, 0,
2019 RT_SCOPE_HOST);
1da177e4
LT
2020 }
2021 }
2022
813b3b5d
DM
2023 if (!fl4->daddr) {
2024 fl4->daddr = fl4->saddr;
2025 if (!fl4->daddr)
2026 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2027 dev_out = net->loopback_dev;
1fb9489b 2028 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1da177e4
LT
2029 res.type = RTN_LOCAL;
2030 flags |= RTCF_LOCAL;
2031 goto make_route;
2032 }
2033
813b3b5d 2034 if (fib_lookup(net, fl4, &res)) {
1da177e4 2035 res.fi = NULL;
8b96d22d 2036 res.table = NULL;
813b3b5d 2037 if (fl4->flowi4_oif) {
1da177e4
LT
2038 /* Apparently, routing tables are wrong. Assume,
2039 that the destination is on link.
2040
2041 WHY? DW.
2042 Because we are allowed to send to iface
2043 even if it has NO routes and NO assigned
2044 addresses. When oif is specified, routing
2045 tables are looked up with only one purpose:
2046 to catch if destination is gatewayed, rather than
2047 direct. Moreover, if MSG_DONTROUTE is set,
2048 we send packet, ignoring both routing tables
2049 and ifaddr state. --ANK
2050
2051
2052 We could make it even if oif is unknown,
2053 likely IPv6, but we do not.
2054 */
2055
813b3b5d
DM
2056 if (fl4->saddr == 0)
2057 fl4->saddr = inet_select_addr(dev_out, 0,
2058 RT_SCOPE_LINK);
1da177e4
LT
2059 res.type = RTN_UNICAST;
2060 goto make_route;
2061 }
b23dd4fe 2062 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2063 goto out;
2064 }
1da177e4
LT
2065
2066 if (res.type == RTN_LOCAL) {
813b3b5d 2067 if (!fl4->saddr) {
9fc3bbb4 2068 if (res.fi->fib_prefsrc)
813b3b5d 2069 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2070 else
813b3b5d 2071 fl4->saddr = fl4->daddr;
9fc3bbb4 2072 }
b40afd0e 2073 dev_out = net->loopback_dev;
813b3b5d 2074 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2075 flags |= RTCF_LOCAL;
2076 goto make_route;
2077 }
2078
2079#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2080 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2081 fib_select_multipath(&res);
1da177e4
LT
2082 else
2083#endif
21d8c49e
DM
2084 if (!res.prefixlen &&
2085 res.table->tb_num_default > 1 &&
813b3b5d 2086 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2087 fib_select_default(&res);
1da177e4 2088
813b3b5d
DM
2089 if (!fl4->saddr)
2090 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2091
1da177e4 2092 dev_out = FIB_RES_DEV(res);
813b3b5d 2093 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2094
2095
2096make_route:
1a00fee4 2097 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1da177e4 2098
010c2708
DM
2099out:
2100 rcu_read_unlock();
b23dd4fe 2101 return rth;
1da177e4 2102}
d8c97a94
ACM
2103EXPORT_SYMBOL_GPL(__ip_route_output_key);
2104
ae2688d5
JW
2105static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2106{
2107 return NULL;
2108}
2109
ebb762f2 2110static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2111{
618f9bc7
SK
2112 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2113
2114 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2115}
2116
6700c270
DM
2117static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2118 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2119{
2120}
2121
6700c270
DM
2122static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2123 struct sk_buff *skb)
b587ee3b
DM
2124{
2125}
2126
0972ddb2
HB
2127static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2128 unsigned long old)
2129{
2130 return NULL;
2131}
2132
14e50e57
DM
2133static struct dst_ops ipv4_dst_blackhole_ops = {
2134 .family = AF_INET,
09640e63 2135 .protocol = cpu_to_be16(ETH_P_IP),
ae2688d5 2136 .check = ipv4_blackhole_dst_check,
ebb762f2 2137 .mtu = ipv4_blackhole_mtu,
214f45c9 2138 .default_advmss = ipv4_default_advmss,
14e50e57 2139 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2140 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2141 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2142 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2143};
2144
2774c131 2145struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2146{
2774c131 2147 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2148 struct rtable *rt;
14e50e57 2149
f5b0a874 2150 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
14e50e57 2151 if (rt) {
d8d1f30b 2152 struct dst_entry *new = &rt->dst;
14e50e57 2153
14e50e57 2154 new->__use = 1;
352e512c
HX
2155 new->input = dst_discard;
2156 new->output = dst_discard;
14e50e57 2157
d8d1f30b 2158 new->dev = ort->dst.dev;
14e50e57
DM
2159 if (new->dev)
2160 dev_hold(new->dev);
2161
9917e1e8 2162 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2163 rt->rt_iif = ort->rt_iif;
5943634f 2164 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2165
e84f84f2 2166 rt->rt_genid = rt_genid(net);
14e50e57
DM
2167 rt->rt_flags = ort->rt_flags;
2168 rt->rt_type = ort->rt_type;
14e50e57 2169 rt->rt_gateway = ort->rt_gateway;
155e8336 2170 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2171
caacf05e
DM
2172 INIT_LIST_HEAD(&rt->rt_uncached);
2173
14e50e57
DM
2174 dst_free(new);
2175 }
2176
2774c131
DM
2177 dst_release(dst_orig);
2178
2179 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2180}
2181
9d6ec938 2182struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2183 struct sock *sk)
1da177e4 2184{
9d6ec938 2185 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2186
b23dd4fe
DM
2187 if (IS_ERR(rt))
2188 return rt;
1da177e4 2189
56157872 2190 if (flp4->flowi4_proto)
9d6ec938
DM
2191 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2192 flowi4_to_flowi(flp4),
2193 sk, 0);
1da177e4 2194
b23dd4fe 2195 return rt;
1da177e4 2196}
d8c97a94
ACM
2197EXPORT_SYMBOL_GPL(ip_route_output_flow);
2198
f1ce3062 2199static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
15e47304 2200 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
f1ce3062 2201 u32 seq, int event, int nowait, unsigned int flags)
1da177e4 2202{
511c3f92 2203 struct rtable *rt = skb_rtable(skb);
1da177e4 2204 struct rtmsg *r;
be403ea1 2205 struct nlmsghdr *nlh;
2bc8ca40 2206 unsigned long expires = 0;
f185071d 2207 u32 error;
521f5490 2208 u32 metrics[RTAX_MAX];
be403ea1 2209
15e47304 2210 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
be403ea1 2211 if (nlh == NULL)
26932566 2212 return -EMSGSIZE;
be403ea1
TG
2213
2214 r = nlmsg_data(nlh);
1da177e4
LT
2215 r->rtm_family = AF_INET;
2216 r->rtm_dst_len = 32;
2217 r->rtm_src_len = 0;
d6c0a4f6 2218 r->rtm_tos = fl4->flowi4_tos;
1da177e4 2219 r->rtm_table = RT_TABLE_MAIN;
f3756b79
DM
2220 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2221 goto nla_put_failure;
1da177e4
LT
2222 r->rtm_type = rt->rt_type;
2223 r->rtm_scope = RT_SCOPE_UNIVERSE;
2224 r->rtm_protocol = RTPROT_UNSPEC;
2225 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2226 if (rt->rt_flags & RTCF_NOTIFY)
2227 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2228
f1ce3062 2229 if (nla_put_be32(skb, RTA_DST, dst))
f3756b79 2230 goto nla_put_failure;
1a00fee4 2231 if (src) {
1da177e4 2232 r->rtm_src_len = 32;
1a00fee4 2233 if (nla_put_be32(skb, RTA_SRC, src))
f3756b79 2234 goto nla_put_failure;
1da177e4 2235 }
f3756b79
DM
2236 if (rt->dst.dev &&
2237 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2238 goto nla_put_failure;
c7066f70 2239#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2240 if (rt->dst.tclassid &&
2241 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2242 goto nla_put_failure;
1da177e4 2243#endif
41347dcd 2244 if (!rt_is_input_route(rt) &&
d6c0a4f6
DM
2245 fl4->saddr != src) {
2246 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2247 goto nla_put_failure;
2248 }
155e8336 2249 if (rt->rt_uses_gateway &&
f3756b79
DM
2250 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2251 goto nla_put_failure;
be403ea1 2252
ee9a8f7a
SK
2253 expires = rt->dst.expires;
2254 if (expires) {
2255 unsigned long now = jiffies;
2256
2257 if (time_before(now, expires))
2258 expires -= now;
2259 else
2260 expires = 0;
2261 }
2262
521f5490 2263 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2264 if (rt->rt_pmtu && expires)
521f5490
JA
2265 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2266 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2267 goto nla_put_failure;
2268
b4869889 2269 if (fl4->flowi4_mark &&
68aaed54 2270 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2271 goto nla_put_failure;
963bfeee 2272
d8d1f30b 2273 error = rt->dst.error;
be403ea1 2274
c7537967 2275 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2276#ifdef CONFIG_IP_MROUTE
2277 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2278 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2279 int err = ipmr_get_route(net, skb,
2280 fl4->saddr, fl4->daddr,
2281 r, nowait);
2282 if (err <= 0) {
2283 if (!nowait) {
2284 if (err == 0)
2285 return 0;
2286 goto nla_put_failure;
2287 } else {
2288 if (err == -EMSGSIZE)
2289 goto nla_put_failure;
2290 error = err;
2291 }
2292 }
2293 } else
2294#endif
da1bba1f 2295 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2296 goto nla_put_failure;
1da177e4
LT
2297 }
2298
f185071d 2299 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2300 goto nla_put_failure;
be403ea1
TG
2301
2302 return nlmsg_end(skb, nlh);
1da177e4 2303
be403ea1 2304nla_put_failure:
26932566
PM
2305 nlmsg_cancel(skb, nlh);
2306 return -EMSGSIZE;
1da177e4
LT
2307}
2308
661d2967 2309static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
1da177e4 2310{
3b1e0a65 2311 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2312 struct rtmsg *rtm;
2313 struct nlattr *tb[RTA_MAX+1];
1da177e4 2314 struct rtable *rt = NULL;
d6c0a4f6 2315 struct flowi4 fl4;
9e12bb22
AV
2316 __be32 dst = 0;
2317 __be32 src = 0;
2318 u32 iif;
d889ce3b 2319 int err;
963bfeee 2320 int mark;
1da177e4
LT
2321 struct sk_buff *skb;
2322
d889ce3b
TG
2323 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2324 if (err < 0)
2325 goto errout;
2326
2327 rtm = nlmsg_data(nlh);
2328
1da177e4 2329 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2330 if (skb == NULL) {
2331 err = -ENOBUFS;
2332 goto errout;
2333 }
1da177e4
LT
2334
2335 /* Reserve room for dummy headers, this skb can pass
2336 through good chunk of routing engine.
2337 */
459a98ed 2338 skb_reset_mac_header(skb);
c1d2bbe1 2339 skb_reset_network_header(skb);
d2c962b8
SH
2340
2341 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2342 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2343 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2344
17fb2c64
AV
2345 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2346 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2347 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2348 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4 2349
d6c0a4f6
DM
2350 memset(&fl4, 0, sizeof(fl4));
2351 fl4.daddr = dst;
2352 fl4.saddr = src;
2353 fl4.flowi4_tos = rtm->rtm_tos;
2354 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2355 fl4.flowi4_mark = mark;
2356
1da177e4 2357 if (iif) {
d889ce3b
TG
2358 struct net_device *dev;
2359
1937504d 2360 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2361 if (dev == NULL) {
2362 err = -ENODEV;
2363 goto errout_free;
2364 }
2365
1da177e4
LT
2366 skb->protocol = htons(ETH_P_IP);
2367 skb->dev = dev;
963bfeee 2368 skb->mark = mark;
1da177e4
LT
2369 local_bh_disable();
2370 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2371 local_bh_enable();
d889ce3b 2372
511c3f92 2373 rt = skb_rtable(skb);
d8d1f30b
CG
2374 if (err == 0 && rt->dst.error)
2375 err = -rt->dst.error;
1da177e4 2376 } else {
9d6ec938 2377 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2378
2379 err = 0;
2380 if (IS_ERR(rt))
2381 err = PTR_ERR(rt);
1da177e4 2382 }
d889ce3b 2383
1da177e4 2384 if (err)
d889ce3b 2385 goto errout_free;
1da177e4 2386
d8d1f30b 2387 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2388 if (rtm->rtm_flags & RTM_F_NOTIFY)
2389 rt->rt_flags |= RTCF_NOTIFY;
2390
f1ce3062 2391 err = rt_fill_info(net, dst, src, &fl4, skb,
15e47304 2392 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
1937504d 2393 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2394 if (err <= 0)
2395 goto errout_free;
1da177e4 2396
15e47304 2397 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2398errout:
2942e900 2399 return err;
1da177e4 2400
d889ce3b 2401errout_free:
1da177e4 2402 kfree_skb(skb);
d889ce3b 2403 goto errout;
1da177e4
LT
2404}
2405
2406int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2407{
1da177e4
LT
2408 return skb->len;
2409}
2410
2411void ip_rt_multicast_event(struct in_device *in_dev)
2412{
4ccfe6d4 2413 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2414}
2415
2416#ifdef CONFIG_SYSCTL
082c7ca4
G
2417static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
2418static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2419static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2420static int ip_rt_gc_elasticity __read_mostly = 8;
2421
81c684d1 2422static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 2423 void __user *buffer,
1da177e4
LT
2424 size_t *lenp, loff_t *ppos)
2425{
2426 if (write) {
4ccfe6d4 2427 rt_cache_flush((struct net *)__ctl->extra1);
1da177e4 2428 return 0;
e905a9ed 2429 }
1da177e4
LT
2430
2431 return -EINVAL;
2432}
2433
eeb61f71 2434static ctl_table ipv4_route_table[] = {
1da177e4 2435 {
1da177e4
LT
2436 .procname = "gc_thresh",
2437 .data = &ipv4_dst_ops.gc_thresh,
2438 .maxlen = sizeof(int),
2439 .mode = 0644,
6d9f239a 2440 .proc_handler = proc_dointvec,
1da177e4
LT
2441 },
2442 {
1da177e4
LT
2443 .procname = "max_size",
2444 .data = &ip_rt_max_size,
2445 .maxlen = sizeof(int),
2446 .mode = 0644,
6d9f239a 2447 .proc_handler = proc_dointvec,
1da177e4
LT
2448 },
2449 {
2450 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2451
1da177e4
LT
2452 .procname = "gc_min_interval",
2453 .data = &ip_rt_gc_min_interval,
2454 .maxlen = sizeof(int),
2455 .mode = 0644,
6d9f239a 2456 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2457 },
2458 {
1da177e4
LT
2459 .procname = "gc_min_interval_ms",
2460 .data = &ip_rt_gc_min_interval,
2461 .maxlen = sizeof(int),
2462 .mode = 0644,
6d9f239a 2463 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2464 },
2465 {
1da177e4
LT
2466 .procname = "gc_timeout",
2467 .data = &ip_rt_gc_timeout,
2468 .maxlen = sizeof(int),
2469 .mode = 0644,
6d9f239a 2470 .proc_handler = proc_dointvec_jiffies,
1da177e4 2471 },
9f28a2fc
ED
2472 {
2473 .procname = "gc_interval",
2474 .data = &ip_rt_gc_interval,
2475 .maxlen = sizeof(int),
2476 .mode = 0644,
2477 .proc_handler = proc_dointvec_jiffies,
2478 },
1da177e4 2479 {
1da177e4
LT
2480 .procname = "redirect_load",
2481 .data = &ip_rt_redirect_load,
2482 .maxlen = sizeof(int),
2483 .mode = 0644,
6d9f239a 2484 .proc_handler = proc_dointvec,
1da177e4
LT
2485 },
2486 {
1da177e4
LT
2487 .procname = "redirect_number",
2488 .data = &ip_rt_redirect_number,
2489 .maxlen = sizeof(int),
2490 .mode = 0644,
6d9f239a 2491 .proc_handler = proc_dointvec,
1da177e4
LT
2492 },
2493 {
1da177e4
LT
2494 .procname = "redirect_silence",
2495 .data = &ip_rt_redirect_silence,
2496 .maxlen = sizeof(int),
2497 .mode = 0644,
6d9f239a 2498 .proc_handler = proc_dointvec,
1da177e4
LT
2499 },
2500 {
1da177e4
LT
2501 .procname = "error_cost",
2502 .data = &ip_rt_error_cost,
2503 .maxlen = sizeof(int),
2504 .mode = 0644,
6d9f239a 2505 .proc_handler = proc_dointvec,
1da177e4
LT
2506 },
2507 {
1da177e4
LT
2508 .procname = "error_burst",
2509 .data = &ip_rt_error_burst,
2510 .maxlen = sizeof(int),
2511 .mode = 0644,
6d9f239a 2512 .proc_handler = proc_dointvec,
1da177e4
LT
2513 },
2514 {
1da177e4
LT
2515 .procname = "gc_elasticity",
2516 .data = &ip_rt_gc_elasticity,
2517 .maxlen = sizeof(int),
2518 .mode = 0644,
6d9f239a 2519 .proc_handler = proc_dointvec,
1da177e4
LT
2520 },
2521 {
1da177e4
LT
2522 .procname = "mtu_expires",
2523 .data = &ip_rt_mtu_expires,
2524 .maxlen = sizeof(int),
2525 .mode = 0644,
6d9f239a 2526 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2527 },
2528 {
1da177e4
LT
2529 .procname = "min_pmtu",
2530 .data = &ip_rt_min_pmtu,
2531 .maxlen = sizeof(int),
2532 .mode = 0644,
6d9f239a 2533 .proc_handler = proc_dointvec,
1da177e4
LT
2534 },
2535 {
1da177e4
LT
2536 .procname = "min_adv_mss",
2537 .data = &ip_rt_min_advmss,
2538 .maxlen = sizeof(int),
2539 .mode = 0644,
6d9f239a 2540 .proc_handler = proc_dointvec,
1da177e4 2541 },
f8572d8f 2542 { }
1da177e4 2543};
39a23e75 2544
39a23e75
DL
2545static struct ctl_table ipv4_route_flush_table[] = {
2546 {
39a23e75
DL
2547 .procname = "flush",
2548 .maxlen = sizeof(int),
2549 .mode = 0200,
6d9f239a 2550 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2551 },
f8572d8f 2552 { },
39a23e75
DL
2553};
2554
2555static __net_init int sysctl_route_net_init(struct net *net)
2556{
2557 struct ctl_table *tbl;
2558
2559 tbl = ipv4_route_flush_table;
09ad9bc7 2560 if (!net_eq(net, &init_net)) {
39a23e75
DL
2561 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2562 if (tbl == NULL)
2563 goto err_dup;
464dc801
EB
2564
2565 /* Don't export sysctls to unprivileged users */
2566 if (net->user_ns != &init_user_ns)
2567 tbl[0].procname = NULL;
39a23e75
DL
2568 }
2569 tbl[0].extra1 = net;
2570
ec8f23ce 2571 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
39a23e75
DL
2572 if (net->ipv4.route_hdr == NULL)
2573 goto err_reg;
2574 return 0;
2575
2576err_reg:
2577 if (tbl != ipv4_route_flush_table)
2578 kfree(tbl);
2579err_dup:
2580 return -ENOMEM;
2581}
2582
2583static __net_exit void sysctl_route_net_exit(struct net *net)
2584{
2585 struct ctl_table *tbl;
2586
2587 tbl = net->ipv4.route_hdr->ctl_table_arg;
2588 unregister_net_sysctl_table(net->ipv4.route_hdr);
2589 BUG_ON(tbl == ipv4_route_flush_table);
2590 kfree(tbl);
2591}
2592
2593static __net_initdata struct pernet_operations sysctl_route_ops = {
2594 .init = sysctl_route_net_init,
2595 .exit = sysctl_route_net_exit,
2596};
1da177e4
LT
2597#endif
2598
3ee94372 2599static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2600{
b42664f8 2601 atomic_set(&net->rt_genid, 0);
436c3b66
DM
2602 get_random_bytes(&net->ipv4.dev_addr_genid,
2603 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
2604 return 0;
2605}
2606
3ee94372
NH
2607static __net_initdata struct pernet_operations rt_genid_ops = {
2608 .init = rt_genid_init,
9f5e97e5
DL
2609};
2610
c3426b47
DM
2611static int __net_init ipv4_inetpeer_init(struct net *net)
2612{
2613 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2614
2615 if (!bp)
2616 return -ENOMEM;
2617 inet_peer_base_init(bp);
2618 net->ipv4.peers = bp;
2619 return 0;
2620}
2621
2622static void __net_exit ipv4_inetpeer_exit(struct net *net)
2623{
2624 struct inet_peer_base *bp = net->ipv4.peers;
2625
2626 net->ipv4.peers = NULL;
56a6b248 2627 inetpeer_invalidate_tree(bp);
c3426b47
DM
2628 kfree(bp);
2629}
2630
2631static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2632 .init = ipv4_inetpeer_init,
2633 .exit = ipv4_inetpeer_exit,
2634};
9f5e97e5 2635
c7066f70 2636#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 2637struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 2638#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 2639
1da177e4
LT
2640int __init ip_rt_init(void)
2641{
424c4b70 2642 int rc = 0;
1da177e4 2643
ff1f69a8
ED
2644 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2645 if (!ip_idents)
2646 panic("IP: failed to allocate ip_idents\n");
2647
2648 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2649
c7066f70 2650#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 2651 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
2652 if (!ip_rt_acct)
2653 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
2654#endif
2655
e5d679f3
AD
2656 ipv4_dst_ops.kmem_cachep =
2657 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2658 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2659
14e50e57
DM
2660 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2661
fc66f95c
ED
2662 if (dst_entries_init(&ipv4_dst_ops) < 0)
2663 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2664
2665 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2666 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2667
89aef892
DM
2668 ipv4_dst_ops.gc_thresh = ~0;
2669 ip_rt_max_size = INT_MAX;
1da177e4 2670
1da177e4
LT
2671 devinet_init();
2672 ip_fib_init();
2673
73b38711 2674 if (ip_rt_proc_init())
058bd4d2 2675 pr_err("Unable to create route proc files\n");
1da177e4
LT
2676#ifdef CONFIG_XFRM
2677 xfrm_init();
703fb94e 2678 xfrm4_init();
1da177e4 2679#endif
c7ac8679 2680 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 2681
39a23e75
DL
2682#ifdef CONFIG_SYSCTL
2683 register_pernet_subsys(&sysctl_route_ops);
2684#endif
3ee94372 2685 register_pernet_subsys(&rt_genid_ops);
c3426b47 2686 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
2687 return rc;
2688}
2689
a1bc6eb4 2690#ifdef CONFIG_SYSCTL
eeb61f71
AV
2691/*
2692 * We really need to sanitize the damn ipv4 init order, then all
2693 * this nonsense will go away.
2694 */
2695void __init ip_static_sysctl_init(void)
2696{
4e5ca785 2697 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 2698}
a1bc6eb4 2699#endif