ipv4: Adjust semantics of rt->rt_gateway.
[GitHub/LineageOS/android_kernel_samsung_universal7580.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU 0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly = 9;
128 static int ip_rt_redirect_load __read_mostly = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly = HZ;
131 static int ip_rt_error_burst __read_mostly = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly = 8;
133 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly = 256;
136
137 /*
138 * Interface to generic destination cache.
139 */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int ipv4_mtu(const struct dst_entry *dst);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
148 struct sk_buff *skb, u32 mtu);
149 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
150 struct sk_buff *skb);
151
152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154 {
155 }
156
157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159 WARN_ON(1);
160 return NULL;
161 }
162
163 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
164 struct sk_buff *skb,
165 const void *daddr);
166
167 static struct dst_ops ipv4_dst_ops = {
168 .family = AF_INET,
169 .protocol = cpu_to_be16(ETH_P_IP),
170 .check = ipv4_dst_check,
171 .default_advmss = ipv4_default_advmss,
172 .mtu = ipv4_mtu,
173 .cow_metrics = ipv4_cow_metrics,
174 .destroy = ipv4_dst_destroy,
175 .ifdown = ipv4_dst_ifdown,
176 .negative_advice = ipv4_negative_advice,
177 .link_failure = ipv4_link_failure,
178 .update_pmtu = ip_rt_update_pmtu,
179 .redirect = ip_do_redirect,
180 .local_out = __ip_local_out,
181 .neigh_lookup = ipv4_neigh_lookup,
182 };
183
184 #define ECN_OR_COST(class) TC_PRIO_##class
185
186 const __u8 ip_tos2prio[16] = {
187 TC_PRIO_BESTEFFORT,
188 ECN_OR_COST(BESTEFFORT),
189 TC_PRIO_BESTEFFORT,
190 ECN_OR_COST(BESTEFFORT),
191 TC_PRIO_BULK,
192 ECN_OR_COST(BULK),
193 TC_PRIO_BULK,
194 ECN_OR_COST(BULK),
195 TC_PRIO_INTERACTIVE,
196 ECN_OR_COST(INTERACTIVE),
197 TC_PRIO_INTERACTIVE,
198 ECN_OR_COST(INTERACTIVE),
199 TC_PRIO_INTERACTIVE_BULK,
200 ECN_OR_COST(INTERACTIVE_BULK),
201 TC_PRIO_INTERACTIVE_BULK,
202 ECN_OR_COST(INTERACTIVE_BULK)
203 };
204 EXPORT_SYMBOL(ip_tos2prio);
205
206 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
207 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
208
209 static inline int rt_genid(struct net *net)
210 {
211 return atomic_read(&net->ipv4.rt_genid);
212 }
213
214 #ifdef CONFIG_PROC_FS
215 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
216 {
217 if (*pos)
218 return NULL;
219 return SEQ_START_TOKEN;
220 }
221
222 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223 {
224 ++*pos;
225 return NULL;
226 }
227
228 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
229 {
230 }
231
232 static int rt_cache_seq_show(struct seq_file *seq, void *v)
233 {
234 if (v == SEQ_START_TOKEN)
235 seq_printf(seq, "%-127s\n",
236 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
237 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
238 "HHUptod\tSpecDst");
239 return 0;
240 }
241
242 static const struct seq_operations rt_cache_seq_ops = {
243 .start = rt_cache_seq_start,
244 .next = rt_cache_seq_next,
245 .stop = rt_cache_seq_stop,
246 .show = rt_cache_seq_show,
247 };
248
249 static int rt_cache_seq_open(struct inode *inode, struct file *file)
250 {
251 return seq_open(file, &rt_cache_seq_ops);
252 }
253
254 static const struct file_operations rt_cache_seq_fops = {
255 .owner = THIS_MODULE,
256 .open = rt_cache_seq_open,
257 .read = seq_read,
258 .llseek = seq_lseek,
259 .release = seq_release,
260 };
261
262
263 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
264 {
265 int cpu;
266
267 if (*pos == 0)
268 return SEQ_START_TOKEN;
269
270 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 return NULL;
277 }
278
279 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
280 {
281 int cpu;
282
283 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
284 if (!cpu_possible(cpu))
285 continue;
286 *pos = cpu+1;
287 return &per_cpu(rt_cache_stat, cpu);
288 }
289 return NULL;
290
291 }
292
293 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
294 {
295
296 }
297
298 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
299 {
300 struct rt_cache_stat *st = v;
301
302 if (v == SEQ_START_TOKEN) {
303 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
304 return 0;
305 }
306
307 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
308 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
309 dst_entries_get_slow(&ipv4_dst_ops),
310 st->in_hit,
311 st->in_slow_tot,
312 st->in_slow_mc,
313 st->in_no_route,
314 st->in_brd,
315 st->in_martian_dst,
316 st->in_martian_src,
317
318 st->out_hit,
319 st->out_slow_tot,
320 st->out_slow_mc,
321
322 st->gc_total,
323 st->gc_ignored,
324 st->gc_goal_miss,
325 st->gc_dst_overflow,
326 st->in_hlist_search,
327 st->out_hlist_search
328 );
329 return 0;
330 }
331
332 static const struct seq_operations rt_cpu_seq_ops = {
333 .start = rt_cpu_seq_start,
334 .next = rt_cpu_seq_next,
335 .stop = rt_cpu_seq_stop,
336 .show = rt_cpu_seq_show,
337 };
338
339
340 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
341 {
342 return seq_open(file, &rt_cpu_seq_ops);
343 }
344
345 static const struct file_operations rt_cpu_seq_fops = {
346 .owner = THIS_MODULE,
347 .open = rt_cpu_seq_open,
348 .read = seq_read,
349 .llseek = seq_lseek,
350 .release = seq_release,
351 };
352
353 #ifdef CONFIG_IP_ROUTE_CLASSID
354 static int rt_acct_proc_show(struct seq_file *m, void *v)
355 {
356 struct ip_rt_acct *dst, *src;
357 unsigned int i, j;
358
359 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
360 if (!dst)
361 return -ENOMEM;
362
363 for_each_possible_cpu(i) {
364 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
365 for (j = 0; j < 256; j++) {
366 dst[j].o_bytes += src[j].o_bytes;
367 dst[j].o_packets += src[j].o_packets;
368 dst[j].i_bytes += src[j].i_bytes;
369 dst[j].i_packets += src[j].i_packets;
370 }
371 }
372
373 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
374 kfree(dst);
375 return 0;
376 }
377
378 static int rt_acct_proc_open(struct inode *inode, struct file *file)
379 {
380 return single_open(file, rt_acct_proc_show, NULL);
381 }
382
383 static const struct file_operations rt_acct_proc_fops = {
384 .owner = THIS_MODULE,
385 .open = rt_acct_proc_open,
386 .read = seq_read,
387 .llseek = seq_lseek,
388 .release = single_release,
389 };
390 #endif
391
392 static int __net_init ip_rt_do_proc_init(struct net *net)
393 {
394 struct proc_dir_entry *pde;
395
396 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
397 &rt_cache_seq_fops);
398 if (!pde)
399 goto err1;
400
401 pde = proc_create("rt_cache", S_IRUGO,
402 net->proc_net_stat, &rt_cpu_seq_fops);
403 if (!pde)
404 goto err2;
405
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
408 if (!pde)
409 goto err3;
410 #endif
411 return 0;
412
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414 err3:
415 remove_proc_entry("rt_cache", net->proc_net_stat);
416 #endif
417 err2:
418 remove_proc_entry("rt_cache", net->proc_net);
419 err1:
420 return -ENOMEM;
421 }
422
423 static void __net_exit ip_rt_do_proc_exit(struct net *net)
424 {
425 remove_proc_entry("rt_cache", net->proc_net_stat);
426 remove_proc_entry("rt_cache", net->proc_net);
427 #ifdef CONFIG_IP_ROUTE_CLASSID
428 remove_proc_entry("rt_acct", net->proc_net);
429 #endif
430 }
431
432 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
433 .init = ip_rt_do_proc_init,
434 .exit = ip_rt_do_proc_exit,
435 };
436
437 static int __init ip_rt_proc_init(void)
438 {
439 return register_pernet_subsys(&ip_rt_proc_ops);
440 }
441
442 #else
443 static inline int ip_rt_proc_init(void)
444 {
445 return 0;
446 }
447 #endif /* CONFIG_PROC_FS */
448
449 static inline int rt_is_expired(struct rtable *rth)
450 {
451 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
452 }
453
454 /*
455 * Perturbation of rt_genid by a small quantity [1..256]
456 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
457 * many times (2^24) without giving recent rt_genid.
458 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
459 */
460 static void rt_cache_invalidate(struct net *net)
461 {
462 unsigned char shuffle;
463
464 get_random_bytes(&shuffle, sizeof(shuffle));
465 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
466 }
467
468 /*
469 * delay < 0 : invalidate cache (fast : entries will be deleted later)
470 * delay >= 0 : invalidate & flush cache (can be long)
471 */
472 void rt_cache_flush(struct net *net, int delay)
473 {
474 rt_cache_invalidate(net);
475 }
476
477 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
478 struct sk_buff *skb,
479 const void *daddr)
480 {
481 struct net_device *dev = dst->dev;
482 const __be32 *pkey = daddr;
483 const struct rtable *rt;
484 struct neighbour *n;
485
486 rt = (const struct rtable *) dst;
487 if (rt->rt_gateway)
488 pkey = (const __be32 *) &rt->rt_gateway;
489 else if (skb)
490 pkey = &ip_hdr(skb)->daddr;
491
492 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
493 if (n)
494 return n;
495 return neigh_create(&arp_tbl, pkey, dev);
496 }
497
498 /*
499 * Peer allocation may fail only in serious out-of-memory conditions. However
500 * we still can generate some output.
501 * Random ID selection looks a bit dangerous because we have no chances to
502 * select ID being unique in a reasonable period of time.
503 * But broken packet identifier may be better than no packet at all.
504 */
505 static void ip_select_fb_ident(struct iphdr *iph)
506 {
507 static DEFINE_SPINLOCK(ip_fb_id_lock);
508 static u32 ip_fallback_id;
509 u32 salt;
510
511 spin_lock_bh(&ip_fb_id_lock);
512 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
513 iph->id = htons(salt & 0xFFFF);
514 ip_fallback_id = salt;
515 spin_unlock_bh(&ip_fb_id_lock);
516 }
517
518 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
519 {
520 struct net *net = dev_net(dst->dev);
521 struct inet_peer *peer;
522
523 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
524 if (peer) {
525 iph->id = htons(inet_getid(peer, more));
526 inet_putpeer(peer);
527 return;
528 }
529
530 ip_select_fb_ident(iph);
531 }
532 EXPORT_SYMBOL(__ip_select_ident);
533
534 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
535 const struct iphdr *iph,
536 int oif, u8 tos,
537 u8 prot, u32 mark, int flow_flags)
538 {
539 if (sk) {
540 const struct inet_sock *inet = inet_sk(sk);
541
542 oif = sk->sk_bound_dev_if;
543 mark = sk->sk_mark;
544 tos = RT_CONN_FLAGS(sk);
545 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
546 }
547 flowi4_init_output(fl4, oif, mark, tos,
548 RT_SCOPE_UNIVERSE, prot,
549 flow_flags,
550 iph->daddr, iph->saddr, 0, 0);
551 }
552
553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 const struct sock *sk)
555 {
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
562 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
563 }
564
565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 {
567 const struct inet_sock *inet = inet_sk(sk);
568 const struct ip_options_rcu *inet_opt;
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
579 daddr, inet->inet_saddr, 0, 0);
580 rcu_read_unlock();
581 }
582
583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
585 {
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590 }
591
592 static DEFINE_SEQLOCK(fnhe_seqlock);
593
594 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
595 {
596 struct fib_nh_exception *fnhe, *oldest;
597
598 oldest = rcu_dereference(hash->chain);
599 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
600 fnhe = rcu_dereference(fnhe->fnhe_next)) {
601 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
602 oldest = fnhe;
603 }
604 return oldest;
605 }
606
607 static inline u32 fnhe_hashfun(__be32 daddr)
608 {
609 u32 hval;
610
611 hval = (__force u32) daddr;
612 hval ^= (hval >> 11) ^ (hval >> 22);
613
614 return hval & (FNHE_HASH_SIZE - 1);
615 }
616
617 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
618 u32 pmtu, unsigned long expires)
619 {
620 struct fnhe_hash_bucket *hash;
621 struct fib_nh_exception *fnhe;
622 int depth;
623 u32 hval = fnhe_hashfun(daddr);
624
625 write_seqlock_bh(&fnhe_seqlock);
626
627 hash = nh->nh_exceptions;
628 if (!hash) {
629 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
630 if (!hash)
631 goto out_unlock;
632 nh->nh_exceptions = hash;
633 }
634
635 hash += hval;
636
637 depth = 0;
638 for (fnhe = rcu_dereference(hash->chain); fnhe;
639 fnhe = rcu_dereference(fnhe->fnhe_next)) {
640 if (fnhe->fnhe_daddr == daddr)
641 break;
642 depth++;
643 }
644
645 if (fnhe) {
646 if (gw)
647 fnhe->fnhe_gw = gw;
648 if (pmtu) {
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
651 }
652 } else {
653 if (depth > FNHE_RECLAIM_DEPTH)
654 fnhe = fnhe_oldest(hash);
655 else {
656 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
657 if (!fnhe)
658 goto out_unlock;
659
660 fnhe->fnhe_next = hash->chain;
661 rcu_assign_pointer(hash->chain, fnhe);
662 }
663 fnhe->fnhe_daddr = daddr;
664 fnhe->fnhe_gw = gw;
665 fnhe->fnhe_pmtu = pmtu;
666 fnhe->fnhe_expires = expires;
667 }
668
669 fnhe->fnhe_stamp = jiffies;
670
671 out_unlock:
672 write_sequnlock_bh(&fnhe_seqlock);
673 return;
674 }
675
676 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
677 {
678 __be32 new_gw = icmp_hdr(skb)->un.gateway;
679 __be32 old_gw = ip_hdr(skb)->saddr;
680 struct net_device *dev = skb->dev;
681 struct in_device *in_dev;
682 struct fib_result res;
683 struct neighbour *n;
684 struct net *net;
685
686 switch (icmp_hdr(skb)->code & 7) {
687 case ICMP_REDIR_NET:
688 case ICMP_REDIR_NETTOS:
689 case ICMP_REDIR_HOST:
690 case ICMP_REDIR_HOSTTOS:
691 break;
692
693 default:
694 return;
695 }
696
697 if (rt->rt_gateway != old_gw)
698 return;
699
700 in_dev = __in_dev_get_rcu(dev);
701 if (!in_dev)
702 return;
703
704 net = dev_net(dev);
705 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
706 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
707 ipv4_is_zeronet(new_gw))
708 goto reject_redirect;
709
710 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
711 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
712 goto reject_redirect;
713 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
714 goto reject_redirect;
715 } else {
716 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
717 goto reject_redirect;
718 }
719
720 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
721 if (n) {
722 if (!(n->nud_state & NUD_VALID)) {
723 neigh_event_send(n, NULL);
724 } else {
725 if (fib_lookup(net, fl4, &res) == 0) {
726 struct fib_nh *nh = &FIB_RES_NH(res);
727
728 update_or_create_fnhe(nh, fl4->daddr, new_gw,
729 0, 0);
730 }
731 rt->rt_gateway = new_gw;
732 rt->rt_flags |= RTCF_REDIRECTED;
733 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
734 }
735 neigh_release(n);
736 }
737 return;
738
739 reject_redirect:
740 #ifdef CONFIG_IP_ROUTE_VERBOSE
741 if (IN_DEV_LOG_MARTIANS(in_dev)) {
742 const struct iphdr *iph = (const struct iphdr *) skb->data;
743 __be32 daddr = iph->daddr;
744 __be32 saddr = iph->saddr;
745
746 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
747 " Advised path = %pI4 -> %pI4\n",
748 &old_gw, dev->name, &new_gw,
749 &saddr, &daddr);
750 }
751 #endif
752 ;
753 }
754
755 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
756 {
757 struct rtable *rt;
758 struct flowi4 fl4;
759
760 rt = (struct rtable *) dst;
761
762 ip_rt_build_flow_key(&fl4, sk, skb);
763 __ip_do_redirect(rt, skb, &fl4);
764 }
765
766 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
767 {
768 struct rtable *rt = (struct rtable *)dst;
769 struct dst_entry *ret = dst;
770
771 if (rt) {
772 if (dst->obsolete > 0) {
773 ip_rt_put(rt);
774 ret = NULL;
775 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
776 rt->dst.expires) {
777 ip_rt_put(rt);
778 ret = NULL;
779 }
780 }
781 return ret;
782 }
783
784 /*
785 * Algorithm:
786 * 1. The first ip_rt_redirect_number redirects are sent
787 * with exponential backoff, then we stop sending them at all,
788 * assuming that the host ignores our redirects.
789 * 2. If we did not see packets requiring redirects
790 * during ip_rt_redirect_silence, we assume that the host
791 * forgot redirected route and start to send redirects again.
792 *
793 * This algorithm is much cheaper and more intelligent than dumb load limiting
794 * in icmp.c.
795 *
796 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
797 * and "frag. need" (breaks PMTU discovery) in icmp.c.
798 */
799
800 void ip_rt_send_redirect(struct sk_buff *skb)
801 {
802 struct rtable *rt = skb_rtable(skb);
803 struct in_device *in_dev;
804 struct inet_peer *peer;
805 struct net *net;
806 int log_martians;
807
808 rcu_read_lock();
809 in_dev = __in_dev_get_rcu(rt->dst.dev);
810 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
811 rcu_read_unlock();
812 return;
813 }
814 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
815 rcu_read_unlock();
816
817 net = dev_net(rt->dst.dev);
818 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
819 if (!peer) {
820 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
821 return;
822 }
823
824 /* No redirected packets during ip_rt_redirect_silence;
825 * reset the algorithm.
826 */
827 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
828 peer->rate_tokens = 0;
829
830 /* Too many ignored redirects; do not send anything
831 * set dst.rate_last to the last seen redirected packet.
832 */
833 if (peer->rate_tokens >= ip_rt_redirect_number) {
834 peer->rate_last = jiffies;
835 goto out_put_peer;
836 }
837
838 /* Check for load limit; set rate_last to the latest sent
839 * redirect.
840 */
841 if (peer->rate_tokens == 0 ||
842 time_after(jiffies,
843 (peer->rate_last +
844 (ip_rt_redirect_load << peer->rate_tokens)))) {
845 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
846 peer->rate_last = jiffies;
847 ++peer->rate_tokens;
848 #ifdef CONFIG_IP_ROUTE_VERBOSE
849 if (log_martians &&
850 peer->rate_tokens == ip_rt_redirect_number)
851 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
852 &ip_hdr(skb)->saddr, rt->rt_iif,
853 &ip_hdr(skb)->daddr, &rt->rt_gateway);
854 #endif
855 }
856 out_put_peer:
857 inet_putpeer(peer);
858 }
859
860 static int ip_error(struct sk_buff *skb)
861 {
862 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
863 struct rtable *rt = skb_rtable(skb);
864 struct inet_peer *peer;
865 unsigned long now;
866 struct net *net;
867 bool send;
868 int code;
869
870 net = dev_net(rt->dst.dev);
871 if (!IN_DEV_FORWARD(in_dev)) {
872 switch (rt->dst.error) {
873 case EHOSTUNREACH:
874 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
875 break;
876
877 case ENETUNREACH:
878 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
879 break;
880 }
881 goto out;
882 }
883
884 switch (rt->dst.error) {
885 case EINVAL:
886 default:
887 goto out;
888 case EHOSTUNREACH:
889 code = ICMP_HOST_UNREACH;
890 break;
891 case ENETUNREACH:
892 code = ICMP_NET_UNREACH;
893 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
894 break;
895 case EACCES:
896 code = ICMP_PKT_FILTERED;
897 break;
898 }
899
900 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
901
902 send = true;
903 if (peer) {
904 now = jiffies;
905 peer->rate_tokens += now - peer->rate_last;
906 if (peer->rate_tokens > ip_rt_error_burst)
907 peer->rate_tokens = ip_rt_error_burst;
908 peer->rate_last = now;
909 if (peer->rate_tokens >= ip_rt_error_cost)
910 peer->rate_tokens -= ip_rt_error_cost;
911 else
912 send = false;
913 inet_putpeer(peer);
914 }
915 if (send)
916 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
917
918 out: kfree_skb(skb);
919 return 0;
920 }
921
922 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
923 {
924 struct fib_result res;
925
926 if (mtu < ip_rt_min_pmtu)
927 mtu = ip_rt_min_pmtu;
928
929 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
930 struct fib_nh *nh = &FIB_RES_NH(res);
931
932 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933 jiffies + ip_rt_mtu_expires);
934 }
935 rt->rt_pmtu = mtu;
936 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
937 }
938
939 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
940 struct sk_buff *skb, u32 mtu)
941 {
942 struct rtable *rt = (struct rtable *) dst;
943 struct flowi4 fl4;
944
945 ip_rt_build_flow_key(&fl4, sk, skb);
946 __ip_rt_update_pmtu(rt, &fl4, mtu);
947 }
948
949 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
950 int oif, u32 mark, u8 protocol, int flow_flags)
951 {
952 const struct iphdr *iph = (const struct iphdr *) skb->data;
953 struct flowi4 fl4;
954 struct rtable *rt;
955
956 __build_flow_key(&fl4, NULL, iph, oif,
957 RT_TOS(iph->tos), protocol, mark, flow_flags);
958 rt = __ip_route_output_key(net, &fl4);
959 if (!IS_ERR(rt)) {
960 __ip_rt_update_pmtu(rt, &fl4, mtu);
961 ip_rt_put(rt);
962 }
963 }
964 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
965
966 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
967 {
968 const struct iphdr *iph = (const struct iphdr *) skb->data;
969 struct flowi4 fl4;
970 struct rtable *rt;
971
972 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
973 rt = __ip_route_output_key(sock_net(sk), &fl4);
974 if (!IS_ERR(rt)) {
975 __ip_rt_update_pmtu(rt, &fl4, mtu);
976 ip_rt_put(rt);
977 }
978 }
979 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
980
981 void ipv4_redirect(struct sk_buff *skb, struct net *net,
982 int oif, u32 mark, u8 protocol, int flow_flags)
983 {
984 const struct iphdr *iph = (const struct iphdr *) skb->data;
985 struct flowi4 fl4;
986 struct rtable *rt;
987
988 __build_flow_key(&fl4, NULL, iph, oif,
989 RT_TOS(iph->tos), protocol, mark, flow_flags);
990 rt = __ip_route_output_key(net, &fl4);
991 if (!IS_ERR(rt)) {
992 __ip_do_redirect(rt, skb, &fl4);
993 ip_rt_put(rt);
994 }
995 }
996 EXPORT_SYMBOL_GPL(ipv4_redirect);
997
998 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
999 {
1000 const struct iphdr *iph = (const struct iphdr *) skb->data;
1001 struct flowi4 fl4;
1002 struct rtable *rt;
1003
1004 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1005 rt = __ip_route_output_key(sock_net(sk), &fl4);
1006 if (!IS_ERR(rt)) {
1007 __ip_do_redirect(rt, skb, &fl4);
1008 ip_rt_put(rt);
1009 }
1010 }
1011 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1012
1013 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1014 {
1015 struct rtable *rt = (struct rtable *) dst;
1016
1017 if (rt_is_expired(rt))
1018 return NULL;
1019 return dst;
1020 }
1021
1022 static void ipv4_dst_destroy(struct dst_entry *dst)
1023 {
1024 struct rtable *rt = (struct rtable *) dst;
1025
1026 if (rt->fi) {
1027 fib_info_put(rt->fi);
1028 rt->fi = NULL;
1029 }
1030 }
1031
1032
1033 static void ipv4_link_failure(struct sk_buff *skb)
1034 {
1035 struct rtable *rt;
1036
1037 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1038
1039 rt = skb_rtable(skb);
1040 if (rt)
1041 dst_set_expires(&rt->dst, 0);
1042 }
1043
1044 static int ip_rt_bug(struct sk_buff *skb)
1045 {
1046 pr_debug("%s: %pI4 -> %pI4, %s\n",
1047 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1048 skb->dev ? skb->dev->name : "?");
1049 kfree_skb(skb);
1050 WARN_ON(1);
1051 return 0;
1052 }
1053
1054 /*
1055 We do not cache source address of outgoing interface,
1056 because it is used only by IP RR, TS and SRR options,
1057 so that it out of fast path.
1058
1059 BTW remember: "addr" is allowed to be not aligned
1060 in IP options!
1061 */
1062
1063 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1064 {
1065 __be32 src;
1066
1067 if (rt_is_output_route(rt))
1068 src = ip_hdr(skb)->saddr;
1069 else {
1070 struct fib_result res;
1071 struct flowi4 fl4;
1072 struct iphdr *iph;
1073
1074 iph = ip_hdr(skb);
1075
1076 memset(&fl4, 0, sizeof(fl4));
1077 fl4.daddr = iph->daddr;
1078 fl4.saddr = iph->saddr;
1079 fl4.flowi4_tos = RT_TOS(iph->tos);
1080 fl4.flowi4_oif = rt->dst.dev->ifindex;
1081 fl4.flowi4_iif = skb->dev->ifindex;
1082 fl4.flowi4_mark = skb->mark;
1083
1084 rcu_read_lock();
1085 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1086 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1087 else
1088 src = inet_select_addr(rt->dst.dev,
1089 rt_nexthop(rt, iph->daddr),
1090 RT_SCOPE_UNIVERSE);
1091 rcu_read_unlock();
1092 }
1093 memcpy(addr, &src, 4);
1094 }
1095
1096 #ifdef CONFIG_IP_ROUTE_CLASSID
1097 static void set_class_tag(struct rtable *rt, u32 tag)
1098 {
1099 if (!(rt->dst.tclassid & 0xFFFF))
1100 rt->dst.tclassid |= tag & 0xFFFF;
1101 if (!(rt->dst.tclassid & 0xFFFF0000))
1102 rt->dst.tclassid |= tag & 0xFFFF0000;
1103 }
1104 #endif
1105
1106 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1107 {
1108 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1109
1110 if (advmss == 0) {
1111 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1112 ip_rt_min_advmss);
1113 if (advmss > 65535 - 40)
1114 advmss = 65535 - 40;
1115 }
1116 return advmss;
1117 }
1118
1119 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1120 {
1121 const struct rtable *rt = (const struct rtable *) dst;
1122 unsigned int mtu = rt->rt_pmtu;
1123
1124 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1125 mtu = 0;
1126
1127 if (!mtu)
1128 mtu = dst_metric_raw(dst, RTAX_MTU);
1129
1130 if (mtu && rt_is_output_route(rt))
1131 return mtu;
1132
1133 mtu = dst->dev->mtu;
1134
1135 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1136 if (rt->rt_gateway && mtu > 576)
1137 mtu = 576;
1138 }
1139
1140 if (mtu > IP_MAX_MTU)
1141 mtu = IP_MAX_MTU;
1142
1143 return mtu;
1144 }
1145
1146 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1147 struct fib_info *fi)
1148 {
1149 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1150 rt->fi = fi;
1151 atomic_inc(&fi->fib_clntref);
1152 }
1153 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1154 }
1155
1156 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1157 {
1158 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1159 struct fib_nh_exception *fnhe;
1160 u32 hval;
1161
1162 hval = fnhe_hashfun(daddr);
1163
1164 restart:
1165 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1166 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1167 __be32 fnhe_daddr, gw;
1168 unsigned long expires;
1169 unsigned int seq;
1170 u32 pmtu;
1171
1172 seq = read_seqbegin(&fnhe_seqlock);
1173 fnhe_daddr = fnhe->fnhe_daddr;
1174 gw = fnhe->fnhe_gw;
1175 pmtu = fnhe->fnhe_pmtu;
1176 expires = fnhe->fnhe_expires;
1177 if (read_seqretry(&fnhe_seqlock, seq))
1178 goto restart;
1179 if (daddr != fnhe_daddr)
1180 continue;
1181 if (pmtu) {
1182 unsigned long diff = expires - jiffies;
1183
1184 if (time_before(jiffies, expires)) {
1185 rt->rt_pmtu = pmtu;
1186 dst_set_expires(&rt->dst, diff);
1187 }
1188 }
1189 if (gw)
1190 rt->rt_gateway = gw;
1191 fnhe->fnhe_stamp = jiffies;
1192 break;
1193 }
1194 }
1195
1196 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1197 const struct fib_result *res,
1198 struct fib_info *fi, u16 type, u32 itag)
1199 {
1200 if (fi) {
1201 struct fib_nh *nh = &FIB_RES_NH(*res);
1202
1203 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1204 rt->rt_gateway = nh->nh_gw;
1205 if (unlikely(nh->nh_exceptions))
1206 rt_bind_exception(rt, nh, fl4->daddr);
1207 rt_init_metrics(rt, fl4, fi);
1208 #ifdef CONFIG_IP_ROUTE_CLASSID
1209 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1210 #endif
1211 }
1212
1213 #ifdef CONFIG_IP_ROUTE_CLASSID
1214 #ifdef CONFIG_IP_MULTIPLE_TABLES
1215 set_class_tag(rt, res->tclassid);
1216 #endif
1217 set_class_tag(rt, itag);
1218 #endif
1219 }
1220
1221 static struct rtable *rt_dst_alloc(struct net_device *dev,
1222 bool nopolicy, bool noxfrm)
1223 {
1224 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1225 DST_HOST | DST_NOCACHE |
1226 (nopolicy ? DST_NOPOLICY : 0) |
1227 (noxfrm ? DST_NOXFRM : 0));
1228 }
1229
1230 /* called in rcu_read_lock() section */
1231 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1232 u8 tos, struct net_device *dev, int our)
1233 {
1234 struct rtable *rth;
1235 struct in_device *in_dev = __in_dev_get_rcu(dev);
1236 u32 itag = 0;
1237 int err;
1238
1239 /* Primary sanity checks. */
1240
1241 if (in_dev == NULL)
1242 return -EINVAL;
1243
1244 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1245 skb->protocol != htons(ETH_P_IP))
1246 goto e_inval;
1247
1248 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1249 if (ipv4_is_loopback(saddr))
1250 goto e_inval;
1251
1252 if (ipv4_is_zeronet(saddr)) {
1253 if (!ipv4_is_local_multicast(daddr))
1254 goto e_inval;
1255 } else {
1256 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1257 in_dev, &itag);
1258 if (err < 0)
1259 goto e_err;
1260 }
1261 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1262 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1263 if (!rth)
1264 goto e_nobufs;
1265
1266 #ifdef CONFIG_IP_ROUTE_CLASSID
1267 rth->dst.tclassid = itag;
1268 #endif
1269 rth->dst.output = ip_rt_bug;
1270
1271 rth->rt_genid = rt_genid(dev_net(dev));
1272 rth->rt_flags = RTCF_MULTICAST;
1273 rth->rt_type = RTN_MULTICAST;
1274 rth->rt_route_iif = dev->ifindex;
1275 rth->rt_iif = dev->ifindex;
1276 rth->rt_oif = 0;
1277 rth->rt_pmtu = 0;
1278 rth->rt_gateway = 0;
1279 rth->fi = NULL;
1280 if (our) {
1281 rth->dst.input= ip_local_deliver;
1282 rth->rt_flags |= RTCF_LOCAL;
1283 }
1284
1285 #ifdef CONFIG_IP_MROUTE
1286 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1287 rth->dst.input = ip_mr_input;
1288 #endif
1289 RT_CACHE_STAT_INC(in_slow_mc);
1290
1291 skb_dst_set(skb, &rth->dst);
1292 return 0;
1293
1294 e_nobufs:
1295 return -ENOBUFS;
1296 e_inval:
1297 return -EINVAL;
1298 e_err:
1299 return err;
1300 }
1301
1302
1303 static void ip_handle_martian_source(struct net_device *dev,
1304 struct in_device *in_dev,
1305 struct sk_buff *skb,
1306 __be32 daddr,
1307 __be32 saddr)
1308 {
1309 RT_CACHE_STAT_INC(in_martian_src);
1310 #ifdef CONFIG_IP_ROUTE_VERBOSE
1311 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1312 /*
1313 * RFC1812 recommendation, if source is martian,
1314 * the only hint is MAC header.
1315 */
1316 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1317 &daddr, &saddr, dev->name);
1318 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1319 print_hex_dump(KERN_WARNING, "ll header: ",
1320 DUMP_PREFIX_OFFSET, 16, 1,
1321 skb_mac_header(skb),
1322 dev->hard_header_len, true);
1323 }
1324 }
1325 #endif
1326 }
1327
1328 /* called in rcu_read_lock() section */
1329 static int __mkroute_input(struct sk_buff *skb,
1330 const struct fib_result *res,
1331 struct in_device *in_dev,
1332 __be32 daddr, __be32 saddr, u32 tos,
1333 struct rtable **result)
1334 {
1335 struct rtable *rth;
1336 int err;
1337 struct in_device *out_dev;
1338 unsigned int flags = 0;
1339 u32 itag;
1340
1341 /* get a working reference to the output device */
1342 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1343 if (out_dev == NULL) {
1344 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1345 return -EINVAL;
1346 }
1347
1348
1349 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1350 in_dev->dev, in_dev, &itag);
1351 if (err < 0) {
1352 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1353 saddr);
1354
1355 goto cleanup;
1356 }
1357
1358 if (err)
1359 flags |= RTCF_DIRECTSRC;
1360
1361 if (out_dev == in_dev && err &&
1362 (IN_DEV_SHARED_MEDIA(out_dev) ||
1363 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1364 flags |= RTCF_DOREDIRECT;
1365
1366 if (skb->protocol != htons(ETH_P_IP)) {
1367 /* Not IP (i.e. ARP). Do not create route, if it is
1368 * invalid for proxy arp. DNAT routes are always valid.
1369 *
1370 * Proxy arp feature have been extended to allow, ARP
1371 * replies back to the same interface, to support
1372 * Private VLAN switch technologies. See arp.c.
1373 */
1374 if (out_dev == in_dev &&
1375 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1376 err = -EINVAL;
1377 goto cleanup;
1378 }
1379 }
1380
1381 rth = rt_dst_alloc(out_dev->dev,
1382 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1383 IN_DEV_CONF_GET(out_dev, NOXFRM));
1384 if (!rth) {
1385 err = -ENOBUFS;
1386 goto cleanup;
1387 }
1388
1389 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1390 rth->rt_flags = flags;
1391 rth->rt_type = res->type;
1392 rth->rt_route_iif = in_dev->dev->ifindex;
1393 rth->rt_iif = in_dev->dev->ifindex;
1394 rth->rt_oif = 0;
1395 rth->rt_pmtu = 0;
1396 rth->rt_gateway = 0;
1397 rth->fi = NULL;
1398
1399 rth->dst.input = ip_forward;
1400 rth->dst.output = ip_output;
1401
1402 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1403
1404 *result = rth;
1405 err = 0;
1406 cleanup:
1407 return err;
1408 }
1409
1410 static int ip_mkroute_input(struct sk_buff *skb,
1411 struct fib_result *res,
1412 const struct flowi4 *fl4,
1413 struct in_device *in_dev,
1414 __be32 daddr, __be32 saddr, u32 tos)
1415 {
1416 struct rtable *rth = NULL;
1417 int err;
1418
1419 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1420 if (res->fi && res->fi->fib_nhs > 1)
1421 fib_select_multipath(res);
1422 #endif
1423
1424 /* create a routing cache entry */
1425 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1426 if (err)
1427 return err;
1428
1429 skb_dst_set(skb, &rth->dst);
1430 return 0;
1431 }
1432
1433 /*
1434 * NOTE. We drop all the packets that has local source
1435 * addresses, because every properly looped back packet
1436 * must have correct destination already attached by output routine.
1437 *
1438 * Such approach solves two big problems:
1439 * 1. Not simplex devices are handled properly.
1440 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1441 * called with rcu_read_lock()
1442 */
1443
1444 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1445 u8 tos, struct net_device *dev)
1446 {
1447 struct fib_result res;
1448 struct in_device *in_dev = __in_dev_get_rcu(dev);
1449 struct flowi4 fl4;
1450 unsigned int flags = 0;
1451 u32 itag = 0;
1452 struct rtable *rth;
1453 int err = -EINVAL;
1454 struct net *net = dev_net(dev);
1455
1456 /* IP on this device is disabled. */
1457
1458 if (!in_dev)
1459 goto out;
1460
1461 /* Check for the most weird martians, which can be not detected
1462 by fib_lookup.
1463 */
1464
1465 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1466 goto martian_source;
1467
1468 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1469 goto brd_input;
1470
1471 /* Accept zero addresses only to limited broadcast;
1472 * I even do not know to fix it or not. Waiting for complains :-)
1473 */
1474 if (ipv4_is_zeronet(saddr))
1475 goto martian_source;
1476
1477 if (ipv4_is_zeronet(daddr))
1478 goto martian_destination;
1479
1480 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1481 if (ipv4_is_loopback(daddr))
1482 goto martian_destination;
1483
1484 if (ipv4_is_loopback(saddr))
1485 goto martian_source;
1486 }
1487
1488 /*
1489 * Now we are ready to route packet.
1490 */
1491 fl4.flowi4_oif = 0;
1492 fl4.flowi4_iif = dev->ifindex;
1493 fl4.flowi4_mark = skb->mark;
1494 fl4.flowi4_tos = tos;
1495 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1496 fl4.daddr = daddr;
1497 fl4.saddr = saddr;
1498 err = fib_lookup(net, &fl4, &res);
1499 if (err != 0)
1500 goto no_route;
1501
1502 RT_CACHE_STAT_INC(in_slow_tot);
1503
1504 if (res.type == RTN_BROADCAST)
1505 goto brd_input;
1506
1507 if (res.type == RTN_LOCAL) {
1508 err = fib_validate_source(skb, saddr, daddr, tos,
1509 net->loopback_dev->ifindex,
1510 dev, in_dev, &itag);
1511 if (err < 0)
1512 goto martian_source_keep_err;
1513 if (err)
1514 flags |= RTCF_DIRECTSRC;
1515 goto local_input;
1516 }
1517
1518 if (!IN_DEV_FORWARD(in_dev))
1519 goto no_route;
1520 if (res.type != RTN_UNICAST)
1521 goto martian_destination;
1522
1523 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1524 out: return err;
1525
1526 brd_input:
1527 if (skb->protocol != htons(ETH_P_IP))
1528 goto e_inval;
1529
1530 if (!ipv4_is_zeronet(saddr)) {
1531 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1532 in_dev, &itag);
1533 if (err < 0)
1534 goto martian_source_keep_err;
1535 if (err)
1536 flags |= RTCF_DIRECTSRC;
1537 }
1538 flags |= RTCF_BROADCAST;
1539 res.type = RTN_BROADCAST;
1540 RT_CACHE_STAT_INC(in_brd);
1541
1542 local_input:
1543 rth = rt_dst_alloc(net->loopback_dev,
1544 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1545 if (!rth)
1546 goto e_nobufs;
1547
1548 rth->dst.input= ip_local_deliver;
1549 rth->dst.output= ip_rt_bug;
1550 #ifdef CONFIG_IP_ROUTE_CLASSID
1551 rth->dst.tclassid = itag;
1552 #endif
1553
1554 rth->rt_genid = rt_genid(net);
1555 rth->rt_flags = flags|RTCF_LOCAL;
1556 rth->rt_type = res.type;
1557 rth->rt_route_iif = dev->ifindex;
1558 rth->rt_iif = dev->ifindex;
1559 rth->rt_oif = 0;
1560 rth->rt_pmtu = 0;
1561 rth->rt_gateway = 0;
1562 rth->fi = NULL;
1563 if (res.type == RTN_UNREACHABLE) {
1564 rth->dst.input= ip_error;
1565 rth->dst.error= -err;
1566 rth->rt_flags &= ~RTCF_LOCAL;
1567 }
1568 skb_dst_set(skb, &rth->dst);
1569 err = 0;
1570 goto out;
1571
1572 no_route:
1573 RT_CACHE_STAT_INC(in_no_route);
1574 res.type = RTN_UNREACHABLE;
1575 if (err == -ESRCH)
1576 err = -ENETUNREACH;
1577 goto local_input;
1578
1579 /*
1580 * Do not cache martian addresses: they should be logged (RFC1812)
1581 */
1582 martian_destination:
1583 RT_CACHE_STAT_INC(in_martian_dst);
1584 #ifdef CONFIG_IP_ROUTE_VERBOSE
1585 if (IN_DEV_LOG_MARTIANS(in_dev))
1586 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1587 &daddr, &saddr, dev->name);
1588 #endif
1589
1590 e_inval:
1591 err = -EINVAL;
1592 goto out;
1593
1594 e_nobufs:
1595 err = -ENOBUFS;
1596 goto out;
1597
1598 martian_source:
1599 err = -EINVAL;
1600 martian_source_keep_err:
1601 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1602 goto out;
1603 }
1604
1605 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606 u8 tos, struct net_device *dev)
1607 {
1608 int res;
1609
1610 rcu_read_lock();
1611
1612 /* Multicast recognition logic is moved from route cache to here.
1613 The problem was that too many Ethernet cards have broken/missing
1614 hardware multicast filters :-( As result the host on multicasting
1615 network acquires a lot of useless route cache entries, sort of
1616 SDR messages from all the world. Now we try to get rid of them.
1617 Really, provided software IP multicast filter is organized
1618 reasonably (at least, hashed), it does not result in a slowdown
1619 comparing with route cache reject entries.
1620 Note, that multicast routers are not affected, because
1621 route cache entry is created eventually.
1622 */
1623 if (ipv4_is_multicast(daddr)) {
1624 struct in_device *in_dev = __in_dev_get_rcu(dev);
1625
1626 if (in_dev) {
1627 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1628 ip_hdr(skb)->protocol);
1629 if (our
1630 #ifdef CONFIG_IP_MROUTE
1631 ||
1632 (!ipv4_is_local_multicast(daddr) &&
1633 IN_DEV_MFORWARD(in_dev))
1634 #endif
1635 ) {
1636 int res = ip_route_input_mc(skb, daddr, saddr,
1637 tos, dev, our);
1638 rcu_read_unlock();
1639 return res;
1640 }
1641 }
1642 rcu_read_unlock();
1643 return -EINVAL;
1644 }
1645 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1646 rcu_read_unlock();
1647 return res;
1648 }
1649 EXPORT_SYMBOL(ip_route_input);
1650
1651 /* called with rcu_read_lock() */
1652 static struct rtable *__mkroute_output(const struct fib_result *res,
1653 const struct flowi4 *fl4, int orig_oif,
1654 struct net_device *dev_out,
1655 unsigned int flags)
1656 {
1657 struct fib_info *fi = res->fi;
1658 struct in_device *in_dev;
1659 u16 type = res->type;
1660 struct rtable *rth;
1661
1662 in_dev = __in_dev_get_rcu(dev_out);
1663 if (!in_dev)
1664 return ERR_PTR(-EINVAL);
1665
1666 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1667 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1668 return ERR_PTR(-EINVAL);
1669
1670 if (ipv4_is_lbcast(fl4->daddr))
1671 type = RTN_BROADCAST;
1672 else if (ipv4_is_multicast(fl4->daddr))
1673 type = RTN_MULTICAST;
1674 else if (ipv4_is_zeronet(fl4->daddr))
1675 return ERR_PTR(-EINVAL);
1676
1677 if (dev_out->flags & IFF_LOOPBACK)
1678 flags |= RTCF_LOCAL;
1679
1680 if (type == RTN_BROADCAST) {
1681 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1682 fi = NULL;
1683 } else if (type == RTN_MULTICAST) {
1684 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1685 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1686 fl4->flowi4_proto))
1687 flags &= ~RTCF_LOCAL;
1688 /* If multicast route do not exist use
1689 * default one, but do not gateway in this case.
1690 * Yes, it is hack.
1691 */
1692 if (fi && res->prefixlen < 4)
1693 fi = NULL;
1694 }
1695
1696 rth = rt_dst_alloc(dev_out,
1697 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1698 IN_DEV_CONF_GET(in_dev, NOXFRM));
1699 if (!rth)
1700 return ERR_PTR(-ENOBUFS);
1701
1702 rth->dst.output = ip_output;
1703
1704 rth->rt_genid = rt_genid(dev_net(dev_out));
1705 rth->rt_flags = flags;
1706 rth->rt_type = type;
1707 rth->rt_route_iif = 0;
1708 rth->rt_iif = orig_oif ? : dev_out->ifindex;
1709 rth->rt_oif = orig_oif;
1710 rth->rt_pmtu = 0;
1711 rth->rt_gateway = 0;
1712 rth->fi = NULL;
1713
1714 RT_CACHE_STAT_INC(out_slow_tot);
1715
1716 if (flags & RTCF_LOCAL)
1717 rth->dst.input = ip_local_deliver;
1718 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1719 if (flags & RTCF_LOCAL &&
1720 !(dev_out->flags & IFF_LOOPBACK)) {
1721 rth->dst.output = ip_mc_output;
1722 RT_CACHE_STAT_INC(out_slow_mc);
1723 }
1724 #ifdef CONFIG_IP_MROUTE
1725 if (type == RTN_MULTICAST) {
1726 if (IN_DEV_MFORWARD(in_dev) &&
1727 !ipv4_is_local_multicast(fl4->daddr)) {
1728 rth->dst.input = ip_mr_input;
1729 rth->dst.output = ip_mc_output;
1730 }
1731 }
1732 #endif
1733 }
1734
1735 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1736
1737 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
1738 rth->dst.flags |= DST_NOCACHE;
1739
1740 return rth;
1741 }
1742
1743 /*
1744 * Major route resolver routine.
1745 */
1746
1747 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1748 {
1749 struct net_device *dev_out = NULL;
1750 __u8 tos = RT_FL_TOS(fl4);
1751 unsigned int flags = 0;
1752 struct fib_result res;
1753 struct rtable *rth;
1754 int orig_oif;
1755
1756 res.tclassid = 0;
1757 res.fi = NULL;
1758 res.table = NULL;
1759
1760 orig_oif = fl4->flowi4_oif;
1761
1762 fl4->flowi4_iif = net->loopback_dev->ifindex;
1763 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1764 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1765 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1766
1767 rcu_read_lock();
1768 if (fl4->saddr) {
1769 rth = ERR_PTR(-EINVAL);
1770 if (ipv4_is_multicast(fl4->saddr) ||
1771 ipv4_is_lbcast(fl4->saddr) ||
1772 ipv4_is_zeronet(fl4->saddr))
1773 goto out;
1774
1775 /* I removed check for oif == dev_out->oif here.
1776 It was wrong for two reasons:
1777 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1778 is assigned to multiple interfaces.
1779 2. Moreover, we are allowed to send packets with saddr
1780 of another iface. --ANK
1781 */
1782
1783 if (fl4->flowi4_oif == 0 &&
1784 (ipv4_is_multicast(fl4->daddr) ||
1785 ipv4_is_lbcast(fl4->daddr))) {
1786 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1787 dev_out = __ip_dev_find(net, fl4->saddr, false);
1788 if (dev_out == NULL)
1789 goto out;
1790
1791 /* Special hack: user can direct multicasts
1792 and limited broadcast via necessary interface
1793 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1794 This hack is not just for fun, it allows
1795 vic,vat and friends to work.
1796 They bind socket to loopback, set ttl to zero
1797 and expect that it will work.
1798 From the viewpoint of routing cache they are broken,
1799 because we are not allowed to build multicast path
1800 with loopback source addr (look, routing cache
1801 cannot know, that ttl is zero, so that packet
1802 will not leave this host and route is valid).
1803 Luckily, this hack is good workaround.
1804 */
1805
1806 fl4->flowi4_oif = dev_out->ifindex;
1807 goto make_route;
1808 }
1809
1810 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1811 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1812 if (!__ip_dev_find(net, fl4->saddr, false))
1813 goto out;
1814 }
1815 }
1816
1817
1818 if (fl4->flowi4_oif) {
1819 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1820 rth = ERR_PTR(-ENODEV);
1821 if (dev_out == NULL)
1822 goto out;
1823
1824 /* RACE: Check return value of inet_select_addr instead. */
1825 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1826 rth = ERR_PTR(-ENETUNREACH);
1827 goto out;
1828 }
1829 if (ipv4_is_local_multicast(fl4->daddr) ||
1830 ipv4_is_lbcast(fl4->daddr)) {
1831 if (!fl4->saddr)
1832 fl4->saddr = inet_select_addr(dev_out, 0,
1833 RT_SCOPE_LINK);
1834 goto make_route;
1835 }
1836 if (fl4->saddr) {
1837 if (ipv4_is_multicast(fl4->daddr))
1838 fl4->saddr = inet_select_addr(dev_out, 0,
1839 fl4->flowi4_scope);
1840 else if (!fl4->daddr)
1841 fl4->saddr = inet_select_addr(dev_out, 0,
1842 RT_SCOPE_HOST);
1843 }
1844 }
1845
1846 if (!fl4->daddr) {
1847 fl4->daddr = fl4->saddr;
1848 if (!fl4->daddr)
1849 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1850 dev_out = net->loopback_dev;
1851 fl4->flowi4_oif = net->loopback_dev->ifindex;
1852 res.type = RTN_LOCAL;
1853 flags |= RTCF_LOCAL;
1854 goto make_route;
1855 }
1856
1857 if (fib_lookup(net, fl4, &res)) {
1858 res.fi = NULL;
1859 res.table = NULL;
1860 if (fl4->flowi4_oif) {
1861 /* Apparently, routing tables are wrong. Assume,
1862 that the destination is on link.
1863
1864 WHY? DW.
1865 Because we are allowed to send to iface
1866 even if it has NO routes and NO assigned
1867 addresses. When oif is specified, routing
1868 tables are looked up with only one purpose:
1869 to catch if destination is gatewayed, rather than
1870 direct. Moreover, if MSG_DONTROUTE is set,
1871 we send packet, ignoring both routing tables
1872 and ifaddr state. --ANK
1873
1874
1875 We could make it even if oif is unknown,
1876 likely IPv6, but we do not.
1877 */
1878
1879 if (fl4->saddr == 0)
1880 fl4->saddr = inet_select_addr(dev_out, 0,
1881 RT_SCOPE_LINK);
1882 res.type = RTN_UNICAST;
1883 goto make_route;
1884 }
1885 rth = ERR_PTR(-ENETUNREACH);
1886 goto out;
1887 }
1888
1889 if (res.type == RTN_LOCAL) {
1890 if (!fl4->saddr) {
1891 if (res.fi->fib_prefsrc)
1892 fl4->saddr = res.fi->fib_prefsrc;
1893 else
1894 fl4->saddr = fl4->daddr;
1895 }
1896 dev_out = net->loopback_dev;
1897 fl4->flowi4_oif = dev_out->ifindex;
1898 res.fi = NULL;
1899 flags |= RTCF_LOCAL;
1900 goto make_route;
1901 }
1902
1903 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1904 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1905 fib_select_multipath(&res);
1906 else
1907 #endif
1908 if (!res.prefixlen &&
1909 res.table->tb_num_default > 1 &&
1910 res.type == RTN_UNICAST && !fl4->flowi4_oif)
1911 fib_select_default(&res);
1912
1913 if (!fl4->saddr)
1914 fl4->saddr = FIB_RES_PREFSRC(net, res);
1915
1916 dev_out = FIB_RES_DEV(res);
1917 fl4->flowi4_oif = dev_out->ifindex;
1918
1919
1920 make_route:
1921 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
1922
1923 out:
1924 rcu_read_unlock();
1925 return rth;
1926 }
1927 EXPORT_SYMBOL_GPL(__ip_route_output_key);
1928
1929 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
1930 {
1931 return NULL;
1932 }
1933
1934 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
1935 {
1936 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1937
1938 return mtu ? : dst->dev->mtu;
1939 }
1940
1941 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
1942 struct sk_buff *skb, u32 mtu)
1943 {
1944 }
1945
1946 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
1947 struct sk_buff *skb)
1948 {
1949 }
1950
1951 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
1952 unsigned long old)
1953 {
1954 return NULL;
1955 }
1956
1957 static struct dst_ops ipv4_dst_blackhole_ops = {
1958 .family = AF_INET,
1959 .protocol = cpu_to_be16(ETH_P_IP),
1960 .destroy = ipv4_dst_destroy,
1961 .check = ipv4_blackhole_dst_check,
1962 .mtu = ipv4_blackhole_mtu,
1963 .default_advmss = ipv4_default_advmss,
1964 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
1965 .redirect = ipv4_rt_blackhole_redirect,
1966 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
1967 .neigh_lookup = ipv4_neigh_lookup,
1968 };
1969
1970 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1971 {
1972 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
1973 struct rtable *ort = (struct rtable *) dst_orig;
1974
1975 if (rt) {
1976 struct dst_entry *new = &rt->dst;
1977
1978 new->__use = 1;
1979 new->input = dst_discard;
1980 new->output = dst_discard;
1981
1982 new->dev = ort->dst.dev;
1983 if (new->dev)
1984 dev_hold(new->dev);
1985
1986 rt->rt_route_iif = ort->rt_route_iif;
1987 rt->rt_iif = ort->rt_iif;
1988 rt->rt_oif = ort->rt_oif;
1989 rt->rt_pmtu = ort->rt_pmtu;
1990
1991 rt->rt_genid = rt_genid(net);
1992 rt->rt_flags = ort->rt_flags;
1993 rt->rt_type = ort->rt_type;
1994 rt->rt_gateway = ort->rt_gateway;
1995 rt->fi = ort->fi;
1996 if (rt->fi)
1997 atomic_inc(&rt->fi->fib_clntref);
1998
1999 dst_free(new);
2000 }
2001
2002 dst_release(dst_orig);
2003
2004 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2005 }
2006
2007 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2008 struct sock *sk)
2009 {
2010 struct rtable *rt = __ip_route_output_key(net, flp4);
2011
2012 if (IS_ERR(rt))
2013 return rt;
2014
2015 if (flp4->flowi4_proto)
2016 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2017 flowi4_to_flowi(flp4),
2018 sk, 0);
2019
2020 return rt;
2021 }
2022 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2023
2024 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2025 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2026 u32 seq, int event, int nowait, unsigned int flags)
2027 {
2028 struct rtable *rt = skb_rtable(skb);
2029 struct rtmsg *r;
2030 struct nlmsghdr *nlh;
2031 unsigned long expires = 0;
2032 u32 error;
2033 u32 metrics[RTAX_MAX];
2034
2035 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2036 if (nlh == NULL)
2037 return -EMSGSIZE;
2038
2039 r = nlmsg_data(nlh);
2040 r->rtm_family = AF_INET;
2041 r->rtm_dst_len = 32;
2042 r->rtm_src_len = 0;
2043 r->rtm_tos = fl4->flowi4_tos;
2044 r->rtm_table = RT_TABLE_MAIN;
2045 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2046 goto nla_put_failure;
2047 r->rtm_type = rt->rt_type;
2048 r->rtm_scope = RT_SCOPE_UNIVERSE;
2049 r->rtm_protocol = RTPROT_UNSPEC;
2050 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2051 if (rt->rt_flags & RTCF_NOTIFY)
2052 r->rtm_flags |= RTM_F_NOTIFY;
2053
2054 if (nla_put_be32(skb, RTA_DST, dst))
2055 goto nla_put_failure;
2056 if (src) {
2057 r->rtm_src_len = 32;
2058 if (nla_put_be32(skb, RTA_SRC, src))
2059 goto nla_put_failure;
2060 }
2061 if (rt->dst.dev &&
2062 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2063 goto nla_put_failure;
2064 #ifdef CONFIG_IP_ROUTE_CLASSID
2065 if (rt->dst.tclassid &&
2066 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2067 goto nla_put_failure;
2068 #endif
2069 if (!rt_is_input_route(rt) &&
2070 fl4->saddr != src) {
2071 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2072 goto nla_put_failure;
2073 }
2074 if (rt->rt_gateway &&
2075 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2076 goto nla_put_failure;
2077
2078 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2079 if (rt->rt_pmtu)
2080 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2081 if (rtnetlink_put_metrics(skb, metrics) < 0)
2082 goto nla_put_failure;
2083
2084 if (fl4->flowi4_mark &&
2085 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2086 goto nla_put_failure;
2087
2088 error = rt->dst.error;
2089 expires = rt->dst.expires;
2090 if (expires) {
2091 if (time_before(jiffies, expires))
2092 expires -= jiffies;
2093 else
2094 expires = 0;
2095 }
2096
2097 if (rt_is_input_route(rt)) {
2098 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2099 goto nla_put_failure;
2100 }
2101
2102 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2103 goto nla_put_failure;
2104
2105 return nlmsg_end(skb, nlh);
2106
2107 nla_put_failure:
2108 nlmsg_cancel(skb, nlh);
2109 return -EMSGSIZE;
2110 }
2111
2112 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2113 {
2114 struct net *net = sock_net(in_skb->sk);
2115 struct rtmsg *rtm;
2116 struct nlattr *tb[RTA_MAX+1];
2117 struct rtable *rt = NULL;
2118 struct flowi4 fl4;
2119 __be32 dst = 0;
2120 __be32 src = 0;
2121 u32 iif;
2122 int err;
2123 int mark;
2124 struct sk_buff *skb;
2125
2126 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2127 if (err < 0)
2128 goto errout;
2129
2130 rtm = nlmsg_data(nlh);
2131
2132 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2133 if (skb == NULL) {
2134 err = -ENOBUFS;
2135 goto errout;
2136 }
2137
2138 /* Reserve room for dummy headers, this skb can pass
2139 through good chunk of routing engine.
2140 */
2141 skb_reset_mac_header(skb);
2142 skb_reset_network_header(skb);
2143
2144 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2145 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2146 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2147
2148 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2149 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2150 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2151 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2152
2153 memset(&fl4, 0, sizeof(fl4));
2154 fl4.daddr = dst;
2155 fl4.saddr = src;
2156 fl4.flowi4_tos = rtm->rtm_tos;
2157 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2158 fl4.flowi4_mark = mark;
2159
2160 if (iif) {
2161 struct net_device *dev;
2162
2163 dev = __dev_get_by_index(net, iif);
2164 if (dev == NULL) {
2165 err = -ENODEV;
2166 goto errout_free;
2167 }
2168
2169 skb->protocol = htons(ETH_P_IP);
2170 skb->dev = dev;
2171 skb->mark = mark;
2172 local_bh_disable();
2173 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2174 local_bh_enable();
2175
2176 rt = skb_rtable(skb);
2177 if (err == 0 && rt->dst.error)
2178 err = -rt->dst.error;
2179 } else {
2180 rt = ip_route_output_key(net, &fl4);
2181
2182 err = 0;
2183 if (IS_ERR(rt))
2184 err = PTR_ERR(rt);
2185 }
2186
2187 if (err)
2188 goto errout_free;
2189
2190 skb_dst_set(skb, &rt->dst);
2191 if (rtm->rtm_flags & RTM_F_NOTIFY)
2192 rt->rt_flags |= RTCF_NOTIFY;
2193
2194 err = rt_fill_info(net, dst, src, &fl4, skb,
2195 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2196 RTM_NEWROUTE, 0, 0);
2197 if (err <= 0)
2198 goto errout_free;
2199
2200 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2201 errout:
2202 return err;
2203
2204 errout_free:
2205 kfree_skb(skb);
2206 goto errout;
2207 }
2208
2209 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2210 {
2211 return skb->len;
2212 }
2213
2214 void ip_rt_multicast_event(struct in_device *in_dev)
2215 {
2216 rt_cache_flush(dev_net(in_dev->dev), 0);
2217 }
2218
2219 #ifdef CONFIG_SYSCTL
2220 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2221 void __user *buffer,
2222 size_t *lenp, loff_t *ppos)
2223 {
2224 if (write) {
2225 int flush_delay;
2226 ctl_table ctl;
2227 struct net *net;
2228
2229 memcpy(&ctl, __ctl, sizeof(ctl));
2230 ctl.data = &flush_delay;
2231 proc_dointvec(&ctl, write, buffer, lenp, ppos);
2232
2233 net = (struct net *)__ctl->extra1;
2234 rt_cache_flush(net, flush_delay);
2235 return 0;
2236 }
2237
2238 return -EINVAL;
2239 }
2240
2241 static ctl_table ipv4_route_table[] = {
2242 {
2243 .procname = "gc_thresh",
2244 .data = &ipv4_dst_ops.gc_thresh,
2245 .maxlen = sizeof(int),
2246 .mode = 0644,
2247 .proc_handler = proc_dointvec,
2248 },
2249 {
2250 .procname = "max_size",
2251 .data = &ip_rt_max_size,
2252 .maxlen = sizeof(int),
2253 .mode = 0644,
2254 .proc_handler = proc_dointvec,
2255 },
2256 {
2257 /* Deprecated. Use gc_min_interval_ms */
2258
2259 .procname = "gc_min_interval",
2260 .data = &ip_rt_gc_min_interval,
2261 .maxlen = sizeof(int),
2262 .mode = 0644,
2263 .proc_handler = proc_dointvec_jiffies,
2264 },
2265 {
2266 .procname = "gc_min_interval_ms",
2267 .data = &ip_rt_gc_min_interval,
2268 .maxlen = sizeof(int),
2269 .mode = 0644,
2270 .proc_handler = proc_dointvec_ms_jiffies,
2271 },
2272 {
2273 .procname = "gc_timeout",
2274 .data = &ip_rt_gc_timeout,
2275 .maxlen = sizeof(int),
2276 .mode = 0644,
2277 .proc_handler = proc_dointvec_jiffies,
2278 },
2279 {
2280 .procname = "gc_interval",
2281 .data = &ip_rt_gc_interval,
2282 .maxlen = sizeof(int),
2283 .mode = 0644,
2284 .proc_handler = proc_dointvec_jiffies,
2285 },
2286 {
2287 .procname = "redirect_load",
2288 .data = &ip_rt_redirect_load,
2289 .maxlen = sizeof(int),
2290 .mode = 0644,
2291 .proc_handler = proc_dointvec,
2292 },
2293 {
2294 .procname = "redirect_number",
2295 .data = &ip_rt_redirect_number,
2296 .maxlen = sizeof(int),
2297 .mode = 0644,
2298 .proc_handler = proc_dointvec,
2299 },
2300 {
2301 .procname = "redirect_silence",
2302 .data = &ip_rt_redirect_silence,
2303 .maxlen = sizeof(int),
2304 .mode = 0644,
2305 .proc_handler = proc_dointvec,
2306 },
2307 {
2308 .procname = "error_cost",
2309 .data = &ip_rt_error_cost,
2310 .maxlen = sizeof(int),
2311 .mode = 0644,
2312 .proc_handler = proc_dointvec,
2313 },
2314 {
2315 .procname = "error_burst",
2316 .data = &ip_rt_error_burst,
2317 .maxlen = sizeof(int),
2318 .mode = 0644,
2319 .proc_handler = proc_dointvec,
2320 },
2321 {
2322 .procname = "gc_elasticity",
2323 .data = &ip_rt_gc_elasticity,
2324 .maxlen = sizeof(int),
2325 .mode = 0644,
2326 .proc_handler = proc_dointvec,
2327 },
2328 {
2329 .procname = "mtu_expires",
2330 .data = &ip_rt_mtu_expires,
2331 .maxlen = sizeof(int),
2332 .mode = 0644,
2333 .proc_handler = proc_dointvec_jiffies,
2334 },
2335 {
2336 .procname = "min_pmtu",
2337 .data = &ip_rt_min_pmtu,
2338 .maxlen = sizeof(int),
2339 .mode = 0644,
2340 .proc_handler = proc_dointvec,
2341 },
2342 {
2343 .procname = "min_adv_mss",
2344 .data = &ip_rt_min_advmss,
2345 .maxlen = sizeof(int),
2346 .mode = 0644,
2347 .proc_handler = proc_dointvec,
2348 },
2349 { }
2350 };
2351
2352 static struct ctl_table ipv4_route_flush_table[] = {
2353 {
2354 .procname = "flush",
2355 .maxlen = sizeof(int),
2356 .mode = 0200,
2357 .proc_handler = ipv4_sysctl_rtcache_flush,
2358 },
2359 { },
2360 };
2361
2362 static __net_init int sysctl_route_net_init(struct net *net)
2363 {
2364 struct ctl_table *tbl;
2365
2366 tbl = ipv4_route_flush_table;
2367 if (!net_eq(net, &init_net)) {
2368 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2369 if (tbl == NULL)
2370 goto err_dup;
2371 }
2372 tbl[0].extra1 = net;
2373
2374 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2375 if (net->ipv4.route_hdr == NULL)
2376 goto err_reg;
2377 return 0;
2378
2379 err_reg:
2380 if (tbl != ipv4_route_flush_table)
2381 kfree(tbl);
2382 err_dup:
2383 return -ENOMEM;
2384 }
2385
2386 static __net_exit void sysctl_route_net_exit(struct net *net)
2387 {
2388 struct ctl_table *tbl;
2389
2390 tbl = net->ipv4.route_hdr->ctl_table_arg;
2391 unregister_net_sysctl_table(net->ipv4.route_hdr);
2392 BUG_ON(tbl == ipv4_route_flush_table);
2393 kfree(tbl);
2394 }
2395
2396 static __net_initdata struct pernet_operations sysctl_route_ops = {
2397 .init = sysctl_route_net_init,
2398 .exit = sysctl_route_net_exit,
2399 };
2400 #endif
2401
2402 static __net_init int rt_genid_init(struct net *net)
2403 {
2404 get_random_bytes(&net->ipv4.rt_genid,
2405 sizeof(net->ipv4.rt_genid));
2406 get_random_bytes(&net->ipv4.dev_addr_genid,
2407 sizeof(net->ipv4.dev_addr_genid));
2408 return 0;
2409 }
2410
2411 static __net_initdata struct pernet_operations rt_genid_ops = {
2412 .init = rt_genid_init,
2413 };
2414
2415 static int __net_init ipv4_inetpeer_init(struct net *net)
2416 {
2417 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2418
2419 if (!bp)
2420 return -ENOMEM;
2421 inet_peer_base_init(bp);
2422 net->ipv4.peers = bp;
2423 return 0;
2424 }
2425
2426 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2427 {
2428 struct inet_peer_base *bp = net->ipv4.peers;
2429
2430 net->ipv4.peers = NULL;
2431 inetpeer_invalidate_tree(bp);
2432 kfree(bp);
2433 }
2434
2435 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2436 .init = ipv4_inetpeer_init,
2437 .exit = ipv4_inetpeer_exit,
2438 };
2439
2440 #ifdef CONFIG_IP_ROUTE_CLASSID
2441 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2442 #endif /* CONFIG_IP_ROUTE_CLASSID */
2443
2444 int __init ip_rt_init(void)
2445 {
2446 int rc = 0;
2447
2448 #ifdef CONFIG_IP_ROUTE_CLASSID
2449 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2450 if (!ip_rt_acct)
2451 panic("IP: failed to allocate ip_rt_acct\n");
2452 #endif
2453
2454 ipv4_dst_ops.kmem_cachep =
2455 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2456 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2457
2458 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2459
2460 if (dst_entries_init(&ipv4_dst_ops) < 0)
2461 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2462
2463 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2464 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2465
2466 ipv4_dst_ops.gc_thresh = ~0;
2467 ip_rt_max_size = INT_MAX;
2468
2469 devinet_init();
2470 ip_fib_init();
2471
2472 if (ip_rt_proc_init())
2473 pr_err("Unable to create route proc files\n");
2474 #ifdef CONFIG_XFRM
2475 xfrm_init();
2476 xfrm4_init(ip_rt_max_size);
2477 #endif
2478 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2479
2480 #ifdef CONFIG_SYSCTL
2481 register_pernet_subsys(&sysctl_route_ops);
2482 #endif
2483 register_pernet_subsys(&rt_genid_ops);
2484 register_pernet_subsys(&ipv4_inetpeer_ops);
2485 return rc;
2486 }
2487
2488 #ifdef CONFIG_SYSCTL
2489 /*
2490 * We really need to sanitize the damn ipv4 init order, then all
2491 * this nonsense will go away.
2492 */
2493 void __init ip_static_sysctl_init(void)
2494 {
2495 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2496 }
2497 #endif