2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #define RT_FL_TOS(oldflp4) \
118 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120 #define RT_GC_TIMEOUT (300*HZ)
122 static int ip_rt_max_size
;
123 static int ip_rt_redirect_number __read_mostly
= 9;
124 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
125 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly
= HZ
;
127 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
128 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
129 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly
= 256;
132 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
134 * Interface to generic destination cache.
137 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
138 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
139 static unsigned int ipv4_mtu(const struct dst_entry
*dst
);
140 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
141 static void ipv4_link_failure(struct sk_buff
*skb
);
142 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
143 struct sk_buff
*skb
, u32 mtu
);
144 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
145 struct sk_buff
*skb
);
146 static void ipv4_dst_destroy(struct dst_entry
*dst
);
148 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
154 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
158 static struct dst_ops ipv4_dst_ops
= {
160 .check
= ipv4_dst_check
,
161 .default_advmss
= ipv4_default_advmss
,
163 .cow_metrics
= ipv4_cow_metrics
,
164 .destroy
= ipv4_dst_destroy
,
165 .negative_advice
= ipv4_negative_advice
,
166 .link_failure
= ipv4_link_failure
,
167 .update_pmtu
= ip_rt_update_pmtu
,
168 .redirect
= ip_do_redirect
,
169 .local_out
= __ip_local_out
,
170 .neigh_lookup
= ipv4_neigh_lookup
,
173 #define ECN_OR_COST(class) TC_PRIO_##class
175 const __u8 ip_tos2prio
[16] = {
177 ECN_OR_COST(BESTEFFORT
),
179 ECN_OR_COST(BESTEFFORT
),
185 ECN_OR_COST(INTERACTIVE
),
187 ECN_OR_COST(INTERACTIVE
),
188 TC_PRIO_INTERACTIVE_BULK
,
189 ECN_OR_COST(INTERACTIVE_BULK
),
190 TC_PRIO_INTERACTIVE_BULK
,
191 ECN_OR_COST(INTERACTIVE_BULK
)
193 EXPORT_SYMBOL(ip_tos2prio
);
195 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
203 return SEQ_START_TOKEN
;
206 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
212 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
216 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
218 if (v
== SEQ_START_TOKEN
)
219 seq_printf(seq
, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 static const struct seq_operations rt_cache_seq_ops
= {
227 .start
= rt_cache_seq_start
,
228 .next
= rt_cache_seq_next
,
229 .stop
= rt_cache_seq_stop
,
230 .show
= rt_cache_seq_show
,
233 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
235 return seq_open(file
, &rt_cache_seq_ops
);
238 static const struct file_operations rt_cache_seq_fops
= {
239 .owner
= THIS_MODULE
,
240 .open
= rt_cache_seq_open
,
243 .release
= seq_release
,
247 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
252 return SEQ_START_TOKEN
;
254 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
255 if (!cpu_possible(cpu
))
258 return &per_cpu(rt_cache_stat
, cpu
);
263 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
267 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
268 if (!cpu_possible(cpu
))
271 return &per_cpu(rt_cache_stat
, cpu
);
277 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
282 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
284 struct rt_cache_stat
*st
= v
;
286 if (v
== SEQ_START_TOKEN
) {
287 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
292 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293 dst_entries_get_slow(&ipv4_dst_ops
),
306 0, /* st->gc_total */
307 0, /* st->gc_ignored */
308 0, /* st->gc_goal_miss */
309 0, /* st->gc_dst_overflow */
310 0, /* st->in_hlist_search */
311 0 /* st->out_hlist_search */
316 static const struct seq_operations rt_cpu_seq_ops
= {
317 .start
= rt_cpu_seq_start
,
318 .next
= rt_cpu_seq_next
,
319 .stop
= rt_cpu_seq_stop
,
320 .show
= rt_cpu_seq_show
,
324 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
326 return seq_open(file
, &rt_cpu_seq_ops
);
329 static const struct file_operations rt_cpu_seq_fops
= {
330 .owner
= THIS_MODULE
,
331 .open
= rt_cpu_seq_open
,
334 .release
= seq_release
,
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
340 struct ip_rt_acct
*dst
, *src
;
343 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
347 for_each_possible_cpu(i
) {
348 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
349 for (j
= 0; j
< 256; j
++) {
350 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
351 dst
[j
].o_packets
+= src
[j
].o_packets
;
352 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
353 dst
[j
].i_packets
+= src
[j
].i_packets
;
357 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
362 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
364 return single_open(file
, rt_acct_proc_show
, NULL
);
367 static const struct file_operations rt_acct_proc_fops
= {
368 .owner
= THIS_MODULE
,
369 .open
= rt_acct_proc_open
,
372 .release
= single_release
,
376 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
378 struct proc_dir_entry
*pde
;
380 pde
= proc_create("rt_cache", S_IRUGO
, net
->proc_net
,
385 pde
= proc_create("rt_cache", S_IRUGO
,
386 net
->proc_net_stat
, &rt_cpu_seq_fops
);
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
399 remove_proc_entry("rt_cache", net
->proc_net_stat
);
402 remove_proc_entry("rt_cache", net
->proc_net
);
407 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
409 remove_proc_entry("rt_cache", net
->proc_net_stat
);
410 remove_proc_entry("rt_cache", net
->proc_net
);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412 remove_proc_entry("rt_acct", net
->proc_net
);
416 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
417 .init
= ip_rt_do_proc_init
,
418 .exit
= ip_rt_do_proc_exit
,
421 static int __init
ip_rt_proc_init(void)
423 return register_pernet_subsys(&ip_rt_proc_ops
);
427 static inline int ip_rt_proc_init(void)
431 #endif /* CONFIG_PROC_FS */
433 static inline bool rt_is_expired(const struct rtable
*rth
)
435 return rth
->rt_genid
!= rt_genid_ipv4(dev_net(rth
->dst
.dev
));
438 void rt_cache_flush(struct net
*net
)
440 rt_genid_bump_ipv4(net
);
443 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
447 struct net_device
*dev
= dst
->dev
;
448 const __be32
*pkey
= daddr
;
449 const struct rtable
*rt
;
452 rt
= (const struct rtable
*) dst
;
454 pkey
= (const __be32
*) &rt
->rt_gateway
;
456 pkey
= &ip_hdr(skb
)->daddr
;
458 n
= __ipv4_neigh_lookup(dev
, *(__force u32
*)pkey
);
461 return neigh_create(&arp_tbl
, pkey
, dev
);
464 #define IP_IDENTS_SZ 2048u
466 static atomic_t
*ip_idents __read_mostly
;
467 static u32
*ip_tstamps __read_mostly
;
469 /* In order to protect privacy, we add a perturbation to identifiers
470 * if one generator is seldom used. This makes hard for an attacker
471 * to infer how many packets were sent between two points in time.
473 u32
ip_idents_reserve(u32 hash
, int segs
)
475 u32
*p_tstamp
= ip_tstamps
+ hash
% IP_IDENTS_SZ
;
476 atomic_t
*p_id
= ip_idents
+ hash
% IP_IDENTS_SZ
;
477 u32 old
= ACCESS_ONCE(*p_tstamp
);
478 u32 now
= (u32
)jiffies
;
481 if (old
!= now
&& cmpxchg(p_tstamp
, old
, now
) == old
)
482 delta
= prandom_u32_max(now
- old
);
484 return atomic_add_return(segs
+ delta
, p_id
) - segs
;
486 EXPORT_SYMBOL(ip_idents_reserve
);
488 void __ip_select_ident(struct net
*net
, struct iphdr
*iph
, int segs
)
490 static u32 ip_idents_hashrnd __read_mostly
;
493 net_get_random_once(&ip_idents_hashrnd
, sizeof(ip_idents_hashrnd
));
495 hash
= jhash_3words((__force u32
)iph
->daddr
,
496 (__force u32
)iph
->saddr
,
497 iph
->protocol
^ net_hash_mix(net
),
499 id
= ip_idents_reserve(hash
, segs
);
502 EXPORT_SYMBOL(__ip_select_ident
);
504 static void __build_flow_key(const struct net
*net
, struct flowi4
*fl4
,
505 const struct sock
*sk
,
506 const struct iphdr
*iph
,
508 u8 prot
, u32 mark
, int flow_flags
)
511 const struct inet_sock
*inet
= inet_sk(sk
);
513 oif
= sk
->sk_bound_dev_if
;
515 tos
= RT_CONN_FLAGS(sk
);
516 prot
= inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
;
518 flowi4_init_output(fl4
, oif
, mark
, tos
,
519 RT_SCOPE_UNIVERSE
, prot
,
521 iph
->daddr
, iph
->saddr
, 0, 0,
522 sock_net_uid(net
, sk
));
525 static void build_skb_flow_key(struct flowi4
*fl4
, const struct sk_buff
*skb
,
526 const struct sock
*sk
)
528 const struct net
*net
= dev_net(skb
->dev
);
529 const struct iphdr
*iph
= ip_hdr(skb
);
530 int oif
= skb
->dev
->ifindex
;
531 u8 tos
= RT_TOS(iph
->tos
);
532 u8 prot
= iph
->protocol
;
533 u32 mark
= skb
->mark
;
535 __build_flow_key(net
, fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
538 static void build_sk_flow_key(struct flowi4
*fl4
, const struct sock
*sk
)
540 const struct inet_sock
*inet
= inet_sk(sk
);
541 const struct ip_options_rcu
*inet_opt
;
542 __be32 daddr
= inet
->inet_daddr
;
545 inet_opt
= rcu_dereference(inet
->inet_opt
);
546 if (inet_opt
&& inet_opt
->opt
.srr
)
547 daddr
= inet_opt
->opt
.faddr
;
548 flowi4_init_output(fl4
, sk
->sk_bound_dev_if
, sk
->sk_mark
,
549 RT_CONN_FLAGS(sk
), RT_SCOPE_UNIVERSE
,
550 inet
->hdrincl
? IPPROTO_RAW
: sk
->sk_protocol
,
551 inet_sk_flowi_flags(sk
),
552 daddr
, inet
->inet_saddr
, 0, 0, sk
->sk_uid
);
556 static void ip_rt_build_flow_key(struct flowi4
*fl4
, const struct sock
*sk
,
557 const struct sk_buff
*skb
)
560 build_skb_flow_key(fl4
, skb
, sk
);
562 build_sk_flow_key(fl4
, sk
);
565 static inline void rt_free(struct rtable
*rt
)
567 call_rcu(&rt
->dst
.rcu_head
, dst_rcu_free
);
570 static DEFINE_SPINLOCK(fnhe_lock
);
572 static void fnhe_flush_routes(struct fib_nh_exception
*fnhe
)
576 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
578 RCU_INIT_POINTER(fnhe
->fnhe_rth_input
, NULL
);
581 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
583 RCU_INIT_POINTER(fnhe
->fnhe_rth_output
, NULL
);
588 static struct fib_nh_exception
*fnhe_oldest(struct fnhe_hash_bucket
*hash
)
590 struct fib_nh_exception
*fnhe
, *oldest
;
592 oldest
= rcu_dereference(hash
->chain
);
593 for (fnhe
= rcu_dereference(oldest
->fnhe_next
); fnhe
;
594 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
595 if (time_before(fnhe
->fnhe_stamp
, oldest
->fnhe_stamp
))
598 fnhe_flush_routes(oldest
);
602 static inline u32
fnhe_hashfun(__be32 daddr
)
604 static u32 fnhe_hashrnd __read_mostly
;
607 net_get_random_once(&fnhe_hashrnd
, sizeof(fnhe_hashrnd
));
608 hval
= jhash_1word((__force u32
) daddr
, fnhe_hashrnd
);
609 return hash_32(hval
, FNHE_HASH_SHIFT
);
612 static void fill_route_from_fnhe(struct rtable
*rt
, struct fib_nh_exception
*fnhe
)
614 rt
->rt_pmtu
= fnhe
->fnhe_pmtu
;
615 rt
->dst
.expires
= fnhe
->fnhe_expires
;
618 rt
->rt_flags
|= RTCF_REDIRECTED
;
619 rt
->rt_gateway
= fnhe
->fnhe_gw
;
620 rt
->rt_uses_gateway
= 1;
624 static void update_or_create_fnhe(struct fib_nh
*nh
, __be32 daddr
, __be32 gw
,
625 u32 pmtu
, unsigned long expires
)
627 struct fnhe_hash_bucket
*hash
;
628 struct fib_nh_exception
*fnhe
;
632 u32 hval
= fnhe_hashfun(daddr
);
634 spin_lock_bh(&fnhe_lock
);
636 hash
= rcu_dereference(nh
->nh_exceptions
);
638 hash
= kzalloc(FNHE_HASH_SIZE
* sizeof(*hash
), GFP_ATOMIC
);
641 rcu_assign_pointer(nh
->nh_exceptions
, hash
);
647 for (fnhe
= rcu_dereference(hash
->chain
); fnhe
;
648 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
649 if (fnhe
->fnhe_daddr
== daddr
)
658 fnhe
->fnhe_pmtu
= pmtu
;
659 fnhe
->fnhe_expires
= max(1UL, expires
);
661 /* Update all cached dsts too */
662 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
664 fill_route_from_fnhe(rt
, fnhe
);
665 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
667 fill_route_from_fnhe(rt
, fnhe
);
669 if (depth
> FNHE_RECLAIM_DEPTH
)
670 fnhe
= fnhe_oldest(hash
);
672 fnhe
= kzalloc(sizeof(*fnhe
), GFP_ATOMIC
);
676 fnhe
->fnhe_next
= hash
->chain
;
677 rcu_assign_pointer(hash
->chain
, fnhe
);
679 fnhe
->fnhe_genid
= fnhe_genid(dev_net(nh
->nh_dev
));
680 fnhe
->fnhe_daddr
= daddr
;
682 fnhe
->fnhe_pmtu
= pmtu
;
683 fnhe
->fnhe_expires
= expires
;
685 /* Exception created; mark the cached routes for the nexthop
686 * stale, so anyone caching it rechecks if this exception
689 rt
= rcu_dereference(nh
->nh_rth_input
);
691 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
693 for_each_possible_cpu(i
) {
694 struct rtable __rcu
**prt
;
695 prt
= per_cpu_ptr(nh
->nh_pcpu_rth_output
, i
);
696 rt
= rcu_dereference(*prt
);
698 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
702 fnhe
->fnhe_stamp
= jiffies
;
705 spin_unlock_bh(&fnhe_lock
);
708 static void __ip_do_redirect(struct rtable
*rt
, struct sk_buff
*skb
, struct flowi4
*fl4
,
711 __be32 new_gw
= icmp_hdr(skb
)->un
.gateway
;
712 __be32 old_gw
= ip_hdr(skb
)->saddr
;
713 struct net_device
*dev
= skb
->dev
;
714 struct in_device
*in_dev
;
715 struct fib_result res
;
719 switch (icmp_hdr(skb
)->code
& 7) {
721 case ICMP_REDIR_NETTOS
:
722 case ICMP_REDIR_HOST
:
723 case ICMP_REDIR_HOSTTOS
:
730 if (rt
->rt_gateway
!= old_gw
)
733 in_dev
= __in_dev_get_rcu(dev
);
738 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
739 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
740 ipv4_is_zeronet(new_gw
))
741 goto reject_redirect
;
743 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
744 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
745 goto reject_redirect
;
746 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
747 goto reject_redirect
;
749 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
750 goto reject_redirect
;
753 n
= __ipv4_neigh_lookup(rt
->dst
.dev
, new_gw
);
755 n
= neigh_create(&arp_tbl
, &new_gw
, rt
->dst
.dev
);
757 if (!(n
->nud_state
& NUD_VALID
)) {
758 neigh_event_send(n
, NULL
);
760 if (fib_lookup(net
, fl4
, &res
, 0) == 0) {
761 struct fib_nh
*nh
= &FIB_RES_NH(res
);
763 update_or_create_fnhe(nh
, fl4
->daddr
, new_gw
,
764 0, jiffies
+ ip_rt_gc_timeout
);
767 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
768 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE
, n
);
775 #ifdef CONFIG_IP_ROUTE_VERBOSE
776 if (IN_DEV_LOG_MARTIANS(in_dev
)) {
777 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
778 __be32 daddr
= iph
->daddr
;
779 __be32 saddr
= iph
->saddr
;
781 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
782 " Advised path = %pI4 -> %pI4\n",
783 &old_gw
, dev
->name
, &new_gw
,
790 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
794 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
795 struct net
*net
= dev_net(skb
->dev
);
796 int oif
= skb
->dev
->ifindex
;
797 u8 tos
= RT_TOS(iph
->tos
);
798 u8 prot
= iph
->protocol
;
799 u32 mark
= skb
->mark
;
801 rt
= (struct rtable
*) dst
;
803 __build_flow_key(net
, &fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
804 __ip_do_redirect(rt
, skb
, &fl4
, true);
807 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
809 struct rtable
*rt
= (struct rtable
*)dst
;
810 struct dst_entry
*ret
= dst
;
813 if (dst
->obsolete
> 0) {
816 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
827 * 1. The first ip_rt_redirect_number redirects are sent
828 * with exponential backoff, then we stop sending them at all,
829 * assuming that the host ignores our redirects.
830 * 2. If we did not see packets requiring redirects
831 * during ip_rt_redirect_silence, we assume that the host
832 * forgot redirected route and start to send redirects again.
834 * This algorithm is much cheaper and more intelligent than dumb load limiting
837 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
838 * and "frag. need" (breaks PMTU discovery) in icmp.c.
841 void ip_rt_send_redirect(struct sk_buff
*skb
)
843 struct rtable
*rt
= skb_rtable(skb
);
844 struct in_device
*in_dev
;
845 struct inet_peer
*peer
;
851 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
852 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
856 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
857 vif
= l3mdev_master_ifindex_rcu(rt
->dst
.dev
);
860 net
= dev_net(rt
->dst
.dev
);
861 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
, vif
, 1);
863 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
,
864 rt_nexthop(rt
, ip_hdr(skb
)->daddr
));
868 /* No redirected packets during ip_rt_redirect_silence;
869 * reset the algorithm.
871 if (time_after(jiffies
, peer
->rate_last
+ ip_rt_redirect_silence
))
872 peer
->rate_tokens
= 0;
874 /* Too many ignored redirects; do not send anything
875 * set dst.rate_last to the last seen redirected packet.
877 if (peer
->rate_tokens
>= ip_rt_redirect_number
) {
878 peer
->rate_last
= jiffies
;
882 /* Check for load limit; set rate_last to the latest sent
885 if (peer
->rate_tokens
== 0 ||
888 (ip_rt_redirect_load
<< peer
->rate_tokens
)))) {
889 __be32 gw
= rt_nexthop(rt
, ip_hdr(skb
)->daddr
);
891 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, gw
);
892 peer
->rate_last
= jiffies
;
894 #ifdef CONFIG_IP_ROUTE_VERBOSE
896 peer
->rate_tokens
== ip_rt_redirect_number
)
897 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
898 &ip_hdr(skb
)->saddr
, inet_iif(skb
),
899 &ip_hdr(skb
)->daddr
, &gw
);
906 static int ip_error(struct sk_buff
*skb
)
908 struct in_device
*in_dev
= __in_dev_get_rcu(skb
->dev
);
909 struct rtable
*rt
= skb_rtable(skb
);
910 struct inet_peer
*peer
;
916 /* IP on this device is disabled. */
920 net
= dev_net(rt
->dst
.dev
);
921 if (!IN_DEV_FORWARD(in_dev
)) {
922 switch (rt
->dst
.error
) {
924 IP_INC_STATS_BH(net
, IPSTATS_MIB_INADDRERRORS
);
928 IP_INC_STATS_BH(net
, IPSTATS_MIB_INNOROUTES
);
934 switch (rt
->dst
.error
) {
939 code
= ICMP_HOST_UNREACH
;
942 code
= ICMP_NET_UNREACH
;
943 IP_INC_STATS_BH(net
, IPSTATS_MIB_INNOROUTES
);
946 code
= ICMP_PKT_FILTERED
;
950 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
,
951 l3mdev_master_ifindex(skb
->dev
), 1);
956 peer
->rate_tokens
+= now
- peer
->rate_last
;
957 if (peer
->rate_tokens
> ip_rt_error_burst
)
958 peer
->rate_tokens
= ip_rt_error_burst
;
959 peer
->rate_last
= now
;
960 if (peer
->rate_tokens
>= ip_rt_error_cost
)
961 peer
->rate_tokens
-= ip_rt_error_cost
;
967 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
973 static void __ip_rt_update_pmtu(struct rtable
*rt
, struct flowi4
*fl4
, u32 mtu
)
975 struct dst_entry
*dst
= &rt
->dst
;
976 struct fib_result res
;
978 if (dst_metric_locked(dst
, RTAX_MTU
))
981 if (ipv4_mtu(dst
) < mtu
)
984 if (mtu
< ip_rt_min_pmtu
)
985 mtu
= ip_rt_min_pmtu
;
987 if (rt
->rt_pmtu
== mtu
&&
988 time_before(jiffies
, dst
->expires
- ip_rt_mtu_expires
/ 2))
992 if (fib_lookup(dev_net(dst
->dev
), fl4
, &res
, 0) == 0) {
993 struct fib_nh
*nh
= &FIB_RES_NH(res
);
995 update_or_create_fnhe(nh
, fl4
->daddr
, 0, mtu
,
996 jiffies
+ ip_rt_mtu_expires
);
1001 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
1002 struct sk_buff
*skb
, u32 mtu
)
1004 struct rtable
*rt
= (struct rtable
*) dst
;
1007 ip_rt_build_flow_key(&fl4
, sk
, skb
);
1008 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1011 void ipv4_update_pmtu(struct sk_buff
*skb
, struct net
*net
, u32 mtu
,
1012 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1014 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1019 mark
= IP4_REPLY_MARK(net
, skb
->mark
);
1021 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
,
1022 RT_TOS(iph
->tos
), protocol
, mark
, flow_flags
);
1023 rt
= __ip_route_output_key(net
, &fl4
);
1025 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1029 EXPORT_SYMBOL_GPL(ipv4_update_pmtu
);
1031 static void __ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1033 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1037 __build_flow_key(sock_net(sk
), &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1039 if (!fl4
.flowi4_mark
)
1040 fl4
.flowi4_mark
= IP4_REPLY_MARK(sock_net(sk
), skb
->mark
);
1042 rt
= __ip_route_output_key(sock_net(sk
), &fl4
);
1044 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1049 void ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1051 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1054 struct dst_entry
*odst
= NULL
;
1056 struct net
*net
= sock_net(sk
);
1060 if (!ip_sk_accept_pmtu(sk
))
1063 odst
= sk_dst_get(sk
);
1065 if (sock_owned_by_user(sk
) || !odst
) {
1066 __ipv4_sk_update_pmtu(skb
, sk
, mtu
);
1070 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1072 rt
= (struct rtable
*)odst
;
1073 if (odst
->obsolete
&& !odst
->ops
->check(odst
, 0)) {
1074 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1081 __ip_rt_update_pmtu((struct rtable
*) rt
->dst
.path
, &fl4
, mtu
);
1083 if (!dst_check(&rt
->dst
, 0)) {
1085 dst_release(&rt
->dst
);
1087 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1095 sk_dst_set(sk
, &rt
->dst
);
1101 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu
);
1103 void ipv4_redirect(struct sk_buff
*skb
, struct net
*net
,
1104 int oif
, u32 mark
, u8 protocol
, int flow_flags
)
1106 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1110 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
,
1111 RT_TOS(iph
->tos
), protocol
, mark
, flow_flags
);
1112 rt
= __ip_route_output_key(net
, &fl4
);
1114 __ip_do_redirect(rt
, skb
, &fl4
, false);
1118 EXPORT_SYMBOL_GPL(ipv4_redirect
);
1120 void ipv4_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
1122 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
1125 struct net
*net
= sock_net(sk
);
1127 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1128 rt
= __ip_route_output_key(net
, &fl4
);
1130 __ip_do_redirect(rt
, skb
, &fl4
, false);
1134 EXPORT_SYMBOL_GPL(ipv4_sk_redirect
);
1136 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1138 struct rtable
*rt
= (struct rtable
*) dst
;
1140 /* All IPV4 dsts are created with ->obsolete set to the value
1141 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1142 * into this function always.
1144 * When a PMTU/redirect information update invalidates a route,
1145 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1146 * DST_OBSOLETE_DEAD by dst_free().
1148 if (dst
->obsolete
!= DST_OBSOLETE_FORCE_CHK
|| rt_is_expired(rt
))
1153 static void ipv4_link_failure(struct sk_buff
*skb
)
1157 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1159 rt
= skb_rtable(skb
);
1161 dst_set_expires(&rt
->dst
, 0);
1164 static int ip_rt_bug(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
1166 pr_debug("%s: %pI4 -> %pI4, %s\n",
1167 __func__
, &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1168 skb
->dev
? skb
->dev
->name
: "?");
1175 We do not cache source address of outgoing interface,
1176 because it is used only by IP RR, TS and SRR options,
1177 so that it out of fast path.
1179 BTW remember: "addr" is allowed to be not aligned
1183 void ip_rt_get_source(u8
*addr
, struct sk_buff
*skb
, struct rtable
*rt
)
1187 if (rt_is_output_route(rt
))
1188 src
= ip_hdr(skb
)->saddr
;
1190 struct fib_result res
;
1196 memset(&fl4
, 0, sizeof(fl4
));
1197 fl4
.daddr
= iph
->daddr
;
1198 fl4
.saddr
= iph
->saddr
;
1199 fl4
.flowi4_tos
= RT_TOS(iph
->tos
);
1200 fl4
.flowi4_oif
= rt
->dst
.dev
->ifindex
;
1201 fl4
.flowi4_iif
= skb
->dev
->ifindex
;
1202 fl4
.flowi4_mark
= skb
->mark
;
1205 if (fib_lookup(dev_net(rt
->dst
.dev
), &fl4
, &res
, 0) == 0)
1206 src
= FIB_RES_PREFSRC(dev_net(rt
->dst
.dev
), res
);
1208 src
= inet_select_addr(rt
->dst
.dev
,
1209 rt_nexthop(rt
, iph
->daddr
),
1213 memcpy(addr
, &src
, 4);
1216 #ifdef CONFIG_IP_ROUTE_CLASSID
1217 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1219 if (!(rt
->dst
.tclassid
& 0xFFFF))
1220 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1221 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1222 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1226 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1228 unsigned int advmss
= dst_metric_raw(dst
, RTAX_ADVMSS
);
1231 advmss
= max_t(unsigned int, dst
->dev
->mtu
- 40,
1233 if (advmss
> 65535 - 40)
1234 advmss
= 65535 - 40;
1239 static unsigned int ipv4_mtu(const struct dst_entry
*dst
)
1241 const struct rtable
*rt
= (const struct rtable
*) dst
;
1242 unsigned int mtu
= rt
->rt_pmtu
;
1244 if (!mtu
|| time_after_eq(jiffies
, rt
->dst
.expires
))
1245 mtu
= dst_metric_raw(dst
, RTAX_MTU
);
1250 mtu
= dst
->dev
->mtu
;
1252 if (unlikely(dst_metric_locked(dst
, RTAX_MTU
))) {
1253 if (rt
->rt_uses_gateway
&& mtu
> 576)
1257 return min_t(unsigned int, mtu
, IP_MAX_MTU
);
1260 static struct fib_nh_exception
*find_exception(struct fib_nh
*nh
, __be32 daddr
)
1262 struct fnhe_hash_bucket
*hash
= rcu_dereference(nh
->nh_exceptions
);
1263 struct fib_nh_exception
*fnhe
;
1269 hval
= fnhe_hashfun(daddr
);
1271 for (fnhe
= rcu_dereference(hash
[hval
].chain
); fnhe
;
1272 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
1273 if (fnhe
->fnhe_daddr
== daddr
)
1279 static bool rt_bind_exception(struct rtable
*rt
, struct fib_nh_exception
*fnhe
,
1284 spin_lock_bh(&fnhe_lock
);
1286 if (daddr
== fnhe
->fnhe_daddr
) {
1287 struct rtable __rcu
**porig
;
1288 struct rtable
*orig
;
1289 int genid
= fnhe_genid(dev_net(rt
->dst
.dev
));
1291 if (rt_is_input_route(rt
))
1292 porig
= &fnhe
->fnhe_rth_input
;
1294 porig
= &fnhe
->fnhe_rth_output
;
1295 orig
= rcu_dereference(*porig
);
1297 if (fnhe
->fnhe_genid
!= genid
) {
1298 fnhe
->fnhe_genid
= genid
;
1300 fnhe
->fnhe_pmtu
= 0;
1301 fnhe
->fnhe_expires
= 0;
1302 fnhe_flush_routes(fnhe
);
1305 fill_route_from_fnhe(rt
, fnhe
);
1306 if (!rt
->rt_gateway
)
1307 rt
->rt_gateway
= daddr
;
1309 if (!(rt
->dst
.flags
& DST_NOCACHE
)) {
1310 rcu_assign_pointer(*porig
, rt
);
1316 fnhe
->fnhe_stamp
= jiffies
;
1318 spin_unlock_bh(&fnhe_lock
);
1323 static bool rt_cache_route(struct fib_nh
*nh
, struct rtable
*rt
)
1325 struct rtable
*orig
, *prev
, **p
;
1328 if (rt_is_input_route(rt
)) {
1329 p
= (struct rtable
**)&nh
->nh_rth_input
;
1331 p
= (struct rtable
**)raw_cpu_ptr(nh
->nh_pcpu_rth_output
);
1335 prev
= cmpxchg(p
, orig
, rt
);
1345 struct uncached_list
{
1347 struct list_head head
;
1350 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt_uncached_list
);
1352 static void rt_add_uncached_list(struct rtable
*rt
)
1354 struct uncached_list
*ul
= raw_cpu_ptr(&rt_uncached_list
);
1356 rt
->rt_uncached_list
= ul
;
1358 spin_lock_bh(&ul
->lock
);
1359 list_add_tail(&rt
->rt_uncached
, &ul
->head
);
1360 spin_unlock_bh(&ul
->lock
);
1363 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1365 struct dst_metrics
*p
= (struct dst_metrics
*)DST_METRICS_PTR(dst
);
1366 struct rtable
*rt
= (struct rtable
*) dst
;
1368 if (p
!= &dst_default_metrics
&& atomic_dec_and_test(&p
->refcnt
))
1371 if (!list_empty(&rt
->rt_uncached
)) {
1372 struct uncached_list
*ul
= rt
->rt_uncached_list
;
1374 spin_lock_bh(&ul
->lock
);
1375 list_del(&rt
->rt_uncached
);
1376 spin_unlock_bh(&ul
->lock
);
1380 void rt_flush_dev(struct net_device
*dev
)
1382 struct net
*net
= dev_net(dev
);
1386 for_each_possible_cpu(cpu
) {
1387 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
1389 spin_lock_bh(&ul
->lock
);
1390 list_for_each_entry(rt
, &ul
->head
, rt_uncached
) {
1391 if (rt
->dst
.dev
!= dev
)
1393 rt
->dst
.dev
= net
->loopback_dev
;
1394 dev_hold(rt
->dst
.dev
);
1397 spin_unlock_bh(&ul
->lock
);
1401 static bool rt_cache_valid(const struct rtable
*rt
)
1404 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
1408 static void rt_set_nexthop(struct rtable
*rt
, __be32 daddr
,
1409 const struct fib_result
*res
,
1410 struct fib_nh_exception
*fnhe
,
1411 struct fib_info
*fi
, u16 type
, u32 itag
)
1413 bool cached
= false;
1416 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
1418 if (nh
->nh_gw
&& nh
->nh_scope
== RT_SCOPE_LINK
) {
1419 rt
->rt_gateway
= nh
->nh_gw
;
1420 rt
->rt_uses_gateway
= 1;
1422 dst_init_metrics(&rt
->dst
, fi
->fib_metrics
->metrics
, true);
1423 if (fi
->fib_metrics
!= &dst_default_metrics
) {
1424 rt
->dst
._metrics
|= DST_METRICS_REFCOUNTED
;
1425 atomic_inc(&fi
->fib_metrics
->refcnt
);
1427 #ifdef CONFIG_IP_ROUTE_CLASSID
1428 rt
->dst
.tclassid
= nh
->nh_tclassid
;
1430 rt
->dst
.lwtstate
= lwtstate_get(nh
->nh_lwtstate
);
1432 cached
= rt_bind_exception(rt
, fnhe
, daddr
);
1433 else if (!(rt
->dst
.flags
& DST_NOCACHE
))
1434 cached
= rt_cache_route(nh
, rt
);
1435 if (unlikely(!cached
)) {
1436 /* Routes we intend to cache in nexthop exception or
1437 * FIB nexthop have the DST_NOCACHE bit clear.
1438 * However, if we are unsuccessful at storing this
1439 * route into the cache we really need to set it.
1441 rt
->dst
.flags
|= DST_NOCACHE
;
1442 if (!rt
->rt_gateway
)
1443 rt
->rt_gateway
= daddr
;
1444 rt_add_uncached_list(rt
);
1447 rt_add_uncached_list(rt
);
1449 #ifdef CONFIG_IP_ROUTE_CLASSID
1450 #ifdef CONFIG_IP_MULTIPLE_TABLES
1451 set_class_tag(rt
, res
->tclassid
);
1453 set_class_tag(rt
, itag
);
1457 static struct rtable
*rt_dst_alloc(struct net_device
*dev
,
1458 unsigned int flags
, u16 type
,
1459 bool nopolicy
, bool noxfrm
, bool will_cache
)
1463 rt
= dst_alloc(&ipv4_dst_ops
, dev
, 1, DST_OBSOLETE_FORCE_CHK
,
1464 (will_cache
? 0 : (DST_HOST
| DST_NOCACHE
)) |
1465 (nopolicy
? DST_NOPOLICY
: 0) |
1466 (noxfrm
? DST_NOXFRM
: 0));
1469 rt
->rt_genid
= rt_genid_ipv4(dev_net(dev
));
1470 rt
->rt_flags
= flags
;
1472 rt
->rt_is_input
= 0;
1476 rt
->rt_uses_gateway
= 0;
1477 rt
->rt_table_id
= 0;
1478 INIT_LIST_HEAD(&rt
->rt_uncached
);
1480 rt
->dst
.output
= ip_output
;
1481 if (flags
& RTCF_LOCAL
)
1482 rt
->dst
.input
= ip_local_deliver
;
1488 /* called in rcu_read_lock() section */
1489 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1490 u8 tos
, struct net_device
*dev
, int our
)
1493 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1494 unsigned int flags
= RTCF_MULTICAST
;
1498 /* Primary sanity checks. */
1503 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1504 skb
->protocol
!= htons(ETH_P_IP
))
1507 if (ipv4_is_loopback(saddr
) && !IN_DEV_ROUTE_LOCALNET(in_dev
))
1510 if (ipv4_is_zeronet(saddr
)) {
1511 if (!ipv4_is_local_multicast(daddr
))
1514 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
1520 flags
|= RTCF_LOCAL
;
1522 rth
= rt_dst_alloc(dev_net(dev
)->loopback_dev
, flags
, RTN_MULTICAST
,
1523 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false, false);
1527 #ifdef CONFIG_IP_ROUTE_CLASSID
1528 rth
->dst
.tclassid
= itag
;
1530 rth
->dst
.output
= ip_rt_bug
;
1531 rth
->rt_is_input
= 1;
1533 #ifdef CONFIG_IP_MROUTE
1534 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1535 rth
->dst
.input
= ip_mr_input
;
1537 RT_CACHE_STAT_INC(in_slow_mc
);
1539 skb_dst_set(skb
, &rth
->dst
);
1551 static void ip_handle_martian_source(struct net_device
*dev
,
1552 struct in_device
*in_dev
,
1553 struct sk_buff
*skb
,
1557 RT_CACHE_STAT_INC(in_martian_src
);
1558 #ifdef CONFIG_IP_ROUTE_VERBOSE
1559 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1561 * RFC1812 recommendation, if source is martian,
1562 * the only hint is MAC header.
1564 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1565 &daddr
, &saddr
, dev
->name
);
1566 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1567 print_hex_dump(KERN_WARNING
, "ll header: ",
1568 DUMP_PREFIX_OFFSET
, 16, 1,
1569 skb_mac_header(skb
),
1570 dev
->hard_header_len
, true);
1576 static void ip_del_fnhe(struct fib_nh
*nh
, __be32 daddr
)
1578 struct fnhe_hash_bucket
*hash
;
1579 struct fib_nh_exception
*fnhe
, __rcu
**fnhe_p
;
1580 u32 hval
= fnhe_hashfun(daddr
);
1582 spin_lock_bh(&fnhe_lock
);
1584 hash
= rcu_dereference_protected(nh
->nh_exceptions
,
1585 lockdep_is_held(&fnhe_lock
));
1588 fnhe_p
= &hash
->chain
;
1589 fnhe
= rcu_dereference_protected(*fnhe_p
, lockdep_is_held(&fnhe_lock
));
1591 if (fnhe
->fnhe_daddr
== daddr
) {
1592 rcu_assign_pointer(*fnhe_p
, rcu_dereference_protected(
1593 fnhe
->fnhe_next
, lockdep_is_held(&fnhe_lock
)));
1594 fnhe_flush_routes(fnhe
);
1595 kfree_rcu(fnhe
, rcu
);
1598 fnhe_p
= &fnhe
->fnhe_next
;
1599 fnhe
= rcu_dereference_protected(fnhe
->fnhe_next
,
1600 lockdep_is_held(&fnhe_lock
));
1603 spin_unlock_bh(&fnhe_lock
);
1606 /* called in rcu_read_lock() section */
1607 static int __mkroute_input(struct sk_buff
*skb
,
1608 const struct fib_result
*res
,
1609 struct in_device
*in_dev
,
1610 __be32 daddr
, __be32 saddr
, u32 tos
)
1612 struct fib_nh_exception
*fnhe
;
1615 struct in_device
*out_dev
;
1619 /* get a working reference to the output device */
1620 out_dev
= __in_dev_get_rcu(FIB_RES_DEV(*res
));
1622 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1626 err
= fib_validate_source(skb
, saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
1627 in_dev
->dev
, in_dev
, &itag
);
1629 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
1635 do_cache
= res
->fi
&& !itag
;
1636 if (out_dev
== in_dev
&& err
&& IN_DEV_TX_REDIRECTS(out_dev
) &&
1637 skb
->protocol
== htons(ETH_P_IP
) &&
1638 (IN_DEV_SHARED_MEDIA(out_dev
) ||
1639 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
1640 IPCB(skb
)->flags
|= IPSKB_DOREDIRECT
;
1642 if (skb
->protocol
!= htons(ETH_P_IP
)) {
1643 /* Not IP (i.e. ARP). Do not create route, if it is
1644 * invalid for proxy arp. DNAT routes are always valid.
1646 * Proxy arp feature have been extended to allow, ARP
1647 * replies back to the same interface, to support
1648 * Private VLAN switch technologies. See arp.c.
1650 if (out_dev
== in_dev
&&
1651 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
1657 fnhe
= find_exception(&FIB_RES_NH(*res
), daddr
);
1660 rth
= rcu_dereference(fnhe
->fnhe_rth_input
);
1661 if (rth
&& rth
->dst
.expires
&&
1662 time_after(jiffies
, rth
->dst
.expires
)) {
1663 ip_del_fnhe(&FIB_RES_NH(*res
), daddr
);
1670 rth
= rcu_dereference(FIB_RES_NH(*res
).nh_rth_input
);
1673 if (rt_cache_valid(rth
)) {
1674 skb_dst_set_noref(skb
, &rth
->dst
);
1679 rth
= rt_dst_alloc(out_dev
->dev
, 0, res
->type
,
1680 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
1681 IN_DEV_CONF_GET(out_dev
, NOXFRM
), do_cache
);
1687 rth
->rt_is_input
= 1;
1689 rth
->rt_table_id
= res
->table
->tb_id
;
1690 RT_CACHE_STAT_INC(in_slow_tot
);
1692 rth
->dst
.input
= ip_forward
;
1694 rt_set_nexthop(rth
, daddr
, res
, fnhe
, res
->fi
, res
->type
, itag
);
1695 if (lwtunnel_output_redirect(rth
->dst
.lwtstate
)) {
1696 rth
->dst
.lwtstate
->orig_output
= rth
->dst
.output
;
1697 rth
->dst
.output
= lwtunnel_output
;
1699 if (lwtunnel_input_redirect(rth
->dst
.lwtstate
)) {
1700 rth
->dst
.lwtstate
->orig_input
= rth
->dst
.input
;
1701 rth
->dst
.input
= lwtunnel_input
;
1703 skb_dst_set(skb
, &rth
->dst
);
1710 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1712 /* To make ICMP packets follow the right flow, the multipath hash is
1713 * calculated from the inner IP addresses in reverse order.
1715 static int ip_multipath_icmp_hash(struct sk_buff
*skb
)
1717 const struct iphdr
*outer_iph
= ip_hdr(skb
);
1718 struct icmphdr _icmph
;
1719 const struct icmphdr
*icmph
;
1720 struct iphdr _inner_iph
;
1721 const struct iphdr
*inner_iph
;
1723 if (unlikely((outer_iph
->frag_off
& htons(IP_OFFSET
)) != 0))
1726 icmph
= skb_header_pointer(skb
, outer_iph
->ihl
* 4, sizeof(_icmph
),
1731 if (icmph
->type
!= ICMP_DEST_UNREACH
&&
1732 icmph
->type
!= ICMP_REDIRECT
&&
1733 icmph
->type
!= ICMP_TIME_EXCEEDED
&&
1734 icmph
->type
!= ICMP_PARAMETERPROB
) {
1738 inner_iph
= skb_header_pointer(skb
,
1739 outer_iph
->ihl
* 4 + sizeof(_icmph
),
1740 sizeof(_inner_iph
), &_inner_iph
);
1744 return fib_multipath_hash(inner_iph
->daddr
, inner_iph
->saddr
);
1747 return fib_multipath_hash(outer_iph
->saddr
, outer_iph
->daddr
);
1750 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1752 static int ip_mkroute_input(struct sk_buff
*skb
,
1753 struct fib_result
*res
,
1754 const struct flowi4
*fl4
,
1755 struct in_device
*in_dev
,
1756 __be32 daddr
, __be32 saddr
, u32 tos
)
1758 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1759 if (res
->fi
&& res
->fi
->fib_nhs
> 1) {
1762 if (unlikely(ip_hdr(skb
)->protocol
== IPPROTO_ICMP
))
1763 h
= ip_multipath_icmp_hash(skb
);
1765 h
= fib_multipath_hash(saddr
, daddr
);
1766 fib_select_multipath(res
, h
);
1770 /* create a routing cache entry */
1771 return __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
);
1775 * NOTE. We drop all the packets that has local source
1776 * addresses, because every properly looped back packet
1777 * must have correct destination already attached by output routine.
1779 * Such approach solves two big problems:
1780 * 1. Not simplex devices are handled properly.
1781 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1782 * called with rcu_read_lock()
1785 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1786 u8 tos
, struct net_device
*dev
)
1788 struct fib_result res
;
1789 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1790 struct ip_tunnel_info
*tun_info
;
1792 unsigned int flags
= 0;
1796 struct net
*net
= dev_net(dev
);
1799 /* IP on this device is disabled. */
1804 /* Check for the most weird martians, which can be not detected
1808 tun_info
= skb_tunnel_info(skb
);
1809 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
1810 fl4
.flowi4_tun_key
.tun_id
= tun_info
->key
.tun_id
;
1812 fl4
.flowi4_tun_key
.tun_id
= 0;
1815 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
))
1816 goto martian_source
;
1820 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
1823 /* Accept zero addresses only to limited broadcast;
1824 * I even do not know to fix it or not. Waiting for complains :-)
1826 if (ipv4_is_zeronet(saddr
))
1827 goto martian_source
;
1829 if (ipv4_is_zeronet(daddr
))
1830 goto martian_destination
;
1832 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1833 * and call it once if daddr or/and saddr are loopback addresses
1835 if (ipv4_is_loopback(daddr
)) {
1836 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
))
1837 goto martian_destination
;
1838 } else if (ipv4_is_loopback(saddr
)) {
1839 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
))
1840 goto martian_source
;
1844 * Now we are ready to route packet.
1847 fl4
.flowi4_iif
= l3mdev_fib_oif_rcu(dev
);
1848 fl4
.flowi4_mark
= skb
->mark
;
1849 fl4
.flowi4_tos
= tos
;
1850 fl4
.flowi4_scope
= RT_SCOPE_UNIVERSE
;
1851 fl4
.flowi4_flags
= 0;
1854 err
= fib_lookup(net
, &fl4
, &res
, 0);
1856 if (!IN_DEV_FORWARD(in_dev
))
1857 err
= -EHOSTUNREACH
;
1861 if (res
.type
== RTN_BROADCAST
)
1864 if (res
.type
== RTN_LOCAL
) {
1865 err
= fib_validate_source(skb
, saddr
, daddr
, tos
,
1866 0, dev
, in_dev
, &itag
);
1868 goto martian_source
;
1872 if (!IN_DEV_FORWARD(in_dev
)) {
1873 err
= -EHOSTUNREACH
;
1876 if (res
.type
!= RTN_UNICAST
)
1877 goto martian_destination
;
1879 err
= ip_mkroute_input(skb
, &res
, &fl4
, in_dev
, daddr
, saddr
, tos
);
1883 if (skb
->protocol
!= htons(ETH_P_IP
))
1886 if (!ipv4_is_zeronet(saddr
)) {
1887 err
= fib_validate_source(skb
, saddr
, 0, tos
, 0, dev
,
1890 goto martian_source
;
1892 flags
|= RTCF_BROADCAST
;
1893 res
.type
= RTN_BROADCAST
;
1894 RT_CACHE_STAT_INC(in_brd
);
1900 rth
= rcu_dereference(FIB_RES_NH(res
).nh_rth_input
);
1901 if (rt_cache_valid(rth
)) {
1902 skb_dst_set_noref(skb
, &rth
->dst
);
1910 rth
= rt_dst_alloc(net
->loopback_dev
, flags
| RTCF_LOCAL
, res
.type
,
1911 IN_DEV_CONF_GET(in_dev
, NOPOLICY
), false, do_cache
);
1915 rth
->dst
.output
= ip_rt_bug
;
1916 #ifdef CONFIG_IP_ROUTE_CLASSID
1917 rth
->dst
.tclassid
= itag
;
1919 rth
->rt_is_input
= 1;
1921 rth
->rt_table_id
= res
.table
->tb_id
;
1923 RT_CACHE_STAT_INC(in_slow_tot
);
1924 if (res
.type
== RTN_UNREACHABLE
) {
1925 rth
->dst
.input
= ip_error
;
1926 rth
->dst
.error
= -err
;
1927 rth
->rt_flags
&= ~RTCF_LOCAL
;
1930 if (unlikely(!rt_cache_route(&FIB_RES_NH(res
), rth
))) {
1931 rth
->dst
.flags
|= DST_NOCACHE
;
1932 rt_add_uncached_list(rth
);
1935 skb_dst_set(skb
, &rth
->dst
);
1940 RT_CACHE_STAT_INC(in_no_route
);
1941 res
.type
= RTN_UNREACHABLE
;
1947 * Do not cache martian addresses: they should be logged (RFC1812)
1949 martian_destination
:
1950 RT_CACHE_STAT_INC(in_martian_dst
);
1951 #ifdef CONFIG_IP_ROUTE_VERBOSE
1952 if (IN_DEV_LOG_MARTIANS(in_dev
))
1953 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1954 &daddr
, &saddr
, dev
->name
);
1966 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
1970 int ip_route_input_noref(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1971 u8 tos
, struct net_device
*dev
)
1975 tos
&= IPTOS_RT_MASK
;
1978 /* Multicast recognition logic is moved from route cache to here.
1979 The problem was that too many Ethernet cards have broken/missing
1980 hardware multicast filters :-( As result the host on multicasting
1981 network acquires a lot of useless route cache entries, sort of
1982 SDR messages from all the world. Now we try to get rid of them.
1983 Really, provided software IP multicast filter is organized
1984 reasonably (at least, hashed), it does not result in a slowdown
1985 comparing with route cache reject entries.
1986 Note, that multicast routers are not affected, because
1987 route cache entry is created eventually.
1989 if (ipv4_is_multicast(daddr
)) {
1990 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1993 int our
= ip_check_mc_rcu(in_dev
, daddr
, saddr
,
1994 ip_hdr(skb
)->protocol
);
1996 #ifdef CONFIG_IP_MROUTE
1998 (!ipv4_is_local_multicast(daddr
) &&
1999 IN_DEV_MFORWARD(in_dev
))
2002 int res
= ip_route_input_mc(skb
, daddr
, saddr
,
2011 res
= ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2015 EXPORT_SYMBOL(ip_route_input_noref
);
2017 /* called with rcu_read_lock() */
2018 static struct rtable
*__mkroute_output(const struct fib_result
*res
,
2019 const struct flowi4
*fl4
, int orig_oif
,
2020 struct net_device
*dev_out
,
2023 struct fib_info
*fi
= res
->fi
;
2024 struct fib_nh_exception
*fnhe
;
2025 struct in_device
*in_dev
;
2026 u16 type
= res
->type
;
2030 in_dev
= __in_dev_get_rcu(dev_out
);
2032 return ERR_PTR(-EINVAL
);
2034 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
)))
2035 if (ipv4_is_loopback(fl4
->saddr
) && !(dev_out
->flags
& IFF_LOOPBACK
))
2036 return ERR_PTR(-EINVAL
);
2038 if (ipv4_is_lbcast(fl4
->daddr
))
2039 type
= RTN_BROADCAST
;
2040 else if (ipv4_is_multicast(fl4
->daddr
))
2041 type
= RTN_MULTICAST
;
2042 else if (ipv4_is_zeronet(fl4
->daddr
))
2043 return ERR_PTR(-EINVAL
);
2045 if (dev_out
->flags
& IFF_LOOPBACK
)
2046 flags
|= RTCF_LOCAL
;
2049 if (type
== RTN_BROADCAST
) {
2050 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2052 } else if (type
== RTN_MULTICAST
) {
2053 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2054 if (!ip_check_mc_rcu(in_dev
, fl4
->daddr
, fl4
->saddr
,
2056 flags
&= ~RTCF_LOCAL
;
2059 /* If multicast route do not exist use
2060 * default one, but do not gateway in this case.
2063 if (fi
&& res
->prefixlen
< 4)
2065 } else if ((type
== RTN_LOCAL
) && (orig_oif
!= 0) &&
2066 (orig_oif
!= dev_out
->ifindex
)) {
2067 /* For local routes that require a particular output interface
2068 * we do not want to cache the result. Caching the result
2069 * causes incorrect behaviour when there are multiple source
2070 * addresses on the interface, the end result being that if the
2071 * intended recipient is waiting on that interface for the
2072 * packet he won't receive it because it will be delivered on
2073 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2074 * be set to the loopback interface as well.
2080 do_cache
&= fi
!= NULL
;
2082 struct rtable __rcu
**prth
;
2083 struct fib_nh
*nh
= &FIB_RES_NH(*res
);
2085 fnhe
= find_exception(nh
, fl4
->daddr
);
2087 prth
= &fnhe
->fnhe_rth_output
;
2088 rth
= rcu_dereference(*prth
);
2089 if (rth
&& rth
->dst
.expires
&&
2090 time_after(jiffies
, rth
->dst
.expires
)) {
2091 ip_del_fnhe(nh
, fl4
->daddr
);
2098 if (unlikely(fl4
->flowi4_flags
&
2099 FLOWI_FLAG_KNOWN_NH
&&
2101 nh
->nh_scope
== RT_SCOPE_LINK
))) {
2105 prth
= raw_cpu_ptr(nh
->nh_pcpu_rth_output
);
2106 rth
= rcu_dereference(*prth
);
2109 if (rt_cache_valid(rth
)) {
2110 dst_hold(&rth
->dst
);
2116 rth
= rt_dst_alloc(dev_out
, flags
, type
,
2117 IN_DEV_CONF_GET(in_dev
, NOPOLICY
),
2118 IN_DEV_CONF_GET(in_dev
, NOXFRM
),
2121 return ERR_PTR(-ENOBUFS
);
2123 rth
->rt_iif
= orig_oif
? : 0;
2125 rth
->rt_table_id
= res
->table
->tb_id
;
2127 RT_CACHE_STAT_INC(out_slow_tot
);
2129 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2130 if (flags
& RTCF_LOCAL
&&
2131 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2132 rth
->dst
.output
= ip_mc_output
;
2133 RT_CACHE_STAT_INC(out_slow_mc
);
2135 #ifdef CONFIG_IP_MROUTE
2136 if (type
== RTN_MULTICAST
) {
2137 if (IN_DEV_MFORWARD(in_dev
) &&
2138 !ipv4_is_local_multicast(fl4
->daddr
)) {
2139 rth
->dst
.input
= ip_mr_input
;
2140 rth
->dst
.output
= ip_mc_output
;
2146 rt_set_nexthop(rth
, fl4
->daddr
, res
, fnhe
, fi
, type
, 0);
2147 if (lwtunnel_output_redirect(rth
->dst
.lwtstate
))
2148 rth
->dst
.output
= lwtunnel_output
;
2154 * Major route resolver routine.
2157 struct rtable
*__ip_route_output_key_hash(struct net
*net
, struct flowi4
*fl4
,
2160 struct net_device
*dev_out
= NULL
;
2161 __u8 tos
= RT_FL_TOS(fl4
);
2162 unsigned int flags
= 0;
2163 struct fib_result res
;
2166 int err
= -ENETUNREACH
;
2172 orig_oif
= fl4
->flowi4_oif
;
2174 fl4
->flowi4_iif
= LOOPBACK_IFINDEX
;
2175 fl4
->flowi4_tos
= tos
& IPTOS_RT_MASK
;
2176 fl4
->flowi4_scope
= ((tos
& RTO_ONLINK
) ?
2177 RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
);
2181 rth
= ERR_PTR(-EINVAL
);
2182 if (ipv4_is_multicast(fl4
->saddr
) ||
2183 ipv4_is_lbcast(fl4
->saddr
) ||
2184 ipv4_is_zeronet(fl4
->saddr
))
2187 /* I removed check for oif == dev_out->oif here.
2188 It was wrong for two reasons:
2189 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2190 is assigned to multiple interfaces.
2191 2. Moreover, we are allowed to send packets with saddr
2192 of another iface. --ANK
2195 if (fl4
->flowi4_oif
== 0 &&
2196 (ipv4_is_multicast(fl4
->daddr
) ||
2197 ipv4_is_lbcast(fl4
->daddr
))) {
2198 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2199 dev_out
= __ip_dev_find(net
, fl4
->saddr
, false);
2203 /* Special hack: user can direct multicasts
2204 and limited broadcast via necessary interface
2205 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2206 This hack is not just for fun, it allows
2207 vic,vat and friends to work.
2208 They bind socket to loopback, set ttl to zero
2209 and expect that it will work.
2210 From the viewpoint of routing cache they are broken,
2211 because we are not allowed to build multicast path
2212 with loopback source addr (look, routing cache
2213 cannot know, that ttl is zero, so that packet
2214 will not leave this host and route is valid).
2215 Luckily, this hack is good workaround.
2218 fl4
->flowi4_oif
= dev_out
->ifindex
;
2222 if (!(fl4
->flowi4_flags
& FLOWI_FLAG_ANYSRC
)) {
2223 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2224 if (!__ip_dev_find(net
, fl4
->saddr
, false))
2230 if (fl4
->flowi4_oif
) {
2231 dev_out
= dev_get_by_index_rcu(net
, fl4
->flowi4_oif
);
2232 rth
= ERR_PTR(-ENODEV
);
2236 /* RACE: Check return value of inet_select_addr instead. */
2237 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2238 rth
= ERR_PTR(-ENETUNREACH
);
2241 if (ipv4_is_local_multicast(fl4
->daddr
) ||
2242 ipv4_is_lbcast(fl4
->daddr
) ||
2243 fl4
->flowi4_proto
== IPPROTO_IGMP
) {
2245 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2250 if (ipv4_is_multicast(fl4
->daddr
))
2251 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2253 else if (!fl4
->daddr
)
2254 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2258 rth
= l3mdev_get_rtable(dev_out
, fl4
);
2264 fl4
->daddr
= fl4
->saddr
;
2266 fl4
->daddr
= fl4
->saddr
= htonl(INADDR_LOOPBACK
);
2267 dev_out
= net
->loopback_dev
;
2268 fl4
->flowi4_oif
= LOOPBACK_IFINDEX
;
2269 res
.type
= RTN_LOCAL
;
2270 flags
|= RTCF_LOCAL
;
2274 err
= fib_lookup(net
, fl4
, &res
, 0);
2278 if (fl4
->flowi4_oif
&&
2279 !netif_index_is_l3_master(net
, fl4
->flowi4_oif
)) {
2280 /* Apparently, routing tables are wrong. Assume,
2281 that the destination is on link.
2284 Because we are allowed to send to iface
2285 even if it has NO routes and NO assigned
2286 addresses. When oif is specified, routing
2287 tables are looked up with only one purpose:
2288 to catch if destination is gatewayed, rather than
2289 direct. Moreover, if MSG_DONTROUTE is set,
2290 we send packet, ignoring both routing tables
2291 and ifaddr state. --ANK
2294 We could make it even if oif is unknown,
2295 likely IPv6, but we do not.
2298 if (fl4
->saddr
== 0)
2299 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2301 res
.type
= RTN_UNICAST
;
2308 if (res
.type
== RTN_LOCAL
) {
2310 if (res
.fi
->fib_prefsrc
)
2311 fl4
->saddr
= res
.fi
->fib_prefsrc
;
2313 fl4
->saddr
= fl4
->daddr
;
2315 dev_out
= net
->loopback_dev
;
2316 fl4
->flowi4_oif
= dev_out
->ifindex
;
2317 flags
|= RTCF_LOCAL
;
2321 fib_select_path(net
, &res
, fl4
, mp_hash
);
2323 dev_out
= FIB_RES_DEV(res
);
2324 fl4
->flowi4_oif
= dev_out
->ifindex
;
2328 rth
= __mkroute_output(&res
, fl4
, orig_oif
, dev_out
, flags
);
2334 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash
);
2336 static struct dst_entry
*ipv4_blackhole_dst_check(struct dst_entry
*dst
, u32 cookie
)
2341 static unsigned int ipv4_blackhole_mtu(const struct dst_entry
*dst
)
2343 unsigned int mtu
= dst_metric_raw(dst
, RTAX_MTU
);
2345 return mtu
? : dst
->dev
->mtu
;
2348 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
2349 struct sk_buff
*skb
, u32 mtu
)
2353 static void ipv4_rt_blackhole_redirect(struct dst_entry
*dst
, struct sock
*sk
,
2354 struct sk_buff
*skb
)
2358 static u32
*ipv4_rt_blackhole_cow_metrics(struct dst_entry
*dst
,
2364 static struct dst_ops ipv4_dst_blackhole_ops
= {
2366 .check
= ipv4_blackhole_dst_check
,
2367 .mtu
= ipv4_blackhole_mtu
,
2368 .default_advmss
= ipv4_default_advmss
,
2369 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2370 .redirect
= ipv4_rt_blackhole_redirect
,
2371 .cow_metrics
= ipv4_rt_blackhole_cow_metrics
,
2372 .neigh_lookup
= ipv4_neigh_lookup
,
2375 struct dst_entry
*ipv4_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2377 struct rtable
*ort
= (struct rtable
*) dst_orig
;
2380 rt
= dst_alloc(&ipv4_dst_blackhole_ops
, NULL
, 1, DST_OBSOLETE_NONE
, 0);
2382 struct dst_entry
*new = &rt
->dst
;
2385 new->input
= dst_discard
;
2386 new->output
= dst_discard_out
;
2388 new->dev
= ort
->dst
.dev
;
2392 rt
->rt_is_input
= ort
->rt_is_input
;
2393 rt
->rt_iif
= ort
->rt_iif
;
2394 rt
->rt_pmtu
= ort
->rt_pmtu
;
2396 rt
->rt_genid
= rt_genid_ipv4(net
);
2397 rt
->rt_flags
= ort
->rt_flags
;
2398 rt
->rt_type
= ort
->rt_type
;
2399 rt
->rt_gateway
= ort
->rt_gateway
;
2400 rt
->rt_uses_gateway
= ort
->rt_uses_gateway
;
2402 INIT_LIST_HEAD(&rt
->rt_uncached
);
2406 dst_release(dst_orig
);
2408 return rt
? &rt
->dst
: ERR_PTR(-ENOMEM
);
2411 struct rtable
*ip_route_output_flow(struct net
*net
, struct flowi4
*flp4
,
2412 const struct sock
*sk
)
2414 struct rtable
*rt
= __ip_route_output_key(net
, flp4
);
2419 if (flp4
->flowi4_proto
)
2420 rt
= (struct rtable
*)xfrm_lookup_route(net
, &rt
->dst
,
2421 flowi4_to_flowi(flp4
),
2426 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2428 static int rt_fill_info(struct net
*net
, __be32 dst
, __be32 src
, u32 table_id
,
2429 struct flowi4
*fl4
, struct sk_buff
*skb
, u32 portid
,
2430 u32 seq
, int event
, int nowait
, unsigned int flags
)
2432 struct rtable
*rt
= skb_rtable(skb
);
2434 struct nlmsghdr
*nlh
;
2435 unsigned long expires
= 0;
2437 u32 metrics
[RTAX_MAX
];
2439 nlh
= nlmsg_put(skb
, portid
, seq
, event
, sizeof(*r
), flags
);
2443 r
= nlmsg_data(nlh
);
2444 r
->rtm_family
= AF_INET
;
2445 r
->rtm_dst_len
= 32;
2447 r
->rtm_tos
= fl4
->flowi4_tos
;
2448 r
->rtm_table
= table_id
< 256 ? table_id
: RT_TABLE_COMPAT
;
2449 if (nla_put_u32(skb
, RTA_TABLE
, table_id
))
2450 goto nla_put_failure
;
2451 r
->rtm_type
= rt
->rt_type
;
2452 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2453 r
->rtm_protocol
= RTPROT_UNSPEC
;
2454 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2455 if (rt
->rt_flags
& RTCF_NOTIFY
)
2456 r
->rtm_flags
|= RTM_F_NOTIFY
;
2457 if (IPCB(skb
)->flags
& IPSKB_DOREDIRECT
)
2458 r
->rtm_flags
|= RTCF_DOREDIRECT
;
2460 if (nla_put_in_addr(skb
, RTA_DST
, dst
))
2461 goto nla_put_failure
;
2463 r
->rtm_src_len
= 32;
2464 if (nla_put_in_addr(skb
, RTA_SRC
, src
))
2465 goto nla_put_failure
;
2468 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
2469 goto nla_put_failure
;
2470 #ifdef CONFIG_IP_ROUTE_CLASSID
2471 if (rt
->dst
.tclassid
&&
2472 nla_put_u32(skb
, RTA_FLOW
, rt
->dst
.tclassid
))
2473 goto nla_put_failure
;
2475 if (!rt_is_input_route(rt
) &&
2476 fl4
->saddr
!= src
) {
2477 if (nla_put_in_addr(skb
, RTA_PREFSRC
, fl4
->saddr
))
2478 goto nla_put_failure
;
2480 if (rt
->rt_uses_gateway
&&
2481 nla_put_in_addr(skb
, RTA_GATEWAY
, rt
->rt_gateway
))
2482 goto nla_put_failure
;
2484 expires
= rt
->dst
.expires
;
2486 unsigned long now
= jiffies
;
2488 if (time_before(now
, expires
))
2494 memcpy(metrics
, dst_metrics_ptr(&rt
->dst
), sizeof(metrics
));
2495 if (rt
->rt_pmtu
&& expires
)
2496 metrics
[RTAX_MTU
- 1] = rt
->rt_pmtu
;
2497 if (rtnetlink_put_metrics(skb
, metrics
) < 0)
2498 goto nla_put_failure
;
2500 if (fl4
->flowi4_mark
&&
2501 nla_put_u32(skb
, RTA_MARK
, fl4
->flowi4_mark
))
2502 goto nla_put_failure
;
2504 if (!uid_eq(fl4
->flowi4_uid
, INVALID_UID
) &&
2505 nla_put_u32(skb
, RTA_UID
,
2506 from_kuid_munged(current_user_ns(), fl4
->flowi4_uid
)))
2507 goto nla_put_failure
;
2509 error
= rt
->dst
.error
;
2511 if (rt_is_input_route(rt
)) {
2512 #ifdef CONFIG_IP_MROUTE
2513 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2514 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2515 int err
= ipmr_get_route(net
, skb
,
2516 fl4
->saddr
, fl4
->daddr
,
2523 goto nla_put_failure
;
2525 if (err
== -EMSGSIZE
)
2526 goto nla_put_failure
;
2532 if (nla_put_u32(skb
, RTA_IIF
, skb
->dev
->ifindex
))
2533 goto nla_put_failure
;
2536 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, expires
, error
) < 0)
2537 goto nla_put_failure
;
2539 nlmsg_end(skb
, nlh
);
2543 nlmsg_cancel(skb
, nlh
);
2547 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
)
2549 struct net
*net
= sock_net(in_skb
->sk
);
2551 struct nlattr
*tb
[RTA_MAX
+1];
2552 struct rtable
*rt
= NULL
;
2559 struct sk_buff
*skb
;
2560 u32 table_id
= RT_TABLE_MAIN
;
2563 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
2567 rtm
= nlmsg_data(nlh
);
2569 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2575 /* Reserve room for dummy headers, this skb can pass
2576 through good chunk of routing engine.
2578 skb_reset_mac_header(skb
);
2579 skb_reset_network_header(skb
);
2581 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2582 ip_hdr(skb
)->protocol
= IPPROTO_UDP
;
2583 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2585 src
= tb
[RTA_SRC
] ? nla_get_in_addr(tb
[RTA_SRC
]) : 0;
2586 dst
= tb
[RTA_DST
] ? nla_get_in_addr(tb
[RTA_DST
]) : 0;
2587 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2588 mark
= tb
[RTA_MARK
] ? nla_get_u32(tb
[RTA_MARK
]) : 0;
2590 uid
= make_kuid(current_user_ns(), nla_get_u32(tb
[RTA_UID
]));
2592 uid
= (iif
? INVALID_UID
: current_uid());
2594 memset(&fl4
, 0, sizeof(fl4
));
2597 fl4
.flowi4_tos
= rtm
->rtm_tos
;
2598 fl4
.flowi4_oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0;
2599 fl4
.flowi4_mark
= mark
;
2600 fl4
.flowi4_uid
= uid
;
2602 if (netif_index_is_l3_master(net
, fl4
.flowi4_oif
))
2603 fl4
.flowi4_flags
= FLOWI_FLAG_L3MDEV_SRC
| FLOWI_FLAG_SKIP_NH_OIF
;
2606 struct net_device
*dev
;
2608 dev
= __dev_get_by_index(net
, iif
);
2614 skb
->protocol
= htons(ETH_P_IP
);
2618 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
2621 rt
= skb_rtable(skb
);
2622 if (err
== 0 && rt
->dst
.error
)
2623 err
= -rt
->dst
.error
;
2625 rt
= ip_route_output_key(net
, &fl4
);
2635 skb_dst_set(skb
, &rt
->dst
);
2636 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
2637 rt
->rt_flags
|= RTCF_NOTIFY
;
2639 if (rtm
->rtm_flags
& RTM_F_LOOKUP_TABLE
)
2640 table_id
= rt
->rt_table_id
;
2642 err
= rt_fill_info(net
, dst
, src
, table_id
, &fl4
, skb
,
2643 NETLINK_CB(in_skb
).portid
, nlh
->nlmsg_seq
,
2644 RTM_NEWROUTE
, 0, 0);
2648 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
2657 void ip_rt_multicast_event(struct in_device
*in_dev
)
2659 rt_cache_flush(dev_net(in_dev
->dev
));
2662 #ifdef CONFIG_SYSCTL
2663 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
2664 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
2665 static int ip_rt_gc_elasticity __read_mostly
= 8;
2667 static int ipv4_sysctl_rtcache_flush(struct ctl_table
*__ctl
, int write
,
2668 void __user
*buffer
,
2669 size_t *lenp
, loff_t
*ppos
)
2671 struct net
*net
= (struct net
*)__ctl
->extra1
;
2674 rt_cache_flush(net
);
2675 fnhe_genid_bump(net
);
2682 static struct ctl_table ipv4_route_table
[] = {
2684 .procname
= "gc_thresh",
2685 .data
= &ipv4_dst_ops
.gc_thresh
,
2686 .maxlen
= sizeof(int),
2688 .proc_handler
= proc_dointvec
,
2691 .procname
= "max_size",
2692 .data
= &ip_rt_max_size
,
2693 .maxlen
= sizeof(int),
2695 .proc_handler
= proc_dointvec
,
2698 /* Deprecated. Use gc_min_interval_ms */
2700 .procname
= "gc_min_interval",
2701 .data
= &ip_rt_gc_min_interval
,
2702 .maxlen
= sizeof(int),
2704 .proc_handler
= proc_dointvec_jiffies
,
2707 .procname
= "gc_min_interval_ms",
2708 .data
= &ip_rt_gc_min_interval
,
2709 .maxlen
= sizeof(int),
2711 .proc_handler
= proc_dointvec_ms_jiffies
,
2714 .procname
= "gc_timeout",
2715 .data
= &ip_rt_gc_timeout
,
2716 .maxlen
= sizeof(int),
2718 .proc_handler
= proc_dointvec_jiffies
,
2721 .procname
= "gc_interval",
2722 .data
= &ip_rt_gc_interval
,
2723 .maxlen
= sizeof(int),
2725 .proc_handler
= proc_dointvec_jiffies
,
2728 .procname
= "redirect_load",
2729 .data
= &ip_rt_redirect_load
,
2730 .maxlen
= sizeof(int),
2732 .proc_handler
= proc_dointvec
,
2735 .procname
= "redirect_number",
2736 .data
= &ip_rt_redirect_number
,
2737 .maxlen
= sizeof(int),
2739 .proc_handler
= proc_dointvec
,
2742 .procname
= "redirect_silence",
2743 .data
= &ip_rt_redirect_silence
,
2744 .maxlen
= sizeof(int),
2746 .proc_handler
= proc_dointvec
,
2749 .procname
= "error_cost",
2750 .data
= &ip_rt_error_cost
,
2751 .maxlen
= sizeof(int),
2753 .proc_handler
= proc_dointvec
,
2756 .procname
= "error_burst",
2757 .data
= &ip_rt_error_burst
,
2758 .maxlen
= sizeof(int),
2760 .proc_handler
= proc_dointvec
,
2763 .procname
= "gc_elasticity",
2764 .data
= &ip_rt_gc_elasticity
,
2765 .maxlen
= sizeof(int),
2767 .proc_handler
= proc_dointvec
,
2770 .procname
= "mtu_expires",
2771 .data
= &ip_rt_mtu_expires
,
2772 .maxlen
= sizeof(int),
2774 .proc_handler
= proc_dointvec_jiffies
,
2777 .procname
= "min_pmtu",
2778 .data
= &ip_rt_min_pmtu
,
2779 .maxlen
= sizeof(int),
2781 .proc_handler
= proc_dointvec
,
2784 .procname
= "min_adv_mss",
2785 .data
= &ip_rt_min_advmss
,
2786 .maxlen
= sizeof(int),
2788 .proc_handler
= proc_dointvec
,
2793 static struct ctl_table ipv4_route_flush_table
[] = {
2795 .procname
= "flush",
2796 .maxlen
= sizeof(int),
2798 .proc_handler
= ipv4_sysctl_rtcache_flush
,
2803 static __net_init
int sysctl_route_net_init(struct net
*net
)
2805 struct ctl_table
*tbl
;
2807 tbl
= ipv4_route_flush_table
;
2808 if (!net_eq(net
, &init_net
)) {
2809 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
2813 /* Don't export sysctls to unprivileged users */
2814 if (net
->user_ns
!= &init_user_ns
)
2815 tbl
[0].procname
= NULL
;
2817 tbl
[0].extra1
= net
;
2819 net
->ipv4
.route_hdr
= register_net_sysctl(net
, "net/ipv4/route", tbl
);
2820 if (!net
->ipv4
.route_hdr
)
2825 if (tbl
!= ipv4_route_flush_table
)
2831 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
2833 struct ctl_table
*tbl
;
2835 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
2836 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
2837 BUG_ON(tbl
== ipv4_route_flush_table
);
2841 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
2842 .init
= sysctl_route_net_init
,
2843 .exit
= sysctl_route_net_exit
,
2847 static __net_init
int rt_genid_init(struct net
*net
)
2849 atomic_set(&net
->ipv4
.rt_genid
, 0);
2850 atomic_set(&net
->fnhe_genid
, 0);
2851 get_random_bytes(&net
->ipv4
.dev_addr_genid
,
2852 sizeof(net
->ipv4
.dev_addr_genid
));
2856 static __net_initdata
struct pernet_operations rt_genid_ops
= {
2857 .init
= rt_genid_init
,
2860 static int __net_init
ipv4_inetpeer_init(struct net
*net
)
2862 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
2866 inet_peer_base_init(bp
);
2867 net
->ipv4
.peers
= bp
;
2871 static void __net_exit
ipv4_inetpeer_exit(struct net
*net
)
2873 struct inet_peer_base
*bp
= net
->ipv4
.peers
;
2875 net
->ipv4
.peers
= NULL
;
2876 inetpeer_invalidate_tree(bp
);
2880 static __net_initdata
struct pernet_operations ipv4_inetpeer_ops
= {
2881 .init
= ipv4_inetpeer_init
,
2882 .exit
= ipv4_inetpeer_exit
,
2885 #ifdef CONFIG_IP_ROUTE_CLASSID
2886 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
2887 #endif /* CONFIG_IP_ROUTE_CLASSID */
2889 int __init
ip_rt_init(void)
2894 ip_idents
= kmalloc(IP_IDENTS_SZ
* sizeof(*ip_idents
), GFP_KERNEL
);
2896 panic("IP: failed to allocate ip_idents\n");
2898 prandom_bytes(ip_idents
, IP_IDENTS_SZ
* sizeof(*ip_idents
));
2900 ip_tstamps
= kcalloc(IP_IDENTS_SZ
, sizeof(*ip_tstamps
), GFP_KERNEL
);
2902 panic("IP: failed to allocate ip_tstamps\n");
2904 for_each_possible_cpu(cpu
) {
2905 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
2907 INIT_LIST_HEAD(&ul
->head
);
2908 spin_lock_init(&ul
->lock
);
2910 #ifdef CONFIG_IP_ROUTE_CLASSID
2911 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
2913 panic("IP: failed to allocate ip_rt_acct\n");
2916 ipv4_dst_ops
.kmem_cachep
=
2917 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
2918 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
2920 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
2922 if (dst_entries_init(&ipv4_dst_ops
) < 0)
2923 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2925 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
2926 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2928 ipv4_dst_ops
.gc_thresh
= ~0;
2929 ip_rt_max_size
= INT_MAX
;
2934 if (ip_rt_proc_init())
2935 pr_err("Unable to create route proc files\n");
2940 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
, NULL
);
2942 #ifdef CONFIG_SYSCTL
2943 register_pernet_subsys(&sysctl_route_ops
);
2945 register_pernet_subsys(&rt_genid_ops
);
2946 register_pernet_subsys(&ipv4_inetpeer_ops
);
2950 #ifdef CONFIG_SYSCTL
2952 * We really need to sanitize the damn ipv4 init order, then all
2953 * this nonsense will go away.
2955 void __init
ip_static_sysctl_init(void)
2957 register_net_sysctl(&init_net
, "net/ipv4/route", ipv4_route_table
);