2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
78 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
79 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
80 static void ip6_dst_destroy(struct dst_entry
*);
81 static void ip6_dst_ifdown(struct dst_entry
*,
82 struct net_device
*dev
, int how
);
83 static int ip6_dst_gc(struct dst_ops
*ops
);
85 static int ip6_pkt_discard(struct sk_buff
*skb
);
86 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
87 static void ip6_link_failure(struct sk_buff
*skb
);
88 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
92 struct in6_addr
*prefix
, int prefixlen
,
93 struct in6_addr
*gwaddr
, int ifindex
,
95 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
96 struct in6_addr
*prefix
, int prefixlen
,
97 struct in6_addr
*gwaddr
, int ifindex
);
100 static struct dst_ops ip6_dst_ops_template
= {
102 .protocol
= cpu_to_be16(ETH_P_IPV6
),
105 .check
= ip6_dst_check
,
106 .destroy
= ip6_dst_destroy
,
107 .ifdown
= ip6_dst_ifdown
,
108 .negative_advice
= ip6_negative_advice
,
109 .link_failure
= ip6_link_failure
,
110 .update_pmtu
= ip6_rt_update_pmtu
,
111 .local_out
= __ip6_local_out
,
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
118 static struct dst_ops ip6_dst_blackhole_ops
= {
120 .protocol
= cpu_to_be16(ETH_P_IPV6
),
121 .destroy
= ip6_dst_destroy
,
122 .check
= ip6_dst_check
,
123 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
126 static struct rt6_info ip6_null_entry_template
= {
128 .__refcnt
= ATOMIC_INIT(1),
131 .error
= -ENETUNREACH
,
132 .input
= ip6_pkt_discard
,
133 .output
= ip6_pkt_discard_out
,
135 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
136 .rt6i_protocol
= RTPROT_KERNEL
,
137 .rt6i_metric
= ~(u32
) 0,
138 .rt6i_ref
= ATOMIC_INIT(1),
141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
144 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
146 static struct rt6_info ip6_prohibit_entry_template
= {
148 .__refcnt
= ATOMIC_INIT(1),
152 .input
= ip6_pkt_prohibit
,
153 .output
= ip6_pkt_prohibit_out
,
155 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
156 .rt6i_protocol
= RTPROT_KERNEL
,
157 .rt6i_metric
= ~(u32
) 0,
158 .rt6i_ref
= ATOMIC_INIT(1),
161 static struct rt6_info ip6_blk_hole_entry_template
= {
163 .__refcnt
= ATOMIC_INIT(1),
167 .input
= dst_discard
,
168 .output
= dst_discard
,
170 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
171 .rt6i_protocol
= RTPROT_KERNEL
,
172 .rt6i_metric
= ~(u32
) 0,
173 .rt6i_ref
= ATOMIC_INIT(1),
178 /* allocate dst with ip6_dst_ops */
179 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
181 return (struct rt6_info
*)dst_alloc(ops
);
184 static void ip6_dst_destroy(struct dst_entry
*dst
)
186 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
187 struct inet6_dev
*idev
= rt
->rt6i_idev
;
188 struct inet_peer
*peer
= rt
->rt6i_peer
;
191 rt
->rt6i_idev
= NULL
;
195 BUG_ON(!(rt
->rt6i_flags
& RTF_CACHE
));
196 rt
->rt6i_peer
= NULL
;
201 void rt6_bind_peer(struct rt6_info
*rt
, int create
)
203 struct inet_peer
*peer
;
205 if (WARN_ON(!(rt
->rt6i_flags
& RTF_CACHE
)))
208 peer
= inet_getpeer_v6(&rt
->rt6i_dst
.addr
, create
);
209 if (peer
&& cmpxchg(&rt
->rt6i_peer
, NULL
, peer
) != NULL
)
213 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
216 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
217 struct inet6_dev
*idev
= rt
->rt6i_idev
;
218 struct net_device
*loopback_dev
=
219 dev_net(dev
)->loopback_dev
;
221 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
222 struct inet6_dev
*loopback_idev
=
223 in6_dev_get(loopback_dev
);
224 if (loopback_idev
!= NULL
) {
225 rt
->rt6i_idev
= loopback_idev
;
231 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
233 return (rt
->rt6i_flags
& RTF_EXPIRES
) &&
234 time_after(jiffies
, rt
->rt6i_expires
);
237 static inline int rt6_need_strict(struct in6_addr
*daddr
)
239 return ipv6_addr_type(daddr
) &
240 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
);
244 * Route lookup. Any table->tb6_lock is implied.
247 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
249 struct in6_addr
*saddr
,
253 struct rt6_info
*local
= NULL
;
254 struct rt6_info
*sprt
;
256 if (!oif
&& ipv6_addr_any(saddr
))
259 for (sprt
= rt
; sprt
; sprt
= sprt
->dst
.rt6_next
) {
260 struct net_device
*dev
= sprt
->rt6i_dev
;
263 if (dev
->ifindex
== oif
)
265 if (dev
->flags
& IFF_LOOPBACK
) {
266 if (sprt
->rt6i_idev
== NULL
||
267 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
268 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
270 if (local
&& (!oif
||
271 local
->rt6i_idev
->dev
->ifindex
== oif
))
277 if (ipv6_chk_addr(net
, saddr
, dev
,
278 flags
& RT6_LOOKUP_F_IFACE
))
287 if (flags
& RT6_LOOKUP_F_IFACE
)
288 return net
->ipv6
.ip6_null_entry
;
294 #ifdef CONFIG_IPV6_ROUTER_PREF
295 static void rt6_probe(struct rt6_info
*rt
)
297 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
299 * Okay, this does not seem to be appropriate
300 * for now, however, we need to check if it
301 * is really so; aka Router Reachability Probing.
303 * Router Reachability Probe MUST be rate-limited
304 * to no more than one per minute.
306 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
308 read_lock_bh(&neigh
->lock
);
309 if (!(neigh
->nud_state
& NUD_VALID
) &&
310 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
311 struct in6_addr mcaddr
;
312 struct in6_addr
*target
;
314 neigh
->updated
= jiffies
;
315 read_unlock_bh(&neigh
->lock
);
317 target
= (struct in6_addr
*)&neigh
->primary_key
;
318 addrconf_addr_solict_mult(target
, &mcaddr
);
319 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
321 read_unlock_bh(&neigh
->lock
);
324 static inline void rt6_probe(struct rt6_info
*rt
)
330 * Default Router Selection (RFC 2461 6.3.6)
332 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
334 struct net_device
*dev
= rt
->rt6i_dev
;
335 if (!oif
|| dev
->ifindex
== oif
)
337 if ((dev
->flags
& IFF_LOOPBACK
) &&
338 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
343 static inline int rt6_check_neigh(struct rt6_info
*rt
)
345 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
347 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
348 !(rt
->rt6i_flags
& RTF_GATEWAY
))
351 read_lock_bh(&neigh
->lock
);
352 if (neigh
->nud_state
& NUD_VALID
)
354 #ifdef CONFIG_IPV6_ROUTER_PREF
355 else if (neigh
->nud_state
& NUD_FAILED
)
360 read_unlock_bh(&neigh
->lock
);
366 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
371 m
= rt6_check_dev(rt
, oif
);
372 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
374 #ifdef CONFIG_IPV6_ROUTER_PREF
375 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
377 n
= rt6_check_neigh(rt
);
378 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
383 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
384 int *mpri
, struct rt6_info
*match
)
388 if (rt6_check_expired(rt
))
391 m
= rt6_score_route(rt
, oif
, strict
);
396 if (strict
& RT6_LOOKUP_F_REACHABLE
)
400 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
408 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
409 struct rt6_info
*rr_head
,
410 u32 metric
, int oif
, int strict
)
412 struct rt6_info
*rt
, *match
;
416 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
417 rt
= rt
->dst
.rt6_next
)
418 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
419 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
420 rt
= rt
->dst
.rt6_next
)
421 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
426 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
428 struct rt6_info
*match
, *rt0
;
431 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
432 __func__
, fn
->leaf
, oif
);
436 fn
->rr_ptr
= rt0
= fn
->leaf
;
438 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
441 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
442 struct rt6_info
*next
= rt0
->dst
.rt6_next
;
444 /* no entries matched; do round-robin */
445 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
452 RT6_TRACE("%s() => %p\n",
455 net
= dev_net(rt0
->rt6i_dev
);
456 return match
? match
: net
->ipv6
.ip6_null_entry
;
459 #ifdef CONFIG_IPV6_ROUTE_INFO
460 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
461 struct in6_addr
*gwaddr
)
463 struct net
*net
= dev_net(dev
);
464 struct route_info
*rinfo
= (struct route_info
*) opt
;
465 struct in6_addr prefix_buf
, *prefix
;
467 unsigned long lifetime
;
470 if (len
< sizeof(struct route_info
)) {
474 /* Sanity check for prefix_len and length */
475 if (rinfo
->length
> 3) {
477 } else if (rinfo
->prefix_len
> 128) {
479 } else if (rinfo
->prefix_len
> 64) {
480 if (rinfo
->length
< 2) {
483 } else if (rinfo
->prefix_len
> 0) {
484 if (rinfo
->length
< 1) {
489 pref
= rinfo
->route_pref
;
490 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
493 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
495 if (rinfo
->length
== 3)
496 prefix
= (struct in6_addr
*)rinfo
->prefix
;
498 /* this function is safe */
499 ipv6_addr_prefix(&prefix_buf
,
500 (struct in6_addr
*)rinfo
->prefix
,
502 prefix
= &prefix_buf
;
505 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
508 if (rt
&& !lifetime
) {
514 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
517 rt
->rt6i_flags
= RTF_ROUTEINFO
|
518 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
521 if (!addrconf_finite_timeout(lifetime
)) {
522 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
524 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
525 rt
->rt6i_flags
|= RTF_EXPIRES
;
527 dst_release(&rt
->dst
);
533 #define BACKTRACK(__net, saddr) \
535 if (rt == __net->ipv6.ip6_null_entry) { \
536 struct fib6_node *pn; \
538 if (fn->fn_flags & RTN_TL_ROOT) \
541 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
542 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
545 if (fn->fn_flags & RTN_RTINFO) \
551 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
552 struct fib6_table
*table
,
553 struct flowi
*fl
, int flags
)
555 struct fib6_node
*fn
;
558 read_lock_bh(&table
->tb6_lock
);
559 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
562 rt
= rt6_device_match(net
, rt
, &fl
->fl6_src
, fl
->oif
, flags
);
563 BACKTRACK(net
, &fl
->fl6_src
);
565 dst_use(&rt
->dst
, jiffies
);
566 read_unlock_bh(&table
->tb6_lock
);
571 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
572 const struct in6_addr
*saddr
, int oif
, int strict
)
578 struct dst_entry
*dst
;
579 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
582 memcpy(&fl
.fl6_src
, saddr
, sizeof(*saddr
));
583 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
586 dst
= fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_lookup
);
588 return (struct rt6_info
*) dst
;
595 EXPORT_SYMBOL(rt6_lookup
);
597 /* ip6_ins_rt is called with FREE table->tb6_lock.
598 It takes new route entry, the addition fails by any reason the
599 route is freed. In any case, if caller does not hold it, it may
603 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
606 struct fib6_table
*table
;
608 table
= rt
->rt6i_table
;
609 write_lock_bh(&table
->tb6_lock
);
610 err
= fib6_add(&table
->tb6_root
, rt
, info
);
611 write_unlock_bh(&table
->tb6_lock
);
616 int ip6_ins_rt(struct rt6_info
*rt
)
618 struct nl_info info
= {
619 .nl_net
= dev_net(rt
->rt6i_dev
),
621 return __ip6_ins_rt(rt
, &info
);
624 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
625 struct in6_addr
*saddr
)
633 rt
= ip6_rt_copy(ort
);
636 struct neighbour
*neigh
;
637 int attempts
= !in_softirq();
639 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
640 if (rt
->rt6i_dst
.plen
!= 128 &&
641 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
642 rt
->rt6i_flags
|= RTF_ANYCAST
;
643 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
646 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
647 rt
->rt6i_dst
.plen
= 128;
648 rt
->rt6i_flags
|= RTF_CACHE
;
649 rt
->dst
.flags
|= DST_HOST
;
651 #ifdef CONFIG_IPV6_SUBTREES
652 if (rt
->rt6i_src
.plen
&& saddr
) {
653 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
654 rt
->rt6i_src
.plen
= 128;
659 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
661 struct net
*net
= dev_net(rt
->rt6i_dev
);
662 int saved_rt_min_interval
=
663 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
664 int saved_rt_elasticity
=
665 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
667 if (attempts
-- > 0) {
668 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
669 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
671 ip6_dst_gc(&net
->ipv6
.ip6_dst_ops
);
673 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
675 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
676 saved_rt_min_interval
;
682 "ipv6: Neighbour table overflow.\n");
686 rt
->rt6i_nexthop
= neigh
;
693 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
695 struct rt6_info
*rt
= ip6_rt_copy(ort
);
697 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
698 rt
->rt6i_dst
.plen
= 128;
699 rt
->rt6i_flags
|= RTF_CACHE
;
700 rt
->dst
.flags
|= DST_HOST
;
701 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
706 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
707 struct flowi
*fl
, int flags
)
709 struct fib6_node
*fn
;
710 struct rt6_info
*rt
, *nrt
;
714 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
716 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
719 read_lock_bh(&table
->tb6_lock
);
722 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
725 rt
= rt6_select(fn
, oif
, strict
| reachable
);
727 BACKTRACK(net
, &fl
->fl6_src
);
728 if (rt
== net
->ipv6
.ip6_null_entry
||
729 rt
->rt6i_flags
& RTF_CACHE
)
733 read_unlock_bh(&table
->tb6_lock
);
735 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
736 nrt
= rt6_alloc_cow(rt
, &fl
->fl6_dst
, &fl
->fl6_src
);
738 #if CLONE_OFFLINK_ROUTE
739 nrt
= rt6_alloc_clone(rt
, &fl
->fl6_dst
);
745 dst_release(&rt
->dst
);
746 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
750 err
= ip6_ins_rt(nrt
);
759 * Race condition! In the gap, when table->tb6_lock was
760 * released someone could insert this route. Relookup.
762 dst_release(&rt
->dst
);
771 read_unlock_bh(&table
->tb6_lock
);
773 rt
->dst
.lastuse
= jiffies
;
779 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
780 struct flowi
*fl
, int flags
)
782 return ip6_pol_route(net
, table
, fl
->iif
, fl
, flags
);
785 void ip6_route_input(struct sk_buff
*skb
)
787 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
788 struct net
*net
= dev_net(skb
->dev
);
789 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
791 .iif
= skb
->dev
->ifindex
,
792 .fl6_dst
= iph
->daddr
,
793 .fl6_src
= iph
->saddr
,
794 .fl6_flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
796 .proto
= iph
->nexthdr
,
799 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
800 flags
|= RT6_LOOKUP_F_IFACE
;
802 skb_dst_set(skb
, fib6_rule_lookup(net
, &fl
, flags
, ip6_pol_route_input
));
805 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
806 struct flowi
*fl
, int flags
)
808 return ip6_pol_route(net
, table
, fl
->oif
, fl
, flags
);
811 struct dst_entry
* ip6_route_output(struct net
*net
, struct sock
*sk
,
816 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl
->fl6_dst
))
817 flags
|= RT6_LOOKUP_F_IFACE
;
819 if (!ipv6_addr_any(&fl
->fl6_src
))
820 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
822 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
824 return fib6_rule_lookup(net
, fl
, flags
, ip6_pol_route_output
);
827 EXPORT_SYMBOL(ip6_route_output
);
829 int ip6_dst_blackhole(struct sock
*sk
, struct dst_entry
**dstp
, struct flowi
*fl
)
831 struct rt6_info
*ort
= (struct rt6_info
*) *dstp
;
832 struct rt6_info
*rt
= (struct rt6_info
*)
833 dst_alloc(&ip6_dst_blackhole_ops
);
834 struct dst_entry
*new = NULL
;
839 atomic_set(&new->__refcnt
, 1);
841 new->input
= dst_discard
;
842 new->output
= dst_discard
;
844 dst_copy_metrics(new, &ort
->dst
);
845 new->dev
= ort
->dst
.dev
;
848 rt
->rt6i_idev
= ort
->rt6i_idev
;
850 in6_dev_hold(rt
->rt6i_idev
);
851 rt
->rt6i_expires
= 0;
853 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
854 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
857 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
858 #ifdef CONFIG_IPV6_SUBTREES
859 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
867 return new ? 0 : -ENOMEM
;
869 EXPORT_SYMBOL_GPL(ip6_dst_blackhole
);
872 * Destination cache support functions
875 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
879 rt
= (struct rt6_info
*) dst
;
881 if (rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
))
887 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
889 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
892 if (rt
->rt6i_flags
& RTF_CACHE
) {
893 if (rt6_check_expired(rt
)) {
905 static void ip6_link_failure(struct sk_buff
*skb
)
909 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
911 rt
= (struct rt6_info
*) skb_dst(skb
);
913 if (rt
->rt6i_flags
&RTF_CACHE
) {
914 dst_set_expires(&rt
->dst
, 0);
915 rt
->rt6i_flags
|= RTF_EXPIRES
;
916 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
917 rt
->rt6i_node
->fn_sernum
= -1;
921 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
923 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
925 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
926 rt6
->rt6i_flags
|= RTF_MODIFIED
;
927 if (mtu
< IPV6_MIN_MTU
) {
928 u32 features
= dst_metric(dst
, RTAX_FEATURES
);
930 features
|= RTAX_FEATURE_ALLFRAG
;
931 dst_metric_set(dst
, RTAX_FEATURES
, features
);
933 dst_metric_set(dst
, RTAX_MTU
, mtu
);
934 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
938 static int ipv6_get_mtu(struct net_device
*dev
);
940 static inline unsigned int ipv6_advmss(struct net
*net
, unsigned int mtu
)
942 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
944 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
945 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
948 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
949 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
950 * IPV6_MAXPLEN is also valid and means: "any MSS,
951 * rely only on pmtu discovery"
953 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
958 static struct dst_entry
*icmp6_dst_gc_list
;
959 static DEFINE_SPINLOCK(icmp6_dst_lock
);
961 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
962 struct neighbour
*neigh
,
963 const struct in6_addr
*addr
)
966 struct inet6_dev
*idev
= in6_dev_get(dev
);
967 struct net
*net
= dev_net(dev
);
969 if (unlikely(idev
== NULL
))
972 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
973 if (unlikely(rt
== NULL
)) {
982 neigh
= ndisc_get_neigh(dev
, addr
);
988 rt
->rt6i_idev
= idev
;
989 rt
->rt6i_nexthop
= neigh
;
990 atomic_set(&rt
->dst
.__refcnt
, 1);
991 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 255);
992 dst_metric_set(&rt
->dst
, RTAX_MTU
, ipv6_get_mtu(rt
->rt6i_dev
));
993 dst_metric_set(&rt
->dst
, RTAX_ADVMSS
, ipv6_advmss(net
, dst_mtu(&rt
->dst
)));
994 rt
->dst
.output
= ip6_output
;
996 #if 0 /* there's no chance to use these for ndisc */
997 rt
->dst
.flags
= ipv6_addr_type(addr
) & IPV6_ADDR_UNICAST
1000 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1001 rt
->rt6i_dst
.plen
= 128;
1004 spin_lock_bh(&icmp6_dst_lock
);
1005 rt
->dst
.next
= icmp6_dst_gc_list
;
1006 icmp6_dst_gc_list
= &rt
->dst
;
1007 spin_unlock_bh(&icmp6_dst_lock
);
1009 fib6_force_start_gc(net
);
1015 int icmp6_dst_gc(void)
1017 struct dst_entry
*dst
, *next
, **pprev
;
1022 spin_lock_bh(&icmp6_dst_lock
);
1023 pprev
= &icmp6_dst_gc_list
;
1025 while ((dst
= *pprev
) != NULL
) {
1026 if (!atomic_read(&dst
->__refcnt
)) {
1035 spin_unlock_bh(&icmp6_dst_lock
);
1040 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1043 struct dst_entry
*dst
, **pprev
;
1045 spin_lock_bh(&icmp6_dst_lock
);
1046 pprev
= &icmp6_dst_gc_list
;
1047 while ((dst
= *pprev
) != NULL
) {
1048 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1049 if (func(rt
, arg
)) {
1056 spin_unlock_bh(&icmp6_dst_lock
);
1059 static int ip6_dst_gc(struct dst_ops
*ops
)
1061 unsigned long now
= jiffies
;
1062 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1063 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1064 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1065 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1066 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1067 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1070 entries
= dst_entries_get_fast(ops
);
1071 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1072 entries
<= rt_max_size
)
1075 net
->ipv6
.ip6_rt_gc_expire
++;
1076 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1077 net
->ipv6
.ip6_rt_last_gc
= now
;
1078 entries
= dst_entries_get_slow(ops
);
1079 if (entries
< ops
->gc_thresh
)
1080 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1082 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1083 return entries
> rt_max_size
;
1086 /* Clean host part of a prefix. Not necessary in radix tree,
1087 but results in cleaner routing tables.
1089 Remove it only when all the things will work!
1092 static int ipv6_get_mtu(struct net_device
*dev
)
1094 int mtu
= IPV6_MIN_MTU
;
1095 struct inet6_dev
*idev
;
1098 idev
= __in6_dev_get(dev
);
1100 mtu
= idev
->cnf
.mtu6
;
1105 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1107 int hoplimit
= dst_metric_raw(dst
, RTAX_HOPLIMIT
);
1108 if (hoplimit
== 0) {
1109 struct net_device
*dev
= dst
->dev
;
1110 struct inet6_dev
*idev
;
1113 idev
= __in6_dev_get(dev
);
1115 hoplimit
= idev
->cnf
.hop_limit
;
1117 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1122 EXPORT_SYMBOL(ip6_dst_hoplimit
);
1128 int ip6_route_add(struct fib6_config
*cfg
)
1131 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1132 struct rt6_info
*rt
= NULL
;
1133 struct net_device
*dev
= NULL
;
1134 struct inet6_dev
*idev
= NULL
;
1135 struct fib6_table
*table
;
1138 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1140 #ifndef CONFIG_IPV6_SUBTREES
1141 if (cfg
->fc_src_len
)
1144 if (cfg
->fc_ifindex
) {
1146 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1149 idev
= in6_dev_get(dev
);
1154 if (cfg
->fc_metric
== 0)
1155 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1157 table
= fib6_new_table(net
, cfg
->fc_table
);
1158 if (table
== NULL
) {
1163 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1170 rt
->dst
.obsolete
= -1;
1171 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1172 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1175 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1176 cfg
->fc_protocol
= RTPROT_BOOT
;
1177 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1179 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1181 if (addr_type
& IPV6_ADDR_MULTICAST
)
1182 rt
->dst
.input
= ip6_mc_input
;
1183 else if (cfg
->fc_flags
& RTF_LOCAL
)
1184 rt
->dst
.input
= ip6_input
;
1186 rt
->dst
.input
= ip6_forward
;
1188 rt
->dst
.output
= ip6_output
;
1190 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1191 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1192 if (rt
->rt6i_dst
.plen
== 128)
1193 rt
->dst
.flags
= DST_HOST
;
1195 #ifdef CONFIG_IPV6_SUBTREES
1196 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1197 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1200 rt
->rt6i_metric
= cfg
->fc_metric
;
1202 /* We cannot add true routes via loopback here,
1203 they would result in kernel looping; promote them to reject routes
1205 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1206 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
)
1207 && !(cfg
->fc_flags
&RTF_LOCAL
))) {
1208 /* hold loopback dev/idev if we haven't done so. */
1209 if (dev
!= net
->loopback_dev
) {
1214 dev
= net
->loopback_dev
;
1216 idev
= in6_dev_get(dev
);
1222 rt
->dst
.output
= ip6_pkt_discard_out
;
1223 rt
->dst
.input
= ip6_pkt_discard
;
1224 rt
->dst
.error
= -ENETUNREACH
;
1225 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1229 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1230 struct in6_addr
*gw_addr
;
1233 gw_addr
= &cfg
->fc_gateway
;
1234 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1235 gwa_type
= ipv6_addr_type(gw_addr
);
1237 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1238 struct rt6_info
*grt
;
1240 /* IPv6 strictly inhibits using not link-local
1241 addresses as nexthop address.
1242 Otherwise, router will not able to send redirects.
1243 It is very good, but in some (rare!) circumstances
1244 (SIT, PtP, NBMA NOARP links) it is handy to allow
1245 some exceptions. --ANK
1248 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1251 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1253 err
= -EHOSTUNREACH
;
1257 if (dev
!= grt
->rt6i_dev
) {
1258 dst_release(&grt
->dst
);
1262 dev
= grt
->rt6i_dev
;
1263 idev
= grt
->rt6i_idev
;
1265 in6_dev_hold(grt
->rt6i_idev
);
1267 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1269 dst_release(&grt
->dst
);
1275 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1283 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1284 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1285 if (IS_ERR(rt
->rt6i_nexthop
)) {
1286 err
= PTR_ERR(rt
->rt6i_nexthop
);
1287 rt
->rt6i_nexthop
= NULL
;
1292 rt
->rt6i_flags
= cfg
->fc_flags
;
1299 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1300 int type
= nla_type(nla
);
1303 if (type
> RTAX_MAX
) {
1308 dst_metric_set(&rt
->dst
, type
, nla_get_u32(nla
));
1313 if (!dst_mtu(&rt
->dst
))
1314 dst_metric_set(&rt
->dst
, RTAX_MTU
, ipv6_get_mtu(dev
));
1315 if (!dst_metric(&rt
->dst
, RTAX_ADVMSS
))
1316 dst_metric_set(&rt
->dst
, RTAX_ADVMSS
, ipv6_advmss(net
, dst_mtu(&rt
->dst
)));
1318 rt
->rt6i_idev
= idev
;
1319 rt
->rt6i_table
= table
;
1321 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1323 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1335 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1338 struct fib6_table
*table
;
1339 struct net
*net
= dev_net(rt
->rt6i_dev
);
1341 if (rt
== net
->ipv6
.ip6_null_entry
)
1344 table
= rt
->rt6i_table
;
1345 write_lock_bh(&table
->tb6_lock
);
1347 err
= fib6_del(rt
, info
);
1348 dst_release(&rt
->dst
);
1350 write_unlock_bh(&table
->tb6_lock
);
1355 int ip6_del_rt(struct rt6_info
*rt
)
1357 struct nl_info info
= {
1358 .nl_net
= dev_net(rt
->rt6i_dev
),
1360 return __ip6_del_rt(rt
, &info
);
1363 static int ip6_route_del(struct fib6_config
*cfg
)
1365 struct fib6_table
*table
;
1366 struct fib6_node
*fn
;
1367 struct rt6_info
*rt
;
1370 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1374 read_lock_bh(&table
->tb6_lock
);
1376 fn
= fib6_locate(&table
->tb6_root
,
1377 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1378 &cfg
->fc_src
, cfg
->fc_src_len
);
1381 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1382 if (cfg
->fc_ifindex
&&
1383 (rt
->rt6i_dev
== NULL
||
1384 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1386 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1387 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1389 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1392 read_unlock_bh(&table
->tb6_lock
);
1394 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1397 read_unlock_bh(&table
->tb6_lock
);
1405 struct ip6rd_flowi
{
1407 struct in6_addr gateway
;
1410 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1411 struct fib6_table
*table
,
1415 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl
;
1416 struct rt6_info
*rt
;
1417 struct fib6_node
*fn
;
1420 * Get the "current" route for this destination and
1421 * check if the redirect has come from approriate router.
1423 * RFC 2461 specifies that redirects should only be
1424 * accepted if they come from the nexthop to the target.
1425 * Due to the way the routes are chosen, this notion
1426 * is a bit fuzzy and one might need to check all possible
1430 read_lock_bh(&table
->tb6_lock
);
1431 fn
= fib6_lookup(&table
->tb6_root
, &fl
->fl6_dst
, &fl
->fl6_src
);
1433 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1435 * Current route is on-link; redirect is always invalid.
1437 * Seems, previous statement is not true. It could
1438 * be node, which looks for us as on-link (f.e. proxy ndisc)
1439 * But then router serving it might decide, that we should
1440 * know truth 8)8) --ANK (980726).
1442 if (rt6_check_expired(rt
))
1444 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1446 if (fl
->oif
!= rt
->rt6i_dev
->ifindex
)
1448 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1454 rt
= net
->ipv6
.ip6_null_entry
;
1455 BACKTRACK(net
, &fl
->fl6_src
);
1459 read_unlock_bh(&table
->tb6_lock
);
1464 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1465 struct in6_addr
*src
,
1466 struct in6_addr
*gateway
,
1467 struct net_device
*dev
)
1469 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1470 struct net
*net
= dev_net(dev
);
1471 struct ip6rd_flowi rdfl
= {
1473 .oif
= dev
->ifindex
,
1479 ipv6_addr_copy(&rdfl
.gateway
, gateway
);
1481 if (rt6_need_strict(dest
))
1482 flags
|= RT6_LOOKUP_F_IFACE
;
1484 return (struct rt6_info
*)fib6_rule_lookup(net
, (struct flowi
*)&rdfl
,
1485 flags
, __ip6_route_redirect
);
1488 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1489 struct in6_addr
*saddr
,
1490 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1492 struct rt6_info
*rt
, *nrt
= NULL
;
1493 struct netevent_redirect netevent
;
1494 struct net
*net
= dev_net(neigh
->dev
);
1496 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1498 if (rt
== net
->ipv6
.ip6_null_entry
) {
1499 if (net_ratelimit())
1500 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1501 "for redirect target\n");
1506 * We have finally decided to accept it.
1509 neigh_update(neigh
, lladdr
, NUD_STALE
,
1510 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1511 NEIGH_UPDATE_F_OVERRIDE
|
1512 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1513 NEIGH_UPDATE_F_ISROUTER
))
1517 * Redirect received -> path was valid.
1518 * Look, redirects are sent only in response to data packets,
1519 * so that this nexthop apparently is reachable. --ANK
1521 dst_confirm(&rt
->dst
);
1523 /* Duplicate redirect: silently ignore. */
1524 if (neigh
== rt
->dst
.neighbour
)
1527 nrt
= ip6_rt_copy(rt
);
1531 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1533 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1535 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1536 nrt
->rt6i_dst
.plen
= 128;
1537 nrt
->dst
.flags
|= DST_HOST
;
1539 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1540 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1541 /* Reset pmtu, it may be better */
1542 dst_metric_set(&nrt
->dst
, RTAX_MTU
, ipv6_get_mtu(neigh
->dev
));
1543 dst_metric_set(&nrt
->dst
, RTAX_ADVMSS
, ipv6_advmss(dev_net(neigh
->dev
),
1544 dst_mtu(&nrt
->dst
)));
1546 if (ip6_ins_rt(nrt
))
1549 netevent
.old
= &rt
->dst
;
1550 netevent
.new = &nrt
->dst
;
1551 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1553 if (rt
->rt6i_flags
&RTF_CACHE
) {
1559 dst_release(&rt
->dst
);
1563 * Handle ICMP "packet too big" messages
1564 * i.e. Path MTU discovery
1567 static void rt6_do_pmtu_disc(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1568 struct net
*net
, u32 pmtu
, int ifindex
)
1570 struct rt6_info
*rt
, *nrt
;
1573 rt
= rt6_lookup(net
, daddr
, saddr
, ifindex
, 0);
1577 if (pmtu
>= dst_mtu(&rt
->dst
))
1580 if (pmtu
< IPV6_MIN_MTU
) {
1582 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1583 * MTU (1280) and a fragment header should always be included
1584 * after a node receiving Too Big message reporting PMTU is
1585 * less than the IPv6 Minimum Link MTU.
1587 pmtu
= IPV6_MIN_MTU
;
1591 /* New mtu received -> path was valid.
1592 They are sent only in response to data packets,
1593 so that this nexthop apparently is reachable. --ANK
1595 dst_confirm(&rt
->dst
);
1597 /* Host route. If it is static, it would be better
1598 not to override it, but add new one, so that
1599 when cache entry will expire old pmtu
1600 would return automatically.
1602 if (rt
->rt6i_flags
& RTF_CACHE
) {
1603 dst_metric_set(&rt
->dst
, RTAX_MTU
, pmtu
);
1605 u32 features
= dst_metric(&rt
->dst
, RTAX_FEATURES
);
1606 features
|= RTAX_FEATURE_ALLFRAG
;
1607 dst_metric_set(&rt
->dst
, RTAX_FEATURES
, features
);
1609 dst_set_expires(&rt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1610 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1615 Two cases are possible:
1616 1. It is connected route. Action: COW
1617 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1619 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1620 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1622 nrt
= rt6_alloc_clone(rt
, daddr
);
1625 dst_metric_set(&nrt
->dst
, RTAX_MTU
, pmtu
);
1627 u32 features
= dst_metric(&nrt
->dst
, RTAX_FEATURES
);
1628 features
|= RTAX_FEATURE_ALLFRAG
;
1629 dst_metric_set(&nrt
->dst
, RTAX_FEATURES
, features
);
1632 /* According to RFC 1981, detecting PMTU increase shouldn't be
1633 * happened within 5 mins, the recommended timer is 10 mins.
1634 * Here this route expiration time is set to ip6_rt_mtu_expires
1635 * which is 10 mins. After 10 mins the decreased pmtu is expired
1636 * and detecting PMTU increase will be automatically happened.
1638 dst_set_expires(&nrt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1639 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1644 dst_release(&rt
->dst
);
1647 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1648 struct net_device
*dev
, u32 pmtu
)
1650 struct net
*net
= dev_net(dev
);
1653 * RFC 1981 states that a node "MUST reduce the size of the packets it
1654 * is sending along the path" that caused the Packet Too Big message.
1655 * Since it's not possible in the general case to determine which
1656 * interface was used to send the original packet, we update the MTU
1657 * on the interface that will be used to send future packets. We also
1658 * update the MTU on the interface that received the Packet Too Big in
1659 * case the original packet was forced out that interface with
1660 * SO_BINDTODEVICE or similar. This is the next best thing to the
1661 * correct behaviour, which would be to update the MTU on all
1664 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, 0);
1665 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, dev
->ifindex
);
1669 * Misc support functions
1672 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1674 struct net
*net
= dev_net(ort
->rt6i_dev
);
1675 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1678 rt
->dst
.input
= ort
->dst
.input
;
1679 rt
->dst
.output
= ort
->dst
.output
;
1681 dst_copy_metrics(&rt
->dst
, &ort
->dst
);
1682 rt
->dst
.error
= ort
->dst
.error
;
1683 rt
->dst
.dev
= ort
->dst
.dev
;
1685 dev_hold(rt
->dst
.dev
);
1686 rt
->rt6i_idev
= ort
->rt6i_idev
;
1688 in6_dev_hold(rt
->rt6i_idev
);
1689 rt
->dst
.lastuse
= jiffies
;
1690 rt
->rt6i_expires
= 0;
1692 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1693 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1694 rt
->rt6i_metric
= 0;
1696 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1697 #ifdef CONFIG_IPV6_SUBTREES
1698 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1700 rt
->rt6i_table
= ort
->rt6i_table
;
1705 #ifdef CONFIG_IPV6_ROUTE_INFO
1706 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1707 struct in6_addr
*prefix
, int prefixlen
,
1708 struct in6_addr
*gwaddr
, int ifindex
)
1710 struct fib6_node
*fn
;
1711 struct rt6_info
*rt
= NULL
;
1712 struct fib6_table
*table
;
1714 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1718 write_lock_bh(&table
->tb6_lock
);
1719 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1723 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1724 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1726 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1728 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1734 write_unlock_bh(&table
->tb6_lock
);
1738 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1739 struct in6_addr
*prefix
, int prefixlen
,
1740 struct in6_addr
*gwaddr
, int ifindex
,
1743 struct fib6_config cfg
= {
1744 .fc_table
= RT6_TABLE_INFO
,
1745 .fc_metric
= IP6_RT_PRIO_USER
,
1746 .fc_ifindex
= ifindex
,
1747 .fc_dst_len
= prefixlen
,
1748 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1749 RTF_UP
| RTF_PREF(pref
),
1751 .fc_nlinfo
.nlh
= NULL
,
1752 .fc_nlinfo
.nl_net
= net
,
1755 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1756 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1758 /* We should treat it as a default route if prefix length is 0. */
1760 cfg
.fc_flags
|= RTF_DEFAULT
;
1762 ip6_route_add(&cfg
);
1764 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1768 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1770 struct rt6_info
*rt
;
1771 struct fib6_table
*table
;
1773 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1777 write_lock_bh(&table
->tb6_lock
);
1778 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->dst
.rt6_next
) {
1779 if (dev
== rt
->rt6i_dev
&&
1780 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1781 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1786 write_unlock_bh(&table
->tb6_lock
);
1790 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1791 struct net_device
*dev
,
1794 struct fib6_config cfg
= {
1795 .fc_table
= RT6_TABLE_DFLT
,
1796 .fc_metric
= IP6_RT_PRIO_USER
,
1797 .fc_ifindex
= dev
->ifindex
,
1798 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1799 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1801 .fc_nlinfo
.nlh
= NULL
,
1802 .fc_nlinfo
.nl_net
= dev_net(dev
),
1805 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1807 ip6_route_add(&cfg
);
1809 return rt6_get_dflt_router(gwaddr
, dev
);
1812 void rt6_purge_dflt_routers(struct net
*net
)
1814 struct rt6_info
*rt
;
1815 struct fib6_table
*table
;
1817 /* NOTE: Keep consistent with rt6_get_dflt_router */
1818 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1823 read_lock_bh(&table
->tb6_lock
);
1824 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1825 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1827 read_unlock_bh(&table
->tb6_lock
);
1832 read_unlock_bh(&table
->tb6_lock
);
1835 static void rtmsg_to_fib6_config(struct net
*net
,
1836 struct in6_rtmsg
*rtmsg
,
1837 struct fib6_config
*cfg
)
1839 memset(cfg
, 0, sizeof(*cfg
));
1841 cfg
->fc_table
= RT6_TABLE_MAIN
;
1842 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1843 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1844 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1845 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1846 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1847 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1849 cfg
->fc_nlinfo
.nl_net
= net
;
1851 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1852 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1853 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1856 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1858 struct fib6_config cfg
;
1859 struct in6_rtmsg rtmsg
;
1863 case SIOCADDRT
: /* Add a route */
1864 case SIOCDELRT
: /* Delete a route */
1865 if (!capable(CAP_NET_ADMIN
))
1867 err
= copy_from_user(&rtmsg
, arg
,
1868 sizeof(struct in6_rtmsg
));
1872 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1877 err
= ip6_route_add(&cfg
);
1880 err
= ip6_route_del(&cfg
);
1894 * Drop the packet on the floor
1897 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
1900 struct dst_entry
*dst
= skb_dst(skb
);
1901 switch (ipstats_mib_noroutes
) {
1902 case IPSTATS_MIB_INNOROUTES
:
1903 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1904 if (type
== IPV6_ADDR_ANY
) {
1905 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1906 IPSTATS_MIB_INADDRERRORS
);
1910 case IPSTATS_MIB_OUTNOROUTES
:
1911 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1912 ipstats_mib_noroutes
);
1915 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
1920 static int ip6_pkt_discard(struct sk_buff
*skb
)
1922 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1925 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1927 skb
->dev
= skb_dst(skb
)->dev
;
1928 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1931 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1933 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1935 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1938 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1940 skb
->dev
= skb_dst(skb
)->dev
;
1941 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1947 * Allocate a dst for local (unicast / anycast) address.
1950 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1951 const struct in6_addr
*addr
,
1954 struct net
*net
= dev_net(idev
->dev
);
1955 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1956 struct neighbour
*neigh
;
1959 if (net_ratelimit())
1960 pr_warning("IPv6: Maximum number of routes reached,"
1961 " consider increasing route/max_size.\n");
1962 return ERR_PTR(-ENOMEM
);
1965 dev_hold(net
->loopback_dev
);
1968 rt
->dst
.flags
= DST_HOST
;
1969 rt
->dst
.input
= ip6_input
;
1970 rt
->dst
.output
= ip6_output
;
1971 rt
->rt6i_dev
= net
->loopback_dev
;
1972 rt
->rt6i_idev
= idev
;
1973 dst_metric_set(&rt
->dst
, RTAX_MTU
, ipv6_get_mtu(rt
->rt6i_dev
));
1974 dst_metric_set(&rt
->dst
, RTAX_ADVMSS
, ipv6_advmss(net
, dst_mtu(&rt
->dst
)));
1975 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, -1);
1976 rt
->dst
.obsolete
= -1;
1978 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
1980 rt
->rt6i_flags
|= RTF_ANYCAST
;
1982 rt
->rt6i_flags
|= RTF_LOCAL
;
1983 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
1984 if (IS_ERR(neigh
)) {
1987 /* We are casting this because that is the return
1988 * value type. But an errno encoded pointer is the
1989 * same regardless of the underlying pointer type,
1990 * and that's what we are returning. So this is OK.
1992 return (struct rt6_info
*) neigh
;
1994 rt
->rt6i_nexthop
= neigh
;
1996 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1997 rt
->rt6i_dst
.plen
= 128;
1998 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
2000 atomic_set(&rt
->dst
.__refcnt
, 1);
2005 struct arg_dev_net
{
2006 struct net_device
*dev
;
2010 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
2012 struct net_device
*dev
= ((struct arg_dev_net
*)arg
)->dev
;
2013 struct net
*net
= ((struct arg_dev_net
*)arg
)->net
;
2015 if (((void *)rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
2016 rt
!= net
->ipv6
.ip6_null_entry
) {
2017 RT6_TRACE("deleted by ifdown %p\n", rt
);
2023 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
2025 struct arg_dev_net adn
= {
2030 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
2031 icmp6_clean_all(fib6_ifdown
, &adn
);
2034 struct rt6_mtu_change_arg
2036 struct net_device
*dev
;
2040 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2042 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2043 struct inet6_dev
*idev
;
2044 struct net
*net
= dev_net(arg
->dev
);
2046 /* In IPv6 pmtu discovery is not optional,
2047 so that RTAX_MTU lock cannot disable it.
2048 We still use this lock to block changes
2049 caused by addrconf/ndisc.
2052 idev
= __in6_dev_get(arg
->dev
);
2056 /* For administrative MTU increase, there is no way to discover
2057 IPv6 PMTU increase, so PMTU increase should be updated here.
2058 Since RFC 1981 doesn't include administrative MTU increase
2059 update PMTU increase is a MUST. (i.e. jumbo frame)
2062 If new MTU is less than route PMTU, this new MTU will be the
2063 lowest MTU in the path, update the route PMTU to reflect PMTU
2064 decreases; if new MTU is greater than route PMTU, and the
2065 old MTU is the lowest MTU in the path, update the route PMTU
2066 to reflect the increase. In this case if the other nodes' MTU
2067 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2070 if (rt
->rt6i_dev
== arg
->dev
&&
2071 !dst_metric_locked(&rt
->dst
, RTAX_MTU
) &&
2072 (dst_mtu(&rt
->dst
) >= arg
->mtu
||
2073 (dst_mtu(&rt
->dst
) < arg
->mtu
&&
2074 dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
))) {
2075 dst_metric_set(&rt
->dst
, RTAX_MTU
, arg
->mtu
);
2076 dst_metric_set(&rt
->dst
, RTAX_ADVMSS
, ipv6_advmss(net
, arg
->mtu
));
2081 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2083 struct rt6_mtu_change_arg arg
= {
2088 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2091 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2092 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2093 [RTA_OIF
] = { .type
= NLA_U32
},
2094 [RTA_IIF
] = { .type
= NLA_U32
},
2095 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2096 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2099 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2100 struct fib6_config
*cfg
)
2103 struct nlattr
*tb
[RTA_MAX
+1];
2106 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2111 rtm
= nlmsg_data(nlh
);
2112 memset(cfg
, 0, sizeof(*cfg
));
2114 cfg
->fc_table
= rtm
->rtm_table
;
2115 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2116 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2117 cfg
->fc_flags
= RTF_UP
;
2118 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2120 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2121 cfg
->fc_flags
|= RTF_REJECT
;
2123 if (rtm
->rtm_type
== RTN_LOCAL
)
2124 cfg
->fc_flags
|= RTF_LOCAL
;
2126 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2127 cfg
->fc_nlinfo
.nlh
= nlh
;
2128 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2130 if (tb
[RTA_GATEWAY
]) {
2131 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2132 cfg
->fc_flags
|= RTF_GATEWAY
;
2136 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2138 if (nla_len(tb
[RTA_DST
]) < plen
)
2141 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2145 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2147 if (nla_len(tb
[RTA_SRC
]) < plen
)
2150 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2154 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2156 if (tb
[RTA_PRIORITY
])
2157 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2159 if (tb
[RTA_METRICS
]) {
2160 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2161 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2165 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2172 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2174 struct fib6_config cfg
;
2177 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2181 return ip6_route_del(&cfg
);
2184 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2186 struct fib6_config cfg
;
2189 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2193 return ip6_route_add(&cfg
);
2196 static inline size_t rt6_nlmsg_size(void)
2198 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2199 + nla_total_size(16) /* RTA_SRC */
2200 + nla_total_size(16) /* RTA_DST */
2201 + nla_total_size(16) /* RTA_GATEWAY */
2202 + nla_total_size(16) /* RTA_PREFSRC */
2203 + nla_total_size(4) /* RTA_TABLE */
2204 + nla_total_size(4) /* RTA_IIF */
2205 + nla_total_size(4) /* RTA_OIF */
2206 + nla_total_size(4) /* RTA_PRIORITY */
2207 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2208 + nla_total_size(sizeof(struct rta_cacheinfo
));
2211 static int rt6_fill_node(struct net
*net
,
2212 struct sk_buff
*skb
, struct rt6_info
*rt
,
2213 struct in6_addr
*dst
, struct in6_addr
*src
,
2214 int iif
, int type
, u32 pid
, u32 seq
,
2215 int prefix
, int nowait
, unsigned int flags
)
2218 struct nlmsghdr
*nlh
;
2222 if (prefix
) { /* user wants prefix routes only */
2223 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2224 /* success since this is not a prefix route */
2229 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2233 rtm
= nlmsg_data(nlh
);
2234 rtm
->rtm_family
= AF_INET6
;
2235 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2236 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2239 table
= rt
->rt6i_table
->tb6_id
;
2241 table
= RT6_TABLE_UNSPEC
;
2242 rtm
->rtm_table
= table
;
2243 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2244 if (rt
->rt6i_flags
&RTF_REJECT
)
2245 rtm
->rtm_type
= RTN_UNREACHABLE
;
2246 else if (rt
->rt6i_flags
&RTF_LOCAL
)
2247 rtm
->rtm_type
= RTN_LOCAL
;
2248 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2249 rtm
->rtm_type
= RTN_LOCAL
;
2251 rtm
->rtm_type
= RTN_UNICAST
;
2253 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2254 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2255 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2256 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2257 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2258 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2259 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2260 rtm
->rtm_protocol
= RTPROT_RA
;
2262 if (rt
->rt6i_flags
&RTF_CACHE
)
2263 rtm
->rtm_flags
|= RTM_F_CLONED
;
2266 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2267 rtm
->rtm_dst_len
= 128;
2268 } else if (rtm
->rtm_dst_len
)
2269 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2270 #ifdef CONFIG_IPV6_SUBTREES
2272 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2273 rtm
->rtm_src_len
= 128;
2274 } else if (rtm
->rtm_src_len
)
2275 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2278 #ifdef CONFIG_IPV6_MROUTE
2279 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2280 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2285 goto nla_put_failure
;
2287 if (err
== -EMSGSIZE
)
2288 goto nla_put_failure
;
2293 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2295 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->dst
);
2296 struct in6_addr saddr_buf
;
2297 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2298 dst
, 0, &saddr_buf
) == 0)
2299 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2302 if (rtnetlink_put_metrics(skb
, dst_metrics_ptr(&rt
->dst
)) < 0)
2303 goto nla_put_failure
;
2305 if (rt
->dst
.neighbour
)
2306 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->dst
.neighbour
->primary_key
);
2309 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2311 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2313 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2315 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2316 expires
= rt
->rt6i_expires
- jiffies
;
2320 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, 0, 0,
2321 expires
, rt
->dst
.error
) < 0)
2322 goto nla_put_failure
;
2324 return nlmsg_end(skb
, nlh
);
2327 nlmsg_cancel(skb
, nlh
);
2331 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2333 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2336 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2337 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2338 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2342 return rt6_fill_node(arg
->net
,
2343 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2344 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2345 prefix
, 0, NLM_F_MULTI
);
2348 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2350 struct net
*net
= sock_net(in_skb
->sk
);
2351 struct nlattr
*tb
[RTA_MAX
+1];
2352 struct rt6_info
*rt
;
2353 struct sk_buff
*skb
;
2358 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2363 memset(&fl
, 0, sizeof(fl
));
2366 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2369 ipv6_addr_copy(&fl
.fl6_src
, nla_data(tb
[RTA_SRC
]));
2373 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2376 ipv6_addr_copy(&fl
.fl6_dst
, nla_data(tb
[RTA_DST
]));
2380 iif
= nla_get_u32(tb
[RTA_IIF
]);
2383 fl
.oif
= nla_get_u32(tb
[RTA_OIF
]);
2386 struct net_device
*dev
;
2387 dev
= __dev_get_by_index(net
, iif
);
2394 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2400 /* Reserve room for dummy headers, this skb can pass
2401 through good chunk of routing engine.
2403 skb_reset_mac_header(skb
);
2404 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2406 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl
);
2407 skb_dst_set(skb
, &rt
->dst
);
2409 err
= rt6_fill_node(net
, skb
, rt
, &fl
.fl6_dst
, &fl
.fl6_src
, iif
,
2410 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2411 nlh
->nlmsg_seq
, 0, 0, 0);
2417 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2422 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2424 struct sk_buff
*skb
;
2425 struct net
*net
= info
->nl_net
;
2430 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2432 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2436 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2437 event
, info
->pid
, seq
, 0, 0, 0);
2439 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2440 WARN_ON(err
== -EMSGSIZE
);
2444 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2445 info
->nlh
, gfp_any());
2449 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2452 static int ip6_route_dev_notify(struct notifier_block
*this,
2453 unsigned long event
, void *data
)
2455 struct net_device
*dev
= (struct net_device
*)data
;
2456 struct net
*net
= dev_net(dev
);
2458 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2459 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
2460 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2461 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2462 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
2463 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2464 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
2465 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2476 #ifdef CONFIG_PROC_FS
2487 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2489 struct seq_file
*m
= p_arg
;
2491 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2493 #ifdef CONFIG_IPV6_SUBTREES
2494 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2496 seq_puts(m
, "00000000000000000000000000000000 00 ");
2499 if (rt
->rt6i_nexthop
) {
2500 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2502 seq_puts(m
, "00000000000000000000000000000000");
2504 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2505 rt
->rt6i_metric
, atomic_read(&rt
->dst
.__refcnt
),
2506 rt
->dst
.__use
, rt
->rt6i_flags
,
2507 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2511 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2513 struct net
*net
= (struct net
*)m
->private;
2514 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2518 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2520 return single_open_net(inode
, file
, ipv6_route_show
);
2523 static const struct file_operations ipv6_route_proc_fops
= {
2524 .owner
= THIS_MODULE
,
2525 .open
= ipv6_route_open
,
2527 .llseek
= seq_lseek
,
2528 .release
= single_release_net
,
2531 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2533 struct net
*net
= (struct net
*)seq
->private;
2534 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2535 net
->ipv6
.rt6_stats
->fib_nodes
,
2536 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2537 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2538 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2539 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2540 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
2541 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2546 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2548 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2551 static const struct file_operations rt6_stats_seq_fops
= {
2552 .owner
= THIS_MODULE
,
2553 .open
= rt6_stats_seq_open
,
2555 .llseek
= seq_lseek
,
2556 .release
= single_release_net
,
2558 #endif /* CONFIG_PROC_FS */
2560 #ifdef CONFIG_SYSCTL
2563 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
,
2564 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2566 struct net
*net
= current
->nsproxy
->net_ns
;
2567 int delay
= net
->ipv6
.sysctl
.flush_delay
;
2569 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
2570 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2576 ctl_table ipv6_route_table_template
[] = {
2578 .procname
= "flush",
2579 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2580 .maxlen
= sizeof(int),
2582 .proc_handler
= ipv6_sysctl_rtcache_flush
2585 .procname
= "gc_thresh",
2586 .data
= &ip6_dst_ops_template
.gc_thresh
,
2587 .maxlen
= sizeof(int),
2589 .proc_handler
= proc_dointvec
,
2592 .procname
= "max_size",
2593 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2594 .maxlen
= sizeof(int),
2596 .proc_handler
= proc_dointvec
,
2599 .procname
= "gc_min_interval",
2600 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2601 .maxlen
= sizeof(int),
2603 .proc_handler
= proc_dointvec_jiffies
,
2606 .procname
= "gc_timeout",
2607 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2608 .maxlen
= sizeof(int),
2610 .proc_handler
= proc_dointvec_jiffies
,
2613 .procname
= "gc_interval",
2614 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2615 .maxlen
= sizeof(int),
2617 .proc_handler
= proc_dointvec_jiffies
,
2620 .procname
= "gc_elasticity",
2621 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2622 .maxlen
= sizeof(int),
2624 .proc_handler
= proc_dointvec
,
2627 .procname
= "mtu_expires",
2628 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2629 .maxlen
= sizeof(int),
2631 .proc_handler
= proc_dointvec_jiffies
,
2634 .procname
= "min_adv_mss",
2635 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2636 .maxlen
= sizeof(int),
2638 .proc_handler
= proc_dointvec
,
2641 .procname
= "gc_min_interval_ms",
2642 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2643 .maxlen
= sizeof(int),
2645 .proc_handler
= proc_dointvec_ms_jiffies
,
2650 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
2652 struct ctl_table
*table
;
2654 table
= kmemdup(ipv6_route_table_template
,
2655 sizeof(ipv6_route_table_template
),
2659 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2660 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
2661 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2662 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2663 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2664 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2665 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2666 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2667 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2668 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2675 static int __net_init
ip6_route_net_init(struct net
*net
)
2679 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
2680 sizeof(net
->ipv6
.ip6_dst_ops
));
2682 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
2683 goto out_ip6_dst_ops
;
2685 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2686 sizeof(*net
->ipv6
.ip6_null_entry
),
2688 if (!net
->ipv6
.ip6_null_entry
)
2689 goto out_ip6_dst_entries
;
2690 net
->ipv6
.ip6_null_entry
->dst
.path
=
2691 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2692 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2693 dst_metric_set(&net
->ipv6
.ip6_null_entry
->dst
, RTAX_HOPLIMIT
, 255);
2695 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2696 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2697 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2699 if (!net
->ipv6
.ip6_prohibit_entry
)
2700 goto out_ip6_null_entry
;
2701 net
->ipv6
.ip6_prohibit_entry
->dst
.path
=
2702 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2703 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2704 dst_metric_set(&net
->ipv6
.ip6_prohibit_entry
->dst
, RTAX_HOPLIMIT
, 255);
2706 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2707 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2709 if (!net
->ipv6
.ip6_blk_hole_entry
)
2710 goto out_ip6_prohibit_entry
;
2711 net
->ipv6
.ip6_blk_hole_entry
->dst
.path
=
2712 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2713 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2714 dst_metric_set(&net
->ipv6
.ip6_blk_hole_entry
->dst
, RTAX_HOPLIMIT
, 255);
2717 net
->ipv6
.sysctl
.flush_delay
= 0;
2718 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2719 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2720 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2721 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2722 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2723 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2724 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2726 #ifdef CONFIG_PROC_FS
2727 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2728 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2730 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2736 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2737 out_ip6_prohibit_entry
:
2738 kfree(net
->ipv6
.ip6_prohibit_entry
);
2740 kfree(net
->ipv6
.ip6_null_entry
);
2742 out_ip6_dst_entries
:
2743 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
2748 static void __net_exit
ip6_route_net_exit(struct net
*net
)
2750 #ifdef CONFIG_PROC_FS
2751 proc_net_remove(net
, "ipv6_route");
2752 proc_net_remove(net
, "rt6_stats");
2754 kfree(net
->ipv6
.ip6_null_entry
);
2755 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2756 kfree(net
->ipv6
.ip6_prohibit_entry
);
2757 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2759 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
2762 static struct pernet_operations ip6_route_net_ops
= {
2763 .init
= ip6_route_net_init
,
2764 .exit
= ip6_route_net_exit
,
2767 static struct notifier_block ip6_route_dev_notifier
= {
2768 .notifier_call
= ip6_route_dev_notify
,
2772 int __init
ip6_route_init(void)
2777 ip6_dst_ops_template
.kmem_cachep
=
2778 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2779 SLAB_HWCACHE_ALIGN
, NULL
);
2780 if (!ip6_dst_ops_template
.kmem_cachep
)
2783 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
2785 goto out_kmem_cache
;
2787 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2789 goto out_dst_entries
;
2791 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2793 /* Registering of the loopback is done before this portion of code,
2794 * the loopback reference in rt6_info will not be taken, do it
2795 * manually for init_net */
2796 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
2797 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2798 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2799 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
2800 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2801 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
2802 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2806 goto out_register_subsys
;
2812 ret
= fib6_rules_init();
2817 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2818 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2819 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2820 goto fib6_rules_init
;
2822 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2824 goto fib6_rules_init
;
2830 fib6_rules_cleanup();
2835 out_register_subsys
:
2836 unregister_pernet_subsys(&ip6_route_net_ops
);
2838 dst_entries_destroy(&ip6_dst_blackhole_ops
);
2840 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2844 void ip6_route_cleanup(void)
2846 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2847 fib6_rules_cleanup();
2850 unregister_pernet_subsys(&ip6_route_net_ops
);
2851 dst_entries_destroy(&ip6_dst_blackhole_ops
);
2852 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);