c3d204973dbc4779520d70ea8e703f218564510b
2 * ip_vs_xmit.c: various packet transmitters for IPVS
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
14 * Description of forwarding methods:
15 * - all transmitters are called from LOCAL_IN (remote clients) and
16 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
17 * - not all connections have destination server, for example,
18 * connections in backup server when fwmark is used
19 * - bypass connections use daddr from packet
20 * - we can use dst without ref while sending in RCU section, we use
21 * ref when returning NF_ACCEPT for NAT-ed packet via loopback
23 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
24 * - skb->pkt_type is not set yet
25 * - the only place where we can see skb->sk != NULL
28 #define KMSG_COMPONENT "IPVS"
29 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
31 #include <linux/kernel.h>
32 #include <linux/slab.h>
33 #include <linux/tcp.h> /* for tcphdr */
35 #include <net/tcp.h> /* for csum_tcpudp_magic */
37 #include <net/icmp.h> /* for icmp_send */
38 #include <net/route.h> /* for ip_route_output */
40 #include <net/ip6_route.h>
41 #include <net/addrconf.h>
42 #include <linux/icmpv6.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv4.h>
46 #include <net/ip_vs.h>
49 IP_VS_RT_MODE_LOCAL
= 1, /* Allow local dest */
50 IP_VS_RT_MODE_NON_LOCAL
= 2, /* Allow non-local dest */
51 IP_VS_RT_MODE_RDR
= 4, /* Allow redirect from remote daddr to
54 IP_VS_RT_MODE_CONNECT
= 8, /* Always bind route to saddr */
55 IP_VS_RT_MODE_KNOWN_NH
= 16,/* Route via remote addr */
56 IP_VS_RT_MODE_TUNNEL
= 32,/* Tunnel mode */
59 static inline struct ip_vs_dest_dst
*ip_vs_dest_dst_alloc(void)
61 return kmalloc(sizeof(struct ip_vs_dest_dst
), GFP_ATOMIC
);
64 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst
*dest_dst
)
70 * Destination cache to speed up outgoing route lookup
73 __ip_vs_dst_set(struct ip_vs_dest
*dest
, struct ip_vs_dest_dst
*dest_dst
,
74 struct dst_entry
*dst
, u32 dst_cookie
)
76 struct ip_vs_dest_dst
*old
;
78 old
= rcu_dereference_protected(dest
->dest_dst
,
79 lockdep_is_held(&dest
->dst_lock
));
82 dest_dst
->dst_cache
= dst
;
83 dest_dst
->dst_cookie
= dst_cookie
;
85 rcu_assign_pointer(dest
->dest_dst
, dest_dst
);
88 call_rcu(&old
->rcu_head
, ip_vs_dest_dst_rcu_free
);
91 static inline struct ip_vs_dest_dst
*
92 __ip_vs_dst_check(struct ip_vs_dest
*dest
)
94 struct ip_vs_dest_dst
*dest_dst
= rcu_dereference(dest
->dest_dst
);
95 struct dst_entry
*dst
;
99 dst
= dest_dst
->dst_cache
;
101 dst
->ops
->check(dst
, dest_dst
->dst_cookie
) == NULL
)
107 __mtu_check_toobig_v6(const struct sk_buff
*skb
, u32 mtu
)
109 if (IP6CB(skb
)->frag_max_size
) {
110 /* frag_max_size tell us that, this packet have been
111 * defragmented by netfilter IPv6 conntrack module.
113 if (IP6CB(skb
)->frag_max_size
> mtu
)
114 return true; /* largest fragment violate MTU */
116 else if (skb
->len
> mtu
&& !skb_is_gso(skb
)) {
117 return true; /* Packet size violate MTU size */
122 /* Get route to daddr, update *saddr, optionally bind route to saddr */
123 static struct rtable
*do_output_route4(struct net
*net
, __be32 daddr
,
124 int rt_mode
, __be32
*saddr
)
130 memset(&fl4
, 0, sizeof(fl4
));
132 fl4
.flowi4_flags
= (rt_mode
& IP_VS_RT_MODE_KNOWN_NH
) ?
133 FLOWI_FLAG_KNOWN_NH
: 0;
136 rt
= ip_route_output_key(net
, &fl4
);
138 /* Invalid saddr ? */
139 if (PTR_ERR(rt
) == -EINVAL
&& *saddr
&&
140 rt_mode
& IP_VS_RT_MODE_CONNECT
&& !loop
) {
142 flowi4_update_output(&fl4
, 0, 0, daddr
, 0);
145 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr
);
147 } else if (!*saddr
&& rt_mode
& IP_VS_RT_MODE_CONNECT
&& fl4
.saddr
) {
150 flowi4_update_output(&fl4
, 0, 0, daddr
, fl4
.saddr
);
158 /* Get route to destination or remote server */
160 __ip_vs_get_out_rt(struct sk_buff
*skb
, struct ip_vs_dest
*dest
,
161 __be32 daddr
, int rt_mode
, __be32
*ret_saddr
)
163 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
164 struct netns_ipvs
*ipvs
= net_ipvs(net
);
165 struct ip_vs_dest_dst
*dest_dst
;
166 struct rtable
*rt
; /* Route to the other host */
167 struct rtable
*ort
; /* Original route */
171 int local
, noref
= 1;
174 dest_dst
= __ip_vs_dst_check(dest
);
175 if (likely(dest_dst
))
176 rt
= (struct rtable
*) dest_dst
->dst_cache
;
178 dest_dst
= ip_vs_dest_dst_alloc();
179 spin_lock_bh(&dest
->dst_lock
);
181 __ip_vs_dst_set(dest
, NULL
, NULL
, 0);
182 spin_unlock_bh(&dest
->dst_lock
);
185 rt
= do_output_route4(net
, dest
->addr
.ip
, rt_mode
,
186 &dest_dst
->dst_saddr
.ip
);
188 __ip_vs_dst_set(dest
, NULL
, NULL
, 0);
189 spin_unlock_bh(&dest
->dst_lock
);
190 ip_vs_dest_dst_free(dest_dst
);
193 __ip_vs_dst_set(dest
, dest_dst
, &rt
->dst
, 0);
194 spin_unlock_bh(&dest
->dst_lock
);
195 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
196 &dest
->addr
.ip
, &dest_dst
->dst_saddr
.ip
,
197 atomic_read(&rt
->dst
.__refcnt
));
199 daddr
= dest
->addr
.ip
;
201 *ret_saddr
= dest_dst
->dst_saddr
.ip
;
203 __be32 saddr
= htonl(INADDR_ANY
);
207 /* For such unconfigured boxes avoid many route lookups
208 * for performance reasons because we do not remember saddr
210 rt_mode
&= ~IP_VS_RT_MODE_CONNECT
;
211 rt
= do_output_route4(net
, daddr
, rt_mode
, &saddr
);
218 local
= (rt
->rt_flags
& RTCF_LOCAL
) ? 1 : 0;
219 if (!((local
? IP_VS_RT_MODE_LOCAL
: IP_VS_RT_MODE_NON_LOCAL
) &
221 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
222 (rt
->rt_flags
& RTCF_LOCAL
) ?
223 "local":"non-local", &daddr
);
227 if (likely(!local
)) {
228 if (unlikely(ipv4_is_loopback(iph
->saddr
))) {
229 IP_VS_DBG_RL("Stopping traffic from loopback address "
230 "%pI4 to non-local address, dest: %pI4\n",
231 &iph
->saddr
, &daddr
);
235 ort
= skb_rtable(skb
);
236 if (!(rt_mode
& IP_VS_RT_MODE_RDR
) &&
237 !(ort
->rt_flags
& RTCF_LOCAL
)) {
238 IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
239 "local requires NAT method, dest: %pI4\n",
240 &iph
->daddr
, &daddr
);
243 /* skb to local stack, preserve old route */
249 if (likely(!(rt_mode
& IP_VS_RT_MODE_TUNNEL
))) {
250 mtu
= dst_mtu(&rt
->dst
);
251 df
= iph
->frag_off
& htons(IP_DF
);
253 struct sock
*sk
= skb
->sk
;
255 mtu
= dst_mtu(&rt
->dst
) - sizeof(struct iphdr
);
257 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__
);
260 ort
= skb_rtable(skb
);
261 if (!skb
->dev
&& sk
&& sk
->sk_state
!= TCP_TIME_WAIT
)
262 ort
->dst
.ops
->update_pmtu(&ort
->dst
, sk
, NULL
, mtu
);
263 /* MTU check allowed? */
264 df
= sysctl_pmtu_disc(ipvs
) ? iph
->frag_off
& htons(IP_DF
) : 0;
268 if (unlikely(df
&& skb
->len
> mtu
&& !skb_is_gso(skb
))) {
269 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
270 IP_VS_DBG(1, "frag needed for %pI4\n", &iph
->saddr
);
277 skb_dst_set_noref_force(skb
, &rt
->dst
);
279 skb_dst_set(skb
, dst_clone(&rt
->dst
));
281 skb_dst_set(skb
, &rt
->dst
);
291 dst_link_failure(skb
);
295 #ifdef CONFIG_IP_VS_IPV6
297 static inline int __ip_vs_is_local_route6(struct rt6_info
*rt
)
299 return rt
->dst
.dev
&& rt
->dst
.dev
->flags
& IFF_LOOPBACK
;
302 static struct dst_entry
*
303 __ip_vs_route_output_v6(struct net
*net
, struct in6_addr
*daddr
,
304 struct in6_addr
*ret_saddr
, int do_xfrm
)
306 struct dst_entry
*dst
;
307 struct flowi6 fl6
= {
311 dst
= ip6_route_output(net
, NULL
, &fl6
);
316 if (ipv6_addr_any(&fl6
.saddr
) &&
317 ipv6_dev_get_saddr(net
, ip6_dst_idev(dst
)->dev
,
318 &fl6
.daddr
, 0, &fl6
.saddr
) < 0)
321 dst
= xfrm_lookup(net
, dst
, flowi6_to_flowi(&fl6
), NULL
, 0);
327 *ret_saddr
= fl6
.saddr
;
332 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr
);
337 * Get route to destination or remote server
340 __ip_vs_get_out_rt_v6(struct sk_buff
*skb
, struct ip_vs_dest
*dest
,
341 struct in6_addr
*daddr
, struct in6_addr
*ret_saddr
,
342 struct ip_vs_iphdr
*ipvsh
, int do_xfrm
, int rt_mode
)
344 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
345 struct ip_vs_dest_dst
*dest_dst
;
346 struct rt6_info
*rt
; /* Route to the other host */
347 struct rt6_info
*ort
; /* Original route */
348 struct dst_entry
*dst
;
350 int local
, noref
= 1;
353 dest_dst
= __ip_vs_dst_check(dest
);
354 if (likely(dest_dst
))
355 rt
= (struct rt6_info
*) dest_dst
->dst_cache
;
359 dest_dst
= ip_vs_dest_dst_alloc();
360 spin_lock_bh(&dest
->dst_lock
);
362 __ip_vs_dst_set(dest
, NULL
, NULL
, 0);
363 spin_unlock_bh(&dest
->dst_lock
);
366 dst
= __ip_vs_route_output_v6(net
, &dest
->addr
.in6
,
367 &dest_dst
->dst_saddr
.in6
,
370 __ip_vs_dst_set(dest
, NULL
, NULL
, 0);
371 spin_unlock_bh(&dest
->dst_lock
);
372 ip_vs_dest_dst_free(dest_dst
);
375 rt
= (struct rt6_info
*) dst
;
376 cookie
= rt
->rt6i_node
? rt
->rt6i_node
->fn_sernum
: 0;
377 __ip_vs_dst_set(dest
, dest_dst
, &rt
->dst
, cookie
);
378 spin_unlock_bh(&dest
->dst_lock
);
379 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
380 &dest
->addr
.in6
, &dest_dst
->dst_saddr
.in6
,
381 atomic_read(&rt
->dst
.__refcnt
));
384 *ret_saddr
= dest_dst
->dst_saddr
.in6
;
387 dst
= __ip_vs_route_output_v6(net
, daddr
, ret_saddr
, do_xfrm
);
390 rt
= (struct rt6_info
*) dst
;
393 local
= __ip_vs_is_local_route6(rt
);
394 if (!((local
? IP_VS_RT_MODE_LOCAL
: IP_VS_RT_MODE_NON_LOCAL
) &
396 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
397 local
? "local":"non-local", daddr
);
400 if (likely(!local
)) {
401 if (unlikely((!skb
->dev
|| skb
->dev
->flags
& IFF_LOOPBACK
) &&
402 ipv6_addr_type(&ipv6_hdr(skb
)->saddr
) &
403 IPV6_ADDR_LOOPBACK
)) {
404 IP_VS_DBG_RL("Stopping traffic from loopback address "
405 "%pI6c to non-local address, "
407 &ipv6_hdr(skb
)->saddr
, daddr
);
411 ort
= (struct rt6_info
*) skb_dst(skb
);
412 if (!(rt_mode
& IP_VS_RT_MODE_RDR
) &&
413 !__ip_vs_is_local_route6(ort
)) {
414 IP_VS_DBG_RL("Redirect from non-local address %pI6c "
415 "to local requires NAT method, "
417 &ipv6_hdr(skb
)->daddr
, daddr
);
420 /* skb to local stack, preserve old route */
422 dst_release(&rt
->dst
);
427 if (likely(!(rt_mode
& IP_VS_RT_MODE_TUNNEL
)))
428 mtu
= dst_mtu(&rt
->dst
);
430 struct sock
*sk
= skb
->sk
;
432 mtu
= dst_mtu(&rt
->dst
) - sizeof(struct ipv6hdr
);
433 if (mtu
< IPV6_MIN_MTU
) {
434 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__
,
438 ort
= (struct rt6_info
*) skb_dst(skb
);
439 if (!skb
->dev
&& sk
&& sk
->sk_state
!= TCP_TIME_WAIT
)
440 ort
->dst
.ops
->update_pmtu(&ort
->dst
, sk
, NULL
, mtu
);
443 if (unlikely(__mtu_check_toobig_v6(skb
, mtu
))) {
445 skb
->dev
= net
->loopback_dev
;
446 /* only send ICMP too big on first fragment */
447 if (!ipvsh
->fragoffs
)
448 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
449 IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb
)->saddr
);
456 skb_dst_set_noref_force(skb
, &rt
->dst
);
458 skb_dst_set(skb
, dst_clone(&rt
->dst
));
460 skb_dst_set(skb
, &rt
->dst
);
466 dst_release(&rt
->dst
);
470 dst_link_failure(skb
);
476 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
477 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff
*skb
,
478 struct ip_vs_conn
*cp
)
482 skb
->ipvs_property
= 1;
483 if (unlikely(cp
->flags
& IP_VS_CONN_F_NFCT
))
484 ret
= ip_vs_confirm_conntrack(skb
);
485 if (ret
== NF_ACCEPT
) {
487 skb_forward_csum(skb
);
492 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
493 static inline int ip_vs_nat_send_or_cont(int pf
, struct sk_buff
*skb
,
494 struct ip_vs_conn
*cp
, int local
)
498 skb
->ipvs_property
= 1;
499 if (likely(!(cp
->flags
& IP_VS_CONN_F_NFCT
)))
502 ip_vs_update_conntrack(skb
, cp
, 1);
504 skb_forward_csum(skb
);
505 NF_HOOK(pf
, NF_INET_LOCAL_OUT
, skb
, NULL
, skb_dst(skb
)->dev
,
512 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
513 static inline int ip_vs_send_or_cont(int pf
, struct sk_buff
*skb
,
514 struct ip_vs_conn
*cp
, int local
)
518 skb
->ipvs_property
= 1;
519 if (likely(!(cp
->flags
& IP_VS_CONN_F_NFCT
)))
522 skb_forward_csum(skb
);
523 NF_HOOK(pf
, NF_INET_LOCAL_OUT
, skb
, NULL
, skb_dst(skb
)->dev
,
532 * NULL transmitter (do nothing except return NF_ACCEPT)
535 ip_vs_null_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
536 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
538 /* we do not touch skb and do not need pskb ptr */
539 return ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 1);
545 * Let packets bypass the destination when the destination is not
546 * available, it may be only used in transparent cache cluster.
549 ip_vs_bypass_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
550 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
552 struct iphdr
*iph
= ip_hdr(skb
);
557 if (__ip_vs_get_out_rt(skb
, NULL
, iph
->daddr
, IP_VS_RT_MODE_NON_LOCAL
,
563 /* Another hack: avoid icmp_send in ip_fragment */
566 ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 0);
579 #ifdef CONFIG_IP_VS_IPV6
581 ip_vs_bypass_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
582 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
587 if (__ip_vs_get_out_rt_v6(skb
, NULL
, &ipvsh
->daddr
.in6
, NULL
,
588 ipvsh
, 0, IP_VS_RT_MODE_NON_LOCAL
) < 0)
591 /* Another hack: avoid icmp_send in ip_fragment */
594 ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 0);
609 * NAT transmitter (only for outside-to-inside nat forwarding)
610 * Not used for related ICMP
613 ip_vs_nat_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
614 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
616 struct rtable
*rt
; /* Route to the other host */
617 int local
, rc
, was_input
;
622 /* check if it is a connection of no-client-port */
623 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
)) {
626 p
= skb_header_pointer(skb
, ipvsh
->len
, sizeof(_pt
), &_pt
);
629 ip_vs_conn_fill_cport(cp
, *p
);
630 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p
));
633 was_input
= rt_is_input_route(skb_rtable(skb
));
634 local
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
635 IP_VS_RT_MODE_LOCAL
|
636 IP_VS_RT_MODE_NON_LOCAL
|
637 IP_VS_RT_MODE_RDR
, NULL
);
640 rt
= skb_rtable(skb
);
642 * Avoid duplicate tuple in reply direction for NAT traffic
643 * to local address when connection is sync-ed
645 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
646 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
647 enum ip_conntrack_info ctinfo
;
648 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
650 if (ct
&& !nf_ct_is_untracked(ct
)) {
651 IP_VS_DBG_RL_PKT(10, AF_INET
, pp
, skb
, 0,
653 "stopping DNAT to local address");
659 /* From world but DNAT to loopback address? */
660 if (local
&& ipv4_is_loopback(cp
->daddr
.ip
) && was_input
) {
661 IP_VS_DBG_RL_PKT(1, AF_INET
, pp
, skb
, 0, "ip_vs_nat_xmit(): "
662 "stopping DNAT to loopback address");
666 /* copy-on-write the packet before mangling it */
667 if (!skb_make_writable(skb
, sizeof(struct iphdr
)))
670 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
673 /* mangle the packet */
674 if (pp
->dnat_handler
&& !pp
->dnat_handler(skb
, pp
, cp
, ipvsh
))
676 ip_hdr(skb
)->daddr
= cp
->daddr
.ip
;
677 ip_send_check(ip_hdr(skb
));
679 IP_VS_DBG_PKT(10, AF_INET
, pp
, skb
, 0, "After DNAT");
681 /* FIXME: when application helper enlarges the packet and the length
682 is larger than the MTU of outgoing device, there will be still
685 /* Another hack: avoid icmp_send in ip_fragment */
688 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV4
, skb
, cp
, local
);
701 #ifdef CONFIG_IP_VS_IPV6
703 ip_vs_nat_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
704 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
706 struct rt6_info
*rt
; /* Route to the other host */
712 /* check if it is a connection of no-client-port */
713 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
&& !ipvsh
->fragoffs
)) {
715 p
= skb_header_pointer(skb
, ipvsh
->len
, sizeof(_pt
), &_pt
);
718 ip_vs_conn_fill_cport(cp
, *p
);
719 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p
));
722 local
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
724 IP_VS_RT_MODE_LOCAL
|
725 IP_VS_RT_MODE_NON_LOCAL
|
729 rt
= (struct rt6_info
*) skb_dst(skb
);
731 * Avoid duplicate tuple in reply direction for NAT traffic
732 * to local address when connection is sync-ed
734 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
735 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
736 enum ip_conntrack_info ctinfo
;
737 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
739 if (ct
&& !nf_ct_is_untracked(ct
)) {
740 IP_VS_DBG_RL_PKT(10, AF_INET6
, pp
, skb
, 0,
741 "ip_vs_nat_xmit_v6(): "
742 "stopping DNAT to local address");
748 /* From world but DNAT to loopback address? */
749 if (local
&& skb
->dev
&& !(skb
->dev
->flags
& IFF_LOOPBACK
) &&
750 ipv6_addr_type(&rt
->rt6i_dst
.addr
) & IPV6_ADDR_LOOPBACK
) {
751 IP_VS_DBG_RL_PKT(1, AF_INET6
, pp
, skb
, 0,
752 "ip_vs_nat_xmit_v6(): "
753 "stopping DNAT to loopback address");
757 /* copy-on-write the packet before mangling it */
758 if (!skb_make_writable(skb
, sizeof(struct ipv6hdr
)))
761 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
764 /* mangle the packet */
765 if (pp
->dnat_handler
&& !pp
->dnat_handler(skb
, pp
, cp
, ipvsh
))
767 ipv6_hdr(skb
)->daddr
= cp
->daddr
.in6
;
769 IP_VS_DBG_PKT(10, AF_INET6
, pp
, skb
, 0, "After DNAT");
771 /* FIXME: when application helper enlarges the packet and the length
772 is larger than the MTU of outgoing device, there will be still
775 /* Another hack: avoid icmp_send in ip_fragment */
778 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV6
, skb
, cp
, local
);
794 * IP Tunneling transmitter
796 * This function encapsulates the packet in a new IP packet, its
797 * destination will be set to cp->daddr. Most code of this function
798 * is taken from ipip.c.
800 * It is used in VS/TUN cluster. The load balancer selects a real
801 * server from a cluster based on a scheduling algorithm,
802 * encapsulates the request packet and forwards it to the selected
803 * server. For example, all real servers are configured with
804 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
805 * the encapsulated packet, it will decapsulate the packet, processe
806 * the request and return the response packets directly to the client
807 * without passing the load balancer. This can greatly increase the
808 * scalability of virtual server.
810 * Used for ANY protocol
813 ip_vs_tunnel_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
814 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
816 struct netns_ipvs
*ipvs
= net_ipvs(skb_net(skb
));
817 struct rtable
*rt
; /* Route to the other host */
818 __be32 saddr
; /* Source for tunnel */
819 struct net_device
*tdev
; /* Device to other host */
820 struct iphdr
*old_iph
= ip_hdr(skb
);
821 u8 tos
= old_iph
->tos
;
823 struct iphdr
*iph
; /* Our new IP header */
824 unsigned int max_headroom
; /* The extra header space needed */
830 local
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
831 IP_VS_RT_MODE_LOCAL
|
832 IP_VS_RT_MODE_NON_LOCAL
|
833 IP_VS_RT_MODE_CONNECT
|
834 IP_VS_RT_MODE_TUNNEL
, &saddr
);
839 return ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 1);
842 rt
= skb_rtable(skb
);
845 /* Copy DF, reset fragment offset and MF */
846 df
= sysctl_pmtu_disc(ipvs
) ? old_iph
->frag_off
& htons(IP_DF
) : 0;
849 * Okay, now see if we can stuff it in the buffer as-is.
851 max_headroom
= LL_RESERVED_SPACE(tdev
) + sizeof(struct iphdr
);
853 if (skb_headroom(skb
) < max_headroom
|| skb_cloned(skb
)) {
854 struct sk_buff
*new_skb
=
855 skb_realloc_headroom(skb
, max_headroom
);
861 old_iph
= ip_hdr(skb
);
864 skb
->transport_header
= skb
->network_header
;
866 /* fix old IP header checksum */
867 ip_send_check(old_iph
);
869 skb_push(skb
, sizeof(struct iphdr
));
870 skb_reset_network_header(skb
);
871 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
874 * Push down and install the IPIP header.
878 iph
->ihl
= sizeof(struct iphdr
)>>2;
880 iph
->protocol
= IPPROTO_IPIP
;
882 iph
->daddr
= cp
->daddr
.ip
;
884 iph
->ttl
= old_iph
->ttl
;
885 ip_select_ident(skb
, NULL
);
887 /* Another hack: avoid icmp_send in ip_fragment */
890 ret
= ip_vs_tunnel_xmit_prepare(skb
, cp
);
891 if (ret
== NF_ACCEPT
)
893 else if (ret
== NF_DROP
)
908 #ifdef CONFIG_IP_VS_IPV6
910 ip_vs_tunnel_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
911 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
913 struct rt6_info
*rt
; /* Route to the other host */
914 struct in6_addr saddr
; /* Source for tunnel */
915 struct net_device
*tdev
; /* Device to other host */
916 struct ipv6hdr
*old_iph
= ipv6_hdr(skb
);
917 struct ipv6hdr
*iph
; /* Our new IP header */
918 unsigned int max_headroom
; /* The extra header space needed */
924 local
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
,
926 IP_VS_RT_MODE_LOCAL
|
927 IP_VS_RT_MODE_NON_LOCAL
|
928 IP_VS_RT_MODE_TUNNEL
);
933 return ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 1);
936 rt
= (struct rt6_info
*) skb_dst(skb
);
940 * Okay, now see if we can stuff it in the buffer as-is.
942 max_headroom
= LL_RESERVED_SPACE(tdev
) + sizeof(struct ipv6hdr
);
944 if (skb_headroom(skb
) < max_headroom
|| skb_cloned(skb
)) {
945 struct sk_buff
*new_skb
=
946 skb_realloc_headroom(skb
, max_headroom
);
952 old_iph
= ipv6_hdr(skb
);
955 skb
->transport_header
= skb
->network_header
;
957 skb_push(skb
, sizeof(struct ipv6hdr
));
958 skb_reset_network_header(skb
);
959 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
962 * Push down and install the IPIP header.
966 iph
->nexthdr
= IPPROTO_IPV6
;
967 iph
->payload_len
= old_iph
->payload_len
;
968 be16_add_cpu(&iph
->payload_len
, sizeof(*old_iph
));
969 memset(&iph
->flow_lbl
, 0, sizeof(iph
->flow_lbl
));
970 ipv6_change_dsfield(iph
, 0, ipv6_get_dsfield(old_iph
));
971 iph
->daddr
= cp
->daddr
.in6
;
973 iph
->hop_limit
= old_iph
->hop_limit
;
975 /* Another hack: avoid icmp_send in ip_fragment */
978 ret
= ip_vs_tunnel_xmit_prepare(skb
, cp
);
979 if (ret
== NF_ACCEPT
)
981 else if (ret
== NF_DROP
)
999 * Direct Routing transmitter
1000 * Used for ANY protocol
1003 ip_vs_dr_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1004 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
1011 local
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
1012 IP_VS_RT_MODE_LOCAL
|
1013 IP_VS_RT_MODE_NON_LOCAL
|
1014 IP_VS_RT_MODE_KNOWN_NH
, NULL
);
1019 return ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 1);
1022 ip_send_check(ip_hdr(skb
));
1024 /* Another hack: avoid icmp_send in ip_fragment */
1027 ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 0);
1040 #ifdef CONFIG_IP_VS_IPV6
1042 ip_vs_dr_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1043 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
1050 local
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
1052 IP_VS_RT_MODE_LOCAL
|
1053 IP_VS_RT_MODE_NON_LOCAL
);
1058 return ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 1);
1061 /* Another hack: avoid icmp_send in ip_fragment */
1064 ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 0);
1080 * ICMP packet transmitter
1081 * called by the ip_vs_in_icmp
1084 ip_vs_icmp_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1085 struct ip_vs_protocol
*pp
, int offset
, unsigned int hooknum
,
1086 struct ip_vs_iphdr
*iph
)
1088 struct rtable
*rt
; /* Route to the other host */
1091 int rt_mode
, was_input
;
1095 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1096 forwarded directly here, because there is no need to
1097 translate address/port back */
1098 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
1099 if (cp
->packet_xmit
)
1100 rc
= cp
->packet_xmit(skb
, cp
, pp
, iph
);
1103 /* do not touch skb anymore */
1104 atomic_inc(&cp
->in_pkts
);
1109 * mangle and send the packet here (only for VS/NAT)
1111 was_input
= rt_is_input_route(skb_rtable(skb
));
1113 /* LOCALNODE from FORWARD hook is not supported */
1114 rt_mode
= (hooknum
!= NF_INET_FORWARD
) ?
1115 IP_VS_RT_MODE_LOCAL
| IP_VS_RT_MODE_NON_LOCAL
|
1116 IP_VS_RT_MODE_RDR
: IP_VS_RT_MODE_NON_LOCAL
;
1118 local
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
, rt_mode
, NULL
);
1121 rt
= skb_rtable(skb
);
1124 * Avoid duplicate tuple in reply direction for NAT traffic
1125 * to local address when connection is sync-ed
1127 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1128 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
1129 enum ip_conntrack_info ctinfo
;
1130 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
1132 if (ct
&& !nf_ct_is_untracked(ct
)) {
1133 IP_VS_DBG(10, "%s(): "
1134 "stopping DNAT to local address %pI4\n",
1135 __func__
, &cp
->daddr
.ip
);
1141 /* From world but DNAT to loopback address? */
1142 if (local
&& ipv4_is_loopback(cp
->daddr
.ip
) && was_input
) {
1143 IP_VS_DBG(1, "%s(): "
1144 "stopping DNAT to loopback %pI4\n",
1145 __func__
, &cp
->daddr
.ip
);
1149 /* copy-on-write the packet before mangling it */
1150 if (!skb_make_writable(skb
, offset
))
1153 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
1156 ip_vs_nat_icmp(skb
, pp
, cp
, 0);
1158 /* Another hack: avoid icmp_send in ip_fragment */
1161 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV4
, skb
, cp
, local
);
1174 #ifdef CONFIG_IP_VS_IPV6
1176 ip_vs_icmp_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1177 struct ip_vs_protocol
*pp
, int offset
, unsigned int hooknum
,
1178 struct ip_vs_iphdr
*ipvsh
)
1180 struct rt6_info
*rt
; /* Route to the other host */
1187 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1188 forwarded directly here, because there is no need to
1189 translate address/port back */
1190 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
1191 if (cp
->packet_xmit
)
1192 rc
= cp
->packet_xmit(skb
, cp
, pp
, ipvsh
);
1195 /* do not touch skb anymore */
1196 atomic_inc(&cp
->in_pkts
);
1201 * mangle and send the packet here (only for VS/NAT)
1204 /* LOCALNODE from FORWARD hook is not supported */
1205 rt_mode
= (hooknum
!= NF_INET_FORWARD
) ?
1206 IP_VS_RT_MODE_LOCAL
| IP_VS_RT_MODE_NON_LOCAL
|
1207 IP_VS_RT_MODE_RDR
: IP_VS_RT_MODE_NON_LOCAL
;
1209 local
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
1213 rt
= (struct rt6_info
*) skb_dst(skb
);
1215 * Avoid duplicate tuple in reply direction for NAT traffic
1216 * to local address when connection is sync-ed
1218 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1219 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
1220 enum ip_conntrack_info ctinfo
;
1221 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
1223 if (ct
&& !nf_ct_is_untracked(ct
)) {
1224 IP_VS_DBG(10, "%s(): "
1225 "stopping DNAT to local address %pI6\n",
1226 __func__
, &cp
->daddr
.in6
);
1232 /* From world but DNAT to loopback address? */
1233 if (local
&& skb
->dev
&& !(skb
->dev
->flags
& IFF_LOOPBACK
) &&
1234 ipv6_addr_type(&rt
->rt6i_dst
.addr
) & IPV6_ADDR_LOOPBACK
) {
1235 IP_VS_DBG(1, "%s(): "
1236 "stopping DNAT to loopback %pI6\n",
1237 __func__
, &cp
->daddr
.in6
);
1241 /* copy-on-write the packet before mangling it */
1242 if (!skb_make_writable(skb
, offset
))
1245 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
1248 ip_vs_nat_icmp_v6(skb
, pp
, cp
, 0);
1250 /* Another hack: avoid icmp_send in ip_fragment */
1253 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV6
, skb
, cp
, local
);