IPVS: netns preparation for proto_ah_esp
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / netfilter / ipvs / ip_vs_core.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
1da177e4
LT
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others.
20 *
21 * Changes:
22 * Paul `Rusty' Russell properly handle non-linear skbs
6869c4d8 23 * Harald Welte don't use nfcache
1da177e4
LT
24 *
25 */
26
9aada7ac
HE
27#define KMSG_COMPONENT "IPVS"
28#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
1da177e4
LT
30#include <linux/module.h>
31#include <linux/kernel.h>
32#include <linux/ip.h>
33#include <linux/tcp.h>
2906f66a 34#include <linux/sctp.h>
1da177e4 35#include <linux/icmp.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4
LT
37
38#include <net/ip.h>
39#include <net/tcp.h>
40#include <net/udp.h>
41#include <net/icmp.h> /* for icmp_send */
42#include <net/route.h>
2c70b519 43#include <net/ip6_checksum.h>
61b1ab45 44#include <net/netns/generic.h> /* net_generic() */
1da177e4
LT
45
46#include <linux/netfilter.h>
47#include <linux/netfilter_ipv4.h>
48
2a3b791e
JV
49#ifdef CONFIG_IP_VS_IPV6
50#include <net/ipv6.h>
51#include <linux/netfilter_ipv6.h>
489fdeda 52#include <net/ip6_route.h>
2a3b791e
JV
53#endif
54
1da177e4
LT
55#include <net/ip_vs.h>
56
57
58EXPORT_SYMBOL(register_ip_vs_scheduler);
59EXPORT_SYMBOL(unregister_ip_vs_scheduler);
1da177e4
LT
60EXPORT_SYMBOL(ip_vs_proto_name);
61EXPORT_SYMBOL(ip_vs_conn_new);
62EXPORT_SYMBOL(ip_vs_conn_in_get);
63EXPORT_SYMBOL(ip_vs_conn_out_get);
64#ifdef CONFIG_IP_VS_PROTO_TCP
65EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
66#endif
67EXPORT_SYMBOL(ip_vs_conn_put);
68#ifdef CONFIG_IP_VS_DEBUG
69EXPORT_SYMBOL(ip_vs_get_debug_level);
70#endif
1da177e4 71
61b1ab45
HS
72int ip_vs_net_id __read_mostly;
73#ifdef IP_VS_GENERIC_NETNS
74EXPORT_SYMBOL(ip_vs_net_id);
75#endif
76/* netns cnt used for uniqueness */
77static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
1da177e4
LT
78
79/* ID used in ICMP lookups */
80#define icmp_id(icmph) (((icmph)->un).echo.id)
2a3b791e 81#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
1da177e4
LT
82
83const char *ip_vs_proto_name(unsigned proto)
84{
85 static char buf[20];
86
87 switch (proto) {
88 case IPPROTO_IP:
89 return "IP";
90 case IPPROTO_UDP:
91 return "UDP";
92 case IPPROTO_TCP:
93 return "TCP";
2906f66a
VMR
94 case IPPROTO_SCTP:
95 return "SCTP";
1da177e4
LT
96 case IPPROTO_ICMP:
97 return "ICMP";
2a3b791e
JV
98#ifdef CONFIG_IP_VS_IPV6
99 case IPPROTO_ICMPV6:
100 return "ICMPv6";
101#endif
1da177e4
LT
102 default:
103 sprintf(buf, "IP_%d", proto);
104 return buf;
105 }
106}
107
108void ip_vs_init_hash_table(struct list_head *table, int rows)
109{
110 while (--rows >= 0)
111 INIT_LIST_HEAD(&table[rows]);
112}
113
114static inline void
115ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116{
117 struct ip_vs_dest *dest = cp->dest;
118 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
119 spin_lock(&dest->stats.lock);
e9c0ce23
SW
120 dest->stats.ustats.inpkts++;
121 dest->stats.ustats.inbytes += skb->len;
1da177e4
LT
122 spin_unlock(&dest->stats.lock);
123
124 spin_lock(&dest->svc->stats.lock);
e9c0ce23
SW
125 dest->svc->stats.ustats.inpkts++;
126 dest->svc->stats.ustats.inbytes += skb->len;
1da177e4
LT
127 spin_unlock(&dest->svc->stats.lock);
128
129 spin_lock(&ip_vs_stats.lock);
e9c0ce23
SW
130 ip_vs_stats.ustats.inpkts++;
131 ip_vs_stats.ustats.inbytes += skb->len;
1da177e4
LT
132 spin_unlock(&ip_vs_stats.lock);
133 }
134}
135
136
137static inline void
138ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
139{
140 struct ip_vs_dest *dest = cp->dest;
141 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
142 spin_lock(&dest->stats.lock);
e9c0ce23
SW
143 dest->stats.ustats.outpkts++;
144 dest->stats.ustats.outbytes += skb->len;
1da177e4
LT
145 spin_unlock(&dest->stats.lock);
146
147 spin_lock(&dest->svc->stats.lock);
e9c0ce23
SW
148 dest->svc->stats.ustats.outpkts++;
149 dest->svc->stats.ustats.outbytes += skb->len;
1da177e4
LT
150 spin_unlock(&dest->svc->stats.lock);
151
152 spin_lock(&ip_vs_stats.lock);
e9c0ce23
SW
153 ip_vs_stats.ustats.outpkts++;
154 ip_vs_stats.ustats.outbytes += skb->len;
1da177e4
LT
155 spin_unlock(&ip_vs_stats.lock);
156 }
157}
158
159
160static inline void
161ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
162{
163 spin_lock(&cp->dest->stats.lock);
e9c0ce23 164 cp->dest->stats.ustats.conns++;
1da177e4
LT
165 spin_unlock(&cp->dest->stats.lock);
166
167 spin_lock(&svc->stats.lock);
e9c0ce23 168 svc->stats.ustats.conns++;
1da177e4
LT
169 spin_unlock(&svc->stats.lock);
170
171 spin_lock(&ip_vs_stats.lock);
e9c0ce23 172 ip_vs_stats.ustats.conns++;
1da177e4
LT
173 spin_unlock(&ip_vs_stats.lock);
174}
175
176
177static inline int
178ip_vs_set_state(struct ip_vs_conn *cp, int direction,
179 const struct sk_buff *skb,
180 struct ip_vs_protocol *pp)
181{
182 if (unlikely(!pp->state_transition))
183 return 0;
184 return pp->state_transition(cp, direction, skb, pp);
185}
186
a5959d53 187static inline int
85999283
SH
188ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
189 struct sk_buff *skb, int protocol,
190 const union nf_inet_addr *caddr, __be16 cport,
191 const union nf_inet_addr *vaddr, __be16 vport,
192 struct ip_vs_conn_param *p)
193{
194 ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
195 p->pe = svc->pe;
196 if (p->pe && p->pe->fill_param)
a5959d53
HS
197 return p->pe->fill_param(p, skb);
198
199 return 0;
85999283 200}
1da177e4 201
1da177e4
LT
202/*
203 * IPVS persistent scheduling function
204 * It creates a connection entry according to its template if exists,
205 * or selects a server and creates a connection entry plus a template.
206 * Locking: we are svc user (svc->refcnt), so we hold all dests too
207 * Protocols supported: TCP, UDP
208 */
209static struct ip_vs_conn *
210ip_vs_sched_persist(struct ip_vs_service *svc,
85999283 211 struct sk_buff *skb,
a5959d53 212 __be16 src_port, __be16 dst_port, int *ignored)
1da177e4
LT
213{
214 struct ip_vs_conn *cp = NULL;
28364a59 215 struct ip_vs_iphdr iph;
1da177e4
LT
216 struct ip_vs_dest *dest;
217 struct ip_vs_conn *ct;
5b57a98c 218 __be16 dport = 0; /* destination port to forward */
3575792e 219 unsigned int flags;
f11017ec 220 struct ip_vs_conn_param param;
28364a59
JV
221 union nf_inet_addr snet; /* source network of the client,
222 after masking */
cd17f9ed
JV
223
224 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
1da177e4
LT
225
226 /* Mask saddr with the netmask to adjust template granularity */
cd17f9ed
JV
227#ifdef CONFIG_IP_VS_IPV6
228 if (svc->af == AF_INET6)
229 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
230 else
231#endif
232 snet.ip = iph.saddr.ip & svc->netmask;
1da177e4 233
cd17f9ed
JV
234 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
235 "mnet %s\n",
ce144f24
HS
236 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
237 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
cd17f9ed 238 IP_VS_DBG_ADDR(svc->af, &snet));
1da177e4
LT
239
240 /*
241 * As far as we know, FTP is a very complicated network protocol, and
242 * it uses control connection and data connections. For active FTP,
243 * FTP server initialize data connection to the client, its source port
244 * is often 20. For passive FTP, FTP server tells the clients the port
245 * that it passively listens to, and the client issues the data
246 * connection. In the tunneling or direct routing mode, the load
247 * balancer is on the client-to-server half of connection, the port
248 * number is unknown to the load balancer. So, a conn template like
249 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
250 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
251 * is created for other persistent services.
252 */
5b57a98c 253 {
f11017ec
SH
254 int protocol = iph.protocol;
255 const union nf_inet_addr *vaddr = &iph.daddr;
256 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
257 __be16 vport = 0;
258
ce144f24 259 if (dst_port == svc->port) {
5b57a98c
SH
260 /* non-FTP template:
261 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
262 * FTP template:
263 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
1da177e4
LT
264 */
265 if (svc->port != FTPPORT)
ce144f24 266 vport = dst_port;
1da177e4 267 } else {
5b57a98c
SH
268 /* Note: persistent fwmark-based services and
269 * persistent port zero service are handled here.
270 * fwmark template:
271 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
272 * port zero template:
273 * <protocol,caddr,0,vaddr,0,daddr,0>
1da177e4 274 */
28364a59 275 if (svc->fwmark) {
5b57a98c
SH
276 protocol = IPPROTO_IP;
277 vaddr = &fwmark;
278 }
1da177e4 279 }
a5959d53
HS
280 /* return *ignored = -1 so NF_DROP can be used */
281 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
282 vaddr, vport, &param) < 0) {
283 *ignored = -1;
284 return NULL;
285 }
1da177e4
LT
286 }
287
5b57a98c 288 /* Check if a template already exists */
f11017ec 289 ct = ip_vs_ct_in_get(&param);
5b57a98c 290 if (!ct || !ip_vs_check_template(ct)) {
a5959d53
HS
291 /*
292 * No template found or the dest of the connection
5b57a98c 293 * template is not available.
a5959d53 294 * return *ignored=0 i.e. ICMP and NF_DROP
5b57a98c
SH
295 */
296 dest = svc->scheduler->schedule(svc, skb);
297 if (!dest) {
298 IP_VS_DBG(1, "p-schedule: no dest found.\n");
85999283 299 kfree(param.pe_data);
a5959d53 300 *ignored = 0;
5b57a98c
SH
301 return NULL;
302 }
303
ce144f24 304 if (dst_port == svc->port && svc->port != FTPPORT)
5b57a98c
SH
305 dport = dest->port;
306
85999283
SH
307 /* Create a template
308 * This adds param.pe_data to the template,
309 * and thus param.pe_data will be destroyed
310 * when the template expires */
f11017ec 311 ct = ip_vs_conn_new(&param, &dest->addr, dport,
0e051e68 312 IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
85999283
SH
313 if (ct == NULL) {
314 kfree(param.pe_data);
a5959d53 315 *ignored = -1;
5b57a98c 316 return NULL;
85999283 317 }
5b57a98c
SH
318
319 ct->timeout = svc->timeout;
85999283 320 } else {
5b57a98c
SH
321 /* set destination with the found template */
322 dest = ct->dest;
85999283
SH
323 kfree(param.pe_data);
324 }
5b57a98c 325
ce144f24 326 dport = dst_port;
5b57a98c
SH
327 if (dport == svc->port && dest->port)
328 dport = dest->port;
329
26ec037f
NC
330 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
331 && iph.protocol == IPPROTO_UDP)?
332 IP_VS_CONN_F_ONE_PACKET : 0;
333
1da177e4
LT
334 /*
335 * Create a new connection according to the template
336 */
ce144f24
HS
337 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port,
338 &iph.daddr, dst_port, &param);
339
0e051e68 340 cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
1da177e4
LT
341 if (cp == NULL) {
342 ip_vs_conn_put(ct);
a5959d53 343 *ignored = -1;
1da177e4
LT
344 return NULL;
345 }
346
347 /*
348 * Add its control
349 */
350 ip_vs_control_add(cp, ct);
351 ip_vs_conn_put(ct);
352
353 ip_vs_conn_stats(cp, svc);
354 return cp;
355}
356
357
358/*
359 * IPVS main scheduling function
360 * It selects a server according to the virtual service, and
361 * creates a connection entry.
362 * Protocols supported: TCP, UDP
a5959d53
HS
363 *
364 * Usage of *ignored
365 *
366 * 1 : protocol tried to schedule (eg. on SYN), found svc but the
367 * svc/scheduler decides that this packet should be accepted with
368 * NF_ACCEPT because it must not be scheduled.
369 *
370 * 0 : scheduler can not find destination, so try bypass or
371 * return ICMP and then NF_DROP (ip_vs_leave).
372 *
373 * -1 : scheduler tried to schedule but fatal error occurred, eg.
374 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
375 * failure such as missing Call-ID, ENOMEM on skb_linearize
376 * or pe_data. In this case we should return NF_DROP without
377 * any attempts to send ICMP with ip_vs_leave.
1da177e4
LT
378 */
379struct ip_vs_conn *
190ecd27
JA
380ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
381 struct ip_vs_protocol *pp, int *ignored)
1da177e4
LT
382{
383 struct ip_vs_conn *cp = NULL;
28364a59 384 struct ip_vs_iphdr iph;
1da177e4 385 struct ip_vs_dest *dest;
3575792e
JA
386 __be16 _ports[2], *pptr;
387 unsigned int flags;
1da177e4 388
190ecd27 389 *ignored = 1;
28364a59
JV
390 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
391 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
1da177e4
LT
392 if (pptr == NULL)
393 return NULL;
394
190ecd27
JA
395 /*
396 * FTPDATA needs this check when using local real server.
397 * Never schedule Active FTPDATA connections from real server.
398 * For LVS-NAT they must be already created. For other methods
399 * with persistence the connection is created on SYN+ACK.
400 */
401 if (pptr[0] == FTPDATA) {
0d79641a
JA
402 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
403 "Not scheduling FTPDATA");
190ecd27
JA
404 return NULL;
405 }
406
407 /*
a5959d53 408 * Do not schedule replies from local real server.
190ecd27
JA
409 */
410 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
190ecd27 411 (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
0d79641a 412 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
190ecd27
JA
413 "Not scheduling reply for existing connection");
414 __ip_vs_conn_put(cp);
415 return NULL;
416 }
417
1da177e4
LT
418 /*
419 * Persistent service
420 */
a5959d53
HS
421 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
422 return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
423
424 *ignored = 0;
1da177e4
LT
425
426 /*
427 * Non-persistent service
428 */
429 if (!svc->fwmark && pptr[1] != svc->port) {
430 if (!svc->port)
1e3e238e
HE
431 pr_err("Schedule: port zero only supported "
432 "in persistent services, "
433 "check your ipvs configuration\n");
1da177e4
LT
434 return NULL;
435 }
436
437 dest = svc->scheduler->schedule(svc, skb);
438 if (dest == NULL) {
439 IP_VS_DBG(1, "Schedule: no dest found.\n");
440 return NULL;
441 }
442
26ec037f
NC
443 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
444 && iph.protocol == IPPROTO_UDP)?
445 IP_VS_CONN_F_ONE_PACKET : 0;
446
1da177e4
LT
447 /*
448 * Create a connection entry.
449 */
f11017ec
SH
450 {
451 struct ip_vs_conn_param p;
452 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
453 pptr[0], &iph.daddr, pptr[1], &p);
454 cp = ip_vs_conn_new(&p, &dest->addr,
455 dest->port ? dest->port : pptr[1],
0e051e68 456 flags, dest, skb->mark);
a5959d53
HS
457 if (!cp) {
458 *ignored = -1;
f11017ec 459 return NULL;
a5959d53 460 }
f11017ec 461 }
1da177e4 462
cd17f9ed
JV
463 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
464 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
465 ip_vs_fwd_tag(cp),
466 IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
467 IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
468 IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
469 cp->flags, atomic_read(&cp->refcnt));
1da177e4
LT
470
471 ip_vs_conn_stats(cp, svc);
472 return cp;
473}
474
475
476/*
477 * Pass or drop the packet.
478 * Called by ip_vs_in, when the virtual service is available but
479 * no destination is available for a new connection.
480 */
481int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
482 struct ip_vs_protocol *pp)
483{
014d730d 484 __be16 _ports[2], *pptr;
28364a59 485 struct ip_vs_iphdr iph;
2a3b791e
JV
486 int unicast;
487 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
1da177e4 488
28364a59 489 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
1da177e4
LT
490 if (pptr == NULL) {
491 ip_vs_service_put(svc);
492 return NF_DROP;
493 }
494
2a3b791e
JV
495#ifdef CONFIG_IP_VS_IPV6
496 if (svc->af == AF_INET6)
497 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
498 else
499#endif
500 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
501
1da177e4 502 /* if it is fwmark-based service, the cache_bypass sysctl is up
2a3b791e 503 and the destination is a non-local unicast, then create
1da177e4 504 a cache_bypass connection entry */
2a3b791e 505 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
1da177e4
LT
506 int ret, cs;
507 struct ip_vs_conn *cp;
3575792e
JA
508 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
509 iph.protocol == IPPROTO_UDP)?
510 IP_VS_CONN_F_ONE_PACKET : 0;
dff630dd 511 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
1da177e4
LT
512
513 ip_vs_service_put(svc);
514
515 /* create a new connection entry */
1e3e238e 516 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
f11017ec
SH
517 {
518 struct ip_vs_conn_param p;
519 ip_vs_conn_fill_param(svc->af, iph.protocol,
520 &iph.saddr, pptr[0],
521 &iph.daddr, pptr[1], &p);
522 cp = ip_vs_conn_new(&p, &daddr, 0,
523 IP_VS_CONN_F_BYPASS | flags,
0e051e68 524 NULL, skb->mark);
f11017ec
SH
525 if (!cp)
526 return NF_DROP;
527 }
1da177e4
LT
528
529 /* statistics */
530 ip_vs_in_stats(cp, skb);
531
532 /* set state */
533 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
534
535 /* transmit the first SYN packet */
536 ret = cp->packet_xmit(skb, cp, pp);
537 /* do not touch skb anymore */
538
539 atomic_inc(&cp->in_pkts);
540 ip_vs_conn_put(cp);
541 return ret;
542 }
543
544 /*
545 * When the virtual ftp service is presented, packets destined
546 * for other services on the VIP may get here (except services
547 * listed in the ipvs table), pass the packets, because it is
548 * not ipvs job to decide to drop the packets.
549 */
550 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
551 ip_vs_service_put(svc);
552 return NF_ACCEPT;
553 }
554
555 ip_vs_service_put(svc);
556
557 /*
558 * Notify the client that the destination is unreachable, and
559 * release the socket buffer.
560 * Since it is in IP layer, the TCP socket is not actually
561 * created, the TCP RST packet cannot be sent, instead that
562 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
563 */
2a3b791e 564#ifdef CONFIG_IP_VS_IPV6
cb59155f
JA
565 if (svc->af == AF_INET6) {
566 if (!skb->dev) {
567 struct net *net = dev_net(skb_dst(skb)->dev);
568
569 skb->dev = net->loopback_dev;
570 }
3ffe533c 571 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
cb59155f 572 } else
2a3b791e
JV
573#endif
574 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
575
1da177e4
LT
576 return NF_DROP;
577}
578
b1550f22 579__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
1da177e4 580{
d3bc23e7 581 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
1da177e4
LT
582}
583
1ca5bb54
JA
584static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
585{
586 if (NF_INET_LOCAL_IN == hooknum)
587 return IP_DEFRAG_VS_IN;
588 if (NF_INET_FORWARD == hooknum)
589 return IP_DEFRAG_VS_FWD;
590 return IP_DEFRAG_VS_OUT;
591}
592
776c729e 593static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
1da177e4 594{
776c729e
HX
595 int err = ip_defrag(skb, user);
596
597 if (!err)
eddc9ec5 598 ip_send_check(ip_hdr(skb));
776c729e
HX
599
600 return err;
1da177e4
LT
601}
602
2a3b791e
JV
603#ifdef CONFIG_IP_VS_IPV6
604static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
605{
606 /* TODO IPv6: Find out what to do here for IPv6 */
607 return 0;
608}
609#endif
610
1da177e4
LT
611/*
612 * Packet has been made sufficiently writable in caller
613 * - inout: 1=in->out, 0=out->in
614 */
615void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
616 struct ip_vs_conn *cp, int inout)
617{
eddc9ec5 618 struct iphdr *iph = ip_hdr(skb);
1da177e4 619 unsigned int icmp_offset = iph->ihl*4;
d56f90a7
ACM
620 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
621 icmp_offset);
1da177e4
LT
622 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
623
624 if (inout) {
e7ade46a 625 iph->saddr = cp->vaddr.ip;
1da177e4 626 ip_send_check(iph);
e7ade46a 627 ciph->daddr = cp->vaddr.ip;
1da177e4
LT
628 ip_send_check(ciph);
629 } else {
e7ade46a 630 iph->daddr = cp->daddr.ip;
1da177e4 631 ip_send_check(iph);
e7ade46a 632 ciph->saddr = cp->daddr.ip;
1da177e4
LT
633 ip_send_check(ciph);
634 }
635
2906f66a
VMR
636 /* the TCP/UDP/SCTP port */
637 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
638 IPPROTO_SCTP == ciph->protocol) {
014d730d 639 __be16 *ports = (void *)ciph + ciph->ihl*4;
1da177e4
LT
640
641 if (inout)
642 ports[1] = cp->vport;
643 else
644 ports[0] = cp->dport;
645 }
646
647 /* And finally the ICMP checksum */
648 icmph->checksum = 0;
649 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
650 skb->ip_summed = CHECKSUM_UNNECESSARY;
651
652 if (inout)
0d79641a 653 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
1da177e4
LT
654 "Forwarding altered outgoing ICMP");
655 else
0d79641a 656 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
1da177e4
LT
657 "Forwarding altered incoming ICMP");
658}
659
b3cdd2a7
JV
660#ifdef CONFIG_IP_VS_IPV6
661void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
662 struct ip_vs_conn *cp, int inout)
663{
664 struct ipv6hdr *iph = ipv6_hdr(skb);
665 unsigned int icmp_offset = sizeof(struct ipv6hdr);
666 struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
667 icmp_offset);
668 struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
669
670 if (inout) {
671 iph->saddr = cp->vaddr.in6;
672 ciph->daddr = cp->vaddr.in6;
673 } else {
674 iph->daddr = cp->daddr.in6;
675 ciph->saddr = cp->daddr.in6;
676 }
677
2906f66a
VMR
678 /* the TCP/UDP/SCTP port */
679 if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
680 IPPROTO_SCTP == ciph->nexthdr) {
b3cdd2a7
JV
681 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
682
683 if (inout)
684 ports[1] = cp->vport;
685 else
686 ports[0] = cp->dport;
687 }
688
689 /* And finally the ICMP checksum */
8870f842
SH
690 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
691 skb->len - icmp_offset,
692 IPPROTO_ICMPV6, 0);
693 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
694 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
695 skb->ip_summed = CHECKSUM_PARTIAL;
b3cdd2a7
JV
696
697 if (inout)
0d79641a
JA
698 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
699 (void *)ciph - (void *)iph,
700 "Forwarding altered outgoing ICMPv6");
b3cdd2a7 701 else
0d79641a
JA
702 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
703 (void *)ciph - (void *)iph,
704 "Forwarding altered incoming ICMPv6");
b3cdd2a7
JV
705}
706#endif
707
4856c84c
MT
708/* Handle relevant response ICMP messages - forward to the right
709 * destination host. Used for NAT and local client.
710 */
f2428ed5
SH
711static int handle_response_icmp(int af, struct sk_buff *skb,
712 union nf_inet_addr *snet,
713 __u8 protocol, struct ip_vs_conn *cp,
4856c84c
MT
714 struct ip_vs_protocol *pp,
715 unsigned int offset, unsigned int ihl)
716{
717 unsigned int verdict = NF_DROP;
718
719 if (IP_VS_FWD_METHOD(cp) != 0) {
1e3e238e
HE
720 pr_err("shouldn't reach here, because the box is on the "
721 "half connection in the tun/dr module.\n");
4856c84c
MT
722 }
723
724 /* Ensure the checksum is correct */
725 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
726 /* Failed checksum! */
f2428ed5
SH
727 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
728 IP_VS_DBG_ADDR(af, snet));
4856c84c
MT
729 goto out;
730 }
731
2906f66a
VMR
732 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
733 IPPROTO_SCTP == protocol)
4856c84c
MT
734 offset += 2 * sizeof(__u16);
735 if (!skb_make_writable(skb, offset))
736 goto out;
737
f2428ed5
SH
738#ifdef CONFIG_IP_VS_IPV6
739 if (af == AF_INET6)
740 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
741 else
742#endif
743 ip_vs_nat_icmp(skb, pp, cp, 1);
4856c84c 744
f5a41847
JA
745#ifdef CONFIG_IP_VS_IPV6
746 if (af == AF_INET6) {
747 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
748 goto out;
749 } else
750#endif
751 if ((sysctl_ip_vs_snat_reroute ||
752 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
753 ip_route_me_harder(skb, RTN_LOCAL) != 0)
754 goto out;
755
4856c84c
MT
756 /* do the statistics and put it back */
757 ip_vs_out_stats(cp, skb);
758
cf356d69 759 skb->ipvs_property = 1;
f4bc17cd 760 if (!(cp->flags & IP_VS_CONN_F_NFCT))
cf356d69 761 ip_vs_notrack(skb);
f4bc17cd
JA
762 else
763 ip_vs_update_conntrack(skb, cp, 0);
4856c84c
MT
764 verdict = NF_ACCEPT;
765
766out:
767 __ip_vs_conn_put(cp);
768
769 return verdict;
770}
771
1da177e4
LT
772/*
773 * Handle ICMP messages in the inside-to-outside direction (outgoing).
4856c84c 774 * Find any that might be relevant, check against existing connections.
1da177e4 775 * Currently handles error types - unreachable, quench, ttl exceeded.
1da177e4 776 */
1ca5bb54
JA
777static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
778 unsigned int hooknum)
1da177e4 779{
1da177e4
LT
780 struct iphdr *iph;
781 struct icmphdr _icmph, *ic;
782 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
51ef348b 783 struct ip_vs_iphdr ciph;
1da177e4
LT
784 struct ip_vs_conn *cp;
785 struct ip_vs_protocol *pp;
4856c84c 786 unsigned int offset, ihl;
f2428ed5 787 union nf_inet_addr snet;
1da177e4
LT
788
789 *related = 1;
790
791 /* reassemble IP fragments */
eddc9ec5 792 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1ca5bb54 793 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1da177e4 794 return NF_STOLEN;
1da177e4
LT
795 }
796
eddc9ec5 797 iph = ip_hdr(skb);
1da177e4
LT
798 offset = ihl = iph->ihl * 4;
799 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
800 if (ic == NULL)
801 return NF_DROP;
802
14d5e834 803 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
1da177e4 804 ic->type, ntohs(icmp_id(ic)),
14d5e834 805 &iph->saddr, &iph->daddr);
1da177e4
LT
806
807 /*
808 * Work through seeing if this is for us.
809 * These checks are supposed to be in an order that means easy
810 * things are checked first to speed up processing.... however
811 * this means that some packets will manage to get a long way
812 * down this stack and then be rejected, but that's life.
813 */
814 if ((ic->type != ICMP_DEST_UNREACH) &&
815 (ic->type != ICMP_SOURCE_QUENCH) &&
816 (ic->type != ICMP_TIME_EXCEEDED)) {
817 *related = 0;
818 return NF_ACCEPT;
819 }
820
821 /* Now find the contained IP header */
822 offset += sizeof(_icmph);
823 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
824 if (cih == NULL)
825 return NF_ACCEPT; /* The packet looks wrong, ignore */
826
827 pp = ip_vs_proto_get(cih->protocol);
828 if (!pp)
829 return NF_ACCEPT;
830
831 /* Is the embedded protocol header present? */
4412ec49 832 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1da177e4
LT
833 pp->dont_defrag))
834 return NF_ACCEPT;
835
0d79641a
JA
836 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
837 "Checking outgoing ICMP for");
1da177e4
LT
838
839 offset += cih->ihl * 4;
840
51ef348b 841 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1da177e4 842 /* The embedded headers contain source and dest in reverse order */
51ef348b 843 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1da177e4
LT
844 if (!cp)
845 return NF_ACCEPT;
846
f2428ed5
SH
847 snet.ip = iph->saddr;
848 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
849 pp, offset, ihl);
1da177e4
LT
850}
851
2a3b791e 852#ifdef CONFIG_IP_VS_IPV6
1ca5bb54
JA
853static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
854 unsigned int hooknum)
2a3b791e
JV
855{
856 struct ipv6hdr *iph;
857 struct icmp6hdr _icmph, *ic;
858 struct ipv6hdr _ciph, *cih; /* The ip header contained
859 within the ICMP */
860 struct ip_vs_iphdr ciph;
861 struct ip_vs_conn *cp;
862 struct ip_vs_protocol *pp;
f2428ed5
SH
863 unsigned int offset;
864 union nf_inet_addr snet;
2a3b791e
JV
865
866 *related = 1;
867
868 /* reassemble IP fragments */
869 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1ca5bb54 870 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
2a3b791e
JV
871 return NF_STOLEN;
872 }
873
874 iph = ipv6_hdr(skb);
875 offset = sizeof(struct ipv6hdr);
876 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
877 if (ic == NULL)
878 return NF_DROP;
879
5b095d98 880 IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
2a3b791e 881 ic->icmp6_type, ntohs(icmpv6_id(ic)),
38ff4fa4 882 &iph->saddr, &iph->daddr);
2a3b791e
JV
883
884 /*
885 * Work through seeing if this is for us.
886 * These checks are supposed to be in an order that means easy
887 * things are checked first to speed up processing.... however
888 * this means that some packets will manage to get a long way
889 * down this stack and then be rejected, but that's life.
890 */
891 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
892 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
893 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
894 *related = 0;
895 return NF_ACCEPT;
896 }
897
898 /* Now find the contained IP header */
899 offset += sizeof(_icmph);
900 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
901 if (cih == NULL)
902 return NF_ACCEPT; /* The packet looks wrong, ignore */
903
904 pp = ip_vs_proto_get(cih->nexthdr);
905 if (!pp)
906 return NF_ACCEPT;
907
908 /* Is the embedded protocol header present? */
909 /* TODO: we don't support fragmentation at the moment anyways */
910 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
911 return NF_ACCEPT;
912
0d79641a
JA
913 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
914 "Checking outgoing ICMPv6 for");
2a3b791e
JV
915
916 offset += sizeof(struct ipv6hdr);
917
918 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
919 /* The embedded headers contain source and dest in reverse order */
920 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
921 if (!cp)
922 return NF_ACCEPT;
923
178f5e49 924 ipv6_addr_copy(&snet.in6, &iph->saddr);
f2428ed5
SH
925 return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
926 pp, offset, sizeof(struct ipv6hdr));
2a3b791e
JV
927}
928#endif
929
2906f66a
VMR
930/*
931 * Check if sctp chunc is ABORT chunk
932 */
933static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
934{
935 sctp_chunkhdr_t *sch, schunk;
936 sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
937 sizeof(schunk), &schunk);
938 if (sch == NULL)
939 return 0;
940 if (sch->type == SCTP_CID_ABORT)
941 return 1;
942 return 0;
943}
944
2a3b791e 945static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1da177e4
LT
946{
947 struct tcphdr _tcph, *th;
948
2a3b791e 949 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1da177e4
LT
950 if (th == NULL)
951 return 0;
952 return th->rst;
953}
954
4856c84c
MT
955/* Handle response packets: rewrite addresses and send away...
956 * Used for NAT and local client.
957 */
958static unsigned int
959handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
960 struct ip_vs_conn *cp, int ihl)
961{
0d79641a 962 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
4856c84c
MT
963
964 if (!skb_make_writable(skb, ihl))
965 goto drop;
966
967 /* mangle the packet */
968 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
969 goto drop;
970
971#ifdef CONFIG_IP_VS_IPV6
972 if (af == AF_INET6)
973 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
974 else
975#endif
976 {
977 ip_hdr(skb)->saddr = cp->vaddr.ip;
978 ip_send_check(ip_hdr(skb));
979 }
980
8a803040
JA
981 /*
982 * nf_iterate does not expect change in the skb->dst->dev.
983 * It looks like it is not fatal to enable this code for hooks
984 * where our handlers are at the end of the chain list and
985 * when all next handlers use skb->dst->dev and not outdev.
986 * It will definitely route properly the inout NAT traffic
987 * when multiple paths are used.
988 */
989
4856c84c
MT
990 /* For policy routing, packets originating from this
991 * machine itself may be routed differently to packets
992 * passing through. We want this packet to be routed as
993 * if it came from this machine itself. So re-compute
994 * the routing information.
995 */
996#ifdef CONFIG_IP_VS_IPV6
f5a41847
JA
997 if (af == AF_INET6) {
998 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
999 goto drop;
1000 } else
4856c84c 1001#endif
f5a41847
JA
1002 if ((sysctl_ip_vs_snat_reroute ||
1003 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
1004 ip_route_me_harder(skb, RTN_LOCAL) != 0)
1005 goto drop;
4856c84c 1006
0d79641a 1007 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
4856c84c
MT
1008
1009 ip_vs_out_stats(cp, skb);
1010 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
cf356d69 1011 skb->ipvs_property = 1;
f4bc17cd 1012 if (!(cp->flags & IP_VS_CONN_F_NFCT))
cf356d69 1013 ip_vs_notrack(skb);
f4bc17cd
JA
1014 else
1015 ip_vs_update_conntrack(skb, cp, 0);
4856c84c
MT
1016 ip_vs_conn_put(cp);
1017
4856c84c
MT
1018 LeaveFunction(11);
1019 return NF_ACCEPT;
1020
1021drop:
1022 ip_vs_conn_put(cp);
1023 kfree_skb(skb);
f4bc17cd 1024 LeaveFunction(11);
4856c84c
MT
1025 return NF_STOLEN;
1026}
1027
1da177e4 1028/*
4856c84c 1029 * Check if outgoing packet belongs to the established ip_vs_conn.
1da177e4
LT
1030 */
1031static unsigned int
fc604767 1032ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1da177e4 1033{
fc723250 1034 struct net *net = NULL;
51ef348b 1035 struct ip_vs_iphdr iph;
1da177e4
LT
1036 struct ip_vs_protocol *pp;
1037 struct ip_vs_conn *cp;
1da177e4
LT
1038
1039 EnterFunction(11);
1040
fc604767 1041 /* Already marked as IPVS request or reply? */
6869c4d8 1042 if (skb->ipvs_property)
1da177e4
LT
1043 return NF_ACCEPT;
1044
fc604767
JA
1045 /* Bad... Do not break raw sockets */
1046 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1047 af == AF_INET)) {
1048 struct sock *sk = skb->sk;
1049 struct inet_sock *inet = inet_sk(skb->sk);
1050
1051 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1052 return NF_ACCEPT;
1053 }
1054
1055 if (unlikely(!skb_dst(skb)))
1056 return NF_ACCEPT;
1057
fc723250 1058 net = skb_net(skb);
2a3b791e
JV
1059 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1060#ifdef CONFIG_IP_VS_IPV6
1061 if (af == AF_INET6) {
1062 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1ca5bb54
JA
1063 int related;
1064 int verdict = ip_vs_out_icmp_v6(skb, &related,
1065 hooknum);
1da177e4 1066
f5a41847 1067 if (related)
2a3b791e
JV
1068 return verdict;
1069 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1070 }
1071 } else
1072#endif
1073 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1ca5bb54
JA
1074 int related;
1075 int verdict = ip_vs_out_icmp(skb, &related, hooknum);
2a3b791e 1076
f5a41847 1077 if (related)
2a3b791e
JV
1078 return verdict;
1079 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1080 }
1da177e4 1081
51ef348b 1082 pp = ip_vs_proto_get(iph.protocol);
1da177e4
LT
1083 if (unlikely(!pp))
1084 return NF_ACCEPT;
1085
1086 /* reassemble IP fragments */
2a3b791e
JV
1087#ifdef CONFIG_IP_VS_IPV6
1088 if (af == AF_INET6) {
1ca5bb54
JA
1089 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1090 if (ip_vs_gather_frags_v6(skb,
1091 ip_vs_defrag_user(hooknum)))
1092 return NF_STOLEN;
2a3b791e 1093 }
1ca5bb54
JA
1094
1095 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
2a3b791e
JV
1096 } else
1097#endif
1098 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1099 !pp->dont_defrag)) {
1ca5bb54
JA
1100 if (ip_vs_gather_frags(skb,
1101 ip_vs_defrag_user(hooknum)))
2a3b791e
JV
1102 return NF_STOLEN;
1103
1104 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1105 }
1da177e4
LT
1106
1107 /*
1108 * Check if the packet belongs to an existing entry
1109 */
2a3b791e 1110 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1da177e4 1111
cb59155f
JA
1112 if (likely(cp))
1113 return handle_response(af, skb, pp, cp, iph.len);
1114 if (sysctl_ip_vs_nat_icmp_send &&
1115 (pp->protocol == IPPROTO_TCP ||
1116 pp->protocol == IPPROTO_UDP ||
1117 pp->protocol == IPPROTO_SCTP)) {
1118 __be16 _ports[2], *pptr;
1119
1120 pptr = skb_header_pointer(skb, iph.len,
1121 sizeof(_ports), _ports);
1122 if (pptr == NULL)
1123 return NF_ACCEPT; /* Not for me */
fc723250 1124 if (ip_vs_lookup_real_service(net, af, iph.protocol,
cb59155f
JA
1125 &iph.saddr,
1126 pptr[0])) {
1127 /*
1128 * Notify the real server: there is no
1129 * existing entry if it is not RST
1130 * packet or not TCP packet.
1131 */
1132 if ((iph.protocol != IPPROTO_TCP &&
1133 iph.protocol != IPPROTO_SCTP)
1134 || ((iph.protocol == IPPROTO_TCP
1135 && !is_tcp_reset(skb, iph.len))
1136 || (iph.protocol == IPPROTO_SCTP
1137 && !is_sctp_abort(skb,
1138 iph.len)))) {
2a3b791e 1139#ifdef CONFIG_IP_VS_IPV6
cb59155f
JA
1140 if (af == AF_INET6) {
1141 struct net *net =
1142 dev_net(skb_dst(skb)->dev);
1143
1144 if (!skb->dev)
1145 skb->dev = net->loopback_dev;
1146 icmpv6_send(skb,
1147 ICMPV6_DEST_UNREACH,
1148 ICMPV6_PORT_UNREACH,
1149 0);
1150 } else
2a3b791e 1151#endif
cb59155f
JA
1152 icmp_send(skb,
1153 ICMP_DEST_UNREACH,
1154 ICMP_PORT_UNREACH, 0);
1155 return NF_DROP;
1da177e4
LT
1156 }
1157 }
1da177e4 1158 }
0d79641a 1159 IP_VS_DBG_PKT(12, af, pp, skb, 0,
cb59155f
JA
1160 "ip_vs_out: packet continues traversal as normal");
1161 return NF_ACCEPT;
1da177e4
LT
1162}
1163
fc604767 1164/*
cb59155f
JA
1165 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1166 * used only for VS/NAT.
fc604767
JA
1167 * Check if packet is reply for established ip_vs_conn.
1168 */
1169static unsigned int
1170ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1171 const struct net_device *in, const struct net_device *out,
1172 int (*okfn)(struct sk_buff *))
1173{
1174 return ip_vs_out(hooknum, skb, AF_INET);
1175}
1176
1177/*
1178 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1179 * Check if packet is reply for established ip_vs_conn.
1180 */
1181static unsigned int
1182ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1183 const struct net_device *in, const struct net_device *out,
1184 int (*okfn)(struct sk_buff *))
1185{
1186 unsigned int verdict;
1187
1188 /* Disable BH in LOCAL_OUT until all places are fixed */
1189 local_bh_disable();
1190 verdict = ip_vs_out(hooknum, skb, AF_INET);
1191 local_bh_enable();
1192 return verdict;
1193}
1194
1195#ifdef CONFIG_IP_VS_IPV6
1196
1197/*
cb59155f
JA
1198 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1199 * used only for VS/NAT.
fc604767
JA
1200 * Check if packet is reply for established ip_vs_conn.
1201 */
1202static unsigned int
1203ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1204 const struct net_device *in, const struct net_device *out,
1205 int (*okfn)(struct sk_buff *))
1206{
1207 return ip_vs_out(hooknum, skb, AF_INET6);
1208}
1209
1210/*
1211 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1212 * Check if packet is reply for established ip_vs_conn.
1213 */
1214static unsigned int
1215ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1216 const struct net_device *in, const struct net_device *out,
1217 int (*okfn)(struct sk_buff *))
1218{
1219 unsigned int verdict;
1220
1221 /* Disable BH in LOCAL_OUT until all places are fixed */
1222 local_bh_disable();
1223 verdict = ip_vs_out(hooknum, skb, AF_INET6);
1224 local_bh_enable();
1225 return verdict;
1226}
1227
1228#endif
1da177e4
LT
1229
1230/*
1231 * Handle ICMP messages in the outside-to-inside direction (incoming).
1232 * Find any that might be relevant, check against existing connections,
1233 * forward to the right destination host if relevant.
1234 * Currently handles error types - unreachable, quench, ttl exceeded.
1235 */
e905a9ed 1236static int
3db05fea 1237ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1da177e4 1238{
1da177e4
LT
1239 struct iphdr *iph;
1240 struct icmphdr _icmph, *ic;
1241 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
51ef348b 1242 struct ip_vs_iphdr ciph;
1da177e4
LT
1243 struct ip_vs_conn *cp;
1244 struct ip_vs_protocol *pp;
1245 unsigned int offset, ihl, verdict;
f2428ed5 1246 union nf_inet_addr snet;
1da177e4
LT
1247
1248 *related = 1;
1249
1250 /* reassemble IP fragments */
eddc9ec5 1251 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1ca5bb54 1252 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1da177e4 1253 return NF_STOLEN;
1da177e4
LT
1254 }
1255
eddc9ec5 1256 iph = ip_hdr(skb);
1da177e4
LT
1257 offset = ihl = iph->ihl * 4;
1258 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1259 if (ic == NULL)
1260 return NF_DROP;
1261
14d5e834 1262 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1da177e4 1263 ic->type, ntohs(icmp_id(ic)),
14d5e834 1264 &iph->saddr, &iph->daddr);
1da177e4
LT
1265
1266 /*
1267 * Work through seeing if this is for us.
1268 * These checks are supposed to be in an order that means easy
1269 * things are checked first to speed up processing.... however
1270 * this means that some packets will manage to get a long way
1271 * down this stack and then be rejected, but that's life.
1272 */
1273 if ((ic->type != ICMP_DEST_UNREACH) &&
1274 (ic->type != ICMP_SOURCE_QUENCH) &&
1275 (ic->type != ICMP_TIME_EXCEEDED)) {
1276 *related = 0;
1277 return NF_ACCEPT;
1278 }
1279
1280 /* Now find the contained IP header */
1281 offset += sizeof(_icmph);
1282 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1283 if (cih == NULL)
1284 return NF_ACCEPT; /* The packet looks wrong, ignore */
1285
1286 pp = ip_vs_proto_get(cih->protocol);
1287 if (!pp)
1288 return NF_ACCEPT;
1289
1290 /* Is the embedded protocol header present? */
4412ec49 1291 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1da177e4
LT
1292 pp->dont_defrag))
1293 return NF_ACCEPT;
1294
0d79641a
JA
1295 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1296 "Checking incoming ICMP for");
1da177e4
LT
1297
1298 offset += cih->ihl * 4;
1299
51ef348b 1300 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1da177e4 1301 /* The embedded headers contain source and dest in reverse order */
51ef348b 1302 cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
4856c84c
MT
1303 if (!cp) {
1304 /* The packet could also belong to a local client */
1305 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
f2428ed5
SH
1306 if (cp) {
1307 snet.ip = iph->saddr;
1308 return handle_response_icmp(AF_INET, skb, &snet,
1309 cih->protocol, cp, pp,
4856c84c 1310 offset, ihl);
f2428ed5 1311 }
1da177e4 1312 return NF_ACCEPT;
4856c84c 1313 }
1da177e4
LT
1314
1315 verdict = NF_DROP;
1316
1317 /* Ensure the checksum is correct */
60476372 1318 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1da177e4 1319 /* Failed checksum! */
14d5e834
HH
1320 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1321 &iph->saddr);
1da177e4
LT
1322 goto out;
1323 }
1324
1325 /* do the statistics and put it back */
1326 ip_vs_in_stats(cp, skb);
1327 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1328 offset += 2 * sizeof(__u16);
1329 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
489fdeda
JA
1330 /* LOCALNODE from FORWARD hook is not supported */
1331 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1332 skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
1333 IP_VS_DBG(1, "%s(): "
1334 "local delivery to %pI4 but in FORWARD\n",
1335 __func__, &skb_rtable(skb)->rt_dst);
1336 verdict = NF_DROP;
1337 }
1da177e4
LT
1338
1339 out:
1340 __ip_vs_conn_put(cp);
1341
1342 return verdict;
1343}
1344
2a3b791e
JV
1345#ifdef CONFIG_IP_VS_IPV6
1346static int
1347ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1348{
1349 struct ipv6hdr *iph;
1350 struct icmp6hdr _icmph, *ic;
1351 struct ipv6hdr _ciph, *cih; /* The ip header contained
1352 within the ICMP */
1353 struct ip_vs_iphdr ciph;
1354 struct ip_vs_conn *cp;
1355 struct ip_vs_protocol *pp;
1356 unsigned int offset, verdict;
f2428ed5 1357 union nf_inet_addr snet;
489fdeda 1358 struct rt6_info *rt;
2a3b791e
JV
1359
1360 *related = 1;
1361
1362 /* reassemble IP fragments */
1363 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1ca5bb54 1364 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
2a3b791e
JV
1365 return NF_STOLEN;
1366 }
1367
1368 iph = ipv6_hdr(skb);
1369 offset = sizeof(struct ipv6hdr);
1370 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1371 if (ic == NULL)
1372 return NF_DROP;
1373
5b095d98 1374 IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
2a3b791e 1375 ic->icmp6_type, ntohs(icmpv6_id(ic)),
38ff4fa4 1376 &iph->saddr, &iph->daddr);
2a3b791e
JV
1377
1378 /*
1379 * Work through seeing if this is for us.
1380 * These checks are supposed to be in an order that means easy
1381 * things are checked first to speed up processing.... however
1382 * this means that some packets will manage to get a long way
1383 * down this stack and then be rejected, but that's life.
1384 */
1385 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1386 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1387 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1388 *related = 0;
1389 return NF_ACCEPT;
1390 }
1391
1392 /* Now find the contained IP header */
1393 offset += sizeof(_icmph);
1394 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1395 if (cih == NULL)
1396 return NF_ACCEPT; /* The packet looks wrong, ignore */
1397
1398 pp = ip_vs_proto_get(cih->nexthdr);
1399 if (!pp)
1400 return NF_ACCEPT;
1401
1402 /* Is the embedded protocol header present? */
1403 /* TODO: we don't support fragmentation at the moment anyways */
1404 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1405 return NF_ACCEPT;
1406
0d79641a
JA
1407 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1408 "Checking incoming ICMPv6 for");
2a3b791e
JV
1409
1410 offset += sizeof(struct ipv6hdr);
1411
1412 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1413 /* The embedded headers contain source and dest in reverse order */
1414 cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
f2428ed5
SH
1415 if (!cp) {
1416 /* The packet could also belong to a local client */
1417 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1418 if (cp) {
178f5e49 1419 ipv6_addr_copy(&snet.in6, &iph->saddr);
f2428ed5
SH
1420 return handle_response_icmp(AF_INET6, skb, &snet,
1421 cih->nexthdr,
1422 cp, pp, offset,
1423 sizeof(struct ipv6hdr));
1424 }
2a3b791e 1425 return NF_ACCEPT;
f2428ed5 1426 }
2a3b791e
JV
1427
1428 verdict = NF_DROP;
1429
1430 /* do the statistics and put it back */
1431 ip_vs_in_stats(cp, skb);
2906f66a
VMR
1432 if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1433 IPPROTO_SCTP == cih->nexthdr)
2a3b791e
JV
1434 offset += 2 * sizeof(__u16);
1435 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
489fdeda
JA
1436 /* LOCALNODE from FORWARD hook is not supported */
1437 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1438 (rt = (struct rt6_info *) skb_dst(skb)) &&
1439 rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
1440 IP_VS_DBG(1, "%s(): "
1441 "local delivery to %pI6 but in FORWARD\n",
1442 __func__, &rt->rt6i_dst);
1443 verdict = NF_DROP;
1444 }
2a3b791e
JV
1445
1446 __ip_vs_conn_put(cp);
1447
1448 return verdict;
1449}
1450#endif
1451
1452
1da177e4
LT
1453/*
1454 * Check if it's for virtual services, look it up,
1455 * and send it on its way...
1456 */
1457static unsigned int
cb59155f 1458ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1da177e4 1459{
51ef348b 1460 struct ip_vs_iphdr iph;
1da177e4
LT
1461 struct ip_vs_protocol *pp;
1462 struct ip_vs_conn *cp;
cb59155f 1463 int ret, restart, pkts;
2a3b791e 1464
fc604767
JA
1465 /* Already marked as IPVS request or reply? */
1466 if (skb->ipvs_property)
1467 return NF_ACCEPT;
1468
1da177e4 1469 /*
cb59155f
JA
1470 * Big tappo:
1471 * - remote client: only PACKET_HOST
1472 * - route: used for struct net when skb->dev is unset
1da177e4 1473 */
cb59155f
JA
1474 if (unlikely((skb->pkt_type != PACKET_HOST &&
1475 hooknum != NF_INET_LOCAL_OUT) ||
1476 !skb_dst(skb))) {
1477 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1478 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1479 " ignored in hook %u\n",
1480 skb->pkt_type, iph.protocol,
1481 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1da177e4
LT
1482 return NF_ACCEPT;
1483 }
cb59155f
JA
1484 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1485
1486 /* Bad... Do not break raw sockets */
1487 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1488 af == AF_INET)) {
1489 struct sock *sk = skb->sk;
1490 struct inet_sock *inet = inet_sk(skb->sk);
1491
1492 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1493 return NF_ACCEPT;
1494 }
1da177e4 1495
94b26551
JV
1496#ifdef CONFIG_IP_VS_IPV6
1497 if (af == AF_INET6) {
1498 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1ca5bb54
JA
1499 int related;
1500 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1da177e4 1501
94b26551
JV
1502 if (related)
1503 return verdict;
1504 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1505 }
1506 } else
1507#endif
1508 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1ca5bb54
JA
1509 int related;
1510 int verdict = ip_vs_in_icmp(skb, &related, hooknum);
94b26551
JV
1511
1512 if (related)
1513 return verdict;
1514 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1515 }
1da177e4
LT
1516
1517 /* Protocol supported? */
51ef348b 1518 pp = ip_vs_proto_get(iph.protocol);
1da177e4
LT
1519 if (unlikely(!pp))
1520 return NF_ACCEPT;
1521
1da177e4
LT
1522 /*
1523 * Check if the packet belongs to an existing connection entry
1524 */
2a3b791e 1525 cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1da177e4
LT
1526
1527 if (unlikely(!cp)) {
1528 int v;
1529
2a3b791e 1530 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1da177e4
LT
1531 return v;
1532 }
1533
1534 if (unlikely(!cp)) {
1535 /* sorry, all this trouble for a no-hit :) */
0d79641a 1536 IP_VS_DBG_PKT(12, af, pp, skb, 0,
cb59155f 1537 "ip_vs_in: packet continues traversal as normal");
1da177e4
LT
1538 return NF_ACCEPT;
1539 }
1540
0d79641a 1541 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1da177e4
LT
1542
1543 /* Check the server status */
1544 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1545 /* the destination server is not available */
1546
1547 if (sysctl_ip_vs_expire_nodest_conn) {
1548 /* try to expire the connection immediately */
1549 ip_vs_conn_expire_now(cp);
1da177e4 1550 }
dc8103f2
JA
1551 /* don't restart its timer, and silently
1552 drop the packet. */
1553 __ip_vs_conn_put(cp);
1da177e4
LT
1554 return NF_DROP;
1555 }
1556
1557 ip_vs_in_stats(cp, skb);
1558 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1559 if (cp->packet_xmit)
1560 ret = cp->packet_xmit(skb, cp, pp);
1561 /* do not touch skb anymore */
1562 else {
1563 IP_VS_DBG_RL("warning: packet_xmit is null");
1564 ret = NF_ACCEPT;
1565 }
1566
efac5276
RB
1567 /* Increase its packet counter and check if it is needed
1568 * to be synchronized
1569 *
1570 * Sync connection if it is about to close to
1571 * encorage the standby servers to update the connections timeout
986a0757
HS
1572 *
1573 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
efac5276 1574 */
986a0757
HS
1575 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1576 pkts = sysctl_ip_vs_sync_threshold[0];
1577 else
1578 pkts = atomic_add_return(1, &cp->in_pkts);
1579
1580 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
2906f66a
VMR
1581 cp->protocol == IPPROTO_SCTP) {
1582 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
8ed2163f 1583 (pkts % sysctl_ip_vs_sync_threshold[1]
2906f66a
VMR
1584 == sysctl_ip_vs_sync_threshold[0])) ||
1585 (cp->old_state != cp->state &&
1586 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1587 (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1588 (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1589 ip_vs_sync_conn(cp);
1590 goto out;
1591 }
1592 }
1593
8ed2163f 1594 /* Keep this block last: TCP and others with pp->num_states <= 1 */
986a0757 1595 else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
efac5276
RB
1596 (((cp->protocol != IPPROTO_TCP ||
1597 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1e66dafc 1598 (pkts % sysctl_ip_vs_sync_threshold[1]
efac5276
RB
1599 == sysctl_ip_vs_sync_threshold[0])) ||
1600 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1601 ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
9abfe315 1602 (cp->state == IP_VS_TCP_S_CLOSE) ||
9d3a0de7
RB
1603 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1604 (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1da177e4 1605 ip_vs_sync_conn(cp);
2906f66a 1606out:
efac5276 1607 cp->old_state = cp->state;
1da177e4
LT
1608
1609 ip_vs_conn_put(cp);
1610 return ret;
1611}
1612
cb59155f
JA
1613/*
1614 * AF_INET handler in NF_INET_LOCAL_IN chain
1615 * Schedule and forward packets from remote clients
1616 */
1617static unsigned int
1618ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1619 const struct net_device *in,
1620 const struct net_device *out,
1621 int (*okfn)(struct sk_buff *))
1622{
1623 return ip_vs_in(hooknum, skb, AF_INET);
1624}
1625
1626/*
1627 * AF_INET handler in NF_INET_LOCAL_OUT chain
1628 * Schedule and forward packets from local clients
1629 */
1630static unsigned int
1631ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1632 const struct net_device *in, const struct net_device *out,
1633 int (*okfn)(struct sk_buff *))
1634{
1635 unsigned int verdict;
1636
1637 /* Disable BH in LOCAL_OUT until all places are fixed */
1638 local_bh_disable();
1639 verdict = ip_vs_in(hooknum, skb, AF_INET);
1640 local_bh_enable();
1641 return verdict;
1642}
1643
1644#ifdef CONFIG_IP_VS_IPV6
1645
1646/*
1647 * AF_INET6 handler in NF_INET_LOCAL_IN chain
1648 * Schedule and forward packets from remote clients
1649 */
1650static unsigned int
1651ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1652 const struct net_device *in,
1653 const struct net_device *out,
1654 int (*okfn)(struct sk_buff *))
1655{
1656 return ip_vs_in(hooknum, skb, AF_INET6);
1657}
1658
1659/*
1660 * AF_INET6 handler in NF_INET_LOCAL_OUT chain
1661 * Schedule and forward packets from local clients
1662 */
1663static unsigned int
1664ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1665 const struct net_device *in, const struct net_device *out,
1666 int (*okfn)(struct sk_buff *))
1667{
1668 unsigned int verdict;
1669
1670 /* Disable BH in LOCAL_OUT until all places are fixed */
1671 local_bh_disable();
1672 verdict = ip_vs_in(hooknum, skb, AF_INET6);
1673 local_bh_enable();
1674 return verdict;
1675}
1676
1677#endif
1678
1da177e4
LT
1679
1680/*
6e23ae2a 1681 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1da177e4
LT
1682 * related packets destined for 0.0.0.0/0.
1683 * When fwmark-based virtual service is used, such as transparent
1684 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1685 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
6e23ae2a 1686 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1da177e4
LT
1687 * and send them to ip_vs_in_icmp.
1688 */
1689static unsigned int
3db05fea 1690ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1da177e4
LT
1691 const struct net_device *in, const struct net_device *out,
1692 int (*okfn)(struct sk_buff *))
1693{
1694 int r;
1695
3db05fea 1696 if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1da177e4
LT
1697 return NF_ACCEPT;
1698
3db05fea 1699 return ip_vs_in_icmp(skb, &r, hooknum);
1da177e4
LT
1700}
1701
2a3b791e
JV
1702#ifdef CONFIG_IP_VS_IPV6
1703static unsigned int
1704ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1705 const struct net_device *in, const struct net_device *out,
1706 int (*okfn)(struct sk_buff *))
1707{
1708 int r;
1709
1710 if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1711 return NF_ACCEPT;
1712
1713 return ip_vs_in_icmp_v6(skb, &r, hooknum);
1714}
1715#endif
1716
1da177e4 1717
1999414a 1718static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
cb59155f
JA
1719 /* After packet filtering, change source only for VS/NAT */
1720 {
1721 .hook = ip_vs_reply4,
1722 .owner = THIS_MODULE,
1723 .pf = PF_INET,
1724 .hooknum = NF_INET_LOCAL_IN,
1725 .priority = 99,
1726 },
41c5b317
PM
1727 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1728 * or VS/NAT(change destination), so that filtering rules can be
1729 * applied to IPVS. */
1730 {
cb59155f 1731 .hook = ip_vs_remote_request4,
41c5b317
PM
1732 .owner = THIS_MODULE,
1733 .pf = PF_INET,
cb59155f
JA
1734 .hooknum = NF_INET_LOCAL_IN,
1735 .priority = 101,
41c5b317 1736 },
fc604767 1737 /* Before ip_vs_in, change source only for VS/NAT */
41c5b317 1738 {
fc604767 1739 .hook = ip_vs_local_reply4,
41c5b317
PM
1740 .owner = THIS_MODULE,
1741 .pf = PF_INET,
fc604767
JA
1742 .hooknum = NF_INET_LOCAL_OUT,
1743 .priority = -99,
41c5b317 1744 },
cb59155f
JA
1745 /* After mangle, schedule and forward local requests */
1746 {
1747 .hook = ip_vs_local_request4,
1748 .owner = THIS_MODULE,
1749 .pf = PF_INET,
1750 .hooknum = NF_INET_LOCAL_OUT,
1751 .priority = -98,
1752 },
41c5b317
PM
1753 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1754 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1755 {
1756 .hook = ip_vs_forward_icmp,
1757 .owner = THIS_MODULE,
1758 .pf = PF_INET,
cb59155f
JA
1759 .hooknum = NF_INET_FORWARD,
1760 .priority = 99,
41c5b317 1761 },
fc604767
JA
1762 /* After packet filtering, change source only for VS/NAT */
1763 {
1764 .hook = ip_vs_reply4,
1765 .owner = THIS_MODULE,
1766 .pf = PF_INET,
1767 .hooknum = NF_INET_FORWARD,
1768 .priority = 100,
1769 },
473b23d3 1770#ifdef CONFIG_IP_VS_IPV6
cb59155f
JA
1771 /* After packet filtering, change source only for VS/NAT */
1772 {
1773 .hook = ip_vs_reply6,
1774 .owner = THIS_MODULE,
1775 .pf = PF_INET6,
1776 .hooknum = NF_INET_LOCAL_IN,
1777 .priority = 99,
1778 },
473b23d3
JV
1779 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1780 * or VS/NAT(change destination), so that filtering rules can be
1781 * applied to IPVS. */
1782 {
cb59155f 1783 .hook = ip_vs_remote_request6,
473b23d3
JV
1784 .owner = THIS_MODULE,
1785 .pf = PF_INET6,
cb59155f
JA
1786 .hooknum = NF_INET_LOCAL_IN,
1787 .priority = 101,
473b23d3 1788 },
fc604767 1789 /* Before ip_vs_in, change source only for VS/NAT */
473b23d3 1790 {
fc604767 1791 .hook = ip_vs_local_reply6,
473b23d3 1792 .owner = THIS_MODULE,
fc604767
JA
1793 .pf = PF_INET,
1794 .hooknum = NF_INET_LOCAL_OUT,
1795 .priority = -99,
473b23d3 1796 },
cb59155f
JA
1797 /* After mangle, schedule and forward local requests */
1798 {
1799 .hook = ip_vs_local_request6,
1800 .owner = THIS_MODULE,
1801 .pf = PF_INET6,
1802 .hooknum = NF_INET_LOCAL_OUT,
1803 .priority = -98,
1804 },
473b23d3
JV
1805 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1806 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1807 {
1808 .hook = ip_vs_forward_icmp_v6,
1809 .owner = THIS_MODULE,
1810 .pf = PF_INET6,
cb59155f
JA
1811 .hooknum = NF_INET_FORWARD,
1812 .priority = 99,
473b23d3 1813 },
fc604767
JA
1814 /* After packet filtering, change source only for VS/NAT */
1815 {
1816 .hook = ip_vs_reply6,
1817 .owner = THIS_MODULE,
1818 .pf = PF_INET6,
1819 .hooknum = NF_INET_FORWARD,
1820 .priority = 100,
1821 },
473b23d3 1822#endif
1da177e4
LT
1823};
1824
61b1ab45
HS
1825/*
1826 * Initialize IP Virtual Server netns mem.
1827 */
1828static int __net_init __ip_vs_init(struct net *net)
1829{
1830 struct netns_ipvs *ipvs;
1831
1832 if (!net_eq(net, &init_net)) {
1833 pr_err("The final patch for enabling netns is missing\n");
1834 return -EPERM;
1835 }
1836 ipvs = net_generic(net, ip_vs_net_id);
1837 if (ipvs == NULL) {
1838 pr_err("%s(): no memory.\n", __func__);
1839 return -ENOMEM;
1840 }
1841 /* Counters used for creating unique names */
1842 ipvs->gen = atomic_read(&ipvs_netns_cnt);
1843 atomic_inc(&ipvs_netns_cnt);
1844 net->ipvs = ipvs;
1845 printk(KERN_INFO "IPVS: Creating netns size=%lu id=%d\n",
1846 sizeof(struct netns_ipvs), ipvs->gen);
1847 return 0;
1848}
1849
1850static void __net_exit __ip_vs_cleanup(struct net *net)
1851{
1852 struct netns_ipvs *ipvs = net_ipvs(net);
1853
1854 IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->gen);
1855}
1856
1857static struct pernet_operations ipvs_core_ops = {
1858 .init = __ip_vs_init,
1859 .exit = __ip_vs_cleanup,
1860 .id = &ip_vs_net_id,
1861 .size = sizeof(struct netns_ipvs),
1862};
1da177e4
LT
1863
1864/*
1865 * Initialize IP Virtual Server
1866 */
1867static int __init ip_vs_init(void)
1868{
1869 int ret;
1870
61b1ab45
HS
1871 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
1872 if (ret < 0)
1873 return ret;
a919cf4b 1874
61b1ab45 1875 ip_vs_estimator_init();
1da177e4
LT
1876 ret = ip_vs_control_init();
1877 if (ret < 0) {
1e3e238e 1878 pr_err("can't setup control.\n");
a919cf4b 1879 goto cleanup_estimator;
1da177e4
LT
1880 }
1881
1882 ip_vs_protocol_init();
1883
1884 ret = ip_vs_app_init();
1885 if (ret < 0) {
1e3e238e 1886 pr_err("can't setup application helper.\n");
1da177e4
LT
1887 goto cleanup_protocol;
1888 }
1889
1890 ret = ip_vs_conn_init();
1891 if (ret < 0) {
1e3e238e 1892 pr_err("can't setup connection table.\n");
1da177e4
LT
1893 goto cleanup_app;
1894 }
1895
61b1ab45
HS
1896 ret = ip_vs_sync_init();
1897 if (ret < 0) {
1898 pr_err("can't setup sync data.\n");
1899 goto cleanup_conn;
1900 }
1901
41c5b317 1902 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1da177e4 1903 if (ret < 0) {
1e3e238e 1904 pr_err("can't register hooks.\n");
61b1ab45 1905 goto cleanup_sync;
1da177e4
LT
1906 }
1907
1e3e238e 1908 pr_info("ipvs loaded.\n");
1da177e4
LT
1909 return ret;
1910
61b1ab45
HS
1911cleanup_sync:
1912 ip_vs_sync_cleanup();
1da177e4
LT
1913 cleanup_conn:
1914 ip_vs_conn_cleanup();
1915 cleanup_app:
1916 ip_vs_app_cleanup();
1917 cleanup_protocol:
1918 ip_vs_protocol_cleanup();
1919 ip_vs_control_cleanup();
a919cf4b
SW
1920 cleanup_estimator:
1921 ip_vs_estimator_cleanup();
61b1ab45 1922 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
1da177e4
LT
1923 return ret;
1924}
1925
1926static void __exit ip_vs_cleanup(void)
1927{
41c5b317 1928 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
61b1ab45 1929 ip_vs_sync_cleanup();
1da177e4
LT
1930 ip_vs_conn_cleanup();
1931 ip_vs_app_cleanup();
1932 ip_vs_protocol_cleanup();
1933 ip_vs_control_cleanup();
a919cf4b 1934 ip_vs_estimator_cleanup();
61b1ab45 1935 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
1e3e238e 1936 pr_info("ipvs unloaded.\n");
1da177e4
LT
1937}
1938
1939module_init(ip_vs_init);
1940module_exit(ip_vs_cleanup);
1941MODULE_LICENSE("GPL");