[NETFILTER]: Avoid skb_copy/pskb_copy/skb_realloc_headroom
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / ipvs / ip_vs_core.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21 * and others.
22 *
23 * Changes:
24 * Paul `Rusty' Russell properly handle non-linear skbs
6869c4d8 25 * Harald Welte don't use nfcache
1da177e4
LT
26 *
27 */
28
29#include <linux/module.h>
30#include <linux/kernel.h>
31#include <linux/ip.h>
32#include <linux/tcp.h>
33#include <linux/icmp.h>
34
35#include <net/ip.h>
36#include <net/tcp.h>
37#include <net/udp.h>
38#include <net/icmp.h> /* for icmp_send */
39#include <net/route.h>
40
41#include <linux/netfilter.h>
42#include <linux/netfilter_ipv4.h>
43
44#include <net/ip_vs.h>
45
46
47EXPORT_SYMBOL(register_ip_vs_scheduler);
48EXPORT_SYMBOL(unregister_ip_vs_scheduler);
49EXPORT_SYMBOL(ip_vs_skb_replace);
50EXPORT_SYMBOL(ip_vs_proto_name);
51EXPORT_SYMBOL(ip_vs_conn_new);
52EXPORT_SYMBOL(ip_vs_conn_in_get);
53EXPORT_SYMBOL(ip_vs_conn_out_get);
54#ifdef CONFIG_IP_VS_PROTO_TCP
55EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
56#endif
57EXPORT_SYMBOL(ip_vs_conn_put);
58#ifdef CONFIG_IP_VS_DEBUG
59EXPORT_SYMBOL(ip_vs_get_debug_level);
60#endif
1da177e4
LT
61
62
63/* ID used in ICMP lookups */
64#define icmp_id(icmph) (((icmph)->un).echo.id)
65
66const char *ip_vs_proto_name(unsigned proto)
67{
68 static char buf[20];
69
70 switch (proto) {
71 case IPPROTO_IP:
72 return "IP";
73 case IPPROTO_UDP:
74 return "UDP";
75 case IPPROTO_TCP:
76 return "TCP";
77 case IPPROTO_ICMP:
78 return "ICMP";
79 default:
80 sprintf(buf, "IP_%d", proto);
81 return buf;
82 }
83}
84
85void ip_vs_init_hash_table(struct list_head *table, int rows)
86{
87 while (--rows >= 0)
88 INIT_LIST_HEAD(&table[rows]);
89}
90
91static inline void
92ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
93{
94 struct ip_vs_dest *dest = cp->dest;
95 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
96 spin_lock(&dest->stats.lock);
97 dest->stats.inpkts++;
98 dest->stats.inbytes += skb->len;
99 spin_unlock(&dest->stats.lock);
100
101 spin_lock(&dest->svc->stats.lock);
102 dest->svc->stats.inpkts++;
103 dest->svc->stats.inbytes += skb->len;
104 spin_unlock(&dest->svc->stats.lock);
105
106 spin_lock(&ip_vs_stats.lock);
107 ip_vs_stats.inpkts++;
108 ip_vs_stats.inbytes += skb->len;
109 spin_unlock(&ip_vs_stats.lock);
110 }
111}
112
113
114static inline void
115ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116{
117 struct ip_vs_dest *dest = cp->dest;
118 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
119 spin_lock(&dest->stats.lock);
120 dest->stats.outpkts++;
121 dest->stats.outbytes += skb->len;
122 spin_unlock(&dest->stats.lock);
123
124 spin_lock(&dest->svc->stats.lock);
125 dest->svc->stats.outpkts++;
126 dest->svc->stats.outbytes += skb->len;
127 spin_unlock(&dest->svc->stats.lock);
128
129 spin_lock(&ip_vs_stats.lock);
130 ip_vs_stats.outpkts++;
131 ip_vs_stats.outbytes += skb->len;
132 spin_unlock(&ip_vs_stats.lock);
133 }
134}
135
136
137static inline void
138ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
139{
140 spin_lock(&cp->dest->stats.lock);
141 cp->dest->stats.conns++;
142 spin_unlock(&cp->dest->stats.lock);
143
144 spin_lock(&svc->stats.lock);
145 svc->stats.conns++;
146 spin_unlock(&svc->stats.lock);
147
148 spin_lock(&ip_vs_stats.lock);
149 ip_vs_stats.conns++;
150 spin_unlock(&ip_vs_stats.lock);
151}
152
153
154static inline int
155ip_vs_set_state(struct ip_vs_conn *cp, int direction,
156 const struct sk_buff *skb,
157 struct ip_vs_protocol *pp)
158{
159 if (unlikely(!pp->state_transition))
160 return 0;
161 return pp->state_transition(cp, direction, skb, pp);
162}
163
164
1da177e4
LT
165/*
166 * IPVS persistent scheduling function
167 * It creates a connection entry according to its template if exists,
168 * or selects a server and creates a connection entry plus a template.
169 * Locking: we are svc user (svc->refcnt), so we hold all dests too
170 * Protocols supported: TCP, UDP
171 */
172static struct ip_vs_conn *
173ip_vs_sched_persist(struct ip_vs_service *svc,
174 const struct sk_buff *skb,
014d730d 175 __be16 ports[2])
1da177e4
LT
176{
177 struct ip_vs_conn *cp = NULL;
eddc9ec5 178 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
179 struct ip_vs_dest *dest;
180 struct ip_vs_conn *ct;
014d730d
AV
181 __be16 dport; /* destination port to forward */
182 __be32 snet; /* source network of the client, after masking */
1da177e4
LT
183
184 /* Mask saddr with the netmask to adjust template granularity */
185 snet = iph->saddr & svc->netmask;
186
187 IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
188 "mnet %u.%u.%u.%u\n",
189 NIPQUAD(iph->saddr), ntohs(ports[0]),
190 NIPQUAD(iph->daddr), ntohs(ports[1]),
191 NIPQUAD(snet));
192
193 /*
194 * As far as we know, FTP is a very complicated network protocol, and
195 * it uses control connection and data connections. For active FTP,
196 * FTP server initialize data connection to the client, its source port
197 * is often 20. For passive FTP, FTP server tells the clients the port
198 * that it passively listens to, and the client issues the data
199 * connection. In the tunneling or direct routing mode, the load
200 * balancer is on the client-to-server half of connection, the port
201 * number is unknown to the load balancer. So, a conn template like
202 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
203 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
204 * is created for other persistent services.
205 */
206 if (ports[1] == svc->port) {
207 /* Check if a template already exists */
208 if (svc->port != FTPPORT)
87375ab4 209 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
1da177e4
LT
210 iph->daddr, ports[1]);
211 else
87375ab4 212 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
1da177e4
LT
213 iph->daddr, 0);
214
215 if (!ct || !ip_vs_check_template(ct)) {
216 /*
217 * No template found or the dest of the connection
218 * template is not available.
219 */
220 dest = svc->scheduler->schedule(svc, skb);
221 if (dest == NULL) {
222 IP_VS_DBG(1, "p-schedule: no dest found.\n");
223 return NULL;
224 }
225
226 /*
227 * Create a template like <protocol,caddr,0,
228 * vaddr,vport,daddr,dport> for non-ftp service,
229 * and <protocol,caddr,0,vaddr,0,daddr,0>
230 * for ftp service.
231 */
232 if (svc->port != FTPPORT)
233 ct = ip_vs_conn_new(iph->protocol,
234 snet, 0,
235 iph->daddr,
236 ports[1],
237 dest->addr, dest->port,
87375ab4 238 IP_VS_CONN_F_TEMPLATE,
1da177e4
LT
239 dest);
240 else
241 ct = ip_vs_conn_new(iph->protocol,
242 snet, 0,
243 iph->daddr, 0,
244 dest->addr, 0,
87375ab4 245 IP_VS_CONN_F_TEMPLATE,
1da177e4
LT
246 dest);
247 if (ct == NULL)
248 return NULL;
249
250 ct->timeout = svc->timeout;
251 } else {
252 /* set destination with the found template */
253 dest = ct->dest;
254 }
255 dport = dest->port;
256 } else {
257 /*
258 * Note: persistent fwmark-based services and persistent
259 * port zero service are handled here.
260 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
261 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
262 */
263 if (svc->fwmark)
87375ab4 264 ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
1da177e4
LT
265 htonl(svc->fwmark), 0);
266 else
87375ab4 267 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
1da177e4
LT
268 iph->daddr, 0);
269
270 if (!ct || !ip_vs_check_template(ct)) {
271 /*
272 * If it is not persistent port zero, return NULL,
273 * otherwise create a connection template.
274 */
275 if (svc->port)
276 return NULL;
277
278 dest = svc->scheduler->schedule(svc, skb);
279 if (dest == NULL) {
280 IP_VS_DBG(1, "p-schedule: no dest found.\n");
281 return NULL;
282 }
283
284 /*
285 * Create a template according to the service
286 */
287 if (svc->fwmark)
288 ct = ip_vs_conn_new(IPPROTO_IP,
289 snet, 0,
290 htonl(svc->fwmark), 0,
291 dest->addr, 0,
87375ab4 292 IP_VS_CONN_F_TEMPLATE,
1da177e4
LT
293 dest);
294 else
295 ct = ip_vs_conn_new(iph->protocol,
296 snet, 0,
297 iph->daddr, 0,
298 dest->addr, 0,
87375ab4 299 IP_VS_CONN_F_TEMPLATE,
1da177e4
LT
300 dest);
301 if (ct == NULL)
302 return NULL;
303
304 ct->timeout = svc->timeout;
305 } else {
306 /* set destination with the found template */
307 dest = ct->dest;
308 }
309 dport = ports[1];
310 }
311
312 /*
313 * Create a new connection according to the template
314 */
315 cp = ip_vs_conn_new(iph->protocol,
316 iph->saddr, ports[0],
317 iph->daddr, ports[1],
318 dest->addr, dport,
319 0,
320 dest);
321 if (cp == NULL) {
322 ip_vs_conn_put(ct);
323 return NULL;
324 }
325
326 /*
327 * Add its control
328 */
329 ip_vs_control_add(cp, ct);
330 ip_vs_conn_put(ct);
331
332 ip_vs_conn_stats(cp, svc);
333 return cp;
334}
335
336
337/*
338 * IPVS main scheduling function
339 * It selects a server according to the virtual service, and
340 * creates a connection entry.
341 * Protocols supported: TCP, UDP
342 */
343struct ip_vs_conn *
344ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
345{
346 struct ip_vs_conn *cp = NULL;
eddc9ec5 347 struct iphdr *iph = ip_hdr(skb);
1da177e4 348 struct ip_vs_dest *dest;
014d730d 349 __be16 _ports[2], *pptr;
1da177e4
LT
350
351 pptr = skb_header_pointer(skb, iph->ihl*4,
352 sizeof(_ports), _ports);
353 if (pptr == NULL)
354 return NULL;
355
356 /*
357 * Persistent service
358 */
359 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
360 return ip_vs_sched_persist(svc, skb, pptr);
361
362 /*
363 * Non-persistent service
364 */
365 if (!svc->fwmark && pptr[1] != svc->port) {
366 if (!svc->port)
367 IP_VS_ERR("Schedule: port zero only supported "
368 "in persistent services, "
369 "check your ipvs configuration\n");
370 return NULL;
371 }
372
373 dest = svc->scheduler->schedule(svc, skb);
374 if (dest == NULL) {
375 IP_VS_DBG(1, "Schedule: no dest found.\n");
376 return NULL;
377 }
378
379 /*
380 * Create a connection entry.
381 */
382 cp = ip_vs_conn_new(iph->protocol,
383 iph->saddr, pptr[0],
384 iph->daddr, pptr[1],
385 dest->addr, dest->port?dest->port:pptr[1],
386 0,
387 dest);
388 if (cp == NULL)
389 return NULL;
390
391 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
4b5bdf5c 392 "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
1da177e4
LT
393 ip_vs_fwd_tag(cp),
394 NIPQUAD(cp->caddr), ntohs(cp->cport),
395 NIPQUAD(cp->vaddr), ntohs(cp->vport),
396 NIPQUAD(cp->daddr), ntohs(cp->dport),
397 cp->flags, atomic_read(&cp->refcnt));
398
399 ip_vs_conn_stats(cp, svc);
400 return cp;
401}
402
403
404/*
405 * Pass or drop the packet.
406 * Called by ip_vs_in, when the virtual service is available but
407 * no destination is available for a new connection.
408 */
409int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
410 struct ip_vs_protocol *pp)
411{
014d730d 412 __be16 _ports[2], *pptr;
eddc9ec5 413 struct iphdr *iph = ip_hdr(skb);
1da177e4
LT
414
415 pptr = skb_header_pointer(skb, iph->ihl*4,
416 sizeof(_ports), _ports);
417 if (pptr == NULL) {
418 ip_vs_service_put(svc);
419 return NF_DROP;
420 }
421
422 /* if it is fwmark-based service, the cache_bypass sysctl is up
423 and the destination is RTN_UNICAST (and not local), then create
424 a cache_bypass connection entry */
425 if (sysctl_ip_vs_cache_bypass && svc->fwmark
426 && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
427 int ret, cs;
428 struct ip_vs_conn *cp;
429
430 ip_vs_service_put(svc);
431
432 /* create a new connection entry */
433 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
434 cp = ip_vs_conn_new(iph->protocol,
435 iph->saddr, pptr[0],
436 iph->daddr, pptr[1],
437 0, 0,
438 IP_VS_CONN_F_BYPASS,
439 NULL);
440 if (cp == NULL)
441 return NF_DROP;
442
443 /* statistics */
444 ip_vs_in_stats(cp, skb);
445
446 /* set state */
447 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
448
449 /* transmit the first SYN packet */
450 ret = cp->packet_xmit(skb, cp, pp);
451 /* do not touch skb anymore */
452
453 atomic_inc(&cp->in_pkts);
454 ip_vs_conn_put(cp);
455 return ret;
456 }
457
458 /*
459 * When the virtual ftp service is presented, packets destined
460 * for other services on the VIP may get here (except services
461 * listed in the ipvs table), pass the packets, because it is
462 * not ipvs job to decide to drop the packets.
463 */
464 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
465 ip_vs_service_put(svc);
466 return NF_ACCEPT;
467 }
468
469 ip_vs_service_put(svc);
470
471 /*
472 * Notify the client that the destination is unreachable, and
473 * release the socket buffer.
474 * Since it is in IP layer, the TCP socket is not actually
475 * created, the TCP RST packet cannot be sent, instead that
476 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
477 */
478 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
479 return NF_DROP;
480}
481
482
483/*
484 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
485 * chain, and is used for VS/NAT.
486 * It detects packets for VS/NAT connections and sends the packets
487 * immediately. This can avoid that iptable_nat mangles the packets
488 * for VS/NAT.
489 */
490static unsigned int ip_vs_post_routing(unsigned int hooknum,
491 struct sk_buff **pskb,
492 const struct net_device *in,
493 const struct net_device *out,
494 int (*okfn)(struct sk_buff *))
495{
6869c4d8 496 if (!((*pskb)->ipvs_property))
1da177e4 497 return NF_ACCEPT;
1da177e4 498 /* The packet was sent from IPVS, exit this chain */
abbcc739 499 return NF_STOP;
1da177e4
LT
500}
501
b1550f22 502__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
1da177e4 503{
d3bc23e7 504 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
1da177e4
LT
505}
506
776c729e 507static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
1da177e4 508{
776c729e
HX
509 int err = ip_defrag(skb, user);
510
511 if (!err)
eddc9ec5 512 ip_send_check(ip_hdr(skb));
776c729e
HX
513
514 return err;
1da177e4
LT
515}
516
517/*
518 * Packet has been made sufficiently writable in caller
519 * - inout: 1=in->out, 0=out->in
520 */
521void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
522 struct ip_vs_conn *cp, int inout)
523{
eddc9ec5 524 struct iphdr *iph = ip_hdr(skb);
1da177e4 525 unsigned int icmp_offset = iph->ihl*4;
d56f90a7
ACM
526 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
527 icmp_offset);
1da177e4
LT
528 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
529
530 if (inout) {
531 iph->saddr = cp->vaddr;
532 ip_send_check(iph);
533 ciph->daddr = cp->vaddr;
534 ip_send_check(ciph);
535 } else {
536 iph->daddr = cp->daddr;
537 ip_send_check(iph);
538 ciph->saddr = cp->daddr;
539 ip_send_check(ciph);
540 }
541
542 /* the TCP/UDP port */
543 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
014d730d 544 __be16 *ports = (void *)ciph + ciph->ihl*4;
1da177e4
LT
545
546 if (inout)
547 ports[1] = cp->vport;
548 else
549 ports[0] = cp->dport;
550 }
551
552 /* And finally the ICMP checksum */
553 icmph->checksum = 0;
554 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
555 skb->ip_summed = CHECKSUM_UNNECESSARY;
556
557 if (inout)
558 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
559 "Forwarding altered outgoing ICMP");
560 else
561 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
562 "Forwarding altered incoming ICMP");
563}
564
565/*
566 * Handle ICMP messages in the inside-to-outside direction (outgoing).
567 * Find any that might be relevant, check against existing connections,
568 * forward to the right destination host if relevant.
569 * Currently handles error types - unreachable, quench, ttl exceeded.
570 * (Only used in VS/NAT)
571 */
572static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
573{
574 struct sk_buff *skb = *pskb;
575 struct iphdr *iph;
576 struct icmphdr _icmph, *ic;
577 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
578 struct ip_vs_conn *cp;
579 struct ip_vs_protocol *pp;
580 unsigned int offset, ihl, verdict;
581
582 *related = 1;
583
584 /* reassemble IP fragments */
eddc9ec5 585 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
776c729e 586 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1da177e4 587 return NF_STOLEN;
1da177e4
LT
588 }
589
eddc9ec5 590 iph = ip_hdr(skb);
1da177e4
LT
591 offset = ihl = iph->ihl * 4;
592 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
593 if (ic == NULL)
594 return NF_DROP;
595
596 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
597 ic->type, ntohs(icmp_id(ic)),
598 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
599
600 /*
601 * Work through seeing if this is for us.
602 * These checks are supposed to be in an order that means easy
603 * things are checked first to speed up processing.... however
604 * this means that some packets will manage to get a long way
605 * down this stack and then be rejected, but that's life.
606 */
607 if ((ic->type != ICMP_DEST_UNREACH) &&
608 (ic->type != ICMP_SOURCE_QUENCH) &&
609 (ic->type != ICMP_TIME_EXCEEDED)) {
610 *related = 0;
611 return NF_ACCEPT;
612 }
613
614 /* Now find the contained IP header */
615 offset += sizeof(_icmph);
616 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
617 if (cih == NULL)
618 return NF_ACCEPT; /* The packet looks wrong, ignore */
619
620 pp = ip_vs_proto_get(cih->protocol);
621 if (!pp)
622 return NF_ACCEPT;
623
624 /* Is the embedded protocol header present? */
4412ec49 625 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1da177e4
LT
626 pp->dont_defrag))
627 return NF_ACCEPT;
628
629 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
630
631 offset += cih->ihl * 4;
632
633 /* The embedded headers contain source and dest in reverse order */
634 cp = pp->conn_out_get(skb, pp, cih, offset, 1);
635 if (!cp)
636 return NF_ACCEPT;
637
638 verdict = NF_DROP;
639
640 if (IP_VS_FWD_METHOD(cp) != 0) {
641 IP_VS_ERR("shouldn't reach here, because the box is on the"
642 "half connection in the tun/dr module.\n");
643 }
644
645 /* Ensure the checksum is correct */
60476372 646 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1da177e4
LT
647 /* Failed checksum! */
648 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
649 NIPQUAD(iph->saddr));
650 goto out;
651 }
652
653 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
654 offset += 2 * sizeof(__u16);
af1e1cf0 655 if (!skb_make_writable(skb, offset))
1da177e4 656 goto out;
1da177e4
LT
657
658 ip_vs_nat_icmp(skb, pp, cp, 1);
659
660 /* do the statistics and put it back */
661 ip_vs_out_stats(cp, skb);
662
6869c4d8 663 skb->ipvs_property = 1;
1da177e4
LT
664 verdict = NF_ACCEPT;
665
666 out:
667 __ip_vs_conn_put(cp);
668
669 return verdict;
670}
671
672static inline int is_tcp_reset(const struct sk_buff *skb)
673{
674 struct tcphdr _tcph, *th;
675
c9bdd4b5 676 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
1da177e4
LT
677 if (th == NULL)
678 return 0;
679 return th->rst;
680}
681
682/*
683 * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
684 * Check if outgoing packet belongs to the established ip_vs_conn,
685 * rewrite addresses of the packet and send it on its way...
686 */
687static unsigned int
688ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
689 const struct net_device *in, const struct net_device *out,
690 int (*okfn)(struct sk_buff *))
691{
692 struct sk_buff *skb = *pskb;
693 struct iphdr *iph;
694 struct ip_vs_protocol *pp;
695 struct ip_vs_conn *cp;
696 int ihl;
697
698 EnterFunction(11);
699
6869c4d8 700 if (skb->ipvs_property)
1da177e4
LT
701 return NF_ACCEPT;
702
eddc9ec5 703 iph = ip_hdr(skb);
1da177e4
LT
704 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
705 int related, verdict = ip_vs_out_icmp(pskb, &related);
706
707 if (related)
708 return verdict;
709 skb = *pskb;
eddc9ec5 710 iph = ip_hdr(skb);
1da177e4
LT
711 }
712
713 pp = ip_vs_proto_get(iph->protocol);
714 if (unlikely(!pp))
715 return NF_ACCEPT;
716
717 /* reassemble IP fragments */
4412ec49 718 if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
1da177e4 719 !pp->dont_defrag)) {
776c729e 720 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1da177e4 721 return NF_STOLEN;
eddc9ec5 722 iph = ip_hdr(skb);
1da177e4
LT
723 }
724
725 ihl = iph->ihl << 2;
726
727 /*
728 * Check if the packet belongs to an existing entry
729 */
730 cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
731
732 if (unlikely(!cp)) {
733 if (sysctl_ip_vs_nat_icmp_send &&
734 (pp->protocol == IPPROTO_TCP ||
735 pp->protocol == IPPROTO_UDP)) {
014d730d 736 __be16 _ports[2], *pptr;
1da177e4
LT
737
738 pptr = skb_header_pointer(skb, ihl,
739 sizeof(_ports), _ports);
740 if (pptr == NULL)
741 return NF_ACCEPT; /* Not for me */
742 if (ip_vs_lookup_real_service(iph->protocol,
743 iph->saddr, pptr[0])) {
744 /*
745 * Notify the real server: there is no
746 * existing entry if it is not RST
747 * packet or not TCP packet.
748 */
749 if (iph->protocol != IPPROTO_TCP
750 || !is_tcp_reset(skb)) {
751 icmp_send(skb,ICMP_DEST_UNREACH,
752 ICMP_PORT_UNREACH, 0);
753 return NF_DROP;
754 }
755 }
756 }
757 IP_VS_DBG_PKT(12, pp, skb, 0,
758 "packet continues traversal as normal");
759 return NF_ACCEPT;
760 }
761
762 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
763
af1e1cf0 764 if (!skb_make_writable(skb, ihl))
1da177e4
LT
765 goto drop;
766
767 /* mangle the packet */
768 if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
769 goto drop;
770 skb = *pskb;
eddc9ec5
ACM
771 ip_hdr(skb)->saddr = cp->vaddr;
772 ip_send_check(ip_hdr(skb));
1da177e4 773
e905a9ed
YH
774 /* For policy routing, packets originating from this
775 * machine itself may be routed differently to packets
776 * passing through. We want this packet to be routed as
777 * if it came from this machine itself. So re-compute
778 * the routing information.
779 */
780 if (ip_route_me_harder(pskb, RTN_LOCAL) != 0)
781 goto drop;
901eaf6c
SH
782 skb = *pskb;
783
1da177e4
LT
784 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
785
786 ip_vs_out_stats(cp, skb);
787 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
788 ip_vs_conn_put(cp);
789
6869c4d8 790 skb->ipvs_property = 1;
1da177e4
LT
791
792 LeaveFunction(11);
793 return NF_ACCEPT;
794
795 drop:
796 ip_vs_conn_put(cp);
797 kfree_skb(*pskb);
798 return NF_STOLEN;
799}
800
801
802/*
803 * Handle ICMP messages in the outside-to-inside direction (incoming).
804 * Find any that might be relevant, check against existing connections,
805 * forward to the right destination host if relevant.
806 * Currently handles error types - unreachable, quench, ttl exceeded.
807 */
e905a9ed 808static int
1da177e4
LT
809ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
810{
811 struct sk_buff *skb = *pskb;
812 struct iphdr *iph;
813 struct icmphdr _icmph, *ic;
814 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
815 struct ip_vs_conn *cp;
816 struct ip_vs_protocol *pp;
817 unsigned int offset, ihl, verdict;
818
819 *related = 1;
820
821 /* reassemble IP fragments */
eddc9ec5 822 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
776c729e
HX
823 if (ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ?
824 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1da177e4 825 return NF_STOLEN;
1da177e4
LT
826 }
827
eddc9ec5 828 iph = ip_hdr(skb);
1da177e4
LT
829 offset = ihl = iph->ihl * 4;
830 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
831 if (ic == NULL)
832 return NF_DROP;
833
834 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
835 ic->type, ntohs(icmp_id(ic)),
836 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
837
838 /*
839 * Work through seeing if this is for us.
840 * These checks are supposed to be in an order that means easy
841 * things are checked first to speed up processing.... however
842 * this means that some packets will manage to get a long way
843 * down this stack and then be rejected, but that's life.
844 */
845 if ((ic->type != ICMP_DEST_UNREACH) &&
846 (ic->type != ICMP_SOURCE_QUENCH) &&
847 (ic->type != ICMP_TIME_EXCEEDED)) {
848 *related = 0;
849 return NF_ACCEPT;
850 }
851
852 /* Now find the contained IP header */
853 offset += sizeof(_icmph);
854 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
855 if (cih == NULL)
856 return NF_ACCEPT; /* The packet looks wrong, ignore */
857
858 pp = ip_vs_proto_get(cih->protocol);
859 if (!pp)
860 return NF_ACCEPT;
861
862 /* Is the embedded protocol header present? */
4412ec49 863 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1da177e4
LT
864 pp->dont_defrag))
865 return NF_ACCEPT;
866
867 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
868
869 offset += cih->ihl * 4;
870
871 /* The embedded headers contain source and dest in reverse order */
872 cp = pp->conn_in_get(skb, pp, cih, offset, 1);
873 if (!cp)
874 return NF_ACCEPT;
875
876 verdict = NF_DROP;
877
878 /* Ensure the checksum is correct */
60476372 879 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1da177e4
LT
880 /* Failed checksum! */
881 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
882 NIPQUAD(iph->saddr));
883 goto out;
884 }
885
886 /* do the statistics and put it back */
887 ip_vs_in_stats(cp, skb);
888 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
889 offset += 2 * sizeof(__u16);
890 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
891 /* do not touch skb anymore */
892
893 out:
894 __ip_vs_conn_put(cp);
895
896 return verdict;
897}
898
899/*
900 * Check if it's for virtual services, look it up,
901 * and send it on its way...
902 */
903static unsigned int
904ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
905 const struct net_device *in, const struct net_device *out,
906 int (*okfn)(struct sk_buff *))
907{
908 struct sk_buff *skb = *pskb;
909 struct iphdr *iph;
910 struct ip_vs_protocol *pp;
911 struct ip_vs_conn *cp;
912 int ret, restart;
913 int ihl;
914
915 /*
916 * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
917 * ... don't know why 1st test DOES NOT include 2nd (?)
918 */
919 if (unlikely(skb->pkt_type != PACKET_HOST
0cc217e1 920 || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
1da177e4
LT
921 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
922 skb->pkt_type,
eddc9ec5
ACM
923 ip_hdr(skb)->protocol,
924 NIPQUAD(ip_hdr(skb)->daddr));
1da177e4
LT
925 return NF_ACCEPT;
926 }
927
eddc9ec5 928 iph = ip_hdr(skb);
1da177e4
LT
929 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
930 int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
931
932 if (related)
933 return verdict;
934 skb = *pskb;
eddc9ec5 935 iph = ip_hdr(skb);
1da177e4
LT
936 }
937
938 /* Protocol supported? */
939 pp = ip_vs_proto_get(iph->protocol);
940 if (unlikely(!pp))
941 return NF_ACCEPT;
942
943 ihl = iph->ihl << 2;
944
945 /*
946 * Check if the packet belongs to an existing connection entry
947 */
948 cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
949
950 if (unlikely(!cp)) {
951 int v;
952
953 if (!pp->conn_schedule(skb, pp, &v, &cp))
954 return v;
955 }
956
957 if (unlikely(!cp)) {
958 /* sorry, all this trouble for a no-hit :) */
959 IP_VS_DBG_PKT(12, pp, skb, 0,
960 "packet continues traversal as normal");
961 return NF_ACCEPT;
962 }
963
964 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
965
966 /* Check the server status */
967 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
968 /* the destination server is not available */
969
970 if (sysctl_ip_vs_expire_nodest_conn) {
971 /* try to expire the connection immediately */
972 ip_vs_conn_expire_now(cp);
1da177e4 973 }
dc8103f2
JA
974 /* don't restart its timer, and silently
975 drop the packet. */
976 __ip_vs_conn_put(cp);
1da177e4
LT
977 return NF_DROP;
978 }
979
980 ip_vs_in_stats(cp, skb);
981 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
982 if (cp->packet_xmit)
983 ret = cp->packet_xmit(skb, cp, pp);
984 /* do not touch skb anymore */
985 else {
986 IP_VS_DBG_RL("warning: packet_xmit is null");
987 ret = NF_ACCEPT;
988 }
989
990 /* increase its packet counter and check if it is needed
991 to be synchronized */
992 atomic_inc(&cp->in_pkts);
993 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
994 (cp->protocol != IPPROTO_TCP ||
995 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
996 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
997 == sysctl_ip_vs_sync_threshold[0]))
998 ip_vs_sync_conn(cp);
999
1000 ip_vs_conn_put(cp);
1001 return ret;
1002}
1003
1004
1005/*
1006 * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
1007 * related packets destined for 0.0.0.0/0.
1008 * When fwmark-based virtual service is used, such as transparent
1009 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1010 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1011 * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
1012 * and send them to ip_vs_in_icmp.
1013 */
1014static unsigned int
1015ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
1016 const struct net_device *in, const struct net_device *out,
1017 int (*okfn)(struct sk_buff *))
1018{
1019 int r;
1020
eddc9ec5 1021 if (ip_hdr(*pskb)->protocol != IPPROTO_ICMP)
1da177e4
LT
1022 return NF_ACCEPT;
1023
1024 return ip_vs_in_icmp(pskb, &r, hooknum);
1025}
1026
1027
1028/* After packet filtering, forward packet through VS/DR, VS/TUN,
1029 or VS/NAT(change destination), so that filtering rules can be
1030 applied to IPVS. */
1031static struct nf_hook_ops ip_vs_in_ops = {
1032 .hook = ip_vs_in,
1033 .owner = THIS_MODULE,
1034 .pf = PF_INET,
1035 .hooknum = NF_IP_LOCAL_IN,
1036 .priority = 100,
1037};
1038
1039/* After packet filtering, change source only for VS/NAT */
1040static struct nf_hook_ops ip_vs_out_ops = {
1041 .hook = ip_vs_out,
1042 .owner = THIS_MODULE,
1043 .pf = PF_INET,
1044 .hooknum = NF_IP_FORWARD,
1045 .priority = 100,
1046};
1047
1048/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1049 destined for 0.0.0.0/0, which is for incoming IPVS connections */
1050static struct nf_hook_ops ip_vs_forward_icmp_ops = {
1051 .hook = ip_vs_forward_icmp,
1052 .owner = THIS_MODULE,
1053 .pf = PF_INET,
1054 .hooknum = NF_IP_FORWARD,
1055 .priority = 99,
1056};
1057
1058/* Before the netfilter connection tracking, exit from POST_ROUTING */
1059static struct nf_hook_ops ip_vs_post_routing_ops = {
1060 .hook = ip_vs_post_routing,
1061 .owner = THIS_MODULE,
1062 .pf = PF_INET,
1063 .hooknum = NF_IP_POST_ROUTING,
1064 .priority = NF_IP_PRI_NAT_SRC-1,
1065};
1066
1067
1068/*
1069 * Initialize IP Virtual Server
1070 */
1071static int __init ip_vs_init(void)
1072{
1073 int ret;
1074
1075 ret = ip_vs_control_init();
1076 if (ret < 0) {
1077 IP_VS_ERR("can't setup control.\n");
1078 goto cleanup_nothing;
1079 }
1080
1081 ip_vs_protocol_init();
1082
1083 ret = ip_vs_app_init();
1084 if (ret < 0) {
1085 IP_VS_ERR("can't setup application helper.\n");
1086 goto cleanup_protocol;
1087 }
1088
1089 ret = ip_vs_conn_init();
1090 if (ret < 0) {
1091 IP_VS_ERR("can't setup connection table.\n");
1092 goto cleanup_app;
1093 }
1094
1095 ret = nf_register_hook(&ip_vs_in_ops);
1096 if (ret < 0) {
1097 IP_VS_ERR("can't register in hook.\n");
1098 goto cleanup_conn;
1099 }
1100
1101 ret = nf_register_hook(&ip_vs_out_ops);
1102 if (ret < 0) {
1103 IP_VS_ERR("can't register out hook.\n");
1104 goto cleanup_inops;
1105 }
1106 ret = nf_register_hook(&ip_vs_post_routing_ops);
1107 if (ret < 0) {
1108 IP_VS_ERR("can't register post_routing hook.\n");
1109 goto cleanup_outops;
1110 }
1111 ret = nf_register_hook(&ip_vs_forward_icmp_ops);
1112 if (ret < 0) {
1113 IP_VS_ERR("can't register forward_icmp hook.\n");
1114 goto cleanup_postroutingops;
1115 }
1116
1117 IP_VS_INFO("ipvs loaded.\n");
1118 return ret;
1119
1120 cleanup_postroutingops:
1121 nf_unregister_hook(&ip_vs_post_routing_ops);
1122 cleanup_outops:
1123 nf_unregister_hook(&ip_vs_out_ops);
1124 cleanup_inops:
1125 nf_unregister_hook(&ip_vs_in_ops);
1126 cleanup_conn:
1127 ip_vs_conn_cleanup();
1128 cleanup_app:
1129 ip_vs_app_cleanup();
1130 cleanup_protocol:
1131 ip_vs_protocol_cleanup();
1132 ip_vs_control_cleanup();
1133 cleanup_nothing:
1134 return ret;
1135}
1136
1137static void __exit ip_vs_cleanup(void)
1138{
1139 nf_unregister_hook(&ip_vs_forward_icmp_ops);
1140 nf_unregister_hook(&ip_vs_post_routing_ops);
1141 nf_unregister_hook(&ip_vs_out_ops);
1142 nf_unregister_hook(&ip_vs_in_ops);
1143 ip_vs_conn_cleanup();
1144 ip_vs_app_cleanup();
1145 ip_vs_protocol_cleanup();
1146 ip_vs_control_cleanup();
1147 IP_VS_INFO("ipvs unloaded.\n");
1148}
1149
1150module_init(ip_vs_init);
1151module_exit(ip_vs_cleanup);
1152MODULE_LICENSE("GPL");