net/netfilter/ipvs/ip_vs_core.c

   1 /*
   2  * IPVS         An implementation of the IP virtual server support for the
   3  *              LINUX operating system.  IPVS is now implemented as a module
   4  *              over the Netfilter framework. IPVS can be used to build a
   5  *              high-performance and highly available server based on a
   6  *              cluster of servers.
   7  *
   8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9  *              Peter Kese <peter.kese@ijs.si>
  10  *              Julian Anastasov <ja@ssi.bg>
  11  *
  12  *              This program is free software; you can redistribute it and/or
  13  *              modify it under the terms of the GNU General Public License
  14  *              as published by the Free Software Foundation; either version
  15  *              2 of the License, or (at your option) any later version.
  16  *
  17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19  * and others.
  20  *
  21  * Changes:
  22  *      Paul `Rusty' Russell            properly handle non-linear skbs
  23  *      Harald Welte                    don't use nfcache
  24  *
  25  */
  26
  27 #define KMSG_COMPONENT "IPVS"
  28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  29
  30 #include <linux/module.h>
  31 #include <linux/kernel.h>
  32 #include <linux/ip.h>
  33 #include <linux/tcp.h>
  34 #include <linux/sctp.h>
  35 #include <linux/icmp.h>
  36 #include <linux/slab.h>
  37
  38 #include <net/ip.h>
  39 #include <net/tcp.h>
  40 #include <net/udp.h>
  41 #include <net/icmp.h>                   /* for icmp_send */
  42 #include <net/route.h>
  43
  44 #include <linux/netfilter.h>
  45 #include <linux/netfilter_ipv4.h>
  46
  47 #ifdef CONFIG_IP_VS_IPV6
  48 #include <net/ipv6.h>
  49 #include <linux/netfilter_ipv6.h>
  50 #endif
  51
  52 #include <net/ip_vs.h>
  53
  54
  55 EXPORT_SYMBOL(register_ip_vs_scheduler);
  56 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  57 EXPORT_SYMBOL(ip_vs_skb_replace);
  58 EXPORT_SYMBOL(ip_vs_proto_name);
  59 EXPORT_SYMBOL(ip_vs_conn_new);
  60 EXPORT_SYMBOL(ip_vs_conn_in_get);
  61 EXPORT_SYMBOL(ip_vs_conn_out_get);
  62 #ifdef CONFIG_IP_VS_PROTO_TCP
  63 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  64 #endif
  65 EXPORT_SYMBOL(ip_vs_conn_put);
  66 #ifdef CONFIG_IP_VS_DEBUG
  67 EXPORT_SYMBOL(ip_vs_get_debug_level);
  68 #endif
  69
  70
  71 /* ID used in ICMP lookups */
  72 #define icmp_id(icmph)          (((icmph)->un).echo.id)
  73 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
  74
  75 const char *ip_vs_proto_name(unsigned proto)
  76 {
  77         static char buf[20];
  78
  79         switch (proto) {
  80         case IPPROTO_IP:
  81                 return "IP";
  82         case IPPROTO_UDP:
  83                 return "UDP";
  84         case IPPROTO_TCP:
  85                 return "TCP";
  86         case IPPROTO_SCTP:
  87                 return "SCTP";
  88         case IPPROTO_ICMP:
  89                 return "ICMP";
  90 #ifdef CONFIG_IP_VS_IPV6
  91         case IPPROTO_ICMPV6:
  92                 return "ICMPv6";
  93 #endif
  94         default:
  95                 sprintf(buf, "IP_%d", proto);
  96                 return buf;
  97         }
  98 }
  99
 100 void ip_vs_init_hash_table(struct list_head *table, int rows)
 101 {
 102         while (--rows >= 0)
 103                 INIT_LIST_HEAD(&table[rows]);
 104 }
 105
 106 static inline void
 107 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 108 {
 109         struct ip_vs_dest *dest = cp->dest;
 110         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 111                 spin_lock(&dest->stats.lock);
 112                 dest->stats.ustats.inpkts++;
 113                 dest->stats.ustats.inbytes += skb->len;
 114                 spin_unlock(&dest->stats.lock);
 115
 116                 spin_lock(&dest->svc->stats.lock);
 117                 dest->svc->stats.ustats.inpkts++;
 118                 dest->svc->stats.ustats.inbytes += skb->len;
 119                 spin_unlock(&dest->svc->stats.lock);
 120
 121                 spin_lock(&ip_vs_stats.lock);
 122                 ip_vs_stats.ustats.inpkts++;
 123                 ip_vs_stats.ustats.inbytes += skb->len;
 124                 spin_unlock(&ip_vs_stats.lock);
 125         }
 126 }
 127
 128
 129 static inline void
 130 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 131 {
 132         struct ip_vs_dest *dest = cp->dest;
 133         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 134                 spin_lock(&dest->stats.lock);
 135                 dest->stats.ustats.outpkts++;
 136                 dest->stats.ustats.outbytes += skb->len;
 137                 spin_unlock(&dest->stats.lock);
 138
 139                 spin_lock(&dest->svc->stats.lock);
 140                 dest->svc->stats.ustats.outpkts++;
 141                 dest->svc->stats.ustats.outbytes += skb->len;
 142                 spin_unlock(&dest->svc->stats.lock);
 143
 144                 spin_lock(&ip_vs_stats.lock);
 145                 ip_vs_stats.ustats.outpkts++;
 146                 ip_vs_stats.ustats.outbytes += skb->len;
 147                 spin_unlock(&ip_vs_stats.lock);
 148         }
 149 }
 150
 151
 152 static inline void
 153 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 154 {
 155         spin_lock(&cp->dest->stats.lock);
 156         cp->dest->stats.ustats.conns++;
 157         spin_unlock(&cp->dest->stats.lock);
 158
 159         spin_lock(&svc->stats.lock);
 160         svc->stats.ustats.conns++;
 161         spin_unlock(&svc->stats.lock);
 162
 163         spin_lock(&ip_vs_stats.lock);
 164         ip_vs_stats.ustats.conns++;
 165         spin_unlock(&ip_vs_stats.lock);
 166 }
 167
 168
 169 static inline int
 170 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 171                 const struct sk_buff *skb,
 172                 struct ip_vs_protocol *pp)
 173 {
 174         if (unlikely(!pp->state_transition))
 175                 return 0;
 176         return pp->state_transition(cp, direction, skb, pp);
 177 }
 178
 179
 180 /*
 181  *  IPVS persistent scheduling function
 182  *  It creates a connection entry according to its template if exists,
 183  *  or selects a server and creates a connection entry plus a template.
 184  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 185  *  Protocols supported: TCP, UDP
 186  */
 187 static struct ip_vs_conn *
 188 ip_vs_sched_persist(struct ip_vs_service *svc,
 189                     const struct sk_buff *skb,
 190                     __be16 ports[2])
 191 {
 192         struct ip_vs_conn *cp = NULL;
 193         struct ip_vs_iphdr iph;
 194         struct ip_vs_dest *dest;
 195         struct ip_vs_conn *ct;
 196         __be16  dport;                  /* destination port to forward */
 197         __be16  flags;
 198         union nf_inet_addr snet;        /* source network of the client,
 199                                            after masking */
 200
 201         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 202
 203         /* Mask saddr with the netmask to adjust template granularity */
 204 #ifdef CONFIG_IP_VS_IPV6
 205         if (svc->af == AF_INET6)
 206                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
 207         else
 208 #endif
 209                 snet.ip = iph.saddr.ip & svc->netmask;
 210
 211         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 212                       "mnet %s\n",
 213                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
 214                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
 215                       IP_VS_DBG_ADDR(svc->af, &snet));
 216
 217         /*
 218          * As far as we know, FTP is a very complicated network protocol, and
 219          * it uses control connection and data connections. For active FTP,
 220          * FTP server initialize data connection to the client, its source port
 221          * is often 20. For passive FTP, FTP server tells the clients the port
 222          * that it passively listens to,  and the client issues the data
 223          * connection. In the tunneling or direct routing mode, the load
 224          * balancer is on the client-to-server half of connection, the port
 225          * number is unknown to the load balancer. So, a conn template like
 226          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 227          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 228          * is created for other persistent services.
 229          */
 230         if (ports[1] == svc->port) {
 231                 /* Check if a template already exists */
 232                 if (svc->port != FTPPORT)
 233                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 234                                              &iph.daddr, ports[1]);
 235                 else
 236                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 237                                              &iph.daddr, 0);
 238
 239                 if (!ct || !ip_vs_check_template(ct)) {
 240                         /*
 241                          * No template found or the dest of the connection
 242                          * template is not available.
 243                          */
 244                         dest = svc->scheduler->schedule(svc, skb);
 245                         if (dest == NULL) {
 246                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 247                                 return NULL;
 248                         }
 249
 250                         /*
 251                          * Create a template like <protocol,caddr,0,
 252                          * vaddr,vport,daddr,dport> for non-ftp service,
 253                          * and <protocol,caddr,0,vaddr,0,daddr,0>
 254                          * for ftp service.
 255                          */
 256                         if (svc->port != FTPPORT)
 257                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 258                                                     &snet, 0,
 259                                                     &iph.daddr,
 260                                                     ports[1],
 261                                                     &dest->addr, dest->port,
 262                                                     IP_VS_CONN_F_TEMPLATE,
 263                                                     dest);
 264                         else
 265                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 266                                                     &snet, 0,
 267                                                     &iph.daddr, 0,
 268                                                     &dest->addr, 0,
 269                                                     IP_VS_CONN_F_TEMPLATE,
 270                                                     dest);
 271                         if (ct == NULL)
 272                                 return NULL;
 273
 274                         ct->timeout = svc->timeout;
 275                 } else {
 276                         /* set destination with the found template */
 277                         dest = ct->dest;
 278                 }
 279                 dport = dest->port;
 280         } else {
 281                 /*
 282                  * Note: persistent fwmark-based services and persistent
 283                  * port zero service are handled here.
 284                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 285                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 286                  */
 287                 if (svc->fwmark) {
 288                         union nf_inet_addr fwmark = {
 289                                 .ip = htonl(svc->fwmark)
 290                         };
 291
 292                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
 293                                              &fwmark, 0);
 294                 } else
 295                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 296                                              &iph.daddr, 0);
 297
 298                 if (!ct || !ip_vs_check_template(ct)) {
 299                         /*
 300                          * If it is not persistent port zero, return NULL,
 301                          * otherwise create a connection template.
 302                          */
 303                         if (svc->port)
 304                                 return NULL;
 305
 306                         dest = svc->scheduler->schedule(svc, skb);
 307                         if (dest == NULL) {
 308                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 309                                 return NULL;
 310                         }
 311
 312                         /*
 313                          * Create a template according to the service
 314                          */
 315                         if (svc->fwmark) {
 316                                 union nf_inet_addr fwmark = {
 317                                         .ip = htonl(svc->fwmark)
 318                                 };
 319
 320                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
 321                                                     &snet, 0,
 322                                                     &fwmark, 0,
 323                                                     &dest->addr, 0,
 324                                                     IP_VS_CONN_F_TEMPLATE,
 325                                                     dest);
 326                         } else
 327                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 328                                                     &snet, 0,
 329                                                     &iph.daddr, 0,
 330                                                     &dest->addr, 0,
 331                                                     IP_VS_CONN_F_TEMPLATE,
 332                                                     dest);
 333                         if (ct == NULL)
 334                                 return NULL;
 335
 336                         ct->timeout = svc->timeout;
 337                 } else {
 338                         /* set destination with the found template */
 339                         dest = ct->dest;
 340                 }
 341                 dport = ports[1];
 342         }
 343
 344         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 345                  && iph.protocol == IPPROTO_UDP)?
 346                 IP_VS_CONN_F_ONE_PACKET : 0;
 347
 348         /*
 349          *    Create a new connection according to the template
 350          */
 351         cp = ip_vs_conn_new(svc->af, iph.protocol,
 352                             &iph.saddr, ports[0],
 353                             &iph.daddr, ports[1],
 354                             &dest->addr, dport,
 355                             flags,
 356                             dest);
 357         if (cp == NULL) {
 358                 ip_vs_conn_put(ct);
 359                 return NULL;
 360         }
 361
 362         /*
 363          *    Add its control
 364          */
 365         ip_vs_control_add(cp, ct);
 366         ip_vs_conn_put(ct);
 367
 368         ip_vs_conn_stats(cp, svc);
 369         return cp;
 370 }
 371
 372
 373 /*
 374  *  IPVS main scheduling function
 375  *  It selects a server according to the virtual service, and
 376  *  creates a connection entry.
 377  *  Protocols supported: TCP, UDP
 378  */
 379 struct ip_vs_conn *
 380 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 381 {
 382         struct ip_vs_conn *cp = NULL;
 383         struct ip_vs_iphdr iph;
 384         struct ip_vs_dest *dest;
 385         __be16 _ports[2], *pptr, flags;
 386
 387         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 388         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 389         if (pptr == NULL)
 390                 return NULL;
 391
 392         /*
 393          *    Persistent service
 394          */
 395         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 396                 return ip_vs_sched_persist(svc, skb, pptr);
 397
 398         /*
 399          *    Non-persistent service
 400          */
 401         if (!svc->fwmark && pptr[1] != svc->port) {
 402                 if (!svc->port)
 403                         pr_err("Schedule: port zero only supported "
 404                                "in persistent services, "
 405                                "check your ipvs configuration\n");
 406                 return NULL;
 407         }
 408
 409         dest = svc->scheduler->schedule(svc, skb);
 410         if (dest == NULL) {
 411                 IP_VS_DBG(1, "Schedule: no dest found.\n");
 412                 return NULL;
 413         }
 414
 415         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 416                  && iph.protocol == IPPROTO_UDP)?
 417                 IP_VS_CONN_F_ONE_PACKET : 0;
 418
 419         /*
 420          *    Create a connection entry.
 421          */
 422         cp = ip_vs_conn_new(svc->af, iph.protocol,
 423                             &iph.saddr, pptr[0],
 424                             &iph.daddr, pptr[1],
 425                             &dest->addr, dest->port ? dest->port : pptr[1],
 426                             flags,
 427                             dest);
 428         if (cp == NULL)
 429                 return NULL;
 430
 431         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 432                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 433                       ip_vs_fwd_tag(cp),
 434                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
 435                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
 436                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
 437                       cp->flags, atomic_read(&cp->refcnt));
 438
 439         ip_vs_conn_stats(cp, svc);
 440         return cp;
 441 }
 442
 443
 444 /*
 445  *  Pass or drop the packet.
 446  *  Called by ip_vs_in, when the virtual service is available but
 447  *  no destination is available for a new connection.
 448  */
 449 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 450                 struct ip_vs_protocol *pp)
 451 {
 452         __be16 _ports[2], *pptr;
 453         struct ip_vs_iphdr iph;
 454         int unicast;
 455         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 456
 457         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 458         if (pptr == NULL) {
 459                 ip_vs_service_put(svc);
 460                 return NF_DROP;
 461         }
 462
 463 #ifdef CONFIG_IP_VS_IPV6
 464         if (svc->af == AF_INET6)
 465                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 466         else
 467 #endif
 468                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
 469
 470         /* if it is fwmark-based service, the cache_bypass sysctl is up
 471            and the destination is a non-local unicast, then create
 472            a cache_bypass connection entry */
 473         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 474                 int ret, cs;
 475                 struct ip_vs_conn *cp;
 476                 __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
 477                                 iph.protocol == IPPROTO_UDP)?
 478                                 IP_VS_CONN_F_ONE_PACKET : 0;
 479                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 480
 481                 ip_vs_service_put(svc);
 482
 483                 /* create a new connection entry */
 484                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 485                 cp = ip_vs_conn_new(svc->af, iph.protocol,
 486                                     &iph.saddr, pptr[0],
 487                                     &iph.daddr, pptr[1],
 488                                     &daddr, 0,
 489                                     IP_VS_CONN_F_BYPASS | flags,
 490                                     NULL);
 491                 if (cp == NULL)
 492                         return NF_DROP;
 493
 494                 /* statistics */
 495                 ip_vs_in_stats(cp, skb);
 496
 497                 /* set state */
 498                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 499
 500                 /* transmit the first SYN packet */
 501                 ret = cp->packet_xmit(skb, cp, pp);
 502                 /* do not touch skb anymore */
 503
 504                 atomic_inc(&cp->in_pkts);
 505                 ip_vs_conn_put(cp);
 506                 return ret;
 507         }
 508
 509         /*
 510          * When the virtual ftp service is presented, packets destined
 511          * for other services on the VIP may get here (except services
 512          * listed in the ipvs table), pass the packets, because it is
 513          * not ipvs job to decide to drop the packets.
 514          */
 515         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 516                 ip_vs_service_put(svc);
 517                 return NF_ACCEPT;
 518         }
 519
 520         ip_vs_service_put(svc);
 521
 522         /*
 523          * Notify the client that the destination is unreachable, and
 524          * release the socket buffer.
 525          * Since it is in IP layer, the TCP socket is not actually
 526          * created, the TCP RST packet cannot be sent, instead that
 527          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 528          */
 529 #ifdef CONFIG_IP_VS_IPV6
 530         if (svc->af == AF_INET6)
 531                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 532         else
 533 #endif
 534                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 535
 536         return NF_DROP;
 537 }
 538
 539
 540 /*
 541  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
 542  *      chain, and is used for VS/NAT.
 543  *      It detects packets for VS/NAT connections and sends the packets
 544  *      immediately. This can avoid that iptable_nat mangles the packets
 545  *      for VS/NAT.
 546  */
 547 static unsigned int ip_vs_post_routing(unsigned int hooknum,
 548                                        struct sk_buff *skb,
 549                                        const struct net_device *in,
 550                                        const struct net_device *out,
 551                                        int (*okfn)(struct sk_buff *))
 552 {
 553         if (!skb->ipvs_property)
 554                 return NF_ACCEPT;
 555         /* The packet was sent from IPVS, exit this chain */
 556         return NF_STOP;
 557 }
 558
 559 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 560 {
 561         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 562 }
 563
 564 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 565 {
 566         int err = ip_defrag(skb, user);
 567
 568         if (!err)
 569                 ip_send_check(ip_hdr(skb));
 570
 571         return err;
 572 }
 573
 574 #ifdef CONFIG_IP_VS_IPV6
 575 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
 576 {
 577         /* TODO IPv6: Find out what to do here for IPv6 */
 578         return 0;
 579 }
 580 #endif
 581
 582 /*
 583  * Packet has been made sufficiently writable in caller
 584  * - inout: 1=in->out, 0=out->in
 585  */
 586 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 587                     struct ip_vs_conn *cp, int inout)
 588 {
 589         struct iphdr *iph        = ip_hdr(skb);
 590         unsigned int icmp_offset = iph->ihl*4;
 591         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 592                                                       icmp_offset);
 593         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 594
 595         if (inout) {
 596                 iph->saddr = cp->vaddr.ip;
 597                 ip_send_check(iph);
 598                 ciph->daddr = cp->vaddr.ip;
 599                 ip_send_check(ciph);
 600         } else {
 601                 iph->daddr = cp->daddr.ip;
 602                 ip_send_check(iph);
 603                 ciph->saddr = cp->daddr.ip;
 604                 ip_send_check(ciph);
 605         }
 606
 607         /* the TCP/UDP/SCTP port */
 608         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 609             IPPROTO_SCTP == ciph->protocol) {
 610                 __be16 *ports = (void *)ciph + ciph->ihl*4;
 611
 612                 if (inout)
 613                         ports[1] = cp->vport;
 614                 else
 615                         ports[0] = cp->dport;
 616         }
 617
 618         /* And finally the ICMP checksum */
 619         icmph->checksum = 0;
 620         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 621         skb->ip_summed = CHECKSUM_UNNECESSARY;
 622
 623         if (inout)
 624                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 625                         "Forwarding altered outgoing ICMP");
 626         else
 627                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 628                         "Forwarding altered incoming ICMP");
 629 }
 630
 631 #ifdef CONFIG_IP_VS_IPV6
 632 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 633                     struct ip_vs_conn *cp, int inout)
 634 {
 635         struct ipv6hdr *iph      = ipv6_hdr(skb);
 636         unsigned int icmp_offset = sizeof(struct ipv6hdr);
 637         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
 638                                                       icmp_offset);
 639         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
 640
 641         if (inout) {
 642                 iph->saddr = cp->vaddr.in6;
 643                 ciph->daddr = cp->vaddr.in6;
 644         } else {
 645                 iph->daddr = cp->daddr.in6;
 646                 ciph->saddr = cp->daddr.in6;
 647         }
 648
 649         /* the TCP/UDP/SCTP port */
 650         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
 651             IPPROTO_SCTP == ciph->nexthdr) {
 652                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
 653
 654                 if (inout)
 655                         ports[1] = cp->vport;
 656                 else
 657                         ports[0] = cp->dport;
 658         }
 659
 660         /* And finally the ICMP checksum */
 661         icmph->icmp6_cksum = 0;
 662         /* TODO IPv6: is this correct for ICMPv6? */
 663         ip_vs_checksum_complete(skb, icmp_offset);
 664         skb->ip_summed = CHECKSUM_UNNECESSARY;
 665
 666         if (inout)
 667                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 668                         "Forwarding altered outgoing ICMPv6");
 669         else
 670                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 671                         "Forwarding altered incoming ICMPv6");
 672 }
 673 #endif
 674
 675 /* Handle relevant response ICMP messages - forward to the right
 676  * destination host. Used for NAT and local client.
 677  */
 678 static int handle_response_icmp(int af, struct sk_buff *skb,
 679                                 union nf_inet_addr *snet,
 680                                 __u8 protocol, struct ip_vs_conn *cp,
 681                                 struct ip_vs_protocol *pp,
 682                                 unsigned int offset, unsigned int ihl)
 683 {
 684         unsigned int verdict = NF_DROP;
 685
 686         if (IP_VS_FWD_METHOD(cp) != 0) {
 687                 pr_err("shouldn't reach here, because the box is on the "
 688                        "half connection in the tun/dr module.\n");
 689         }
 690
 691         /* Ensure the checksum is correct */
 692         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 693                 /* Failed checksum! */
 694                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 695                               IP_VS_DBG_ADDR(af, snet));
 696                 goto out;
 697         }
 698
 699         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 700             IPPROTO_SCTP == protocol)
 701                 offset += 2 * sizeof(__u16);
 702         if (!skb_make_writable(skb, offset))
 703                 goto out;
 704
 705 #ifdef CONFIG_IP_VS_IPV6
 706         if (af == AF_INET6)
 707                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 708         else
 709 #endif
 710                 ip_vs_nat_icmp(skb, pp, cp, 1);
 711
 712         /* do the statistics and put it back */
 713         ip_vs_out_stats(cp, skb);
 714
 715         skb->ipvs_property = 1;
 716         verdict = NF_ACCEPT;
 717
 718 out:
 719         __ip_vs_conn_put(cp);
 720
 721         return verdict;
 722 }
 723
 724 /*
 725  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 726  *      Find any that might be relevant, check against existing connections.
 727  *      Currently handles error types - unreachable, quench, ttl exceeded.
 728  */
 729 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 730 {
 731         struct iphdr *iph;
 732         struct icmphdr  _icmph, *ic;
 733         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 734         struct ip_vs_iphdr ciph;
 735         struct ip_vs_conn *cp;
 736         struct ip_vs_protocol *pp;
 737         unsigned int offset, ihl;
 738         union nf_inet_addr snet;
 739
 740         *related = 1;
 741
 742         /* reassemble IP fragments */
 743         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 744                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 745                         return NF_STOLEN;
 746         }
 747
 748         iph = ip_hdr(skb);
 749         offset = ihl = iph->ihl * 4;
 750         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 751         if (ic == NULL)
 752                 return NF_DROP;
 753
 754         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 755                   ic->type, ntohs(icmp_id(ic)),
 756                   &iph->saddr, &iph->daddr);
 757
 758         /*
 759          * Work through seeing if this is for us.
 760          * These checks are supposed to be in an order that means easy
 761          * things are checked first to speed up processing.... however
 762          * this means that some packets will manage to get a long way
 763          * down this stack and then be rejected, but that's life.
 764          */
 765         if ((ic->type != ICMP_DEST_UNREACH) &&
 766             (ic->type != ICMP_SOURCE_QUENCH) &&
 767             (ic->type != ICMP_TIME_EXCEEDED)) {
 768                 *related = 0;
 769                 return NF_ACCEPT;
 770         }
 771
 772         /* Now find the contained IP header */
 773         offset += sizeof(_icmph);
 774         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 775         if (cih == NULL)
 776                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 777
 778         pp = ip_vs_proto_get(cih->protocol);
 779         if (!pp)
 780                 return NF_ACCEPT;
 781
 782         /* Is the embedded protocol header present? */
 783         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 784                      pp->dont_defrag))
 785                 return NF_ACCEPT;
 786
 787         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
 788
 789         offset += cih->ihl * 4;
 790
 791         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 792         /* The embedded headers contain source and dest in reverse order */
 793         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
 794         if (!cp)
 795                 return NF_ACCEPT;
 796
 797         snet.ip = iph->saddr;
 798         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 799                                     pp, offset, ihl);
 800 }
 801
 802 #ifdef CONFIG_IP_VS_IPV6
 803 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 804 {
 805         struct ipv6hdr *iph;
 806         struct icmp6hdr _icmph, *ic;
 807         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
 808                                            within the ICMP */
 809         struct ip_vs_iphdr ciph;
 810         struct ip_vs_conn *cp;
 811         struct ip_vs_protocol *pp;
 812         unsigned int offset;
 813         union nf_inet_addr snet;
 814
 815         *related = 1;
 816
 817         /* reassemble IP fragments */
 818         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
 819                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
 820                         return NF_STOLEN;
 821         }
 822
 823         iph = ipv6_hdr(skb);
 824         offset = sizeof(struct ipv6hdr);
 825         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 826         if (ic == NULL)
 827                 return NF_DROP;
 828
 829         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
 830                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
 831                   &iph->saddr, &iph->daddr);
 832
 833         /*
 834          * Work through seeing if this is for us.
 835          * These checks are supposed to be in an order that means easy
 836          * things are checked first to speed up processing.... however
 837          * this means that some packets will manage to get a long way
 838          * down this stack and then be rejected, but that's life.
 839          */
 840         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
 841             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
 842             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
 843                 *related = 0;
 844                 return NF_ACCEPT;
 845         }
 846
 847         /* Now find the contained IP header */
 848         offset += sizeof(_icmph);
 849         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 850         if (cih == NULL)
 851                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 852
 853         pp = ip_vs_proto_get(cih->nexthdr);
 854         if (!pp)
 855                 return NF_ACCEPT;
 856
 857         /* Is the embedded protocol header present? */
 858         /* TODO: we don't support fragmentation at the moment anyways */
 859         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 860                 return NF_ACCEPT;
 861
 862         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
 863
 864         offset += sizeof(struct ipv6hdr);
 865
 866         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 867         /* The embedded headers contain source and dest in reverse order */
 868         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
 869         if (!cp)
 870                 return NF_ACCEPT;
 871
 872         ipv6_addr_copy(&snet.in6, &iph->saddr);
 873         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 874                                     pp, offset, sizeof(struct ipv6hdr));
 875 }
 876 #endif
 877
 878 /*
 879  * Check if sctp chunc is ABORT chunk
 880  */
 881 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
 882 {
 883         sctp_chunkhdr_t *sch, schunk;
 884         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
 885                         sizeof(schunk), &schunk);
 886         if (sch == NULL)
 887                 return 0;
 888         if (sch->type == SCTP_CID_ABORT)
 889                 return 1;
 890         return 0;
 891 }
 892
 893 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 894 {
 895         struct tcphdr _tcph, *th;
 896
 897         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
 898         if (th == NULL)
 899                 return 0;
 900         return th->rst;
 901 }
 902
 903 /* Handle response packets: rewrite addresses and send away...
 904  * Used for NAT and local client.
 905  */
 906 static unsigned int
 907 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 908                 struct ip_vs_conn *cp, int ihl)
 909 {
 910         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 911
 912         if (!skb_make_writable(skb, ihl))
 913                 goto drop;
 914
 915         /* mangle the packet */
 916         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 917                 goto drop;
 918
 919 #ifdef CONFIG_IP_VS_IPV6
 920         if (af == AF_INET6)
 921                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
 922         else
 923 #endif
 924         {
 925                 ip_hdr(skb)->saddr = cp->vaddr.ip;
 926                 ip_send_check(ip_hdr(skb));
 927         }
 928
 929         /* For policy routing, packets originating from this
 930          * machine itself may be routed differently to packets
 931          * passing through.  We want this packet to be routed as
 932          * if it came from this machine itself.  So re-compute
 933          * the routing information.
 934          */
 935 #ifdef CONFIG_IP_VS_IPV6
 936         if (af == AF_INET6) {
 937                 if (ip6_route_me_harder(skb) != 0)
 938                         goto drop;
 939         } else
 940 #endif
 941                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 942                         goto drop;
 943
 944         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 945
 946         ip_vs_out_stats(cp, skb);
 947         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 948         ip_vs_conn_put(cp);
 949
 950         skb->ipvs_property = 1;
 951
 952         LeaveFunction(11);
 953         return NF_ACCEPT;
 954
 955 drop:
 956         ip_vs_conn_put(cp);
 957         kfree_skb(skb);
 958         return NF_STOLEN;
 959 }
 960
 961 /*
 962  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
 963  *      Check if outgoing packet belongs to the established ip_vs_conn.
 964  */
 965 static unsigned int
 966 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 967           const struct net_device *in, const struct net_device *out,
 968           int (*okfn)(struct sk_buff *))
 969 {
 970         struct ip_vs_iphdr iph;
 971         struct ip_vs_protocol *pp;
 972         struct ip_vs_conn *cp;
 973         int af;
 974
 975         EnterFunction(11);
 976
 977         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 978
 979         if (skb->ipvs_property)
 980                 return NF_ACCEPT;
 981
 982         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 983 #ifdef CONFIG_IP_VS_IPV6
 984         if (af == AF_INET6) {
 985                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 986                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 987
 988                         if (related)
 989                                 return verdict;
 990                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 991                 }
 992         } else
 993 #endif
 994                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 995                         int related, verdict = ip_vs_out_icmp(skb, &related);
 996
 997                         if (related)
 998                                 return verdict;
 999                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1000                 }
1001
1002         pp = ip_vs_proto_get(iph.protocol);
1003         if (unlikely(!pp))
1004                 return NF_ACCEPT;
1005
1006         /* reassemble IP fragments */
1007 #ifdef CONFIG_IP_VS_IPV6
1008         if (af == AF_INET6) {
1009                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1010                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
1011
1012                         if (related)
1013                                 return verdict;
1014
1015                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1016                 }
1017         } else
1018 #endif
1019                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1020                              !pp->dont_defrag)) {
1021                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1022                                 return NF_STOLEN;
1023
1024                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1025                 }
1026
1027         /*
1028          * Check if the packet belongs to an existing entry
1029          */
1030         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1031
1032         if (unlikely(!cp)) {
1033                 if (sysctl_ip_vs_nat_icmp_send &&
1034                     (pp->protocol == IPPROTO_TCP ||
1035                      pp->protocol == IPPROTO_UDP ||
1036                      pp->protocol == IPPROTO_SCTP)) {
1037                         __be16 _ports[2], *pptr;
1038
1039                         pptr = skb_header_pointer(skb, iph.len,
1040                                                   sizeof(_ports), _ports);
1041                         if (pptr == NULL)
1042                                 return NF_ACCEPT;       /* Not for me */
1043                         if (ip_vs_lookup_real_service(af, iph.protocol,
1044                                                       &iph.saddr,
1045                                                       pptr[0])) {
1046                                 /*
1047                                  * Notify the real server: there is no
1048                                  * existing entry if it is not RST
1049                                  * packet or not TCP packet.
1050                                  */
1051                                 if ((iph.protocol != IPPROTO_TCP &&
1052                                      iph.protocol != IPPROTO_SCTP)
1053                                      || ((iph.protocol == IPPROTO_TCP
1054                                           && !is_tcp_reset(skb, iph.len))
1055                                          || (iph.protocol == IPPROTO_SCTP
1056                                                 && !is_sctp_abort(skb,
1057                                                         iph.len)))) {
1058 #ifdef CONFIG_IP_VS_IPV6
1059                                         if (af == AF_INET6)
1060                                                 icmpv6_send(skb,
1061                                                             ICMPV6_DEST_UNREACH,
1062                                                             ICMPV6_PORT_UNREACH,
1063                                                             0);
1064                                         else
1065 #endif
1066                                                 icmp_send(skb,
1067                                                           ICMP_DEST_UNREACH,
1068                                                           ICMP_PORT_UNREACH, 0);
1069                                         return NF_DROP;
1070                                 }
1071                         }
1072                 }
1073                 IP_VS_DBG_PKT(12, pp, skb, 0,
1074                               "packet continues traversal as normal");
1075                 return NF_ACCEPT;
1076         }
1077
1078         return handle_response(af, skb, pp, cp, iph.len);
1079 }
1080
1081
1082 /*
1083  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1084  *      Find any that might be relevant, check against existing connections,
1085  *      forward to the right destination host if relevant.
1086  *      Currently handles error types - unreachable, quench, ttl exceeded.
1087  */
1088 static int
1089 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1090 {
1091         struct iphdr *iph;
1092         struct icmphdr  _icmph, *ic;
1093         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1094         struct ip_vs_iphdr ciph;
1095         struct ip_vs_conn *cp;
1096         struct ip_vs_protocol *pp;
1097         unsigned int offset, ihl, verdict;
1098         union nf_inet_addr snet;
1099
1100         *related = 1;
1101
1102         /* reassemble IP fragments */
1103         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1104                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1105                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1106                         return NF_STOLEN;
1107         }
1108
1109         iph = ip_hdr(skb);
1110         offset = ihl = iph->ihl * 4;
1111         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1112         if (ic == NULL)
1113                 return NF_DROP;
1114
1115         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1116                   ic->type, ntohs(icmp_id(ic)),
1117                   &iph->saddr, &iph->daddr);
1118
1119         /*
1120          * Work through seeing if this is for us.
1121          * These checks are supposed to be in an order that means easy
1122          * things are checked first to speed up processing.... however
1123          * this means that some packets will manage to get a long way
1124          * down this stack and then be rejected, but that's life.
1125          */
1126         if ((ic->type != ICMP_DEST_UNREACH) &&
1127             (ic->type != ICMP_SOURCE_QUENCH) &&
1128             (ic->type != ICMP_TIME_EXCEEDED)) {
1129                 *related = 0;
1130                 return NF_ACCEPT;
1131         }
1132
1133         /* Now find the contained IP header */
1134         offset += sizeof(_icmph);
1135         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1136         if (cih == NULL)
1137                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1138
1139         pp = ip_vs_proto_get(cih->protocol);
1140         if (!pp)
1141                 return NF_ACCEPT;
1142
1143         /* Is the embedded protocol header present? */
1144         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1145                      pp->dont_defrag))
1146                 return NF_ACCEPT;
1147
1148         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1149
1150         offset += cih->ihl * 4;
1151
1152         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1153         /* The embedded headers contain source and dest in reverse order */
1154         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1155         if (!cp) {
1156                 /* The packet could also belong to a local client */
1157                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1158                 if (cp) {
1159                         snet.ip = iph->saddr;
1160                         return handle_response_icmp(AF_INET, skb, &snet,
1161                                                     cih->protocol, cp, pp,
1162                                                     offset, ihl);
1163                 }
1164                 return NF_ACCEPT;
1165         }
1166
1167         verdict = NF_DROP;
1168
1169         /* Ensure the checksum is correct */
1170         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1171                 /* Failed checksum! */
1172                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1173                           &iph->saddr);
1174                 goto out;
1175         }
1176
1177         /* do the statistics and put it back */
1178         ip_vs_in_stats(cp, skb);
1179         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1180                 offset += 2 * sizeof(__u16);
1181         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1182         /* do not touch skb anymore */
1183
1184   out:
1185         __ip_vs_conn_put(cp);
1186
1187         return verdict;
1188 }
1189
1190 #ifdef CONFIG_IP_VS_IPV6
1191 static int
1192 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1193 {
1194         struct ipv6hdr *iph;
1195         struct icmp6hdr _icmph, *ic;
1196         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1197                                            within the ICMP */
1198         struct ip_vs_iphdr ciph;
1199         struct ip_vs_conn *cp;
1200         struct ip_vs_protocol *pp;
1201         unsigned int offset, verdict;
1202         union nf_inet_addr snet;
1203
1204         *related = 1;
1205
1206         /* reassemble IP fragments */
1207         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1208                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1209                                                IP_DEFRAG_VS_IN :
1210                                                IP_DEFRAG_VS_FWD))
1211                         return NF_STOLEN;
1212         }
1213
1214         iph = ipv6_hdr(skb);
1215         offset = sizeof(struct ipv6hdr);
1216         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1217         if (ic == NULL)
1218                 return NF_DROP;
1219
1220         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1221                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1222                   &iph->saddr, &iph->daddr);
1223
1224         /*
1225          * Work through seeing if this is for us.
1226          * These checks are supposed to be in an order that means easy
1227          * things are checked first to speed up processing.... however
1228          * this means that some packets will manage to get a long way
1229          * down this stack and then be rejected, but that's life.
1230          */
1231         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1232             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1233             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1234                 *related = 0;
1235                 return NF_ACCEPT;
1236         }
1237
1238         /* Now find the contained IP header */
1239         offset += sizeof(_icmph);
1240         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1241         if (cih == NULL)
1242                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1243
1244         pp = ip_vs_proto_get(cih->nexthdr);
1245         if (!pp)
1246                 return NF_ACCEPT;
1247
1248         /* Is the embedded protocol header present? */
1249         /* TODO: we don't support fragmentation at the moment anyways */
1250         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1251                 return NF_ACCEPT;
1252
1253         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1254
1255         offset += sizeof(struct ipv6hdr);
1256
1257         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1258         /* The embedded headers contain source and dest in reverse order */
1259         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1260         if (!cp) {
1261                 /* The packet could also belong to a local client */
1262                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1263                 if (cp) {
1264                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1265                         return handle_response_icmp(AF_INET6, skb, &snet,
1266                                                     cih->nexthdr,
1267                                                     cp, pp, offset,
1268                                                     sizeof(struct ipv6hdr));
1269                 }
1270                 return NF_ACCEPT;
1271         }
1272
1273         verdict = NF_DROP;
1274
1275         /* do the statistics and put it back */
1276         ip_vs_in_stats(cp, skb);
1277         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1278             IPPROTO_SCTP == cih->nexthdr)
1279                 offset += 2 * sizeof(__u16);
1280         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1281         /* do not touch skb anymore */
1282
1283         __ip_vs_conn_put(cp);
1284
1285         return verdict;
1286 }
1287 #endif
1288
1289
1290 /*
1291  *      Check if it's for virtual services, look it up,
1292  *      and send it on its way...
1293  */
1294 static unsigned int
1295 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1296          const struct net_device *in, const struct net_device *out,
1297          int (*okfn)(struct sk_buff *))
1298 {
1299         struct ip_vs_iphdr iph;
1300         struct ip_vs_protocol *pp;
1301         struct ip_vs_conn *cp;
1302         int ret, restart, af, pkts;
1303
1304         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1305
1306         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1307
1308         /*
1309          *      Big tappo: only PACKET_HOST, including loopback for local client
1310          *      Don't handle local packets on IPv6 for now
1311          */
1312         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1313                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1314                               skb->pkt_type,
1315                               iph.protocol,
1316                               IP_VS_DBG_ADDR(af, &iph.daddr));
1317                 return NF_ACCEPT;
1318         }
1319
1320 #ifdef CONFIG_IP_VS_IPV6
1321         if (af == AF_INET6) {
1322                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1323                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1324
1325                         if (related)
1326                                 return verdict;
1327                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1328                 }
1329         } else
1330 #endif
1331                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1332                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1333
1334                         if (related)
1335                                 return verdict;
1336                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1337                 }
1338
1339         /* Protocol supported? */
1340         pp = ip_vs_proto_get(iph.protocol);
1341         if (unlikely(!pp))
1342                 return NF_ACCEPT;
1343
1344         /*
1345          * Check if the packet belongs to an existing connection entry
1346          */
1347         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1348
1349         if (unlikely(!cp)) {
1350                 int v;
1351
1352                 /* For local client packets, it could be a response */
1353                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1354                 if (cp)
1355                         return handle_response(af, skb, pp, cp, iph.len);
1356
1357                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1358                         return v;
1359         }
1360
1361         if (unlikely(!cp)) {
1362                 /* sorry, all this trouble for a no-hit :) */
1363                 IP_VS_DBG_PKT(12, pp, skb, 0,
1364                               "packet continues traversal as normal");
1365                 return NF_ACCEPT;
1366         }
1367
1368         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1369
1370         /* Check the server status */
1371         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1372                 /* the destination server is not available */
1373
1374                 if (sysctl_ip_vs_expire_nodest_conn) {
1375                         /* try to expire the connection immediately */
1376                         ip_vs_conn_expire_now(cp);
1377                 }
1378                 /* don't restart its timer, and silently
1379                    drop the packet. */
1380                 __ip_vs_conn_put(cp);
1381                 return NF_DROP;
1382         }
1383
1384         ip_vs_in_stats(cp, skb);
1385         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1386         if (cp->packet_xmit)
1387                 ret = cp->packet_xmit(skb, cp, pp);
1388                 /* do not touch skb anymore */
1389         else {
1390                 IP_VS_DBG_RL("warning: packet_xmit is null");
1391                 ret = NF_ACCEPT;
1392         }
1393
1394         /* Increase its packet counter and check if it is needed
1395          * to be synchronized
1396          *
1397          * Sync connection if it is about to close to
1398          * encorage the standby servers to update the connections timeout
1399          */
1400         pkts = atomic_add_return(1, &cp->in_pkts);
1401         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1402             cp->protocol == IPPROTO_SCTP) {
1403                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1404                         (atomic_read(&cp->in_pkts) %
1405                          sysctl_ip_vs_sync_threshold[1]
1406                          == sysctl_ip_vs_sync_threshold[0])) ||
1407                                 (cp->old_state != cp->state &&
1408                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1409                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1410                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1411                         ip_vs_sync_conn(cp);
1412                         goto out;
1413                 }
1414         }
1415
1416         if (af == AF_INET &&
1417             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1418             (((cp->protocol != IPPROTO_TCP ||
1419                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1420               (pkts % sysctl_ip_vs_sync_threshold[1]
1421                == sysctl_ip_vs_sync_threshold[0])) ||
1422              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1423               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1424                (cp->state == IP_VS_TCP_S_CLOSE) ||
1425                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1426                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1427                 ip_vs_sync_conn(cp);
1428 out:
1429         cp->old_state = cp->state;
1430
1431         ip_vs_conn_put(cp);
1432         return ret;
1433 }
1434
1435
1436 /*
1437  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1438  *      related packets destined for 0.0.0.0/0.
1439  *      When fwmark-based virtual service is used, such as transparent
1440  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1441  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1442  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1443  *      and send them to ip_vs_in_icmp.
1444  */
1445 static unsigned int
1446 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1447                    const struct net_device *in, const struct net_device *out,
1448                    int (*okfn)(struct sk_buff *))
1449 {
1450         int r;
1451
1452         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1453                 return NF_ACCEPT;
1454
1455         return ip_vs_in_icmp(skb, &r, hooknum);
1456 }
1457
1458 #ifdef CONFIG_IP_VS_IPV6
1459 static unsigned int
1460 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1461                       const struct net_device *in, const struct net_device *out,
1462                       int (*okfn)(struct sk_buff *))
1463 {
1464         int r;
1465
1466         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1467                 return NF_ACCEPT;
1468
1469         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1470 }
1471 #endif
1472
1473
1474 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1475         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1476          * or VS/NAT(change destination), so that filtering rules can be
1477          * applied to IPVS. */
1478         {
1479                 .hook           = ip_vs_in,
1480                 .owner          = THIS_MODULE,
1481                 .pf             = PF_INET,
1482                 .hooknum        = NF_INET_LOCAL_IN,
1483                 .priority       = 100,
1484         },
1485         /* After packet filtering, change source only for VS/NAT */
1486         {
1487                 .hook           = ip_vs_out,
1488                 .owner          = THIS_MODULE,
1489                 .pf             = PF_INET,
1490                 .hooknum        = NF_INET_FORWARD,
1491                 .priority       = 100,
1492         },
1493         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1494          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1495         {
1496                 .hook           = ip_vs_forward_icmp,
1497                 .owner          = THIS_MODULE,
1498                 .pf             = PF_INET,
1499                 .hooknum        = NF_INET_FORWARD,
1500                 .priority       = 99,
1501         },
1502         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1503         {
1504                 .hook           = ip_vs_post_routing,
1505                 .owner          = THIS_MODULE,
1506                 .pf             = PF_INET,
1507                 .hooknum        = NF_INET_POST_ROUTING,
1508                 .priority       = NF_IP_PRI_NAT_SRC-1,
1509         },
1510 #ifdef CONFIG_IP_VS_IPV6
1511         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1512          * or VS/NAT(change destination), so that filtering rules can be
1513          * applied to IPVS. */
1514         {
1515                 .hook           = ip_vs_in,
1516                 .owner          = THIS_MODULE,
1517                 .pf             = PF_INET6,
1518                 .hooknum        = NF_INET_LOCAL_IN,
1519                 .priority       = 100,
1520         },
1521         /* After packet filtering, change source only for VS/NAT */
1522         {
1523                 .hook           = ip_vs_out,
1524                 .owner          = THIS_MODULE,
1525                 .pf             = PF_INET6,
1526                 .hooknum        = NF_INET_FORWARD,
1527                 .priority       = 100,
1528         },
1529         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1530          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1531         {
1532                 .hook           = ip_vs_forward_icmp_v6,
1533                 .owner          = THIS_MODULE,
1534                 .pf             = PF_INET6,
1535                 .hooknum        = NF_INET_FORWARD,
1536                 .priority       = 99,
1537         },
1538         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1539         {
1540                 .hook           = ip_vs_post_routing,
1541                 .owner          = THIS_MODULE,
1542                 .pf             = PF_INET6,
1543                 .hooknum        = NF_INET_POST_ROUTING,
1544                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1545         },
1546 #endif
1547 };
1548
1549
1550 /*
1551  *      Initialize IP Virtual Server
1552  */
1553 static int __init ip_vs_init(void)
1554 {
1555         int ret;
1556
1557         ip_vs_estimator_init();
1558
1559         ret = ip_vs_control_init();
1560         if (ret < 0) {
1561                 pr_err("can't setup control.\n");
1562                 goto cleanup_estimator;
1563         }
1564
1565         ip_vs_protocol_init();
1566
1567         ret = ip_vs_app_init();
1568         if (ret < 0) {
1569                 pr_err("can't setup application helper.\n");
1570                 goto cleanup_protocol;
1571         }
1572
1573         ret = ip_vs_conn_init();
1574         if (ret < 0) {
1575                 pr_err("can't setup connection table.\n");
1576                 goto cleanup_app;
1577         }
1578
1579         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1580         if (ret < 0) {
1581                 pr_err("can't register hooks.\n");
1582                 goto cleanup_conn;
1583         }
1584
1585         pr_info("ipvs loaded.\n");
1586         return ret;
1587
1588   cleanup_conn:
1589         ip_vs_conn_cleanup();
1590   cleanup_app:
1591         ip_vs_app_cleanup();
1592   cleanup_protocol:
1593         ip_vs_protocol_cleanup();
1594         ip_vs_control_cleanup();
1595   cleanup_estimator:
1596         ip_vs_estimator_cleanup();
1597         return ret;
1598 }
1599
1600 static void __exit ip_vs_cleanup(void)
1601 {
1602         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1603         ip_vs_conn_cleanup();
1604         ip_vs_app_cleanup();
1605         ip_vs_protocol_cleanup();
1606         ip_vs_control_cleanup();
1607         ip_vs_estimator_cleanup();
1608         pr_info("ipvs unloaded.\n");
1609 }
1610
1611 module_init(ip_vs_init);
1612 module_exit(ip_vs_cleanup);
1613 MODULE_LICENSE("GPL");