ipvs: do not schedule conns from real servers
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / netfilter / ipvs / ip_vs_core.c
index 4f8ddba480110167674fa36f6854e53f673e8d68..0090d6d25e95b0e4070acf16c7dee55d20dd04fa 100644 (file)
@@ -40,6 +40,7 @@
 #include <net/udp.h>
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>
+#include <net/ip6_checksum.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -175,6 +176,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
        return pp->state_transition(cp, direction, skb, pp);
 }
 
+static inline void
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+                             struct sk_buff *skb, int protocol,
+                             const union nf_inet_addr *caddr, __be16 cport,
+                             const union nf_inet_addr *vaddr, __be16 vport,
+                             struct ip_vs_conn_param *p)
+{
+       ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+       p->pe = svc->pe;
+       if (p->pe && p->pe->fill_param)
+               p->pe->fill_param(p, skb);
+}
 
 /*
  *  IPVS persistent scheduling function
@@ -185,15 +198,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
  */
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
-                   const struct sk_buff *skb,
+                   struct sk_buff *skb,
                    __be16 ports[2])
 {
        struct ip_vs_conn *cp = NULL;
        struct ip_vs_iphdr iph;
        struct ip_vs_dest *dest;
        struct ip_vs_conn *ct;
-       __be16  dport;                  /* destination port to forward */
-       __be16  flags;
+       __be16 dport = 0;               /* destination port to forward */
+       unsigned int flags;
+       struct ip_vs_conn_param param;
        union nf_inet_addr snet;        /* source network of the client,
                                           after masking */
 
@@ -226,120 +240,75 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
         * is created for other persistent services.
         */
-       if (ports[1] == svc->port) {
-               /* Check if a template already exists */
-               if (svc->port != FTPPORT)
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, ports[1]);
-               else
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, 0);
-
-               if (!ct || !ip_vs_check_template(ct)) {
-                       /*
-                        * No template found or the dest of the connection
-                        * template is not available.
-                        */
-                       dest = svc->scheduler->schedule(svc, skb);
-                       if (dest == NULL) {
-                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
-                               return NULL;
-                       }
-
-                       /*
-                        * Create a template like <protocol,caddr,0,
-                        * vaddr,vport,daddr,dport> for non-ftp service,
-                        * and <protocol,caddr,0,vaddr,0,daddr,0>
-                        * for ftp service.
+       {
+               int protocol = iph.protocol;
+               const union nf_inet_addr *vaddr = &iph.daddr;
+               const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+               __be16 vport = 0;
+
+               if (ports[1] == svc->port) {
+                       /* non-FTP template:
+                        * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+                        * FTP template:
+                        * <protocol, caddr, 0, vaddr, 0, daddr, 0>
                         */
                        if (svc->port != FTPPORT)
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr,
-                                                   ports[1],
-                                                   &dest->addr, dest->port,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       else
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
-
-                       ct->timeout = svc->timeout;
+                               vport = ports[1];
                } else {
-                       /* set destination with the found template */
-                       dest = ct->dest;
-               }
-               dport = dest->port;
-       } else {
-               /*
-                * Note: persistent fwmark-based services and persistent
-                * port zero service are handled here.
-                * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-                * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-                */
-               if (svc->fwmark) {
-                       union nf_inet_addr fwmark = {
-                               .ip = htonl(svc->fwmark)
-                       };
-
-                       ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-                                            &fwmark, 0);
-               } else
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, 0);
-
-               if (!ct || !ip_vs_check_template(ct)) {
-                       /*
-                        * If it is not persistent port zero, return NULL,
-                        * otherwise create a connection template.
+                       /* Note: persistent fwmark-based services and
+                        * persistent port zero service are handled here.
+                        * fwmark template:
+                        * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+                        * port zero template:
+                        * <protocol,caddr,0,vaddr,0,daddr,0>
                         */
-                       if (svc->port)
-                               return NULL;
-
-                       dest = svc->scheduler->schedule(svc, skb);
-                       if (dest == NULL) {
-                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
-                               return NULL;
+                       if (svc->fwmark) {
+                               protocol = IPPROTO_IP;
+                               vaddr = &fwmark;
                        }
+               }
+               ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+                                             vaddr, vport, &param);
+       }
 
-                       /*
-                        * Create a template according to the service
-                        */
-                       if (svc->fwmark) {
-                               union nf_inet_addr fwmark = {
-                                       .ip = htonl(svc->fwmark)
-                               };
-
-                               ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-                                                   &snet, 0,
-                                                   &fwmark, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       } else
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
-
-                       ct->timeout = svc->timeout;
-               } else {
-                       /* set destination with the found template */
-                       dest = ct->dest;
+       /* Check if a template already exists */
+       ct = ip_vs_ct_in_get(&param);
+       if (!ct || !ip_vs_check_template(ct)) {
+               /* No template found or the dest of the connection
+                * template is not available.
+                */
+               dest = svc->scheduler->schedule(svc, skb);
+               if (!dest) {
+                       IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                       kfree(param.pe_data);
+                       return NULL;
+               }
+
+               if (ports[1] == svc->port && svc->port != FTPPORT)
+                       dport = dest->port;
+
+               /* Create a template
+                * This adds param.pe_data to the template,
+                * and thus param.pe_data will be destroyed
+                * when the template expires */
+               ct = ip_vs_conn_new(&param, &dest->addr, dport,
+                                   IP_VS_CONN_F_TEMPLATE, dest);
+               if (ct == NULL) {
+                       kfree(param.pe_data);
+                       return NULL;
                }
-               dport = ports[1];
+
+               ct->timeout = svc->timeout;
+       } else {
+               /* set destination with the found template */
+               dest = ct->dest;
+               kfree(param.pe_data);
        }
 
+       dport = ports[1];
+       if (dport == svc->port && dest->port)
+               dport = dest->port;
+
        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
                 && iph.protocol == IPPROTO_UDP)?
                IP_VS_CONN_F_ONE_PACKET : 0;
@@ -347,12 +316,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
        /*
         *    Create a new connection according to the template
         */
-       cp = ip_vs_conn_new(svc->af, iph.protocol,
-                           &iph.saddr, ports[0],
-                           &iph.daddr, ports[1],
-                           &dest->addr, dport,
-                           flags,
-                           dest);
+       ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+                             &iph.daddr, ports[1], &param);
+       cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
        if (cp == NULL) {
                ip_vs_conn_put(ct);
                return NULL;
@@ -376,23 +342,52 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+              struct ip_vs_protocol *pp, int *ignored)
 {
        struct ip_vs_conn *cp = NULL;
        struct ip_vs_iphdr iph;
        struct ip_vs_dest *dest;
-       __be16 _ports[2], *pptr, flags;
+       __be16 _ports[2], *pptr;
+       unsigned int flags;
 
+       *ignored = 1;
        ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
        if (pptr == NULL)
                return NULL;
 
+       /*
+        * FTPDATA needs this check when using local real server.
+        * Never schedule Active FTPDATA connections from real server.
+        * For LVS-NAT they must be already created. For other methods
+        * with persistence the connection is created on SYN+ACK.
+        */
+       if (pptr[0] == FTPDATA) {
+               IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA");
+               return NULL;
+       }
+
+       /*
+        * Do not schedule replies from local real server. It is risky
+        * for fwmark services but mostly for persistent services.
+        */
+       if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+           (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+           (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+               IP_VS_DBG_PKT(12, pp, skb, 0,
+                             "Not scheduling reply for existing connection");
+               __ip_vs_conn_put(cp);
+               return NULL;
+       }
+
        /*
         *    Persistent service
         */
-       if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+       if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+               *ignored = 0;
                return ip_vs_sched_persist(svc, skb, pptr);
+       }
 
        /*
         *    Non-persistent service
@@ -405,6 +400,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
                return NULL;
        }
 
+       *ignored = 0;
+
        dest = svc->scheduler->schedule(svc, skb);
        if (dest == NULL) {
                IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -418,14 +415,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
        /*
         *    Create a connection entry.
         */
-       cp = ip_vs_conn_new(svc->af, iph.protocol,
-                           &iph.saddr, pptr[0],
-                           &iph.daddr, pptr[1],
-                           &dest->addr, dest->port ? dest->port : pptr[1],
-                           flags,
-                           dest);
-       if (cp == NULL)
-               return NULL;
+       {
+               struct ip_vs_conn_param p;
+               ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+                                     pptr[0], &iph.daddr, pptr[1], &p);
+               cp = ip_vs_conn_new(&p, &dest->addr,
+                                   dest->port ? dest->port : pptr[1],
+                                   flags, dest);
+               if (!cp)
+                       return NULL;
+       }
 
        IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -472,23 +471,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
        if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
                int ret, cs;
                struct ip_vs_conn *cp;
-               __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
-                               iph.protocol == IPPROTO_UDP)?
-                               IP_VS_CONN_F_ONE_PACKET : 0;
+               unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+                                     iph.protocol == IPPROTO_UDP)?
+                                     IP_VS_CONN_F_ONE_PACKET : 0;
                union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
                ip_vs_service_put(svc);
 
                /* create a new connection entry */
                IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
-               cp = ip_vs_conn_new(svc->af, iph.protocol,
-                                   &iph.saddr, pptr[0],
-                                   &iph.daddr, pptr[1],
-                                   &daddr, 0,
-                                   IP_VS_CONN_F_BYPASS | flags,
-                                   NULL);
-               if (cp == NULL)
-                       return NF_DROP;
+               {
+                       struct ip_vs_conn_param p;
+                       ip_vs_conn_fill_param(svc->af, iph.protocol,
+                                             &iph.saddr, pptr[0],
+                                             &iph.daddr, pptr[1], &p);
+                       cp = ip_vs_conn_new(&p, &daddr, 0,
+                                           IP_VS_CONN_F_BYPASS | flags,
+                                           NULL);
+                       if (!cp)
+                               return NF_DROP;
+               }
 
                /* statistics */
                ip_vs_in_stats(cp, skb);
@@ -637,10 +639,12 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
        }
 
        /* And finally the ICMP checksum */
-       icmph->icmp6_cksum = 0;
-       /* TODO IPv6: is this correct for ICMPv6? */
-       ip_vs_checksum_complete(skb, icmp_offset);
-       skb->ip_summed = CHECKSUM_UNNECESSARY;
+       icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+                                             skb->len - icmp_offset,
+                                             IPPROTO_ICMPV6, 0);
+       skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+       skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+       skb->ip_summed = CHECKSUM_PARTIAL;
 
        if (inout)
                IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
@@ -692,6 +696,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
        ip_vs_out_stats(cp, skb);
 
        skb->ipvs_property = 1;
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               ip_vs_notrack(skb);
+       else
+               ip_vs_update_conntrack(skb, cp, 0);
        verdict = NF_ACCEPT;
 
 out:
@@ -905,28 +913,42 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
                ip_send_check(ip_hdr(skb));
        }
 
+       /*
+        * nf_iterate does not expect change in the skb->dst->dev.
+        * It looks like it is not fatal to enable this code for hooks
+        * where our handlers are at the end of the chain list and
+        * when all next handlers use skb->dst->dev and not outdev.
+        * It will definitely route properly the inout NAT traffic
+        * when multiple paths are used.
+        */
+
        /* For policy routing, packets originating from this
         * machine itself may be routed differently to packets
         * passing through.  We want this packet to be routed as
         * if it came from this machine itself.  So re-compute
         * the routing information.
         */
+       if (sysctl_ip_vs_snat_reroute) {
 #ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6) {
-               if (ip6_route_me_harder(skb) != 0)
-                       goto drop;
-       } else
+               if (af == AF_INET6) {
+                       if (ip6_route_me_harder(skb) != 0)
+                               goto drop;
+               } else
 #endif
-               if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-                       goto drop;
+                       if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+                               goto drop;
+       }
 
        IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 
        ip_vs_out_stats(cp, skb);
        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-       ip_vs_conn_put(cp);
-
        skb->ipvs_property = 1;
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               ip_vs_notrack(skb);
+       else
+               ip_vs_update_conntrack(skb, cp, 0);
+       ip_vs_conn_put(cp);
 
        LeaveFunction(11);
        return NF_ACCEPT;
@@ -934,6 +956,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 drop:
        ip_vs_conn_put(cp);
        kfree_skb(skb);
+       LeaveFunction(11);
        return NF_STOLEN;
 }
 
@@ -964,8 +987,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
                        int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 
-                       if (related)
+                       if (related) {
+                               if (sysctl_ip_vs_snat_reroute &&
+                                       NF_ACCEPT == verdict &&
+                                       ip6_route_me_harder(skb))
+                                       verdict = NF_DROP;
                                return verdict;
+                       }
                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
                }
        } else
@@ -973,8 +1001,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
                        int related, verdict = ip_vs_out_icmp(skb, &related);
 
-                       if (related)
+                       if (related) {
+                               if (sysctl_ip_vs_snat_reroute &&
+                                       NF_ACCEPT == verdict &&
+                                       ip_route_me_harder(skb, RTN_LOCAL))
+                                       verdict = NF_DROP;
                                return verdict;
+                       }
                        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
                }
 
@@ -1380,8 +1413,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
            cp->protocol == IPPROTO_SCTP) {
                if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
-                       (atomic_read(&cp->in_pkts) %
-                        sysctl_ip_vs_sync_threshold[1]
+                       (pkts % sysctl_ip_vs_sync_threshold[1]
                         == sysctl_ip_vs_sync_threshold[0])) ||
                                (cp->old_state != cp->state &&
                                 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
@@ -1392,7 +1424,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
                }
        }
 
-       if (af == AF_INET &&
+       /* Keep this block last: TCP and others with pp->num_states <= 1 */
+       else if (af == AF_INET &&
            (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
            (((cp->protocol != IPPROTO_TCP ||
               cp->state == IP_VS_TCP_S_ESTABLISHED) &&