IPVS: Move IPVS to net/netfilter/ipvs

author Julius Volz <juliusv@google.com>

Fri, 19 Sep 2008 10:32:57 +0000 (12:32 +0200)

committer Simon Horman <horms@verge.net.au>

Mon, 6 Oct 2008 21:38:24 +0000 (08:38 +1100)
author Julius Volz <juliusv@google.com>
Fri, 19 Sep 2008 10:32:57 +0000 (12:32 +0200)
committer Simon Horman <horms@verge.net.au>
Mon, 6 Oct 2008 21:38:24 +0000 (08:38 +1100)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig

index 591ea23639ca859edcf085da1a2204aa5ef22c6c..691268f3a35972ff931c51bc8c7eeca74e73de78 100644 (file)
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -630,5 +630,3 @@ config TCP_MD5SIG
  
           If unsure, say N.
  
-source "net/ipv4/ipvs/Kconfig"
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile

index ad40ef3f9ebcdedeac09b58a5bda14c0fe48a71a..80ff87ce43aac6a350bbe8e973f91e8eb58e14e9 100644 (file)
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
  obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
  obj-$(CONFIG_IP_PNP) += ipconfig.o
  obj-$(CONFIG_NETFILTER)        += netfilter.o netfilter/
-obj-$(CONFIG_IP_VS) += ipvs/
  obj-$(CONFIG_INET_DIAG) += inet_diag.o 
  obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
  obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig

deleted file mode 100644 (file)

index de6004d..0000000
--- a/net/ipv4/ipvs/Kconfig
+++ /dev/null
@@ -1,239 +0,0 @@
-#
-# IP Virtual Server configuration
-#
-menuconfig IP_VS
-       tristate "IP virtual server support (EXPERIMENTAL)"
-       depends on NETFILTER
-       ---help---
-         IP Virtual Server support will let you build a high-performance
-         virtual server based on cluster of two or more real servers. This
-         option must be enabled for at least one of the clustered computers
-         that will take care of intercepting incoming connections to a
-         single IP address and scheduling them to real servers.
-
-         Three request dispatching techniques are implemented, they are
-         virtual server via NAT, virtual server via tunneling and virtual
-         server via direct routing. The several scheduling algorithms can
-         be used to choose which server the connection is directed to,
-         thus load balancing can be achieved among the servers.  For more
-         information and its administration program, please visit the
-         following URL: <http://www.linuxvirtualserver.org/>.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-if IP_VS
-
-config IP_VS_IPV6
-       bool "IPv6 support for IPVS (DANGEROUS)"
-       depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
-       ---help---
-         Add IPv6 support to IPVS. This is incomplete and might be dangerous.
-
-         Say N if unsure.
-
-config IP_VS_DEBUG
-       bool "IP virtual server debugging"
-       ---help---
-         Say Y here if you want to get additional messages useful in
-         debugging the IP virtual server code. You can change the debug
-         level in /proc/sys/net/ipv4/vs/debug_level
-
-config IP_VS_TAB_BITS
-       int "IPVS connection table size (the Nth power of 2)"
-       range 8 20
-       default 12
-       ---help---
-         The IPVS connection hash table uses the chaining scheme to handle
-         hash collisions. Using a big IPVS connection hash table will greatly
-         reduce conflicts when there are hundreds of thousands of connections
-         in the hash table.
-
-         Note the table size must be power of 2. The table size will be the
-         value of 2 to the your input number power. The number to choose is
-         from 8 to 20, the default number is 12, which means the table size
-         is 4096. Don't input the number too small, otherwise you will lose
-         performance on it. You can adapt the table size yourself, according
-         to your virtual server application. It is good to set the table size
-         not far less than the number of connections per second multiplying
-         average lasting time of connection in the table.  For example, your
-         virtual server gets 200 connections per second, the connection lasts
-         for 200 seconds in average in the connection table, the table size
-         should be not far less than 200x200, it is good to set the table
-         size 32768 (2**15).
-
-         Another note that each connection occupies 128 bytes effectively and
-         each hash entry uses 8 bytes, so you can estimate how much memory is
-         needed for your box.
-
-comment "IPVS transport protocol load balancing support"
-
-config IP_VS_PROTO_TCP
-       bool "TCP load balancing support"
-       ---help---
-         This option enables support for load balancing TCP transport
-         protocol. Say Y if unsure.
-
-config IP_VS_PROTO_UDP
-       bool "UDP load balancing support"
-       ---help---
-         This option enables support for load balancing UDP transport
-         protocol. Say Y if unsure.
-
-config IP_VS_PROTO_AH_ESP
-       bool
-       depends on UNDEFINED
-
-config IP_VS_PROTO_ESP
-       bool "ESP load balancing support"
-       select IP_VS_PROTO_AH_ESP
-       ---help---
-         This option enables support for load balancing ESP (Encapsulation
-         Security Payload) transport protocol. Say Y if unsure.
-
-config IP_VS_PROTO_AH
-       bool "AH load balancing support"
-       select IP_VS_PROTO_AH_ESP
-       ---help---
-         This option enables support for load balancing AH (Authentication
-         Header) transport protocol. Say Y if unsure.
-
-comment "IPVS scheduler"
-
-config IP_VS_RR
-       tristate "round-robin scheduling"
-       ---help---
-         The robin-robin scheduling algorithm simply directs network
-         connections to different real servers in a round-robin manner.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
- 
-config IP_VS_WRR
-        tristate "weighted round-robin scheduling" 
-       ---help---
-         The weighted robin-robin scheduling algorithm directs network
-         connections to different real servers based on server weights
-         in a round-robin manner. Servers with higher weights receive
-         new connections first than those with less weights, and servers
-         with higher weights get more connections than those with less
-         weights and servers with equal weights get equal connections.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_LC
-        tristate "least-connection scheduling"
-       ---help---
-         The least-connection scheduling algorithm directs network
-         connections to the server with the least number of active 
-         connections.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_WLC
-        tristate "weighted least-connection scheduling"
-       ---help---
-         The weighted least-connection scheduling algorithm directs network
-         connections to the server with the least active connections
-         normalized by the server weight.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_LBLC
-       tristate "locality-based least-connection scheduling"
-       ---help---
-         The locality-based least-connection scheduling algorithm is for
-         destination IP load balancing. It is usually used in cache cluster.
-         This algorithm usually directs packet destined for an IP address to
-         its server if the server is alive and under load. If the server is
-         overloaded (its active connection numbers is larger than its weight)
-         and there is a server in its half load, then allocate the weighted
-         least-connection server to this IP address.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config  IP_VS_LBLCR
-       tristate "locality-based least-connection with replication scheduling"
-       ---help---
-         The locality-based least-connection with replication scheduling
-         algorithm is also for destination IP load balancing. It is 
-         usually used in cache cluster. It differs from the LBLC scheduling
-         as follows: the load balancer maintains mappings from a target
-         to a set of server nodes that can serve the target. Requests for
-         a target are assigned to the least-connection node in the target's
-         server set. If all the node in the server set are over loaded,
-         it picks up a least-connection node in the cluster and adds it
-         in the sever set for the target. If the server set has not been
-         modified for the specified time, the most loaded node is removed
-         from the server set, in order to avoid high degree of replication.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_DH
-       tristate "destination hashing scheduling"
-       ---help---
-         The destination hashing scheduling algorithm assigns network
-         connections to the servers through looking up a statically assigned
-         hash table by their destination IP addresses.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_SH
-       tristate "source hashing scheduling"
-       ---help---
-         The source hashing scheduling algorithm assigns network
-         connections to the servers through looking up a statically assigned
-         hash table by their source IP addresses.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_SED
-       tristate "shortest expected delay scheduling"
-       ---help---
-         The shortest expected delay scheduling algorithm assigns network
-         connections to the server with the shortest expected delay. The 
-         expected delay that the job will experience is (Ci + 1) / Ui if 
-         sent to the ith server, in which Ci is the number of connections
-         on the ith server and Ui is the fixed service rate (weight)
-         of the ith server.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-config IP_VS_NQ
-       tristate "never queue scheduling"
-       ---help---
-         The never queue scheduling algorithm adopts a two-speed model.
-         When there is an idle server available, the job will be sent to
-         the idle server, instead of waiting for a fast one. When there
-         is no idle server available, the job will be sent to the server
-         that minimize its expected delay (The Shortest Expected Delay
-         scheduling algorithm).
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-comment 'IPVS application helper'
-
-config IP_VS_FTP
-       tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP
-       ---help---
-         FTP is a protocol that transfers IP address and/or port number in
-         the payload. In the virtual server via Network Address Translation,
-         the IP address and port number of real servers cannot be sent to
-         clients in ftp connections directly, so FTP protocol helper is
-         required for tracking the connection and mangling it back to that of
-         virtual service.
-
-         If you want to compile it in kernel, say Y. To compile it as a
-         module, choose M here. If unsure, say N.
-
-endif # IP_VS
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile

deleted file mode 100644 (file)

index 73a46fe..0000000
--- a/net/ipv4/ipvs/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Makefile for the IPVS modules on top of IPv4.
-#
-
-# IPVS transport protocol load balancing support
-ip_vs_proto-objs-y :=
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
-ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
-
-ip_vs-objs :=  ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o        \
-               ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o                      \
-               ip_vs_est.o ip_vs_proto.o                                  \
-               $(ip_vs_proto-objs-y)
-
-
-# IPVS core
-obj-$(CONFIG_IP_VS) += ip_vs.o
-
-# IPVS schedulers
-obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
-obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
-obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
-obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
-obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
-obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
-obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
-obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
-obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
-obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
-
-# IPVS application helpers
-obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c

deleted file mode 100644 (file)

index 201b8ea..0000000
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ /dev/null
@@ -1,622 +0,0 @@
-/*
- * ip_vs_app.c: Application module support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
- * is that ip_vs_app module handles the reverse direction (incoming requests
- * and outgoing responses).
- *
- *             IP_MASQ_APP application masquerading module
- *
- * Author:     Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/net_namespace.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-
-#include <net/ip_vs.h>
-
-EXPORT_SYMBOL(register_ip_vs_app);
-EXPORT_SYMBOL(unregister_ip_vs_app);
-EXPORT_SYMBOL(register_ip_vs_app_inc);
-
-/* ipvs application list head */
-static LIST_HEAD(ip_vs_app_list);
-static DEFINE_MUTEX(__ip_vs_app_mutex);
-
-
-/*
- *     Get an ip_vs_app object
- */
-static inline int ip_vs_app_get(struct ip_vs_app *app)
-{
-       return try_module_get(app->module);
-}
-
-
-static inline void ip_vs_app_put(struct ip_vs_app *app)
-{
-       module_put(app->module);
-}
-
-
-/*
- *     Allocate/initialize app incarnation and register it in proto apps.
- */
-static int
-ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
-{
-       struct ip_vs_protocol *pp;
-       struct ip_vs_app *inc;
-       int ret;
-
-       if (!(pp = ip_vs_proto_get(proto)))
-               return -EPROTONOSUPPORT;
-
-       if (!pp->unregister_app)
-               return -EOPNOTSUPP;
-
-       inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
-       if (!inc)
-               return -ENOMEM;
-       INIT_LIST_HEAD(&inc->p_list);
-       INIT_LIST_HEAD(&inc->incs_list);
-       inc->app = app;
-       inc->port = htons(port);
-       atomic_set(&inc->usecnt, 0);
-
-       if (app->timeouts) {
-               inc->timeout_table =
-                       ip_vs_create_timeout_table(app->timeouts,
-                                                  app->timeouts_size);
-               if (!inc->timeout_table) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-       }
-
-       ret = pp->register_app(inc);
-       if (ret)
-               goto out;
-
-       list_add(&inc->a_list, &app->incs_list);
-       IP_VS_DBG(9, "%s application %s:%u registered\n",
-                 pp->name, inc->name, inc->port);
-
-       return 0;
-
-  out:
-       kfree(inc->timeout_table);
-       kfree(inc);
-       return ret;
-}
-
-
-/*
- *     Release app incarnation
- */
-static void
-ip_vs_app_inc_release(struct ip_vs_app *inc)
-{
-       struct ip_vs_protocol *pp;
-
-       if (!(pp = ip_vs_proto_get(inc->protocol)))
-               return;
-
-       if (pp->unregister_app)
-               pp->unregister_app(inc);
-
-       IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-                 pp->name, inc->name, inc->port);
-
-       list_del(&inc->a_list);
-
-       kfree(inc->timeout_table);
-       kfree(inc);
-}
-
-
-/*
- *     Get reference to app inc (only called from softirq)
- *
- */
-int ip_vs_app_inc_get(struct ip_vs_app *inc)
-{
-       int result;
-
-       atomic_inc(&inc->usecnt);
-       if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
-               atomic_dec(&inc->usecnt);
-       return result;
-}
-
-
-/*
- *     Put the app inc (only called from timer or net softirq)
- */
-void ip_vs_app_inc_put(struct ip_vs_app *inc)
-{
-       ip_vs_app_put(inc->app);
-       atomic_dec(&inc->usecnt);
-}
-
-
-/*
- *     Register an application incarnation in protocol applications
- */
-int
-register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
-{
-       int result;
-
-       mutex_lock(&__ip_vs_app_mutex);
-
-       result = ip_vs_app_inc_new(app, proto, port);
-
-       mutex_unlock(&__ip_vs_app_mutex);
-
-       return result;
-}
-
-
-/*
- *     ip_vs_app registration routine
- */
-int register_ip_vs_app(struct ip_vs_app *app)
-{
-       /* increase the module use count */
-       ip_vs_use_count_inc();
-
-       mutex_lock(&__ip_vs_app_mutex);
-
-       list_add(&app->a_list, &ip_vs_app_list);
-
-       mutex_unlock(&__ip_vs_app_mutex);
-
-       return 0;
-}
-
-
-/*
- *     ip_vs_app unregistration routine
- *     We are sure there are no app incarnations attached to services
- */
-void unregister_ip_vs_app(struct ip_vs_app *app)
-{
-       struct ip_vs_app *inc, *nxt;
-
-       mutex_lock(&__ip_vs_app_mutex);
-
-       list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
-               ip_vs_app_inc_release(inc);
-       }
-
-       list_del(&app->a_list);
-
-       mutex_unlock(&__ip_vs_app_mutex);
-
-       /* decrease the module use count */
-       ip_vs_use_count_dec();
-}
-
-
-/*
- *     Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
- */
-int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
-{
-       return pp->app_conn_bind(cp);
-}
-
-
-/*
- *     Unbind cp from application incarnation (called by cp destructor)
- */
-void ip_vs_unbind_app(struct ip_vs_conn *cp)
-{
-       struct ip_vs_app *inc = cp->app;
-
-       if (!inc)
-               return;
-
-       if (inc->unbind_conn)
-               inc->unbind_conn(inc, cp);
-       if (inc->done_conn)
-               inc->done_conn(inc, cp);
-       ip_vs_app_inc_put(inc);
-       cp->app = NULL;
-}
-
-
-/*
- *     Fixes th->seq based on ip_vs_seq info.
- */
-static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
-{
-       __u32 seq = ntohl(th->seq);
-
-       /*
-        *      Adjust seq with delta-offset for all packets after
-        *      the most recent resized pkt seq and with previous_delta offset
-        *      for all packets before most recent resized pkt seq.
-        */
-       if (vseq->delta || vseq->previous_delta) {
-               if(after(seq, vseq->init_seq)) {
-                       th->seq = htonl(seq + vseq->delta);
-                       IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
-                                 vseq->delta);
-               } else {
-                       th->seq = htonl(seq + vseq->previous_delta);
-                       IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
-                                 "(%d) to seq\n", vseq->previous_delta);
-               }
-       }
-}
-
-
-/*
- *     Fixes th->ack_seq based on ip_vs_seq info.
- */
-static inline void
-vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
-{
-       __u32 ack_seq = ntohl(th->ack_seq);
-
-       /*
-        * Adjust ack_seq with delta-offset for
-        * the packets AFTER most recent resized pkt has caused a shift
-        * for packets before most recent resized pkt, use previous_delta
-        */
-       if (vseq->delta || vseq->previous_delta) {
-               /* since ack_seq is the number of octet that is expected
-                  to receive next, so compare it with init_seq+delta */
-               if(after(ack_seq, vseq->init_seq+vseq->delta)) {
-                       th->ack_seq = htonl(ack_seq - vseq->delta);
-                       IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
-                                 "(%d) from ack_seq\n", vseq->delta);
-
-               } else {
-                       th->ack_seq = htonl(ack_seq - vseq->previous_delta);
-                       IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
-                                 "previous_delta (%d) from ack_seq\n",
-                                 vseq->previous_delta);
-               }
-       }
-}
-
-
-/*
- *     Updates ip_vs_seq if pkt has been resized
- *     Assumes already checked proto==IPPROTO_TCP and diff!=0.
- */
-static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
-                                unsigned flag, __u32 seq, int diff)
-{
-       /* spinlock is to keep updating cp->flags atomic */
-       spin_lock(&cp->lock);
-       if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
-               vseq->previous_delta = vseq->delta;
-               vseq->delta += diff;
-               vseq->init_seq = seq;
-               cp->flags |= flag;
-       }
-       spin_unlock(&cp->lock);
-}
-
-static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
-                                 struct ip_vs_app *app)
-{
-       int diff;
-       const unsigned int tcp_offset = ip_hdrlen(skb);
-       struct tcphdr *th;
-       __u32 seq;
-
-       if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
-               return 0;
-
-       th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
-
-       /*
-        *      Remember seq number in case this pkt gets resized
-        */
-       seq = ntohl(th->seq);
-
-       /*
-        *      Fix seq stuff if flagged as so.
-        */
-       if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
-               vs_fix_seq(&cp->out_seq, th);
-       if (cp->flags & IP_VS_CONN_F_IN_SEQ)
-               vs_fix_ack_seq(&cp->in_seq, th);
-
-       /*
-        *      Call private output hook function
-        */
-       if (app->pkt_out == NULL)
-               return 1;
-
-       if (!app->pkt_out(app, cp, skb, &diff))
-               return 0;
-
-       /*
-        *      Update ip_vs seq stuff if len has changed.
-        */
-       if (diff != 0)
-               vs_seq_update(cp, &cp->out_seq,
-                             IP_VS_CONN_F_OUT_SEQ, seq, diff);
-
-       return 1;
-}
-
-/*
- *     Output pkt hook. Will call bound ip_vs_app specific function
- *     called by ipvs packet handler, assumes previously checked cp!=NULL
- *     returns false if it can't handle packet (oom)
- */
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-       struct ip_vs_app *app;
-
-       /*
-        *      check if application module is bound to
-        *      this ip_vs_conn.
-        */
-       if ((app = cp->app) == NULL)
-               return 1;
-
-       /* TCP is complicated */
-       if (cp->protocol == IPPROTO_TCP)
-               return app_tcp_pkt_out(cp, skb, app);
-
-       /*
-        *      Call private output hook function
-        */
-       if (app->pkt_out == NULL)
-               return 1;
-
-       return app->pkt_out(app, cp, skb, NULL);
-}
-
-
-static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
-                                struct ip_vs_app *app)
-{
-       int diff;
-       const unsigned int tcp_offset = ip_hdrlen(skb);
-       struct tcphdr *th;
-       __u32 seq;
-
-       if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
-               return 0;
-
-       th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
-
-       /*
-        *      Remember seq number in case this pkt gets resized
-        */
-       seq = ntohl(th->seq);
-
-       /*
-        *      Fix seq stuff if flagged as so.
-        */
-       if (cp->flags & IP_VS_CONN_F_IN_SEQ)
-               vs_fix_seq(&cp->in_seq, th);
-       if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
-               vs_fix_ack_seq(&cp->out_seq, th);
-
-       /*
-        *      Call private input hook function
-        */
-       if (app->pkt_in == NULL)
-               return 1;
-
-       if (!app->pkt_in(app, cp, skb, &diff))
-               return 0;
-
-       /*
-        *      Update ip_vs seq stuff if len has changed.
-        */
-       if (diff != 0)
-               vs_seq_update(cp, &cp->in_seq,
-                             IP_VS_CONN_F_IN_SEQ, seq, diff);
-
-       return 1;
-}
-
-/*
- *     Input pkt hook. Will call bound ip_vs_app specific function
- *     called by ipvs packet handler, assumes previously checked cp!=NULL.
- *     returns false if can't handle packet (oom).
- */
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-       struct ip_vs_app *app;
-
-       /*
-        *      check if application module is bound to
-        *      this ip_vs_conn.
-        */
-       if ((app = cp->app) == NULL)
-               return 1;
-
-       /* TCP is complicated */
-       if (cp->protocol == IPPROTO_TCP)
-               return app_tcp_pkt_in(cp, skb, app);
-
-       /*
-        *      Call private input hook function
-        */
-       if (app->pkt_in == NULL)
-               return 1;
-
-       return app->pkt_in(app, cp, skb, NULL);
-}
-
-
-#ifdef CONFIG_PROC_FS
-/*
- *     /proc/net/ip_vs_app entry function
- */
-
-static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
-{
-       struct ip_vs_app *app, *inc;
-
-       list_for_each_entry(app, &ip_vs_app_list, a_list) {
-               list_for_each_entry(inc, &app->incs_list, a_list) {
-                       if (pos-- == 0)
-                               return inc;
-               }
-       }
-       return NULL;
-
-}
-
-static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
-{
-       mutex_lock(&__ip_vs_app_mutex);
-
-       return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
-}
-
-static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-       struct ip_vs_app *inc, *app;
-       struct list_head *e;
-
-       ++*pos;
-       if (v == SEQ_START_TOKEN)
-               return ip_vs_app_idx(0);
-
-       inc = v;
-       app = inc->app;
-
-       if ((e = inc->a_list.next) != &app->incs_list)
-               return list_entry(e, struct ip_vs_app, a_list);
-
-       /* go on to next application */
-       for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
-               app = list_entry(e, struct ip_vs_app, a_list);
-               list_for_each_entry(inc, &app->incs_list, a_list) {
-                       return inc;
-               }
-       }
-       return NULL;
-}
-
-static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
-{
-       mutex_unlock(&__ip_vs_app_mutex);
-}
-
-static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
-{
-       if (v == SEQ_START_TOKEN)
-               seq_puts(seq, "prot port    usecnt name\n");
-       else {
-               const struct ip_vs_app *inc = v;
-
-               seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
-                          ip_vs_proto_name(inc->protocol),
-                          ntohs(inc->port),
-                          atomic_read(&inc->usecnt),
-                          inc->name);
-       }
-       return 0;
-}
-
-static const struct seq_operations ip_vs_app_seq_ops = {
-       .start = ip_vs_app_seq_start,
-       .next  = ip_vs_app_seq_next,
-       .stop  = ip_vs_app_seq_stop,
-       .show  = ip_vs_app_seq_show,
-};
-
-static int ip_vs_app_open(struct inode *inode, struct file *file)
-{
-       return seq_open(file, &ip_vs_app_seq_ops);
-}
-
-static const struct file_operations ip_vs_app_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ip_vs_app_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release,
-};
-#endif
-
-
-/*
- *     Replace a segment of data with a new segment
- */
-int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-                     char *o_buf, int o_len, char *n_buf, int n_len)
-{
-       int diff;
-       int o_offset;
-       int o_left;
-
-       EnterFunction(9);
-
-       diff = n_len - o_len;
-       o_offset = o_buf - (char *)skb->data;
-       /* The length of left data after o_buf+o_len in the skb data */
-       o_left = skb->len - (o_offset + o_len);
-
-       if (diff <= 0) {
-               memmove(o_buf + n_len, o_buf + o_len, o_left);
-               memcpy(o_buf, n_buf, n_len);
-               skb_trim(skb, skb->len + diff);
-       } else if (diff <= skb_tailroom(skb)) {
-               skb_put(skb, diff);
-               memmove(o_buf + n_len, o_buf + o_len, o_left);
-               memcpy(o_buf, n_buf, n_len);
-       } else {
-               if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
-                       return -ENOMEM;
-               skb_put(skb, diff);
-               memmove(skb->data + o_offset + n_len,
-                       skb->data + o_offset + o_len, o_left);
-               skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
-       }
-
-       /* must update the iph total length here */
-       ip_hdr(skb)->tot_len = htons(skb->len);
-
-       LeaveFunction(9);
-       return 0;
-}
-
-
-int __init ip_vs_app_init(void)
-{
-       /* we will replace it with proc_net_ipvs_create() soon */
-       proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
-       return 0;
-}
-
-
-void ip_vs_app_cleanup(void)
-{
-       proc_net_remove(&init_net, "ip_vs_app");
-}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c

deleted file mode 100644 (file)

index 9a24332..0000000
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the Netfilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
- * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
- * and others. Many code here is taken from IP MASQ code of kernel 2.2.
- *
- * Changes:
- *
- */
-
-#include <linux/interrupt.h>
-#include <linux/in.h>
-#include <linux/net.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/proc_fs.h>             /* for proc_net_* */
-#include <linux/seq_file.h>
-#include <linux/jhash.h>
-#include <linux/random.h>
-
-#include <net/net_namespace.h>
-#include <net/ip_vs.h>
-
-
-/*
- *  Connection hash table: for input and output packets lookups of IPVS
- */
-static struct list_head *ip_vs_conn_tab;
-
-/*  SLAB cache for IPVS connections */
-static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
-
-/*  counter for current IPVS connections */
-static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
-
-/*  counter for no client port connections */
-static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
-
-/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
-
-/*
- *  Fine locking granularity for big connection hash table
- */
-#define CT_LOCKARRAY_BITS  4
-#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
-#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
-
-struct ip_vs_aligned_lock
-{
-       rwlock_t        l;
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
-/* lock array for conn table */
-static struct ip_vs_aligned_lock
-__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-
-static inline void ct_read_lock(unsigned key)
-{
-       read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned key)
-{
-       read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned key)
-{
-       write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned key)
-{
-       write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned key)
-{
-       read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned key)
-{
-       read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned key)
-{
-       write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock_bh(unsigned key)
-{
-       write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-
-/*
- *     Returns hash value for IPVS connection entry
- */
-static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
-                                      const union nf_inet_addr *addr,
-                                      __be16 port)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
-                                   (__force u32)port, proto, ip_vs_conn_rnd)
-                       & IP_VS_CONN_TAB_MASK;
-#endif
-       return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
-                           ip_vs_conn_rnd)
-               & IP_VS_CONN_TAB_MASK;
-}
-
-
-/*
- *     Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
- *     returns bool success.
- */
-static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
-{
-       unsigned hash;
-       int ret;
-
-       /* Hash by protocol, client address and port */
-       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
-
-       ct_write_lock(hash);
-
-       if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
-               list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
-               cp->flags |= IP_VS_CONN_F_HASHED;
-               atomic_inc(&cp->refcnt);
-               ret = 1;
-       } else {
-               IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
-                         "called from %p\n", __builtin_return_address(0));
-               ret = 0;
-       }
-
-       ct_write_unlock(hash);
-
-       return ret;
-}
-
-
-/*
- *     UNhashes ip_vs_conn from ip_vs_conn_tab.
- *     returns bool success.
- */
-static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
-{
-       unsigned hash;
-       int ret;
-
-       /* unhash it and decrease its reference counter */
-       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
-
-       ct_write_lock(hash);
-
-       if (cp->flags & IP_VS_CONN_F_HASHED) {
-               list_del(&cp->c_list);
-               cp->flags &= ~IP_VS_CONN_F_HASHED;
-               atomic_dec(&cp->refcnt);
-               ret = 1;
-       } else
-               ret = 0;
-
-       ct_write_unlock(hash);
-
-       return ret;
-}
-
-
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from OUTside-to-INside.
- *     s_addr, s_port: pkt source address (foreign host)
- *     d_addr, d_port: pkt dest address (load balancer)
- */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-       unsigned hash;
-       struct ip_vs_conn *cp;
-
-       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
-
-       ct_read_lock(hash);
-
-       list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-               if (cp->af == af &&
-                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-                   s_port == cp->cport && d_port == cp->vport &&
-                   ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-                   protocol == cp->protocol) {
-                       /* HIT */
-                       atomic_inc(&cp->refcnt);
-                       ct_read_unlock(hash);
-                       return cp;
-               }
-       }
-
-       ct_read_unlock(hash);
-
-       return NULL;
-}
-
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-       struct ip_vs_conn *cp;
-
-       cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-       if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-               cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-                                        d_port);
-
-       IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-                     ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
-                     cp ? "hit" : "not hit");
-
-       return cp;
-}
-
-/* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-       unsigned hash;
-       struct ip_vs_conn *cp;
-
-       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
-
-       ct_read_lock(hash);
-
-       list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-               if (cp->af == af &&
-                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-                   s_port == cp->cport && d_port == cp->vport &&
-                   cp->flags & IP_VS_CONN_F_TEMPLATE &&
-                   protocol == cp->protocol) {
-                       /* HIT */
-                       atomic_inc(&cp->refcnt);
-                       goto out;
-               }
-       }
-       cp = NULL;
-
-  out:
-       ct_read_unlock(hash);
-
-       IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-                     ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
-                     cp ? "hit" : "not hit");
-
-       return cp;
-}
-
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from inside-to-OUTside.
- *     s_addr, s_port: pkt source address (inside host)
- *     d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
-{
-       unsigned hash;
-       struct ip_vs_conn *cp, *ret=NULL;
-
-       /*
-        *      Check for "full" addressed entries
-        */
-       hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
-
-       ct_read_lock(hash);
-
-       list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-               if (cp->af == af &&
-                   ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-                   ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-                   d_port == cp->cport && s_port == cp->dport &&
-                   protocol == cp->protocol) {
-                       /* HIT */
-                       atomic_inc(&cp->refcnt);
-                       ret = cp;
-                       break;
-               }
-       }
-
-       ct_read_unlock(hash);
-
-       IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-                     ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
-                     ret ? "hit" : "not hit");
-
-       return ret;
-}
-
-
-/*
- *      Put back the conn and restart its timer with its timeout
- */
-void ip_vs_conn_put(struct ip_vs_conn *cp)
-{
-       /* reset it expire in its timeout */
-       mod_timer(&cp->timer, jiffies+cp->timeout);
-
-       __ip_vs_conn_put(cp);
-}
-
-
-/*
- *     Fill a no_client_port connection with a client port number
- */
-void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
-{
-       if (ip_vs_conn_unhash(cp)) {
-               spin_lock(&cp->lock);
-               if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
-                       atomic_dec(&ip_vs_conn_no_cport_cnt);
-                       cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
-                       cp->cport = cport;
-               }
-               spin_unlock(&cp->lock);
-
-               /* hash on new dport */
-               ip_vs_conn_hash(cp);
-       }
-}
-
-
-/*
- *     Bind a connection entry with the corresponding packet_xmit.
- *     Called by ip_vs_conn_new.
- */
-static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
-{
-       switch (IP_VS_FWD_METHOD(cp)) {
-       case IP_VS_CONN_F_MASQ:
-               cp->packet_xmit = ip_vs_nat_xmit;
-               break;
-
-       case IP_VS_CONN_F_TUNNEL:
-               cp->packet_xmit = ip_vs_tunnel_xmit;
-               break;
-
-       case IP_VS_CONN_F_DROUTE:
-               cp->packet_xmit = ip_vs_dr_xmit;
-               break;
-
-       case IP_VS_CONN_F_LOCALNODE:
-               cp->packet_xmit = ip_vs_null_xmit;
-               break;
-
-       case IP_VS_CONN_F_BYPASS:
-               cp->packet_xmit = ip_vs_bypass_xmit;
-               break;
-       }
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
-{
-       switch (IP_VS_FWD_METHOD(cp)) {
-       case IP_VS_CONN_F_MASQ:
-               cp->packet_xmit = ip_vs_nat_xmit_v6;
-               break;
-
-       case IP_VS_CONN_F_TUNNEL:
-               cp->packet_xmit = ip_vs_tunnel_xmit_v6;
-               break;
-
-       case IP_VS_CONN_F_DROUTE:
-               cp->packet_xmit = ip_vs_dr_xmit_v6;
-               break;
-
-       case IP_VS_CONN_F_LOCALNODE:
-               cp->packet_xmit = ip_vs_null_xmit;
-               break;
-
-       case IP_VS_CONN_F_BYPASS:
-               cp->packet_xmit = ip_vs_bypass_xmit_v6;
-               break;
-       }
-}
-#endif
-
-
-static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
-{
-       return atomic_read(&dest->activeconns)
-               + atomic_read(&dest->inactconns);
-}
-
-/*
- *     Bind a connection entry with a virtual service destination
- *     Called just after a new connection entry is created.
- */
-static inline void
-ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
-{
-       /* if dest is NULL, then return directly */
-       if (!dest)
-               return;
-
-       /* Increase the refcnt counter of the dest */
-       atomic_inc(&dest->refcnt);
-
-       /* Bind with the destination and its corresponding transmitter */
-       if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-           (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
-               /* if the connection is not template and is created
-                * by sync, preserve the activity flag.
-                */
-               cp->flags |= atomic_read(&dest->conn_flags) &
-                            (~IP_VS_CONN_F_INACTIVE);
-       else
-               cp->flags |= atomic_read(&dest->conn_flags);
-       cp->dest = dest;
-
-       IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
-                     "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
-                     "dest->refcnt:%d\n",
-                     ip_vs_proto_name(cp->protocol),
-                     IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
-                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
-                     IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
-                     ip_vs_fwd_tag(cp), cp->state,
-                     cp->flags, atomic_read(&cp->refcnt),
-                     atomic_read(&dest->refcnt));
-
-       /* Update the connection counters */
-       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
-               /* It is a normal connection, so increase the inactive
-                  connection counter because it is in TCP SYNRECV
-                  state (inactive) or other protocol inacive state */
-               if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-                   (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
-                       atomic_inc(&dest->activeconns);
-               else
-                       atomic_inc(&dest->inactconns);
-       } else {
-               /* It is a persistent connection/template, so increase
-                  the peristent connection counter */
-               atomic_inc(&dest->persistconns);
-       }
-
-       if (dest->u_threshold != 0 &&
-           ip_vs_dest_totalconns(dest) >= dest->u_threshold)
-               dest->flags |= IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- * Check if there is a destination for the connection, if so
- * bind the connection to the destination.
- */
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
-{
-       struct ip_vs_dest *dest;
-
-       if ((cp) && (!cp->dest)) {
-               dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
-                                      &cp->vaddr, cp->vport,
-                                      cp->protocol);
-               ip_vs_bind_dest(cp, dest);
-               return dest;
-       } else
-               return NULL;
-}
-
-
-/*
- *     Unbind a connection entry with its VS destination
- *     Called by the ip_vs_conn_expire function.
- */
-static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
-{
-       struct ip_vs_dest *dest = cp->dest;
-
-       if (!dest)
-               return;
-
-       IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
-                     "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
-                     "dest->refcnt:%d\n",
-                     ip_vs_proto_name(cp->protocol),
-                     IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
-                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
-                     IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
-                     ip_vs_fwd_tag(cp), cp->state,
-                     cp->flags, atomic_read(&cp->refcnt),
-                     atomic_read(&dest->refcnt));
-
-       /* Update the connection counters */
-       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
-               /* It is a normal connection, so decrease the inactconns
-                  or activeconns counter */
-               if (cp->flags & IP_VS_CONN_F_INACTIVE) {
-                       atomic_dec(&dest->inactconns);
-               } else {
-                       atomic_dec(&dest->activeconns);
-               }
-       } else {
-               /* It is a persistent connection/template, so decrease
-                  the peristent connection counter */
-               atomic_dec(&dest->persistconns);
-       }
-
-       if (dest->l_threshold != 0) {
-               if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
-                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-       } else if (dest->u_threshold != 0) {
-               if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
-                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-       } else {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-       }
-
-       /*
-        * Simply decrease the refcnt of the dest, because the
-        * dest will be either in service's destination list
-        * or in the trash.
-        */
-       atomic_dec(&dest->refcnt);
-}
-
-
-/*
- *     Checking if the destination of a connection template is available.
- *     If available, return 1, otherwise invalidate this connection
- *     template and return 0.
- */
-int ip_vs_check_template(struct ip_vs_conn *ct)
-{
-       struct ip_vs_dest *dest = ct->dest;
-
-       /*
-        * Checking the dest server status.
-        */
-       if ((dest == NULL) ||
-           !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
-           (sysctl_ip_vs_expire_quiescent_template &&
-            (atomic_read(&dest->weight) == 0))) {
-               IP_VS_DBG_BUF(9, "check_template: dest not available for "
-                             "protocol %s s:%s:%d v:%s:%d "
-                             "-> d:%s:%d\n",
-                             ip_vs_proto_name(ct->protocol),
-                             IP_VS_DBG_ADDR(ct->af, &ct->caddr),
-                             ntohs(ct->cport),
-                             IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
-                             ntohs(ct->vport),
-                             IP_VS_DBG_ADDR(ct->af, &ct->daddr),
-                             ntohs(ct->dport));
-
-               /*
-                * Invalidate the connection template
-                */
-               if (ct->vport != htons(0xffff)) {
-                       if (ip_vs_conn_unhash(ct)) {
-                               ct->dport = htons(0xffff);
-                               ct->vport = htons(0xffff);
-                               ct->cport = 0;
-                               ip_vs_conn_hash(ct);
-                       }
-               }
-
-               /*
-                * Simply decrease the refcnt of the template,
-                * don't restart its timer.
-                */
-               atomic_dec(&ct->refcnt);
-               return 0;
-       }
-       return 1;
-}
-
-static void ip_vs_conn_expire(unsigned long data)
-{
-       struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
-
-       cp->timeout = 60*HZ;
-
-       /*
-        *      hey, I'm using it
-        */
-       atomic_inc(&cp->refcnt);
-
-       /*
-        *      do I control anybody?
-        */
-       if (atomic_read(&cp->n_control))
-               goto expire_later;
-
-       /*
-        *      unhash it if it is hashed in the conn table
-        */
-       if (!ip_vs_conn_unhash(cp))
-               goto expire_later;
-
-       /*
-        *      refcnt==1 implies I'm the only one referrer
-        */
-       if (likely(atomic_read(&cp->refcnt) == 1)) {
-               /* delete the timer if it is activated by other users */
-               if (timer_pending(&cp->timer))
-                       del_timer(&cp->timer);
-
-               /* does anybody control me? */
-               if (cp->control)
-                       ip_vs_control_del(cp);
-
-               if (unlikely(cp->app != NULL))
-                       ip_vs_unbind_app(cp);
-               ip_vs_unbind_dest(cp);
-               if (cp->flags & IP_VS_CONN_F_NO_CPORT)
-                       atomic_dec(&ip_vs_conn_no_cport_cnt);
-               atomic_dec(&ip_vs_conn_count);
-
-               kmem_cache_free(ip_vs_conn_cachep, cp);
-               return;
-       }
-
-       /* hash it back to the table */
-       ip_vs_conn_hash(cp);
-
-  expire_later:
-       IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
-                 atomic_read(&cp->refcnt)-1,
-                 atomic_read(&cp->n_control));
-
-       ip_vs_conn_put(cp);
-}
-
-
-void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
-{
-       if (del_timer(&cp->timer))
-               mod_timer(&cp->timer, jiffies);
-}
-
-
-/*
- *     Create a new connection entry and hash it into the ip_vs_conn_tab
- */
-struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-              const union nf_inet_addr *vaddr, __be16 vport,
-              const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
-              struct ip_vs_dest *dest)
-{
-       struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
-
-       cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
-       if (cp == NULL) {
-               IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
-               return NULL;
-       }
-
-       INIT_LIST_HEAD(&cp->c_list);
-       setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-       cp->af             = af;
-       cp->protocol       = proto;
-       ip_vs_addr_copy(af, &cp->caddr, caddr);
-       cp->cport          = cport;
-       ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-       cp->vport          = vport;
-       ip_vs_addr_copy(af, &cp->daddr, daddr);
-       cp->dport          = dport;
-       cp->flags          = flags;
-       spin_lock_init(&cp->lock);
-
-       /*
-        * Set the entry is referenced by the current thread before hashing
-        * it in the table, so that other thread run ip_vs_random_dropentry
-        * but cannot drop this entry.
-        */
-       atomic_set(&cp->refcnt, 1);
-
-       atomic_set(&cp->n_control, 0);
-       atomic_set(&cp->in_pkts, 0);
-
-       atomic_inc(&ip_vs_conn_count);
-       if (flags & IP_VS_CONN_F_NO_CPORT)
-               atomic_inc(&ip_vs_conn_no_cport_cnt);
-
-       /* Bind the connection with a destination server */
-       ip_vs_bind_dest(cp, dest);
-
-       /* Set its state and timeout */
-       cp->state = 0;
-       cp->timeout = 3*HZ;
-
-       /* Bind its packet transmitter */
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               ip_vs_bind_xmit_v6(cp);
-       else
-#endif
-               ip_vs_bind_xmit(cp);
-
-       if (unlikely(pp && atomic_read(&pp->appcnt)))
-               ip_vs_bind_app(cp, pp);
-
-       /* Hash it in the ip_vs_conn_tab finally */
-       ip_vs_conn_hash(cp);
-
-       return cp;
-}
-
-
-/*
- *     /proc/net/ip_vs_conn entries
- */
-#ifdef CONFIG_PROC_FS
-
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
-{
-       int idx;
-       struct ip_vs_conn *cp;
-
-       for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
-               ct_read_lock_bh(idx);
-               list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-                       if (pos-- == 0) {
-                               seq->private = &ip_vs_conn_tab[idx];
-                               return cp;
-                       }
-               }
-               ct_read_unlock_bh(idx);
-       }
-
-       return NULL;
-}
-
-static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
-{
-       seq->private = NULL;
-       return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
-}
-
-static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-       struct ip_vs_conn *cp = v;
-       struct list_head *e, *l = seq->private;
-       int idx;
-
-       ++*pos;
-       if (v == SEQ_START_TOKEN)
-               return ip_vs_conn_array(seq, 0);
-
-       /* more on same hash chain? */
-       if ((e = cp->c_list.next) != l)
-               return list_entry(e, struct ip_vs_conn, c_list);
-
-       idx = l - ip_vs_conn_tab;
-       ct_read_unlock_bh(idx);
-
-       while (++idx < IP_VS_CONN_TAB_SIZE) {
-               ct_read_lock_bh(idx);
-               list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-                       seq->private = &ip_vs_conn_tab[idx];
-                       return cp;
-               }
-               ct_read_unlock_bh(idx);
-       }
-       seq->private = NULL;
-       return NULL;
-}
-
-static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
-{
-       struct list_head *l = seq->private;
-
-       if (l)
-               ct_read_unlock_bh(l - ip_vs_conn_tab);
-}
-
-static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
-{
-
-       if (v == SEQ_START_TOKEN)
-               seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
-       else {
-               const struct ip_vs_conn *cp = v;
-
-#ifdef CONFIG_IP_VS_IPV6
-               if (cp->af == AF_INET6)
-                       seq_printf(seq,
-                               "%-3s " NIP6_FMT " %04X " NIP6_FMT
-                               " %04X " NIP6_FMT " %04X %-11s %7lu\n",
-                               ip_vs_proto_name(cp->protocol),
-                               NIP6(cp->caddr.in6), ntohs(cp->cport),
-                               NIP6(cp->vaddr.in6), ntohs(cp->vport),
-                               NIP6(cp->daddr.in6), ntohs(cp->dport),
-                               ip_vs_state_name(cp->protocol, cp->state),
-                               (cp->timer.expires-jiffies)/HZ);
-               else
-#endif
-                       seq_printf(seq,
-                               "%-3s %08X %04X %08X %04X"
-                               " %08X %04X %-11s %7lu\n",
-                               ip_vs_proto_name(cp->protocol),
-                               ntohl(cp->caddr.ip), ntohs(cp->cport),
-                               ntohl(cp->vaddr.ip), ntohs(cp->vport),
-                               ntohl(cp->daddr.ip), ntohs(cp->dport),
-                               ip_vs_state_name(cp->protocol, cp->state),
-                               (cp->timer.expires-jiffies)/HZ);
-       }
-       return 0;
-}
-
-static const struct seq_operations ip_vs_conn_seq_ops = {
-       .start = ip_vs_conn_seq_start,
-       .next  = ip_vs_conn_seq_next,
-       .stop  = ip_vs_conn_seq_stop,
-       .show  = ip_vs_conn_seq_show,
-};
-
-static int ip_vs_conn_open(struct inode *inode, struct file *file)
-{
-       return seq_open(file, &ip_vs_conn_seq_ops);
-}
-
-static const struct file_operations ip_vs_conn_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ip_vs_conn_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release,
-};
-
-static const char *ip_vs_origin_name(unsigned flags)
-{
-       if (flags & IP_VS_CONN_F_SYNC)
-               return "SYNC";
-       else
-               return "LOCAL";
-}
-
-static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
-{
-
-       if (v == SEQ_START_TOKEN)
-               seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
-       else {
-               const struct ip_vs_conn *cp = v;
-
-#ifdef CONFIG_IP_VS_IPV6
-               if (cp->af == AF_INET6)
-                       seq_printf(seq,
-                               "%-3s " NIP6_FMT " %04X " NIP6_FMT
-                               " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
-                               ip_vs_proto_name(cp->protocol),
-                               NIP6(cp->caddr.in6), ntohs(cp->cport),
-                               NIP6(cp->vaddr.in6), ntohs(cp->vport),
-                               NIP6(cp->daddr.in6), ntohs(cp->dport),
-                               ip_vs_state_name(cp->protocol, cp->state),
-                               ip_vs_origin_name(cp->flags),
-                               (cp->timer.expires-jiffies)/HZ);
-               else
-#endif
-                       seq_printf(seq,
-                               "%-3s %08X %04X %08X %04X "
-                               "%08X %04X %-11s %-6s %7lu\n",
-                               ip_vs_proto_name(cp->protocol),
-                               ntohl(cp->caddr.ip), ntohs(cp->cport),
-                               ntohl(cp->vaddr.ip), ntohs(cp->vport),
-                               ntohl(cp->daddr.ip), ntohs(cp->dport),
-                               ip_vs_state_name(cp->protocol, cp->state),
-                               ip_vs_origin_name(cp->flags),
-                               (cp->timer.expires-jiffies)/HZ);
-       }
-       return 0;
-}
-
-static const struct seq_operations ip_vs_conn_sync_seq_ops = {
-       .start = ip_vs_conn_seq_start,
-       .next  = ip_vs_conn_seq_next,
-       .stop  = ip_vs_conn_seq_stop,
-       .show  = ip_vs_conn_sync_seq_show,
-};
-
-static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
-{
-       return seq_open(file, &ip_vs_conn_sync_seq_ops);
-}
-
-static const struct file_operations ip_vs_conn_sync_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ip_vs_conn_sync_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release,
-};
-
-#endif
-
-
-/*
- *      Randomly drop connection entries before running out of memory
- */
-static inline int todrop_entry(struct ip_vs_conn *cp)
-{
-       /*
-        * The drop rate array needs tuning for real environments.
-        * Called from timer bh only => no locking
-        */
-       static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
-       static char todrop_counter[9] = {0};
-       int i;
-
-       /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
-          This will leave enough time for normal connection to get
-          through. */
-       if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
-               return 0;
-
-       /* Don't drop the entry if its number of incoming packets is not
-          located in [0, 8] */
-       i = atomic_read(&cp->in_pkts);
-       if (i > 8 || i < 0) return 0;
-
-       if (!todrop_rate[i]) return 0;
-       if (--todrop_counter[i] > 0) return 0;
-
-       todrop_counter[i] = todrop_rate[i];
-       return 1;
-}
-
-/* Called from keventd and must protect itself from softirqs */
-void ip_vs_random_dropentry(void)
-{
-       int idx;
-       struct ip_vs_conn *cp;
-
-       /*
-        * Randomly scan 1/32 of the whole table every second
-        */
-       for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
-               unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
-
-               /*
-                *  Lock is actually needed in this loop.
-                */
-               ct_write_lock_bh(hash);
-
-               list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-                       if (cp->flags & IP_VS_CONN_F_TEMPLATE)
-                               /* connection template */
-                               continue;
-
-                       if (cp->protocol == IPPROTO_TCP) {
-                               switch(cp->state) {
-                               case IP_VS_TCP_S_SYN_RECV:
-                               case IP_VS_TCP_S_SYNACK:
-                                       break;
-
-                               case IP_VS_TCP_S_ESTABLISHED:
-                                       if (todrop_entry(cp))
-                                               break;
-                                       continue;
-
-                               default:
-                                       continue;
-                               }
-                       } else {
-                               if (!todrop_entry(cp))
-                                       continue;
-                       }
-
-                       IP_VS_DBG(4, "del connection\n");
-                       ip_vs_conn_expire_now(cp);
-                       if (cp->control) {
-                               IP_VS_DBG(4, "del conn template\n");
-                               ip_vs_conn_expire_now(cp->control);
-                       }
-               }
-               ct_write_unlock_bh(hash);
-       }
-}
-
-
-/*
- *      Flush all the connection entries in the ip_vs_conn_tab
- */
-static void ip_vs_conn_flush(void)
-{
-       int idx;
-       struct ip_vs_conn *cp;
-
-  flush_again:
-       for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
-               /*
-                *  Lock is actually needed in this loop.
-                */
-               ct_write_lock_bh(idx);
-
-               list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-
-                       IP_VS_DBG(4, "del connection\n");
-                       ip_vs_conn_expire_now(cp);
-                       if (cp->control) {
-                               IP_VS_DBG(4, "del conn template\n");
-                               ip_vs_conn_expire_now(cp->control);
-                       }
-               }
-               ct_write_unlock_bh(idx);
-       }
-
-       /* the counter may be not NULL, because maybe some conn entries
-          are run by slow timer handler or unhashed but still referred */
-       if (atomic_read(&ip_vs_conn_count) != 0) {
-               schedule();
-               goto flush_again;
-       }
-}
-
-
-int __init ip_vs_conn_init(void)
-{
-       int idx;
-
-       /*
-        * Allocate the connection hash table and initialize its list heads
-        */
-       ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
-       if (!ip_vs_conn_tab)
-               return -ENOMEM;
-
-       /* Allocate ip_vs_conn slab cache */
-       ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
-                                             sizeof(struct ip_vs_conn), 0,
-                                             SLAB_HWCACHE_ALIGN, NULL);
-       if (!ip_vs_conn_cachep) {
-               vfree(ip_vs_conn_tab);
-               return -ENOMEM;
-       }
-
-       IP_VS_INFO("Connection hash table configured "
-                  "(size=%d, memory=%ldKbytes)\n",
-                  IP_VS_CONN_TAB_SIZE,
-                  (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
-       IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
-                 sizeof(struct ip_vs_conn));
-
-       for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
-               INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
-       }
-
-       for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
-               rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
-       }
-
-       proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
-       proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
-
-       /* calculate the random value for connection hash */
-       get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
-
-       return 0;
-}
-
-
-void ip_vs_conn_cleanup(void)
-{
-       /* flush all the connection entries first */
-       ip_vs_conn_flush();
-
-       /* Release the empty cache */
-       kmem_cache_destroy(ip_vs_conn_cachep);
-       proc_net_remove(&init_net, "ip_vs_conn");
-       proc_net_remove(&init_net, "ip_vs_conn_sync");
-       vfree(ip_vs_conn_tab);
-}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c

deleted file mode 100644 (file)

index 958abf3..0000000
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ /dev/null
@@ -1,1542 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the Netfilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
- * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
- * and others.
- *
- * Changes:
- *     Paul `Rusty' Russell            properly handle non-linear skbs
- *     Harald Welte                    don't use nfcache
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/icmp.h>
-
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <net/icmp.h>                   /* for icmp_send */
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#ifdef CONFIG_IP_VS_IPV6
-#include <net/ipv6.h>
-#include <linux/netfilter_ipv6.h>
-#endif
-
-#include <net/ip_vs.h>
-
-
-EXPORT_SYMBOL(register_ip_vs_scheduler);
-EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
-EXPORT_SYMBOL(ip_vs_proto_name);
-EXPORT_SYMBOL(ip_vs_conn_new);
-EXPORT_SYMBOL(ip_vs_conn_in_get);
-EXPORT_SYMBOL(ip_vs_conn_out_get);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
-#endif
-EXPORT_SYMBOL(ip_vs_conn_put);
-#ifdef CONFIG_IP_VS_DEBUG
-EXPORT_SYMBOL(ip_vs_get_debug_level);
-#endif
-
-
-/* ID used in ICMP lookups */
-#define icmp_id(icmph)          (((icmph)->un).echo.id)
-#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
-
-const char *ip_vs_proto_name(unsigned proto)
-{
-       static char buf[20];
-
-       switch (proto) {
-       case IPPROTO_IP:
-               return "IP";
-       case IPPROTO_UDP:
-               return "UDP";
-       case IPPROTO_TCP:
-               return "TCP";
-       case IPPROTO_ICMP:
-               return "ICMP";
-#ifdef CONFIG_IP_VS_IPV6
-       case IPPROTO_ICMPV6:
-               return "ICMPv6";
-#endif
-       default:
-               sprintf(buf, "IP_%d", proto);
-               return buf;
-       }
-}
-
-void ip_vs_init_hash_table(struct list_head *table, int rows)
-{
-       while (--rows >= 0)
-               INIT_LIST_HEAD(&table[rows]);
-}
-
-static inline void
-ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest = cp->dest;
-       if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-               spin_lock(&dest->stats.lock);
-               dest->stats.ustats.inpkts++;
-               dest->stats.ustats.inbytes += skb->len;
-               spin_unlock(&dest->stats.lock);
-
-               spin_lock(&dest->svc->stats.lock);
-               dest->svc->stats.ustats.inpkts++;
-               dest->svc->stats.ustats.inbytes += skb->len;
-               spin_unlock(&dest->svc->stats.lock);
-
-               spin_lock(&ip_vs_stats.lock);
-               ip_vs_stats.ustats.inpkts++;
-               ip_vs_stats.ustats.inbytes += skb->len;
-               spin_unlock(&ip_vs_stats.lock);
-       }
-}
-
-
-static inline void
-ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest = cp->dest;
-       if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-               spin_lock(&dest->stats.lock);
-               dest->stats.ustats.outpkts++;
-               dest->stats.ustats.outbytes += skb->len;
-               spin_unlock(&dest->stats.lock);
-
-               spin_lock(&dest->svc->stats.lock);
-               dest->svc->stats.ustats.outpkts++;
-               dest->svc->stats.ustats.outbytes += skb->len;
-               spin_unlock(&dest->svc->stats.lock);
-
-               spin_lock(&ip_vs_stats.lock);
-               ip_vs_stats.ustats.outpkts++;
-               ip_vs_stats.ustats.outbytes += skb->len;
-               spin_unlock(&ip_vs_stats.lock);
-       }
-}
-
-
-static inline void
-ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
-{
-       spin_lock(&cp->dest->stats.lock);
-       cp->dest->stats.ustats.conns++;
-       spin_unlock(&cp->dest->stats.lock);
-
-       spin_lock(&svc->stats.lock);
-       svc->stats.ustats.conns++;
-       spin_unlock(&svc->stats.lock);
-
-       spin_lock(&ip_vs_stats.lock);
-       ip_vs_stats.ustats.conns++;
-       spin_unlock(&ip_vs_stats.lock);
-}
-
-
-static inline int
-ip_vs_set_state(struct ip_vs_conn *cp, int direction,
-               const struct sk_buff *skb,
-               struct ip_vs_protocol *pp)
-{
-       if (unlikely(!pp->state_transition))
-               return 0;
-       return pp->state_transition(cp, direction, skb, pp);
-}
-
-
-/*
- *  IPVS persistent scheduling function
- *  It creates a connection entry according to its template if exists,
- *  or selects a server and creates a connection entry plus a template.
- *  Locking: we are svc user (svc->refcnt), so we hold all dests too
- *  Protocols supported: TCP, UDP
- */
-static struct ip_vs_conn *
-ip_vs_sched_persist(struct ip_vs_service *svc,
-                   const struct sk_buff *skb,
-                   __be16 ports[2])
-{
-       struct ip_vs_conn *cp = NULL;
-       struct ip_vs_iphdr iph;
-       struct ip_vs_dest *dest;
-       struct ip_vs_conn *ct;
-       __be16  dport;                  /* destination port to forward */
-       union nf_inet_addr snet;        /* source network of the client,
-                                          after masking */
-
-       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
-       /* Mask saddr with the netmask to adjust template granularity */
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6)
-               ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
-       else
-#endif
-               snet.ip = iph.saddr.ip & svc->netmask;
-
-       IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
-                     "mnet %s\n",
-                     IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
-                     IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
-                     IP_VS_DBG_ADDR(svc->af, &snet));
-
-       /*
-        * As far as we know, FTP is a very complicated network protocol, and
-        * it uses control connection and data connections. For active FTP,
-        * FTP server initialize data connection to the client, its source port
-        * is often 20. For passive FTP, FTP server tells the clients the port
-        * that it passively listens to,  and the client issues the data
-        * connection. In the tunneling or direct routing mode, the load
-        * balancer is on the client-to-server half of connection, the port
-        * number is unknown to the load balancer. So, a conn template like
-        * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
-        * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
-        * is created for other persistent services.
-        */
-       if (ports[1] == svc->port) {
-               /* Check if a template already exists */
-               if (svc->port != FTPPORT)
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, ports[1]);
-               else
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, 0);
-
-               if (!ct || !ip_vs_check_template(ct)) {
-                       /*
-                        * No template found or the dest of the connection
-                        * template is not available.
-                        */
-                       dest = svc->scheduler->schedule(svc, skb);
-                       if (dest == NULL) {
-                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
-                               return NULL;
-                       }
-
-                       /*
-                        * Create a template like <protocol,caddr,0,
-                        * vaddr,vport,daddr,dport> for non-ftp service,
-                        * and <protocol,caddr,0,vaddr,0,daddr,0>
-                        * for ftp service.
-                        */
-                       if (svc->port != FTPPORT)
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr,
-                                                   ports[1],
-                                                   &dest->addr, dest->port,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       else
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
-
-                       ct->timeout = svc->timeout;
-               } else {
-                       /* set destination with the found template */
-                       dest = ct->dest;
-               }
-               dport = dest->port;
-       } else {
-               /*
-                * Note: persistent fwmark-based services and persistent
-                * port zero service are handled here.
-                * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-                * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-                */
-               if (svc->fwmark) {
-                       union nf_inet_addr fwmark = {
-                               .all = { 0, 0, 0, htonl(svc->fwmark) }
-                       };
-
-                       ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-                                            &fwmark, 0);
-               } else
-                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-                                            &iph.daddr, 0);
-
-               if (!ct || !ip_vs_check_template(ct)) {
-                       /*
-                        * If it is not persistent port zero, return NULL,
-                        * otherwise create a connection template.
-                        */
-                       if (svc->port)
-                               return NULL;
-
-                       dest = svc->scheduler->schedule(svc, skb);
-                       if (dest == NULL) {
-                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
-                               return NULL;
-                       }
-
-                       /*
-                        * Create a template according to the service
-                        */
-                       if (svc->fwmark) {
-                               union nf_inet_addr fwmark = {
-                                       .all = { 0, 0, 0, htonl(svc->fwmark) }
-                               };
-
-                               ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-                                                   &snet, 0,
-                                                   &fwmark, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       } else
-                               ct = ip_vs_conn_new(svc->af, iph.protocol,
-                                                   &snet, 0,
-                                                   &iph.daddr, 0,
-                                                   &dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
-
-                       ct->timeout = svc->timeout;
-               } else {
-                       /* set destination with the found template */
-                       dest = ct->dest;
-               }
-               dport = ports[1];
-       }
-
-       /*
-        *    Create a new connection according to the template
-        */
-       cp = ip_vs_conn_new(svc->af, iph.protocol,
-                           &iph.saddr, ports[0],
-                           &iph.daddr, ports[1],
-                           &dest->addr, dport,
-                           0,
-                           dest);
-       if (cp == NULL) {
-               ip_vs_conn_put(ct);
-               return NULL;
-       }
-
-       /*
-        *    Add its control
-        */
-       ip_vs_control_add(cp, ct);
-       ip_vs_conn_put(ct);
-
-       ip_vs_conn_stats(cp, svc);
-       return cp;
-}
-
-
-/*
- *  IPVS main scheduling function
- *  It selects a server according to the virtual service, and
- *  creates a connection entry.
- *  Protocols supported: TCP, UDP
- */
-struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_conn *cp = NULL;
-       struct ip_vs_iphdr iph;
-       struct ip_vs_dest *dest;
-       __be16 _ports[2], *pptr;
-
-       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-       pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
-       if (pptr == NULL)
-               return NULL;
-
-       /*
-        *    Persistent service
-        */
-       if (svc->flags & IP_VS_SVC_F_PERSISTENT)
-               return ip_vs_sched_persist(svc, skb, pptr);
-
-       /*
-        *    Non-persistent service
-        */
-       if (!svc->fwmark && pptr[1] != svc->port) {
-               if (!svc->port)
-                       IP_VS_ERR("Schedule: port zero only supported "
-                                 "in persistent services, "
-                                 "check your ipvs configuration\n");
-               return NULL;
-       }
-
-       dest = svc->scheduler->schedule(svc, skb);
-       if (dest == NULL) {
-               IP_VS_DBG(1, "Schedule: no dest found.\n");
-               return NULL;
-       }
-
-       /*
-        *    Create a connection entry.
-        */
-       cp = ip_vs_conn_new(svc->af, iph.protocol,
-                           &iph.saddr, pptr[0],
-                           &iph.daddr, pptr[1],
-                           &dest->addr, dest->port ? dest->port : pptr[1],
-                           0,
-                           dest);
-       if (cp == NULL)
-               return NULL;
-
-       IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
-                     "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
-                     ip_vs_fwd_tag(cp),
-                     IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
-                     IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
-                     IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
-                     cp->flags, atomic_read(&cp->refcnt));
-
-       ip_vs_conn_stats(cp, svc);
-       return cp;
-}
-
-
-/*
- *  Pass or drop the packet.
- *  Called by ip_vs_in, when the virtual service is available but
- *  no destination is available for a new connection.
- */
-int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
-               struct ip_vs_protocol *pp)
-{
-       __be16 _ports[2], *pptr;
-       struct ip_vs_iphdr iph;
-       int unicast;
-       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
-       pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
-       if (pptr == NULL) {
-               ip_vs_service_put(svc);
-               return NF_DROP;
-       }
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6)
-               unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
-       else
-#endif
-               unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
-
-       /* if it is fwmark-based service, the cache_bypass sysctl is up
-          and the destination is a non-local unicast, then create
-          a cache_bypass connection entry */
-       if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
-               int ret, cs;
-               struct ip_vs_conn *cp;
-               union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
-
-               ip_vs_service_put(svc);
-
-               /* create a new connection entry */
-               IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
-               cp = ip_vs_conn_new(svc->af, iph.protocol,
-                                   &iph.saddr, pptr[0],
-                                   &iph.daddr, pptr[1],
-                                   &daddr, 0,
-                                   IP_VS_CONN_F_BYPASS,
-                                   NULL);
-               if (cp == NULL)
-                       return NF_DROP;
-
-               /* statistics */
-               ip_vs_in_stats(cp, skb);
-
-               /* set state */
-               cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
-
-               /* transmit the first SYN packet */
-               ret = cp->packet_xmit(skb, cp, pp);
-               /* do not touch skb anymore */
-
-               atomic_inc(&cp->in_pkts);
-               ip_vs_conn_put(cp);
-               return ret;
-       }
-
-       /*
-        * When the virtual ftp service is presented, packets destined
-        * for other services on the VIP may get here (except services
-        * listed in the ipvs table), pass the packets, because it is
-        * not ipvs job to decide to drop the packets.
-        */
-       if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
-               ip_vs_service_put(svc);
-               return NF_ACCEPT;
-       }
-
-       ip_vs_service_put(svc);
-
-       /*
-        * Notify the client that the destination is unreachable, and
-        * release the socket buffer.
-        * Since it is in IP layer, the TCP socket is not actually
-        * created, the TCP RST packet cannot be sent, instead that
-        * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
-        */
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6)
-               icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
-                           skb->dev);
-       else
-#endif
-               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-
-       return NF_DROP;
-}
-
-
-/*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- *      chain, and is used for VS/NAT.
- *      It detects packets for VS/NAT connections and sends the packets
- *      immediately. This can avoid that iptable_nat mangles the packets
- *      for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-                                      struct sk_buff *skb,
-                                      const struct net_device *in,
-                                      const struct net_device *out,
-                                      int (*okfn)(struct sk_buff *))
-{
-       if (!skb->ipvs_property)
-               return NF_ACCEPT;
-       /* The packet was sent from IPVS, exit this chain */
-       return NF_STOP;
-}
-
-__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
-{
-       return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
-}
-
-static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
-{
-       int err = ip_defrag(skb, user);
-
-       if (!err)
-               ip_send_check(ip_hdr(skb));
-
-       return err;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
-{
-       /* TODO IPv6: Find out what to do here for IPv6 */
-       return 0;
-}
-#endif
-
-/*
- * Packet has been made sufficiently writable in caller
- * - inout: 1=in->out, 0=out->in
- */
-void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
-                   struct ip_vs_conn *cp, int inout)
-{
-       struct iphdr *iph        = ip_hdr(skb);
-       unsigned int icmp_offset = iph->ihl*4;
-       struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
-                                                     icmp_offset);
-       struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
-
-       if (inout) {
-               iph->saddr = cp->vaddr.ip;
-               ip_send_check(iph);
-               ciph->daddr = cp->vaddr.ip;
-               ip_send_check(ciph);
-       } else {
-               iph->daddr = cp->daddr.ip;
-               ip_send_check(iph);
-               ciph->saddr = cp->daddr.ip;
-               ip_send_check(ciph);
-       }
-
-       /* the TCP/UDP port */
-       if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
-               __be16 *ports = (void *)ciph + ciph->ihl*4;
-
-               if (inout)
-                       ports[1] = cp->vport;
-               else
-                       ports[0] = cp->dport;
-       }
-
-       /* And finally the ICMP checksum */
-       icmph->checksum = 0;
-       icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
-       skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-       if (inout)
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered outgoing ICMP");
-       else
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered incoming ICMP");
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
-                   struct ip_vs_conn *cp, int inout)
-{
-       struct ipv6hdr *iph      = ipv6_hdr(skb);
-       unsigned int icmp_offset = sizeof(struct ipv6hdr);
-       struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
-                                                     icmp_offset);
-       struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
-
-       if (inout) {
-               iph->saddr = cp->vaddr.in6;
-               ciph->daddr = cp->vaddr.in6;
-       } else {
-               iph->daddr = cp->daddr.in6;
-               ciph->saddr = cp->daddr.in6;
-       }
-
-       /* the TCP/UDP port */
-       if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
-               __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
-
-               if (inout)
-                       ports[1] = cp->vport;
-               else
-                       ports[0] = cp->dport;
-       }
-
-       /* And finally the ICMP checksum */
-       icmph->icmp6_cksum = 0;
-       /* TODO IPv6: is this correct for ICMPv6? */
-       ip_vs_checksum_complete(skb, icmp_offset);
-       skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-       if (inout)
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered outgoing ICMPv6");
-       else
-               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-                       "Forwarding altered incoming ICMPv6");
-}
-#endif
-
-/* Handle relevant response ICMP messages - forward to the right
- * destination host. Used for NAT and local client.
- */
-static int handle_response_icmp(int af, struct sk_buff *skb,
-                               union nf_inet_addr *snet,
-                               __u8 protocol, struct ip_vs_conn *cp,
-                               struct ip_vs_protocol *pp,
-                               unsigned int offset, unsigned int ihl)
-{
-       unsigned int verdict = NF_DROP;
-
-       if (IP_VS_FWD_METHOD(cp) != 0) {
-               IP_VS_ERR("shouldn't reach here, because the box is on the "
-                         "half connection in the tun/dr module.\n");
-       }
-
-       /* Ensure the checksum is correct */
-       if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
-               /* Failed checksum! */
-               IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
-                             IP_VS_DBG_ADDR(af, snet));
-               goto out;
-       }
-
-       if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
-               offset += 2 * sizeof(__u16);
-       if (!skb_make_writable(skb, offset))
-               goto out;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               ip_vs_nat_icmp_v6(skb, pp, cp, 1);
-       else
-#endif
-               ip_vs_nat_icmp(skb, pp, cp, 1);
-
-       /* do the statistics and put it back */
-       ip_vs_out_stats(cp, skb);
-
-       skb->ipvs_property = 1;
-       verdict = NF_ACCEPT;
-
-out:
-       __ip_vs_conn_put(cp);
-
-       return verdict;
-}
-
-/*
- *     Handle ICMP messages in the inside-to-outside direction (outgoing).
- *     Find any that might be relevant, check against existing connections.
- *     Currently handles error types - unreachable, quench, ttl exceeded.
- */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
-{
-       struct iphdr *iph;
-       struct icmphdr  _icmph, *ic;
-       struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
-       struct ip_vs_iphdr ciph;
-       struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp;
-       unsigned int offset, ihl;
-       union nf_inet_addr snet;
-
-       *related = 1;
-
-       /* reassemble IP fragments */
-       if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
-                       return NF_STOLEN;
-       }
-
-       iph = ip_hdr(skb);
-       offset = ihl = iph->ihl * 4;
-       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-       if (ic == NULL)
-               return NF_DROP;
-
-       IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
-                 ic->type, ntohs(icmp_id(ic)),
-                 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-
-       /*
-        * Work through seeing if this is for us.
-        * These checks are supposed to be in an order that means easy
-        * things are checked first to speed up processing.... however
-        * this means that some packets will manage to get a long way
-        * down this stack and then be rejected, but that's life.
-        */
-       if ((ic->type != ICMP_DEST_UNREACH) &&
-           (ic->type != ICMP_SOURCE_QUENCH) &&
-           (ic->type != ICMP_TIME_EXCEEDED)) {
-               *related = 0;
-               return NF_ACCEPT;
-       }
-
-       /* Now find the contained IP header */
-       offset += sizeof(_icmph);
-       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-       if (cih == NULL)
-               return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-       pp = ip_vs_proto_get(cih->protocol);
-       if (!pp)
-               return NF_ACCEPT;
-
-       /* Is the embedded protocol header present? */
-       if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
-                    pp->dont_defrag))
-               return NF_ACCEPT;
-
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
-
-       offset += cih->ihl * 4;
-
-       ip_vs_fill_iphdr(AF_INET, cih, &ciph);
-       /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
-       if (!cp)
-               return NF_ACCEPT;
-
-       snet.ip = iph->saddr;
-       return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
-                                   pp, offset, ihl);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
-{
-       struct ipv6hdr *iph;
-       struct icmp6hdr _icmph, *ic;
-       struct ipv6hdr  _ciph, *cih;    /* The ip header contained
-                                          within the ICMP */
-       struct ip_vs_iphdr ciph;
-       struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp;
-       unsigned int offset;
-       union nf_inet_addr snet;
-
-       *related = 1;
-
-       /* reassemble IP fragments */
-       if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
-                       return NF_STOLEN;
-       }
-
-       iph = ipv6_hdr(skb);
-       offset = sizeof(struct ipv6hdr);
-       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-       if (ic == NULL)
-               return NF_DROP;
-
-       IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
-                 ic->icmp6_type, ntohs(icmpv6_id(ic)),
-                 NIP6(iph->saddr), NIP6(iph->daddr));
-
-       /*
-        * Work through seeing if this is for us.
-        * These checks are supposed to be in an order that means easy
-        * things are checked first to speed up processing.... however
-        * this means that some packets will manage to get a long way
-        * down this stack and then be rejected, but that's life.
-        */
-       if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
-           (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
-           (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
-               *related = 0;
-               return NF_ACCEPT;
-       }
-
-       /* Now find the contained IP header */
-       offset += sizeof(_icmph);
-       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-       if (cih == NULL)
-               return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-       pp = ip_vs_proto_get(cih->nexthdr);
-       if (!pp)
-               return NF_ACCEPT;
-
-       /* Is the embedded protocol header present? */
-       /* TODO: we don't support fragmentation at the moment anyways */
-       if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
-               return NF_ACCEPT;
-
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
-
-       offset += sizeof(struct ipv6hdr);
-
-       ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
-       /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
-       if (!cp)
-               return NF_ACCEPT;
-
-       ipv6_addr_copy(&snet.in6, &iph->saddr);
-       return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
-                                   pp, offset, sizeof(struct ipv6hdr));
-}
-#endif
-
-static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
-{
-       struct tcphdr _tcph, *th;
-
-       th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
-       if (th == NULL)
-               return 0;
-       return th->rst;
-}
-
-/* Handle response packets: rewrite addresses and send away...
- * Used for NAT and local client.
- */
-static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-               struct ip_vs_conn *cp, int ihl)
-{
-       IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
-
-       if (!skb_make_writable(skb, ihl))
-               goto drop;
-
-       /* mangle the packet */
-       if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
-               goto drop;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               ipv6_hdr(skb)->saddr = cp->vaddr.in6;
-       else
-#endif
-       {
-               ip_hdr(skb)->saddr = cp->vaddr.ip;
-               ip_send_check(ip_hdr(skb));
-       }
-
-       /* For policy routing, packets originating from this
-        * machine itself may be routed differently to packets
-        * passing through.  We want this packet to be routed as
-        * if it came from this machine itself.  So re-compute
-        * the routing information.
-        */
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6) {
-               if (ip6_route_me_harder(skb) != 0)
-                       goto drop;
-       } else
-#endif
-               if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
-                       goto drop;
-
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
-
-       ip_vs_out_stats(cp, skb);
-       ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-       ip_vs_conn_put(cp);
-
-       skb->ipvs_property = 1;
-
-       LeaveFunction(11);
-       return NF_ACCEPT;
-
-drop:
-       ip_vs_conn_put(cp);
-       kfree_skb(skb);
-       return NF_STOLEN;
-}
-
-/*
- *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
- *     Check if outgoing packet belongs to the established ip_vs_conn.
- */
-static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-         const struct net_device *in, const struct net_device *out,
-         int (*okfn)(struct sk_buff *))
-{
-       struct ip_vs_iphdr iph;
-       struct ip_vs_protocol *pp;
-       struct ip_vs_conn *cp;
-       int af;
-
-       EnterFunction(11);
-
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-       if (skb->ipvs_property)
-               return NF_ACCEPT;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6) {
-               if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-                       if (related)
-                               return verdict;
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-               }
-       } else
-#endif
-               if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-                       int related, verdict = ip_vs_out_icmp(skb, &related);
-
-                       if (related)
-                               return verdict;
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-               }
-
-       pp = ip_vs_proto_get(iph.protocol);
-       if (unlikely(!pp))
-               return NF_ACCEPT;
-
-       /* reassemble IP fragments */
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6) {
-               if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-                       if (related)
-                               return verdict;
-
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-               }
-       } else
-#endif
-               if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
-                            !pp->dont_defrag)) {
-                       if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
-                               return NF_STOLEN;
-
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-               }
-
-       /*
-        * Check if the packet belongs to an existing entry
-        */
-       cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-
-       if (unlikely(!cp)) {
-               if (sysctl_ip_vs_nat_icmp_send &&
-                   (pp->protocol == IPPROTO_TCP ||
-                    pp->protocol == IPPROTO_UDP)) {
-                       __be16 _ports[2], *pptr;
-
-                       pptr = skb_header_pointer(skb, iph.len,
-                                                 sizeof(_ports), _ports);
-                       if (pptr == NULL)
-                               return NF_ACCEPT;       /* Not for me */
-                       if (ip_vs_lookup_real_service(af, iph.protocol,
-                                                     &iph.saddr,
-                                                     pptr[0])) {
-                               /*
-                                * Notify the real server: there is no
-                                * existing entry if it is not RST
-                                * packet or not TCP packet.
-                                */
-                               if (iph.protocol != IPPROTO_TCP
-                                   || !is_tcp_reset(skb, iph.len)) {
-#ifdef CONFIG_IP_VS_IPV6
-                                       if (af == AF_INET6)
-                                               icmpv6_send(skb,
-                                                           ICMPV6_DEST_UNREACH,
-                                                           ICMPV6_PORT_UNREACH,
-                                                           0, skb->dev);
-                                       else
-#endif
-                                               icmp_send(skb,
-                                                         ICMP_DEST_UNREACH,
-                                                         ICMP_PORT_UNREACH, 0);
-                                       return NF_DROP;
-                               }
-                       }
-               }
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
-               return NF_ACCEPT;
-       }
-
-       return handle_response(af, skb, pp, cp, iph.len);
-}
-
-
-/*
- *     Handle ICMP messages in the outside-to-inside direction (incoming).
- *     Find any that might be relevant, check against existing connections,
- *     forward to the right destination host if relevant.
- *     Currently handles error types - unreachable, quench, ttl exceeded.
- */
-static int
-ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
-{
-       struct iphdr *iph;
-       struct icmphdr  _icmph, *ic;
-       struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
-       struct ip_vs_iphdr ciph;
-       struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp;
-       unsigned int offset, ihl, verdict;
-       union nf_inet_addr snet;
-
-       *related = 1;
-
-       /* reassemble IP fragments */
-       if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-                                           IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
-                       return NF_STOLEN;
-       }
-
-       iph = ip_hdr(skb);
-       offset = ihl = iph->ihl * 4;
-       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-       if (ic == NULL)
-               return NF_DROP;
-
-       IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
-                 ic->type, ntohs(icmp_id(ic)),
-                 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-
-       /*
-        * Work through seeing if this is for us.
-        * These checks are supposed to be in an order that means easy
-        * things are checked first to speed up processing.... however
-        * this means that some packets will manage to get a long way
-        * down this stack and then be rejected, but that's life.
-        */
-       if ((ic->type != ICMP_DEST_UNREACH) &&
-           (ic->type != ICMP_SOURCE_QUENCH) &&
-           (ic->type != ICMP_TIME_EXCEEDED)) {
-               *related = 0;
-               return NF_ACCEPT;
-       }
-
-       /* Now find the contained IP header */
-       offset += sizeof(_icmph);
-       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-       if (cih == NULL)
-               return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-       pp = ip_vs_proto_get(cih->protocol);
-       if (!pp)
-               return NF_ACCEPT;
-
-       /* Is the embedded protocol header present? */
-       if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
-                    pp->dont_defrag))
-               return NF_ACCEPT;
-
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
-
-       offset += cih->ihl * 4;
-
-       ip_vs_fill_iphdr(AF_INET, cih, &ciph);
-       /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
-       if (!cp) {
-               /* The packet could also belong to a local client */
-               cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
-               if (cp) {
-                       snet.ip = iph->saddr;
-                       return handle_response_icmp(AF_INET, skb, &snet,
-                                                   cih->protocol, cp, pp,
-                                                   offset, ihl);
-               }
-               return NF_ACCEPT;
-       }
-
-       verdict = NF_DROP;
-
-       /* Ensure the checksum is correct */
-       if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
-               /* Failed checksum! */
-               IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
-                         NIPQUAD(iph->saddr));
-               goto out;
-       }
-
-       /* do the statistics and put it back */
-       ip_vs_in_stats(cp, skb);
-       if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
-               offset += 2 * sizeof(__u16);
-       verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-       /* do not touch skb anymore */
-
-  out:
-       __ip_vs_conn_put(cp);
-
-       return verdict;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static int
-ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
-{
-       struct ipv6hdr *iph;
-       struct icmp6hdr _icmph, *ic;
-       struct ipv6hdr  _ciph, *cih;    /* The ip header contained
-                                          within the ICMP */
-       struct ip_vs_iphdr ciph;
-       struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp;
-       unsigned int offset, verdict;
-       union nf_inet_addr snet;
-
-       *related = 1;
-
-       /* reassemble IP fragments */
-       if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-                                              IP_DEFRAG_VS_IN :
-                                              IP_DEFRAG_VS_FWD))
-                       return NF_STOLEN;
-       }
-
-       iph = ipv6_hdr(skb);
-       offset = sizeof(struct ipv6hdr);
-       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
-       if (ic == NULL)
-               return NF_DROP;
-
-       IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
-                 ic->icmp6_type, ntohs(icmpv6_id(ic)),
-                 NIP6(iph->saddr), NIP6(iph->daddr));
-
-       /*
-        * Work through seeing if this is for us.
-        * These checks are supposed to be in an order that means easy
-        * things are checked first to speed up processing.... however
-        * this means that some packets will manage to get a long way
-        * down this stack and then be rejected, but that's life.
-        */
-       if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
-           (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
-           (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
-               *related = 0;
-               return NF_ACCEPT;
-       }
-
-       /* Now find the contained IP header */
-       offset += sizeof(_icmph);
-       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-       if (cih == NULL)
-               return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-       pp = ip_vs_proto_get(cih->nexthdr);
-       if (!pp)
-               return NF_ACCEPT;
-
-       /* Is the embedded protocol header present? */
-       /* TODO: we don't support fragmentation at the moment anyways */
-       if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
-               return NF_ACCEPT;
-
-       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
-
-       offset += sizeof(struct ipv6hdr);
-
-       ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
-       /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
-       if (!cp) {
-               /* The packet could also belong to a local client */
-               cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
-               if (cp) {
-                       ipv6_addr_copy(&snet.in6, &iph->saddr);
-                       return handle_response_icmp(AF_INET6, skb, &snet,
-                                                   cih->nexthdr,
-                                                   cp, pp, offset,
-                                                   sizeof(struct ipv6hdr));
-               }
-               return NF_ACCEPT;
-       }
-
-       verdict = NF_DROP;
-
-       /* do the statistics and put it back */
-       ip_vs_in_stats(cp, skb);
-       if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
-               offset += 2 * sizeof(__u16);
-       verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-       /* do not touch skb anymore */
-
-       __ip_vs_conn_put(cp);
-
-       return verdict;
-}
-#endif
-
-
-/*
- *     Check if it's for virtual services, look it up,
- *     and send it on its way...
- */
-static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-        const struct net_device *in, const struct net_device *out,
-        int (*okfn)(struct sk_buff *))
-{
-       struct ip_vs_iphdr iph;
-       struct ip_vs_protocol *pp;
-       struct ip_vs_conn *cp;
-       int ret, restart, af;
-
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
-       /*
-        *      Big tappo: only PACKET_HOST, including loopback for local client
-        *      Don't handle local packets on IPv6 for now
-        */
-       if (unlikely(skb->pkt_type != PACKET_HOST)) {
-               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-                             skb->pkt_type,
-                             iph.protocol,
-                             IP_VS_DBG_ADDR(af, &iph.daddr));
-               return NF_ACCEPT;
-       }
-
-       if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-               int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
-
-               if (related)
-                       return verdict;
-               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-       }
-
-       /* Protocol supported? */
-       pp = ip_vs_proto_get(iph.protocol);
-       if (unlikely(!pp))
-               return NF_ACCEPT;
-
-       /*
-        * Check if the packet belongs to an existing connection entry
-        */
-       cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
-
-       if (unlikely(!cp)) {
-               int v;
-
-               /* For local client packets, it could be a response */
-               cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-               if (cp)
-                       return handle_response(af, skb, pp, cp, iph.len);
-
-               if (!pp->conn_schedule(af, skb, pp, &v, &cp))
-                       return v;
-       }
-
-       if (unlikely(!cp)) {
-               /* sorry, all this trouble for a no-hit :) */
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
-               return NF_ACCEPT;
-       }
-
-       IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
-
-       /* Check the server status */
-       if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-               /* the destination server is not available */
-
-               if (sysctl_ip_vs_expire_nodest_conn) {
-                       /* try to expire the connection immediately */
-                       ip_vs_conn_expire_now(cp);
-               }
-               /* don't restart its timer, and silently
-                  drop the packet. */
-               __ip_vs_conn_put(cp);
-               return NF_DROP;
-       }
-
-       ip_vs_in_stats(cp, skb);
-       restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
-       if (cp->packet_xmit)
-               ret = cp->packet_xmit(skb, cp, pp);
-               /* do not touch skb anymore */
-       else {
-               IP_VS_DBG_RL("warning: packet_xmit is null");
-               ret = NF_ACCEPT;
-       }
-
-       /* Increase its packet counter and check if it is needed
-        * to be synchronized
-        *
-        * Sync connection if it is about to close to
-        * encorage the standby servers to update the connections timeout
-        */
-       atomic_inc(&cp->in_pkts);
-       if (af == AF_INET &&
-           (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
-           (((cp->protocol != IPPROTO_TCP ||
-              cp->state == IP_VS_TCP_S_ESTABLISHED) &&
-             (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
-              == sysctl_ip_vs_sync_threshold[0])) ||
-            ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
-             ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
-              (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
-              (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
-               ip_vs_sync_conn(cp);
-       cp->old_state = cp->state;
-
-       ip_vs_conn_put(cp);
-       return ret;
-}
-
-
-/*
- *     It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
- *      related packets destined for 0.0.0.0/0.
- *      When fwmark-based virtual service is used, such as transparent
- *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
- *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
- *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
- *      and send them to ip_vs_in_icmp.
- */
-static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
-                  const struct net_device *in, const struct net_device *out,
-                  int (*okfn)(struct sk_buff *))
-{
-       int r;
-
-       if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
-               return NF_ACCEPT;
-
-       return ip_vs_in_icmp(skb, &r, hooknum);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static unsigned int
-ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
-                     const struct net_device *in, const struct net_device *out,
-                     int (*okfn)(struct sk_buff *))
-{
-       int r;
-
-       if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
-               return NF_ACCEPT;
-
-       return ip_vs_in_icmp_v6(skb, &r, hooknum);
-}
-#endif
-
-
-static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
-       /* After packet filtering, forward packet through VS/DR, VS/TUN,
-        * or VS/NAT(change destination), so that filtering rules can be
-        * applied to IPVS. */
-       {
-               .hook           = ip_vs_in,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
-       },
-       /* After packet filtering, change source only for VS/NAT */
-       {
-               .hook           = ip_vs_out,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 100,
-       },
-       /* After packet filtering (but before ip_vs_out_icmp), catch icmp
-        * destined for 0.0.0.0/0, which is for incoming IPVS connections */
-       {
-               .hook           = ip_vs_forward_icmp,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
-       },
-       /* Before the netfilter connection tracking, exit from POST_ROUTING */
-       {
-               .hook           = ip_vs_post_routing,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET,
-               .hooknum        = NF_INET_POST_ROUTING,
-               .priority       = NF_IP_PRI_NAT_SRC-1,
-       },
-#ifdef CONFIG_IP_VS_IPV6
-       /* After packet filtering, forward packet through VS/DR, VS/TUN,
-        * or VS/NAT(change destination), so that filtering rules can be
-        * applied to IPVS. */
-       {
-               .hook           = ip_vs_in,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET6,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
-       },
-       /* After packet filtering, change source only for VS/NAT */
-       {
-               .hook           = ip_vs_out,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 100,
-       },
-       /* After packet filtering (but before ip_vs_out_icmp), catch icmp
-        * destined for 0.0.0.0/0, which is for incoming IPVS connections */
-       {
-               .hook           = ip_vs_forward_icmp_v6,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
-       },
-       /* Before the netfilter connection tracking, exit from POST_ROUTING */
-       {
-               .hook           = ip_vs_post_routing,
-               .owner          = THIS_MODULE,
-               .pf             = PF_INET6,
-               .hooknum        = NF_INET_POST_ROUTING,
-               .priority       = NF_IP6_PRI_NAT_SRC-1,
-       },
-#endif
-};
-
-
-/*
- *     Initialize IP Virtual Server
- */
-static int __init ip_vs_init(void)
-{
-       int ret;
-
-       ip_vs_estimator_init();
-
-       ret = ip_vs_control_init();
-       if (ret < 0) {
-               IP_VS_ERR("can't setup control.\n");
-               goto cleanup_estimator;
-       }
-
-       ip_vs_protocol_init();
-
-       ret = ip_vs_app_init();
-       if (ret < 0) {
-               IP_VS_ERR("can't setup application helper.\n");
-               goto cleanup_protocol;
-       }
-
-       ret = ip_vs_conn_init();
-       if (ret < 0) {
-               IP_VS_ERR("can't setup connection table.\n");
-               goto cleanup_app;
-       }
-
-       ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
-       if (ret < 0) {
-               IP_VS_ERR("can't register hooks.\n");
-               goto cleanup_conn;
-       }
-
-       IP_VS_INFO("ipvs loaded.\n");
-       return ret;
-
-  cleanup_conn:
-       ip_vs_conn_cleanup();
-  cleanup_app:
-       ip_vs_app_cleanup();
-  cleanup_protocol:
-       ip_vs_protocol_cleanup();
-       ip_vs_control_cleanup();
-  cleanup_estimator:
-       ip_vs_estimator_cleanup();
-       return ret;
-}
-
-static void __exit ip_vs_cleanup(void)
-{
-       nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
-       ip_vs_conn_cleanup();
-       ip_vs_app_cleanup();
-       ip_vs_protocol_cleanup();
-       ip_vs_control_cleanup();
-       ip_vs_estimator_cleanup();
-       IP_VS_INFO("ipvs unloaded.\n");
-}
-
-module_init(ip_vs_init);
-module_exit(ip_vs_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c

deleted file mode 100644 (file)

index 0302cf3..0000000
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ /dev/null
@@ -1,3443 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the NetFilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/swap.h>
-#include <linux/seq_file.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/mutex.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#ifdef CONFIG_IP_VS_IPV6
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#endif
-#include <net/route.h>
-#include <net/sock.h>
-#include <net/genetlink.h>
-
-#include <asm/uaccess.h>
-
-#include <net/ip_vs.h>
-
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
-
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
-/* lock for table with the real services */
-static DEFINE_RWLOCK(__ip_vs_rs_lock);
-
-/* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
-
-/* lock for drop entry handling */
-static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
-
-/* lock for drop packet handling */
-static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
-
-/* 1/rate drop and drop-entry variables */
-int ip_vs_drop_rate = 0;
-int ip_vs_drop_counter = 0;
-static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
-
-/* number of virtual services */
-static int ip_vs_num_services = 0;
-
-/* sysctl variables */
-static int sysctl_ip_vs_drop_entry = 0;
-static int sysctl_ip_vs_drop_packet = 0;
-static int sysctl_ip_vs_secure_tcp = 0;
-static int sysctl_ip_vs_amemthresh = 1024;
-static int sysctl_ip_vs_am_droprate = 10;
-int sysctl_ip_vs_cache_bypass = 0;
-int sysctl_ip_vs_expire_nodest_conn = 0;
-int sysctl_ip_vs_expire_quiescent_template = 0;
-int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
-int sysctl_ip_vs_nat_icmp_send = 0;
-
-
-#ifdef CONFIG_IP_VS_DEBUG
-static int sysctl_ip_vs_debug_level = 0;
-
-int ip_vs_get_debug_level(void)
-{
-       return sysctl_ip_vs_debug_level;
-}
-#endif
-
-#ifdef CONFIG_IP_VS_IPV6
-/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
-{
-       struct rt6_info *rt;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip6_u = {
-                               .daddr = *addr,
-                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-       };
-
-       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-       if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
-                       return 1;
-
-       return 0;
-}
-#endif
-/*
- *     update_defense_level is called from keventd and from sysctl,
- *     so it needs to protect itself from softirqs
- */
-static void update_defense_level(void)
-{
-       struct sysinfo i;
-       static int old_secure_tcp = 0;
-       int availmem;
-       int nomem;
-       int to_change = -1;
-
-       /* we only count free and buffered memory (in pages) */
-       si_meminfo(&i);
-       availmem = i.freeram + i.bufferram;
-       /* however in linux 2.5 the i.bufferram is total page cache size,
-          we need adjust it */
-       /* si_swapinfo(&i); */
-       /* availmem = availmem - (i.totalswap - i.freeswap); */
-
-       nomem = (availmem < sysctl_ip_vs_amemthresh);
-
-       local_bh_disable();
-
-       /* drop_entry */
-       spin_lock(&__ip_vs_dropentry_lock);
-       switch (sysctl_ip_vs_drop_entry) {
-       case 0:
-               atomic_set(&ip_vs_dropentry, 0);
-               break;
-       case 1:
-               if (nomem) {
-                       atomic_set(&ip_vs_dropentry, 1);
-                       sysctl_ip_vs_drop_entry = 2;
-               } else {
-                       atomic_set(&ip_vs_dropentry, 0);
-               }
-               break;
-       case 2:
-               if (nomem) {
-                       atomic_set(&ip_vs_dropentry, 1);
-               } else {
-                       atomic_set(&ip_vs_dropentry, 0);
-                       sysctl_ip_vs_drop_entry = 1;
-               };
-               break;
-       case 3:
-               atomic_set(&ip_vs_dropentry, 1);
-               break;
-       }
-       spin_unlock(&__ip_vs_dropentry_lock);
-
-       /* drop_packet */
-       spin_lock(&__ip_vs_droppacket_lock);
-       switch (sysctl_ip_vs_drop_packet) {
-       case 0:
-               ip_vs_drop_rate = 0;
-               break;
-       case 1:
-               if (nomem) {
-                       ip_vs_drop_rate = ip_vs_drop_counter
-                               = sysctl_ip_vs_amemthresh /
-                               (sysctl_ip_vs_amemthresh-availmem);
-                       sysctl_ip_vs_drop_packet = 2;
-               } else {
-                       ip_vs_drop_rate = 0;
-               }
-               break;
-       case 2:
-               if (nomem) {
-                       ip_vs_drop_rate = ip_vs_drop_counter
-                               = sysctl_ip_vs_amemthresh /
-                               (sysctl_ip_vs_amemthresh-availmem);
-               } else {
-                       ip_vs_drop_rate = 0;
-                       sysctl_ip_vs_drop_packet = 1;
-               }
-               break;
-       case 3:
-               ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
-               break;
-       }
-       spin_unlock(&__ip_vs_droppacket_lock);
-
-       /* secure_tcp */
-       write_lock(&__ip_vs_securetcp_lock);
-       switch (sysctl_ip_vs_secure_tcp) {
-       case 0:
-               if (old_secure_tcp >= 2)
-                       to_change = 0;
-               break;
-       case 1:
-               if (nomem) {
-                       if (old_secure_tcp < 2)
-                               to_change = 1;
-                       sysctl_ip_vs_secure_tcp = 2;
-               } else {
-                       if (old_secure_tcp >= 2)
-                               to_change = 0;
-               }
-               break;
-       case 2:
-               if (nomem) {
-                       if (old_secure_tcp < 2)
-                               to_change = 1;
-               } else {
-                       if (old_secure_tcp >= 2)
-                               to_change = 0;
-                       sysctl_ip_vs_secure_tcp = 1;
-               }
-               break;
-       case 3:
-               if (old_secure_tcp < 2)
-                       to_change = 1;
-               break;
-       }
-       old_secure_tcp = sysctl_ip_vs_secure_tcp;
-       if (to_change >= 0)
-               ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
-       write_unlock(&__ip_vs_securetcp_lock);
-
-       local_bh_enable();
-}
-
-
-/*
- *     Timer for checking the defense
- */
-#define DEFENSE_TIMER_PERIOD   1*HZ
-static void defense_work_handler(struct work_struct *work);
-static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
-
-static void defense_work_handler(struct work_struct *work)
-{
-       update_defense_level();
-       if (atomic_read(&ip_vs_dropentry))
-               ip_vs_random_dropentry();
-
-       schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-}
-
-int
-ip_vs_use_count_inc(void)
-{
-       return try_module_get(THIS_MODULE);
-}
-
-void
-ip_vs_use_count_dec(void)
-{
-       module_put(THIS_MODULE);
-}
-
-
-/*
- *     Hash table: for virtual service lookups
- */
-#define IP_VS_SVC_TAB_BITS 8
-#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
-#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
-
-/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
-/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
-
-/*
- *     Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
- *     Trash for destinations
- */
-static LIST_HEAD(ip_vs_dest_trash);
-
-/*
- *     FTP & NULL virtual service counters
- */
-static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
-static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
-
-
-/*
- *     Returns hash value for virtual service
- */
-static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
-                 __be16 port)
-{
-       register unsigned porth = ntohs(port);
-       __be32 addr_fold = addr->ip;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               addr_fold = addr->ip6[0]^addr->ip6[1]^
-                           addr->ip6[2]^addr->ip6[3];
-#endif
-
-       return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
-               & IP_VS_SVC_TAB_MASK;
-}
-
-/*
- *     Returns hash value of fwmark for virtual service lookup
- */
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
-{
-       return fwmark & IP_VS_SVC_TAB_MASK;
-}
-
-/*
- *     Hashes a service in the ip_vs_svc_table by <proto,addr,port>
- *     or in the ip_vs_svc_fwm_table by fwmark.
- *     Should be called with locked tables.
- */
-static int ip_vs_svc_hash(struct ip_vs_service *svc)
-{
-       unsigned hash;
-
-       if (svc->flags & IP_VS_SVC_F_HASHED) {
-               IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
-                         "called from %p\n", __builtin_return_address(0));
-               return 0;
-       }
-
-       if (svc->fwmark == 0) {
-               /*
-                *  Hash it by <protocol,addr,port> in ip_vs_svc_table
-                */
-               hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
-                                        svc->port);
-               list_add(&svc->s_list, &ip_vs_svc_table[hash]);
-       } else {
-               /*
-                *  Hash it by fwmark in ip_vs_svc_fwm_table
-                */
-               hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
-               list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
-       }
-
-       svc->flags |= IP_VS_SVC_F_HASHED;
-       /* increase its refcnt because it is referenced by the svc table */
-       atomic_inc(&svc->refcnt);
-       return 1;
-}
-
-
-/*
- *     Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
- *     Should be called with locked tables.
- */
-static int ip_vs_svc_unhash(struct ip_vs_service *svc)
-{
-       if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
-               IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
-                         "called from %p\n", __builtin_return_address(0));
-               return 0;
-       }
-
-       if (svc->fwmark == 0) {
-               /* Remove it from the ip_vs_svc_table table */
-               list_del(&svc->s_list);
-       } else {
-               /* Remove it from the ip_vs_svc_fwm_table table */
-               list_del(&svc->f_list);
-       }
-
-       svc->flags &= ~IP_VS_SVC_F_HASHED;
-       atomic_dec(&svc->refcnt);
-       return 1;
-}
-
-
-/*
- *     Get service by {proto,addr,port} in the service table.
- */
-static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
-                   __be16 vport)
-{
-       unsigned hash;
-       struct ip_vs_service *svc;
-
-       /* Check for "full" addressed entries */
-       hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
-
-       list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
-               if ((svc->af == af)
-                   && ip_vs_addr_equal(af, &svc->addr, vaddr)
-                   && (svc->port == vport)
-                   && (svc->protocol == protocol)) {
-                       /* HIT */
-                       atomic_inc(&svc->usecnt);
-                       return svc;
-               }
-       }
-
-       return NULL;
-}
-
-
-/*
- *     Get service by {fwmark} in the service table.
- */
-static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
-{
-       unsigned hash;
-       struct ip_vs_service *svc;
-
-       /* Check for fwmark addressed entries */
-       hash = ip_vs_svc_fwm_hashkey(fwmark);
-
-       list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
-               if (svc->fwmark == fwmark && svc->af == af) {
-                       /* HIT */
-                       atomic_inc(&svc->usecnt);
-                       return svc;
-               }
-       }
-
-       return NULL;
-}
-
-struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
-                 const union nf_inet_addr *vaddr, __be16 vport)
-{
-       struct ip_vs_service *svc;
-
-       read_lock(&__ip_vs_svc_lock);
-
-       /*
-        *      Check the table hashed by fwmark first
-        */
-       if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
-               goto out;
-
-       /*
-        *      Check the table hashed by <protocol,addr,port>
-        *      for "full" addressed entries
-        */
-       svc = __ip_vs_service_get(af, protocol, vaddr, vport);
-
-       if (svc == NULL
-           && protocol == IPPROTO_TCP
-           && atomic_read(&ip_vs_ftpsvc_counter)
-           && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
-               /*
-                * Check if ftp service entry exists, the packet
-                * might belong to FTP data connections.
-                */
-               svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
-       }
-
-       if (svc == NULL
-           && atomic_read(&ip_vs_nullsvc_counter)) {
-               /*
-                * Check if the catch-all port (port zero) exists
-                */
-               svc = __ip_vs_service_get(af, protocol, vaddr, 0);
-       }
-
-  out:
-       read_unlock(&__ip_vs_svc_lock);
-
-       IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
-                     fwmark, ip_vs_proto_name(protocol),
-                     IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
-                     svc ? "hit" : "not hit");
-
-       return svc;
-}
-
-
-static inline void
-__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
-       atomic_inc(&svc->refcnt);
-       dest->svc = svc;
-}
-
-static inline void
-__ip_vs_unbind_svc(struct ip_vs_dest *dest)
-{
-       struct ip_vs_service *svc = dest->svc;
-
-       dest->svc = NULL;
-       if (atomic_dec_and_test(&svc->refcnt))
-               kfree(svc);
-}
-
-
-/*
- *     Returns hash value for real service
- */
-static inline unsigned ip_vs_rs_hashkey(int af,
-                                           const union nf_inet_addr *addr,
-                                           __be16 port)
-{
-       register unsigned porth = ntohs(port);
-       __be32 addr_fold = addr->ip;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               addr_fold = addr->ip6[0]^addr->ip6[1]^
-                           addr->ip6[2]^addr->ip6[3];
-#endif
-
-       return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
-               & IP_VS_RTAB_MASK;
-}
-
-/*
- *     Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
- *     should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
-{
-       unsigned hash;
-
-       if (!list_empty(&dest->d_list)) {
-               return 0;
-       }
-
-       /*
-        *      Hash by proto,addr,port,
-        *      which are the parameters of the real service.
-        */
-       hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
-
-       list_add(&dest->d_list, &ip_vs_rtable[hash]);
-
-       return 1;
-}
-
-/*
- *     UNhashes ip_vs_dest from ip_vs_rtable.
- *     should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
-{
-       /*
-        * Remove it from the ip_vs_rtable table.
-        */
-       if (!list_empty(&dest->d_list)) {
-               list_del(&dest->d_list);
-               INIT_LIST_HEAD(&dest->d_list);
-       }
-
-       return 1;
-}
-
-/*
- *     Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
-                         const union nf_inet_addr *daddr,
-                         __be16 dport)
-{
-       unsigned hash;
-       struct ip_vs_dest *dest;
-
-       /*
-        *      Check for "full" addressed entries
-        *      Return the first found entry
-        */
-       hash = ip_vs_rs_hashkey(af, daddr, dport);
-
-       read_lock(&__ip_vs_rs_lock);
-       list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
-               if ((dest->af == af)
-                   && ip_vs_addr_equal(af, &dest->addr, daddr)
-                   && (dest->port == dport)
-                   && ((dest->protocol == protocol) ||
-                       dest->vfwmark)) {
-                       /* HIT */
-                       read_unlock(&__ip_vs_rs_lock);
-                       return dest;
-               }
-       }
-       read_unlock(&__ip_vs_rs_lock);
-
-       return NULL;
-}
-
-/*
- *     Lookup destination by {addr,port} in the given service
- */
-static struct ip_vs_dest *
-ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
-                 __be16 dport)
-{
-       struct ip_vs_dest *dest;
-
-       /*
-        * Find the destination for the given service
-        */
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if ((dest->af == svc->af)
-                   && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
-                   && (dest->port == dport)) {
-                       /* HIT */
-                       return dest;
-               }
-       }
-
-       return NULL;
-}
-
-/*
- * Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
- * the backup synchronization daemon. It finds the
- * destination to be bound to the received connection
- * on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
- */
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
-                                  __be16 dport,
-                                  const union nf_inet_addr *vaddr,
-                                  __be16 vport, __u16 protocol)
-{
-       struct ip_vs_dest *dest;
-       struct ip_vs_service *svc;
-
-       svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
-       if (!svc)
-               return NULL;
-       dest = ip_vs_lookup_dest(svc, daddr, dport);
-       if (dest)
-               atomic_inc(&dest->refcnt);
-       ip_vs_service_put(svc);
-       return dest;
-}
-
-/*
- *  Lookup dest by {svc,addr,port} in the destination trash.
- *  The destination trash is used to hold the destinations that are removed
- *  from the service table but are still referenced by some conn entries.
- *  The reason to add the destination trash is when the dest is temporary
- *  down (either by administrator or by monitor program), the dest can be
- *  picked back from the trash, the remaining connections to the dest can
- *  continue, and the counting information of the dest is also useful for
- *  scheduling.
- */
-static struct ip_vs_dest *
-ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
-                    __be16 dport)
-{
-       struct ip_vs_dest *dest, *nxt;
-
-       /*
-        * Find the destination in trash
-        */
-       list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
-               IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
-                             "dest->refcnt=%d\n",
-                             dest->vfwmark,
-                             IP_VS_DBG_ADDR(svc->af, &dest->addr),
-                             ntohs(dest->port),
-                             atomic_read(&dest->refcnt));
-               if (dest->af == svc->af &&
-                   ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
-                   dest->port == dport &&
-                   dest->vfwmark == svc->fwmark &&
-                   dest->protocol == svc->protocol &&
-                   (svc->fwmark ||
-                    (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
-                     dest->vport == svc->port))) {
-                       /* HIT */
-                       return dest;
-               }
-
-               /*
-                * Try to purge the destination from trash if not referenced
-                */
-               if (atomic_read(&dest->refcnt) == 1) {
-                       IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
-                                     "from trash\n",
-                                     dest->vfwmark,
-                                     IP_VS_DBG_ADDR(svc->af, &dest->addr),
-                                     ntohs(dest->port));
-                       list_del(&dest->n_list);
-                       ip_vs_dst_reset(dest);
-                       __ip_vs_unbind_svc(dest);
-                       kfree(dest);
-               }
-       }
-
-       return NULL;
-}
-
-
-/*
- *  Clean up all the destinations in the trash
- *  Called by the ip_vs_control_cleanup()
- *
- *  When the ip_vs_control_clearup is activated by ipvs module exit,
- *  the service tables must have been flushed and all the connections
- *  are expired, and the refcnt of each destination in the trash must
- *  be 1, so we simply release them here.
- */
-static void ip_vs_trash_cleanup(void)
-{
-       struct ip_vs_dest *dest, *nxt;
-
-       list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
-               list_del(&dest->n_list);
-               ip_vs_dst_reset(dest);
-               __ip_vs_unbind_svc(dest);
-               kfree(dest);
-       }
-}
-
-
-static void
-ip_vs_zero_stats(struct ip_vs_stats *stats)
-{
-       spin_lock_bh(&stats->lock);
-
-       memset(&stats->ustats, 0, sizeof(stats->ustats));
-       ip_vs_zero_estimator(stats);
-
-       spin_unlock_bh(&stats->lock);
-}
-
-/*
- *     Update a destination in the given service
- */
-static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-                   struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
-{
-       int conn_flags;
-
-       /* set the weight and the flags */
-       atomic_set(&dest->weight, udest->weight);
-       conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
-       /* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6) {
-               if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-                               | IP_VS_CONN_F_LOCALNODE;
-               }
-       } else
-#endif
-               if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-                               | IP_VS_CONN_F_LOCALNODE;
-               }
-
-       /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
-       if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
-               conn_flags |= IP_VS_CONN_F_NOOUTPUT;
-       } else {
-               /*
-                *    Put the real service in ip_vs_rtable if not present.
-                *    For now only for NAT!
-                */
-               write_lock_bh(&__ip_vs_rs_lock);
-               ip_vs_rs_hash(dest);
-               write_unlock_bh(&__ip_vs_rs_lock);
-       }
-       atomic_set(&dest->conn_flags, conn_flags);
-
-       /* bind the service */
-       if (!dest->svc) {
-               __ip_vs_bind_svc(dest, svc);
-       } else {
-               if (dest->svc != svc) {
-                       __ip_vs_unbind_svc(dest);
-                       ip_vs_zero_stats(&dest->stats);
-                       __ip_vs_bind_svc(dest, svc);
-               }
-       }
-
-       /* set the dest status flags */
-       dest->flags |= IP_VS_DEST_F_AVAILABLE;
-
-       if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
-               dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
-       dest->u_threshold = udest->u_threshold;
-       dest->l_threshold = udest->l_threshold;
-}
-
-
-/*
- *     Create a destination for the given service
- */
-static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
-              struct ip_vs_dest **dest_p)
-{
-       struct ip_vs_dest *dest;
-       unsigned atype;
-
-       EnterFunction(2);
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6) {
-               atype = ipv6_addr_type(&udest->addr.in6);
-               if ((!(atype & IPV6_ADDR_UNICAST) ||
-                       atype & IPV6_ADDR_LINKLOCAL) &&
-                       !__ip_vs_addr_is_local_v6(&udest->addr.in6))
-                       return -EINVAL;
-       } else
-#endif
-       {
-               atype = inet_addr_type(&init_net, udest->addr.ip);
-               if (atype != RTN_LOCAL && atype != RTN_UNICAST)
-                       return -EINVAL;
-       }
-
-       dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
-       if (dest == NULL) {
-               IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
-               return -ENOMEM;
-       }
-
-       dest->af = svc->af;
-       dest->protocol = svc->protocol;
-       dest->vaddr = svc->addr;
-       dest->vport = svc->port;
-       dest->vfwmark = svc->fwmark;
-       ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
-       dest->port = udest->port;
-
-       atomic_set(&dest->activeconns, 0);
-       atomic_set(&dest->inactconns, 0);
-       atomic_set(&dest->persistconns, 0);
-       atomic_set(&dest->refcnt, 0);
-
-       INIT_LIST_HEAD(&dest->d_list);
-       spin_lock_init(&dest->dst_lock);
-       spin_lock_init(&dest->stats.lock);
-       __ip_vs_update_dest(svc, dest, udest);
-       ip_vs_new_estimator(&dest->stats);
-
-       *dest_p = dest;
-
-       LeaveFunction(2);
-       return 0;
-}
-
-
-/*
- *     Add a destination into an existing service
- */
-static int
-ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
-{
-       struct ip_vs_dest *dest;
-       union nf_inet_addr daddr;
-       __be16 dport = udest->port;
-       int ret;
-
-       EnterFunction(2);
-
-       if (udest->weight < 0) {
-               IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
-               return -ERANGE;
-       }
-
-       if (udest->l_threshold > udest->u_threshold) {
-               IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
-                         "upper threshold\n");
-               return -ERANGE;
-       }
-
-       ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
-
-       /*
-        * Check if the dest already exists in the list
-        */
-       dest = ip_vs_lookup_dest(svc, &daddr, dport);
-
-       if (dest != NULL) {
-               IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
-               return -EEXIST;
-       }
-
-       /*
-        * Check if the dest already exists in the trash and
-        * is from the same service
-        */
-       dest = ip_vs_trash_get_dest(svc, &daddr, dport);
-
-       if (dest != NULL) {
-               IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
-                             "dest->refcnt=%d, service %u/%s:%u\n",
-                             IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
-                             atomic_read(&dest->refcnt),
-                             dest->vfwmark,
-                             IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
-                             ntohs(dest->vport));
-
-               __ip_vs_update_dest(svc, dest, udest);
-
-               /*
-                * Get the destination from the trash
-                */
-               list_del(&dest->n_list);
-
-               ip_vs_new_estimator(&dest->stats);
-
-               write_lock_bh(&__ip_vs_svc_lock);
-
-               /*
-                * Wait until all other svc users go away.
-                */
-               IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-               list_add(&dest->n_list, &svc->destinations);
-               svc->num_dests++;
-
-               /* call the update_service function of its scheduler */
-               if (svc->scheduler->update_service)
-                       svc->scheduler->update_service(svc);
-
-               write_unlock_bh(&__ip_vs_svc_lock);
-               return 0;
-       }
-
-       /*
-        * Allocate and initialize the dest structure
-        */
-       ret = ip_vs_new_dest(svc, udest, &dest);
-       if (ret) {
-               return ret;
-       }
-
-       /*
-        * Add the dest entry into the list
-        */
-       atomic_inc(&dest->refcnt);
-
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       /*
-        * Wait until all other svc users go away.
-        */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       list_add(&dest->n_list, &svc->destinations);
-       svc->num_dests++;
-
-       /* call the update_service function of its scheduler */
-       if (svc->scheduler->update_service)
-               svc->scheduler->update_service(svc);
-
-       write_unlock_bh(&__ip_vs_svc_lock);
-
-       LeaveFunction(2);
-
-       return 0;
-}
-
-
-/*
- *     Edit a destination in the given service
- */
-static int
-ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
-{
-       struct ip_vs_dest *dest;
-       union nf_inet_addr daddr;
-       __be16 dport = udest->port;
-
-       EnterFunction(2);
-
-       if (udest->weight < 0) {
-               IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
-               return -ERANGE;
-       }
-
-       if (udest->l_threshold > udest->u_threshold) {
-               IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
-                         "upper threshold\n");
-               return -ERANGE;
-       }
-
-       ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
-
-       /*
-        *  Lookup the destination list
-        */
-       dest = ip_vs_lookup_dest(svc, &daddr, dport);
-
-       if (dest == NULL) {
-               IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
-               return -ENOENT;
-       }
-
-       __ip_vs_update_dest(svc, dest, udest);
-
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       /* Wait until all other svc users go away */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       /* call the update_service, because server weight may be changed */
-       if (svc->scheduler->update_service)
-               svc->scheduler->update_service(svc);
-
-       write_unlock_bh(&__ip_vs_svc_lock);
-
-       LeaveFunction(2);
-
-       return 0;
-}
-
-
-/*
- *     Delete a destination (must be already unlinked from the service)
- */
-static void __ip_vs_del_dest(struct ip_vs_dest *dest)
-{
-       ip_vs_kill_estimator(&dest->stats);
-
-       /*
-        *  Remove it from the d-linked list with the real services.
-        */
-       write_lock_bh(&__ip_vs_rs_lock);
-       ip_vs_rs_unhash(dest);
-       write_unlock_bh(&__ip_vs_rs_lock);
-
-       /*
-        *  Decrease the refcnt of the dest, and free the dest
-        *  if nobody refers to it (refcnt=0). Otherwise, throw
-        *  the destination into the trash.
-        */
-       if (atomic_dec_and_test(&dest->refcnt)) {
-               ip_vs_dst_reset(dest);
-               /* simply decrease svc->refcnt here, let the caller check
-                  and release the service if nobody refers to it.
-                  Only user context can release destination and service,
-                  and only one user context can update virtual service at a
-                  time, so the operation here is OK */
-               atomic_dec(&dest->svc->refcnt);
-               kfree(dest);
-       } else {
-               IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
-                             "dest->refcnt=%d\n",
-                             IP_VS_DBG_ADDR(dest->af, &dest->addr),
-                             ntohs(dest->port),
-                             atomic_read(&dest->refcnt));
-               list_add(&dest->n_list, &ip_vs_dest_trash);
-               atomic_inc(&dest->refcnt);
-       }
-}
-
-
-/*
- *     Unlink a destination from the given service
- */
-static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
-                               struct ip_vs_dest *dest,
-                               int svcupd)
-{
-       dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
-
-       /*
-        *  Remove it from the d-linked destination list.
-        */
-       list_del(&dest->n_list);
-       svc->num_dests--;
-
-       /*
-        *  Call the update_service function of its scheduler
-        */
-       if (svcupd && svc->scheduler->update_service)
-                       svc->scheduler->update_service(svc);
-}
-
-
-/*
- *     Delete a destination server in the given service
- */
-static int
-ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
-{
-       struct ip_vs_dest *dest;
-       __be16 dport = udest->port;
-
-       EnterFunction(2);
-
-       dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
-
-       if (dest == NULL) {
-               IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
-               return -ENOENT;
-       }
-
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       /*
-        *      Wait until all other svc users go away.
-        */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       /*
-        *      Unlink dest from the service
-        */
-       __ip_vs_unlink_dest(svc, dest, 1);
-
-       write_unlock_bh(&__ip_vs_svc_lock);
-
-       /*
-        *      Delete the destination
-        */
-       __ip_vs_del_dest(dest);
-
-       LeaveFunction(2);
-
-       return 0;
-}
-
-
-/*
- *     Add a service into the service hash table
- */
-static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
-                 struct ip_vs_service **svc_p)
-{
-       int ret = 0;
-       struct ip_vs_scheduler *sched = NULL;
-       struct ip_vs_service *svc = NULL;
-
-       /* increase the module use count */
-       ip_vs_use_count_inc();
-
-       /* Lookup the scheduler by 'u->sched_name' */
-       sched = ip_vs_scheduler_get(u->sched_name);
-       if (sched == NULL) {
-               IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
-                          u->sched_name);
-               ret = -ENOENT;
-               goto out_mod_dec;
-       }
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (u->af == AF_INET6) {
-               if (!sched->supports_ipv6) {
-                       ret = -EAFNOSUPPORT;
-                       goto out_err;
-               }
-               if ((u->netmask < 1) || (u->netmask > 128)) {
-                       ret = -EINVAL;
-                       goto out_err;
-               }
-       }
-#endif
-
-       svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
-       if (svc == NULL) {
-               IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
-               ret = -ENOMEM;
-               goto out_err;
-       }
-
-       /* I'm the first user of the service */
-       atomic_set(&svc->usecnt, 1);
-       atomic_set(&svc->refcnt, 0);
-
-       svc->af = u->af;
-       svc->protocol = u->protocol;
-       ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
-       svc->port = u->port;
-       svc->fwmark = u->fwmark;
-       svc->flags = u->flags;
-       svc->timeout = u->timeout * HZ;
-       svc->netmask = u->netmask;
-
-       INIT_LIST_HEAD(&svc->destinations);
-       rwlock_init(&svc->sched_lock);
-       spin_lock_init(&svc->stats.lock);
-
-       /* Bind the scheduler */
-       ret = ip_vs_bind_scheduler(svc, sched);
-       if (ret)
-               goto out_err;
-       sched = NULL;
-
-       /* Update the virtual service counters */
-       if (svc->port == FTPPORT)
-               atomic_inc(&ip_vs_ftpsvc_counter);
-       else if (svc->port == 0)
-               atomic_inc(&ip_vs_nullsvc_counter);
-
-       ip_vs_new_estimator(&svc->stats);
-
-       /* Count only IPv4 services for old get/setsockopt interface */
-       if (svc->af == AF_INET)
-               ip_vs_num_services++;
-
-       /* Hash the service into the service table */
-       write_lock_bh(&__ip_vs_svc_lock);
-       ip_vs_svc_hash(svc);
-       write_unlock_bh(&__ip_vs_svc_lock);
-
-       *svc_p = svc;
-       return 0;
-
-  out_err:
-       if (svc != NULL) {
-               if (svc->scheduler)
-                       ip_vs_unbind_scheduler(svc);
-               if (svc->inc) {
-                       local_bh_disable();
-                       ip_vs_app_inc_put(svc->inc);
-                       local_bh_enable();
-               }
-               kfree(svc);
-       }
-       ip_vs_scheduler_put(sched);
-
-  out_mod_dec:
-       /* decrease the module use count */
-       ip_vs_use_count_dec();
-
-       return ret;
-}
-
-
-/*
- *     Edit a service and bind it with a new scheduler
- */
-static int
-ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
-{
-       struct ip_vs_scheduler *sched, *old_sched;
-       int ret = 0;
-
-       /*
-        * Lookup the scheduler, by 'u->sched_name'
-        */
-       sched = ip_vs_scheduler_get(u->sched_name);
-       if (sched == NULL) {
-               IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
-                          u->sched_name);
-               return -ENOENT;
-       }
-       old_sched = sched;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (u->af == AF_INET6) {
-               if (!sched->supports_ipv6) {
-                       ret = -EAFNOSUPPORT;
-                       goto out;
-               }
-               if ((u->netmask < 1) || (u->netmask > 128)) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-       }
-#endif
-
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       /*
-        * Wait until all other svc users go away.
-        */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       /*
-        * Set the flags and timeout value
-        */
-       svc->flags = u->flags | IP_VS_SVC_F_HASHED;
-       svc->timeout = u->timeout * HZ;
-       svc->netmask = u->netmask;
-
-       old_sched = svc->scheduler;
-       if (sched != old_sched) {
-               /*
-                * Unbind the old scheduler
-                */
-               if ((ret = ip_vs_unbind_scheduler(svc))) {
-                       old_sched = sched;
-                       goto out_unlock;
-               }
-
-               /*
-                * Bind the new scheduler
-                */
-               if ((ret = ip_vs_bind_scheduler(svc, sched))) {
-                       /*
-                        * If ip_vs_bind_scheduler fails, restore the old
-                        * scheduler.
-                        * The main reason of failure is out of memory.
-                        *
-                        * The question is if the old scheduler can be
-                        * restored all the time. TODO: if it cannot be
-                        * restored some time, we must delete the service,
-                        * otherwise the system may crash.
-                        */
-                       ip_vs_bind_scheduler(svc, old_sched);
-                       old_sched = sched;
-                       goto out_unlock;
-               }
-       }
-
-  out_unlock:
-       write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
-  out:
-#endif
-
-       if (old_sched)
-               ip_vs_scheduler_put(old_sched);
-
-       return ret;
-}
-
-
-/*
- *     Delete a service from the service list
- *     - The service must be unlinked, unlocked and not referenced!
- *     - We are called under _bh lock
- */
-static void __ip_vs_del_service(struct ip_vs_service *svc)
-{
-       struct ip_vs_dest *dest, *nxt;
-       struct ip_vs_scheduler *old_sched;
-
-       /* Count only IPv4 services for old get/setsockopt interface */
-       if (svc->af == AF_INET)
-               ip_vs_num_services--;
-
-       ip_vs_kill_estimator(&svc->stats);
-
-       /* Unbind scheduler */
-       old_sched = svc->scheduler;
-       ip_vs_unbind_scheduler(svc);
-       if (old_sched)
-               ip_vs_scheduler_put(old_sched);
-
-       /* Unbind app inc */
-       if (svc->inc) {
-               ip_vs_app_inc_put(svc->inc);
-               svc->inc = NULL;
-       }
-
-       /*
-        *    Unlink the whole destination list
-        */
-       list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
-               __ip_vs_unlink_dest(svc, dest, 0);
-               __ip_vs_del_dest(dest);
-       }
-
-       /*
-        *    Update the virtual service counters
-        */
-       if (svc->port == FTPPORT)
-               atomic_dec(&ip_vs_ftpsvc_counter);
-       else if (svc->port == 0)
-               atomic_dec(&ip_vs_nullsvc_counter);
-
-       /*
-        *    Free the service if nobody refers to it
-        */
-       if (atomic_read(&svc->refcnt) == 0)
-               kfree(svc);
-
-       /* decrease the module use count */
-       ip_vs_use_count_dec();
-}
-
-/*
- *     Delete a service from the service list
- */
-static int ip_vs_del_service(struct ip_vs_service *svc)
-{
-       if (svc == NULL)
-               return -EEXIST;
-
-       /*
-        * Unhash it from the service table
-        */
-       write_lock_bh(&__ip_vs_svc_lock);
-
-       ip_vs_svc_unhash(svc);
-
-       /*
-        * Wait until all the svc users go away.
-        */
-       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-       __ip_vs_del_service(svc);
-
-       write_unlock_bh(&__ip_vs_svc_lock);
-
-       return 0;
-}
-
-
-/*
- *     Flush all the virtual services
- */
-static int ip_vs_flush(void)
-{
-       int idx;
-       struct ip_vs_service *svc, *nxt;
-
-       /*
-        * Flush the service table hashed by <protocol,addr,port>
-        */
-       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-                       write_lock_bh(&__ip_vs_svc_lock);
-                       ip_vs_svc_unhash(svc);
-                       /*
-                        * Wait until all the svc users go away.
-                        */
-                       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-                       __ip_vs_del_service(svc);
-                       write_unlock_bh(&__ip_vs_svc_lock);
-               }
-       }
-
-       /*
-        * Flush the service table hashed by fwmark
-        */
-       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry_safe(svc, nxt,
-                                        &ip_vs_svc_fwm_table[idx], f_list) {
-                       write_lock_bh(&__ip_vs_svc_lock);
-                       ip_vs_svc_unhash(svc);
-                       /*
-                        * Wait until all the svc users go away.
-                        */
-                       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-                       __ip_vs_del_service(svc);
-                       write_unlock_bh(&__ip_vs_svc_lock);
-               }
-       }
-
-       return 0;
-}
-
-
-/*
- *     Zero counters in a service or all services
- */
-static int ip_vs_zero_service(struct ip_vs_service *svc)
-{
-       struct ip_vs_dest *dest;
-
-       write_lock_bh(&__ip_vs_svc_lock);
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               ip_vs_zero_stats(&dest->stats);
-       }
-       ip_vs_zero_stats(&svc->stats);
-       write_unlock_bh(&__ip_vs_svc_lock);
-       return 0;
-}
-
-static int ip_vs_zero_all(void)
-{
-       int idx;
-       struct ip_vs_service *svc;
-
-       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-                       ip_vs_zero_service(svc);
-               }
-       }
-
-       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-                       ip_vs_zero_service(svc);
-               }
-       }
-
-       ip_vs_zero_stats(&ip_vs_stats);
-       return 0;
-}
-
-
-static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
-                    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       int *valp = table->data;
-       int val = *valp;
-       int rc;
-
-       rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
-       if (write && (*valp != val)) {
-               if ((*valp < 0) || (*valp > 3)) {
-                       /* Restore the correct value */
-                       *valp = val;
-               } else {
-                       update_defense_level();
-               }
-       }
-       return rc;
-}
-
-
-static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
-                      void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       int *valp = table->data;
-       int val[2];
-       int rc;
-
-       /* backup the value first */
-       memcpy(val, valp, sizeof(val));
-
-       rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
-       if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
-               /* Restore the correct value */
-               memcpy(valp, val, sizeof(val));
-       }
-       return rc;
-}
-
-
-/*
- *     IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
- */
-
-static struct ctl_table vs_vars[] = {
-       {
-               .procname       = "amemthresh",
-               .data           = &sysctl_ip_vs_amemthresh,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-#ifdef CONFIG_IP_VS_DEBUG
-       {
-               .procname       = "debug_level",
-               .data           = &sysctl_ip_vs_debug_level,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-#endif
-       {
-               .procname       = "am_droprate",
-               .data           = &sysctl_ip_vs_am_droprate,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-       {
-               .procname       = "drop_entry",
-               .data           = &sysctl_ip_vs_drop_entry,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_do_defense_mode,
-       },
-       {
-               .procname       = "drop_packet",
-               .data           = &sysctl_ip_vs_drop_packet,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_do_defense_mode,
-       },
-       {
-               .procname       = "secure_tcp",
-               .data           = &sysctl_ip_vs_secure_tcp,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_do_defense_mode,
-       },
-#if 0
-       {
-               .procname       = "timeout_established",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_synsent",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_synrecv",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_finwait",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_timewait",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_close",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_closewait",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_lastack",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_listen",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_synack",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_udp",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       {
-               .procname       = "timeout_icmp",
-               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-#endif
-       {
-               .procname       = "cache_bypass",
-               .data           = &sysctl_ip_vs_cache_bypass,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-       {
-               .procname       = "expire_nodest_conn",
-               .data           = &sysctl_ip_vs_expire_nodest_conn,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-       {
-               .procname       = "expire_quiescent_template",
-               .data           = &sysctl_ip_vs_expire_quiescent_template,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-       {
-               .procname       = "sync_threshold",
-               .data           = &sysctl_ip_vs_sync_threshold,
-               .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
-               .mode           = 0644,
-               .proc_handler   = &proc_do_sync_threshold,
-       },
-       {
-               .procname       = "nat_icmp_send",
-               .data           = &sysctl_ip_vs_nat_icmp_send,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-       { .ctl_name = 0 }
-};
-
-const struct ctl_path net_vs_ctl_path[] = {
-       { .procname = "net", .ctl_name = CTL_NET, },
-       { .procname = "ipv4", .ctl_name = NET_IPV4, },
-       { .procname = "vs", },
-       { }
-};
-EXPORT_SYMBOL_GPL(net_vs_ctl_path);
-
-static struct ctl_table_header * sysctl_header;
-
-#ifdef CONFIG_PROC_FS
-
-struct ip_vs_iter {
-       struct list_head *table;
-       int bucket;
-};
-
-/*
- *     Write the contents of the VS rule table to a PROCfs file.
- *     (It is kept just for backward compatibility)
- */
-static inline const char *ip_vs_fwd_name(unsigned flags)
-{
-       switch (flags & IP_VS_CONN_F_FWD_MASK) {
-       case IP_VS_CONN_F_LOCALNODE:
-               return "Local";
-       case IP_VS_CONN_F_TUNNEL:
-               return "Tunnel";
-       case IP_VS_CONN_F_DROUTE:
-               return "Route";
-       default:
-               return "Masq";
-       }
-}
-
-
-/* Get the Nth entry in the two lists */
-static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
-{
-       struct ip_vs_iter *iter = seq->private;
-       int idx;
-       struct ip_vs_service *svc;
-
-       /* look in hash by protocol */
-       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-                       if (pos-- == 0){
-                               iter->table = ip_vs_svc_table;
-                               iter->bucket = idx;
-                               return svc;
-                       }
-               }
-       }
-
-       /* keep looking in fwmark */
-       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-                       if (pos-- == 0) {
-                               iter->table = ip_vs_svc_fwm_table;
-                               iter->bucket = idx;
-                               return svc;
-                       }
-               }
-       }
-
-       return NULL;
-}
-
-static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
-{
-
-       read_lock_bh(&__ip_vs_svc_lock);
-       return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
-}
-
-
-static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-       struct list_head *e;
-       struct ip_vs_iter *iter;
-       struct ip_vs_service *svc;
-
-       ++*pos;
-       if (v == SEQ_START_TOKEN)
-               return ip_vs_info_array(seq,0);
-
-       svc = v;
-       iter = seq->private;
-
-       if (iter->table == ip_vs_svc_table) {
-               /* next service in table hashed by protocol */
-               if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
-                       return list_entry(e, struct ip_vs_service, s_list);
-
-
-               while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-                       list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
-                                           s_list) {
-                               return svc;
-                       }
-               }
-
-               iter->table = ip_vs_svc_fwm_table;
-               iter->bucket = -1;
-               goto scan_fwmark;
-       }
-
-       /* next service in hashed by fwmark */
-       if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
-               return list_entry(e, struct ip_vs_service, f_list);
-
- scan_fwmark:
-       while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
-               list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
-                                   f_list)
-                       return svc;
-       }
-
-       return NULL;
-}
-
-static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
-{
-       read_unlock_bh(&__ip_vs_svc_lock);
-}
-
-
-static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
-{
-       if (v == SEQ_START_TOKEN) {
-               seq_printf(seq,
-                       "IP Virtual Server version %d.%d.%d (size=%d)\n",
-                       NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
-               seq_puts(seq,
-                        "Prot LocalAddress:Port Scheduler Flags\n");
-               seq_puts(seq,
-                        "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
-       } else {
-               const struct ip_vs_service *svc = v;
-               const struct ip_vs_iter *iter = seq->private;
-               const struct ip_vs_dest *dest;
-
-               if (iter->table == ip_vs_svc_table) {
-#ifdef CONFIG_IP_VS_IPV6
-                       if (svc->af == AF_INET6)
-                               seq_printf(seq, "%s  [" NIP6_FMT "]:%04X %s ",
-                                          ip_vs_proto_name(svc->protocol),
-                                          NIP6(svc->addr.in6),
-                                          ntohs(svc->port),
-                                          svc->scheduler->name);
-                       else
-#endif
-                               seq_printf(seq, "%s  %08X:%04X %s ",
-                                          ip_vs_proto_name(svc->protocol),
-                                          ntohl(svc->addr.ip),
-                                          ntohs(svc->port),
-                                          svc->scheduler->name);
-               } else {
-                       seq_printf(seq, "FWM  %08X %s ",
-                                  svc->fwmark, svc->scheduler->name);
-               }
-
-               if (svc->flags & IP_VS_SVC_F_PERSISTENT)
-                       seq_printf(seq, "persistent %d %08X\n",
-                               svc->timeout,
-                               ntohl(svc->netmask));
-               else
-                       seq_putc(seq, '\n');
-
-               list_for_each_entry(dest, &svc->destinations, n_list) {
-#ifdef CONFIG_IP_VS_IPV6
-                       if (dest->af == AF_INET6)
-                               seq_printf(seq,
-                                          "  -> [" NIP6_FMT "]:%04X"
-                                          "      %-7s %-6d %-10d %-10d\n",
-                                          NIP6(dest->addr.in6),
-                                          ntohs(dest->port),
-                                          ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
-                                          atomic_read(&dest->weight),
-                                          atomic_read(&dest->activeconns),
-                                          atomic_read(&dest->inactconns));
-                       else
-#endif
-                               seq_printf(seq,
-                                          "  -> %08X:%04X      "
-                                          "%-7s %-6d %-10d %-10d\n",
-                                          ntohl(dest->addr.ip),
-                                          ntohs(dest->port),
-                                          ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
-                                          atomic_read(&dest->weight),
-                                          atomic_read(&dest->activeconns),
-                                          atomic_read(&dest->inactconns));
-
-               }
-       }
-       return 0;
-}
-
-static const struct seq_operations ip_vs_info_seq_ops = {
-       .start = ip_vs_info_seq_start,
-       .next  = ip_vs_info_seq_next,
-       .stop  = ip_vs_info_seq_stop,
-       .show  = ip_vs_info_seq_show,
-};
-
-static int ip_vs_info_open(struct inode *inode, struct file *file)
-{
-       return seq_open_private(file, &ip_vs_info_seq_ops,
-                       sizeof(struct ip_vs_iter));
-}
-
-static const struct file_operations ip_vs_info_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ip_vs_info_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = seq_release_private,
-};
-
-#endif
-
-struct ip_vs_stats ip_vs_stats = {
-       .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
-};
-
-#ifdef CONFIG_PROC_FS
-static int ip_vs_stats_show(struct seq_file *seq, void *v)
-{
-
-/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
-       seq_puts(seq,
-                "   Total Incoming Outgoing         Incoming         Outgoing\n");
-       seq_printf(seq,
-                  "   Conns  Packets  Packets            Bytes            Bytes\n");
-
-       spin_lock_bh(&ip_vs_stats.lock);
-       seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
-                  ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
-                  (unsigned long long) ip_vs_stats.ustats.inbytes,
-                  (unsigned long long) ip_vs_stats.ustats.outbytes);
-
-/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
-       seq_puts(seq,
-                  " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
-       seq_printf(seq,"%8X %8X %8X %16X %16X\n",
-                       ip_vs_stats.ustats.cps,
-                       ip_vs_stats.ustats.inpps,
-                       ip_vs_stats.ustats.outpps,
-                       ip_vs_stats.ustats.inbps,
-                       ip_vs_stats.ustats.outbps);
-       spin_unlock_bh(&ip_vs_stats.lock);
-
-       return 0;
-}
-
-static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ip_vs_stats_show, NULL);
-}
-
-static const struct file_operations ip_vs_stats_fops = {
-       .owner = THIS_MODULE,
-       .open = ip_vs_stats_seq_open,
-       .read = seq_read,
-       .llseek = seq_lseek,
-       .release = single_release,
-};
-
-#endif
-
-/*
- *     Set timeout values for tcp tcpfin udp in the timeout_table.
- */
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
-{
-       IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
-                 u->tcp_timeout,
-                 u->tcp_fin_timeout,
-                 u->udp_timeout);
-
-#ifdef CONFIG_IP_VS_PROTO_TCP
-       if (u->tcp_timeout) {
-               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
-                       = u->tcp_timeout * HZ;
-       }
-
-       if (u->tcp_fin_timeout) {
-               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
-                       = u->tcp_fin_timeout * HZ;
-       }
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_UDP
-       if (u->udp_timeout) {
-               ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
-                       = u->udp_timeout * HZ;
-       }
-#endif
-       return 0;
-}
-
-
-#define SET_CMDID(cmd)         (cmd - IP_VS_BASE_CTL)
-#define SERVICE_ARG_LEN                (sizeof(struct ip_vs_service_user))
-#define SVCDEST_ARG_LEN                (sizeof(struct ip_vs_service_user) +    \
-                                sizeof(struct ip_vs_dest_user))
-#define TIMEOUT_ARG_LEN                (sizeof(struct ip_vs_timeout_user))
-#define DAEMON_ARG_LEN         (sizeof(struct ip_vs_daemon_user))
-#define MAX_ARG_LEN            SVCDEST_ARG_LEN
-
-static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
-       [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
-       [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
-       [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
-};
-
-static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
-                                 struct ip_vs_service_user *usvc_compat)
-{
-       usvc->af                = AF_INET;
-       usvc->protocol          = usvc_compat->protocol;
-       usvc->addr.ip           = usvc_compat->addr;
-       usvc->port              = usvc_compat->port;
-       usvc->fwmark            = usvc_compat->fwmark;
-
-       /* Deep copy of sched_name is not needed here */
-       usvc->sched_name        = usvc_compat->sched_name;
-
-       usvc->flags             = usvc_compat->flags;
-       usvc->timeout           = usvc_compat->timeout;
-       usvc->netmask           = usvc_compat->netmask;
-}
-
-static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
-                                  struct ip_vs_dest_user *udest_compat)
-{
-       udest->addr.ip          = udest_compat->addr;
-       udest->port             = udest_compat->port;
-       udest->conn_flags       = udest_compat->conn_flags;
-       udest->weight           = udest_compat->weight;
-       udest->u_threshold      = udest_compat->u_threshold;
-       udest->l_threshold      = udest_compat->l_threshold;
-}
-
-static int
-do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
-{
-       int ret;
-       unsigned char arg[MAX_ARG_LEN];
-       struct ip_vs_service_user *usvc_compat;
-       struct ip_vs_service_user_kern usvc;
-       struct ip_vs_service *svc;
-       struct ip_vs_dest_user *udest_compat;
-       struct ip_vs_dest_user_kern udest;
-
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
-       if (len != set_arglen[SET_CMDID(cmd)]) {
-               IP_VS_ERR("set_ctl: len %u != %u\n",
-                         len, set_arglen[SET_CMDID(cmd)]);
-               return -EINVAL;
-       }
-
-       if (copy_from_user(arg, user, len) != 0)
-               return -EFAULT;
-
-       /* increase the module use count */
-       ip_vs_use_count_inc();
-
-       if (mutex_lock_interruptible(&__ip_vs_mutex)) {
-               ret = -ERESTARTSYS;
-               goto out_dec;
-       }
-
-       if (cmd == IP_VS_SO_SET_FLUSH) {
-               /* Flush the virtual service */
-               ret = ip_vs_flush();
-               goto out_unlock;
-       } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
-               /* Set timeout values for (tcp tcpfin udp) */
-               ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
-               goto out_unlock;
-       } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
-               struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-               ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
-               goto out_unlock;
-       } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
-               struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-               ret = stop_sync_thread(dm->state);
-               goto out_unlock;
-       }
-
-       usvc_compat = (struct ip_vs_service_user *)arg;
-       udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
-
-       /* We only use the new structs internally, so copy userspace compat
-        * structs to extended internal versions */
-       ip_vs_copy_usvc_compat(&usvc, usvc_compat);
-       ip_vs_copy_udest_compat(&udest, udest_compat);
-
-       if (cmd == IP_VS_SO_SET_ZERO) {
-               /* if no service address is set, zero counters in all */
-               if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
-                       ret = ip_vs_zero_all();
-                       goto out_unlock;
-               }
-       }
-
-       /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
-       if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
-               IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
-                         usvc.protocol, NIPQUAD(usvc.addr.ip),
-                         ntohs(usvc.port), usvc.sched_name);
-               ret = -EFAULT;
-               goto out_unlock;
-       }
-
-       /* Lookup the exact service by <protocol, addr, port> or fwmark */
-       if (usvc.fwmark == 0)
-               svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-                                         &usvc.addr, usvc.port);
-       else
-               svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
-       if (cmd != IP_VS_SO_SET_ADD
-           && (svc == NULL || svc->protocol != usvc.protocol)) {
-               ret = -ESRCH;
-               goto out_unlock;
-       }
-
-       switch (cmd) {
-       case IP_VS_SO_SET_ADD:
-               if (svc != NULL)
-                       ret = -EEXIST;
-               else
-                       ret = ip_vs_add_service(&usvc, &svc);
-               break;
-       case IP_VS_SO_SET_EDIT:
-               ret = ip_vs_edit_service(svc, &usvc);
-               break;
-       case IP_VS_SO_SET_DEL:
-               ret = ip_vs_del_service(svc);
-               if (!ret)
-                       goto out_unlock;
-               break;
-       case IP_VS_SO_SET_ZERO:
-               ret = ip_vs_zero_service(svc);
-               break;
-       case IP_VS_SO_SET_ADDDEST:
-               ret = ip_vs_add_dest(svc, &udest);
-               break;
-       case IP_VS_SO_SET_EDITDEST:
-               ret = ip_vs_edit_dest(svc, &udest);
-               break;
-       case IP_VS_SO_SET_DELDEST:
-               ret = ip_vs_del_dest(svc, &udest);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       if (svc)
-               ip_vs_service_put(svc);
-
-  out_unlock:
-       mutex_unlock(&__ip_vs_mutex);
-  out_dec:
-       /* decrease the module use count */
-       ip_vs_use_count_dec();
-
-       return ret;
-}
-
-
-static void
-ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
-{
-       spin_lock_bh(&src->lock);
-       memcpy(dst, &src->ustats, sizeof(*dst));
-       spin_unlock_bh(&src->lock);
-}
-
-static void
-ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
-{
-       dst->protocol = src->protocol;
-       dst->addr = src->addr.ip;
-       dst->port = src->port;
-       dst->fwmark = src->fwmark;
-       strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
-       dst->flags = src->flags;
-       dst->timeout = src->timeout / HZ;
-       dst->netmask = src->netmask;
-       dst->num_dests = src->num_dests;
-       ip_vs_copy_stats(&dst->stats, &src->stats);
-}
-
-static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
-                           struct ip_vs_get_services __user *uptr)
-{
-       int idx, count=0;
-       struct ip_vs_service *svc;
-       struct ip_vs_service_entry entry;
-       int ret = 0;
-
-       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-                       /* Only expose IPv4 entries to old interface */
-                       if (svc->af != AF_INET)
-                               continue;
-
-                       if (count >= get->num_services)
-                               goto out;
-                       memset(&entry, 0, sizeof(entry));
-                       ip_vs_copy_service(&entry, svc);
-                       if (copy_to_user(&uptr->entrytable[count],
-                                        &entry, sizeof(entry))) {
-                               ret = -EFAULT;
-                               goto out;
-                       }
-                       count++;
-               }
-       }
-
-       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-               list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-                       /* Only expose IPv4 entries to old interface */
-                       if (svc->af != AF_INET)
-                               continue;
-
-                       if (count >= get->num_services)
-                               goto out;
-                       memset(&entry, 0, sizeof(entry));
-                       ip_vs_copy_service(&entry, svc);
-                       if (copy_to_user(&uptr->entrytable[count],
-                                        &entry, sizeof(entry))) {
-                               ret = -EFAULT;
-                               goto out;
-                       }
-                       count++;
-               }
-       }
-  out:
-       return ret;
-}
-
-static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
-                        struct ip_vs_get_dests __user *uptr)
-{
-       struct ip_vs_service *svc;
-       union nf_inet_addr addr = { .ip = get->addr };
-       int ret = 0;
-
-       if (get->fwmark)
-               svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
-       else
-               svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-                                         get->port);
-
-       if (svc) {
-               int count = 0;
-               struct ip_vs_dest *dest;
-               struct ip_vs_dest_entry entry;
-
-               list_for_each_entry(dest, &svc->destinations, n_list) {
-                       if (count >= get->num_dests)
-                               break;
-
-                       entry.addr = dest->addr.ip;
-                       entry.port = dest->port;
-                       entry.conn_flags = atomic_read(&dest->conn_flags);
-                       entry.weight = atomic_read(&dest->weight);
-                       entry.u_threshold = dest->u_threshold;
-                       entry.l_threshold = dest->l_threshold;
-                       entry.activeconns = atomic_read(&dest->activeconns);
-                       entry.inactconns = atomic_read(&dest->inactconns);
-                       entry.persistconns = atomic_read(&dest->persistconns);
-                       ip_vs_copy_stats(&entry.stats, &dest->stats);
-                       if (copy_to_user(&uptr->entrytable[count],
-                                        &entry, sizeof(entry))) {
-                               ret = -EFAULT;
-                               break;
-                       }
-                       count++;
-               }
-               ip_vs_service_put(svc);
-       } else
-               ret = -ESRCH;
-       return ret;
-}
-
-static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
-{
-#ifdef CONFIG_IP_VS_PROTO_TCP
-       u->tcp_timeout =
-               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
-       u->tcp_fin_timeout =
-               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
-       u->udp_timeout =
-               ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
-#endif
-}
-
-
-#define GET_CMDID(cmd)         (cmd - IP_VS_BASE_CTL)
-#define GET_INFO_ARG_LEN       (sizeof(struct ip_vs_getinfo))
-#define GET_SERVICES_ARG_LEN   (sizeof(struct ip_vs_get_services))
-#define GET_SERVICE_ARG_LEN    (sizeof(struct ip_vs_service_entry))
-#define GET_DESTS_ARG_LEN      (sizeof(struct ip_vs_get_dests))
-#define GET_TIMEOUT_ARG_LEN    (sizeof(struct ip_vs_timeout_user))
-#define GET_DAEMON_ARG_LEN     (sizeof(struct ip_vs_daemon_user) * 2)
-
-static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
-       [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
-       [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
-       [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
-       [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
-       [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
-       [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
-       [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
-};
-
-static int
-do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
-       unsigned char arg[128];
-       int ret = 0;
-
-       if (!capable(CAP_NET_ADMIN))
-               return -EPERM;
-
-       if (*len < get_arglen[GET_CMDID(cmd)]) {
-               IP_VS_ERR("get_ctl: len %u < %u\n",
-                         *len, get_arglen[GET_CMDID(cmd)]);
-               return -EINVAL;
-       }
-
-       if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
-               return -EFAULT;
-
-       if (mutex_lock_interruptible(&__ip_vs_mutex))
-               return -ERESTARTSYS;
-
-       switch (cmd) {
-       case IP_VS_SO_GET_VERSION:
-       {
-               char buf[64];
-
-               sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
-                       NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
-               if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
-                       ret = -EFAULT;
-                       goto out;
-               }
-               *len = strlen(buf)+1;
-       }
-       break;
-
-       case IP_VS_SO_GET_INFO:
-       {
-               struct ip_vs_getinfo info;
-               info.version = IP_VS_VERSION_CODE;
-               info.size = IP_VS_CONN_TAB_SIZE;
-               info.num_services = ip_vs_num_services;
-               if (copy_to_user(user, &info, sizeof(info)) != 0)
-                       ret = -EFAULT;
-       }
-       break;
-
-       case IP_VS_SO_GET_SERVICES:
-       {
-               struct ip_vs_get_services *get;
-               int size;
-
-               get = (struct ip_vs_get_services *)arg;
-               size = sizeof(*get) +
-                       sizeof(struct ip_vs_service_entry) * get->num_services;
-               if (*len != size) {
-                       IP_VS_ERR("length: %u != %u\n", *len, size);
-                       ret = -EINVAL;
-                       goto out;
-               }
-               ret = __ip_vs_get_service_entries(get, user);
-       }
-       break;
-
-       case IP_VS_SO_GET_SERVICE:
-       {
-               struct ip_vs_service_entry *entry;
-               struct ip_vs_service *svc;
-               union nf_inet_addr addr;
-
-               entry = (struct ip_vs_service_entry *)arg;
-               addr.ip = entry->addr;
-               if (entry->fwmark)
-                       svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
-               else
-                       svc = __ip_vs_service_get(AF_INET, entry->protocol,
-                                                 &addr, entry->port);
-               if (svc) {
-                       ip_vs_copy_service(entry, svc);
-                       if (copy_to_user(user, entry, sizeof(*entry)) != 0)
-                               ret = -EFAULT;
-                       ip_vs_service_put(svc);
-               } else
-                       ret = -ESRCH;
-       }
-       break;
-
-       case IP_VS_SO_GET_DESTS:
-       {
-               struct ip_vs_get_dests *get;
-               int size;
-
-               get = (struct ip_vs_get_dests *)arg;
-               size = sizeof(*get) +
-                       sizeof(struct ip_vs_dest_entry) * get->num_dests;
-               if (*len != size) {
-                       IP_VS_ERR("length: %u != %u\n", *len, size);
-                       ret = -EINVAL;
-                       goto out;
-               }
-               ret = __ip_vs_get_dest_entries(get, user);
-       }
-       break;
-
-       case IP_VS_SO_GET_TIMEOUT:
-       {
-               struct ip_vs_timeout_user t;
-
-               __ip_vs_get_timeouts(&t);
-               if (copy_to_user(user, &t, sizeof(t)) != 0)
-                       ret = -EFAULT;
-       }
-       break;
-
-       case IP_VS_SO_GET_DAEMON:
-       {
-               struct ip_vs_daemon_user d[2];
-
-               memset(&d, 0, sizeof(d));
-               if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
-                       d[0].state = IP_VS_STATE_MASTER;
-                       strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
-                       d[0].syncid = ip_vs_master_syncid;
-               }
-               if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
-                       d[1].state = IP_VS_STATE_BACKUP;
-                       strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
-                       d[1].syncid = ip_vs_backup_syncid;
-               }
-               if (copy_to_user(user, &d, sizeof(d)) != 0)
-                       ret = -EFAULT;
-       }
-       break;
-
-       default:
-               ret = -EINVAL;
-       }
-
-  out:
-       mutex_unlock(&__ip_vs_mutex);
-       return ret;
-}
-
-
-static struct nf_sockopt_ops ip_vs_sockopts = {
-       .pf             = PF_INET,
-       .set_optmin     = IP_VS_BASE_CTL,
-       .set_optmax     = IP_VS_SO_SET_MAX+1,
-       .set            = do_ip_vs_set_ctl,
-       .get_optmin     = IP_VS_BASE_CTL,
-       .get_optmax     = IP_VS_SO_GET_MAX+1,
-       .get            = do_ip_vs_get_ctl,
-       .owner          = THIS_MODULE,
-};
-
-/*
- * Generic Netlink interface
- */
-
-/* IPVS genetlink family */
-static struct genl_family ip_vs_genl_family = {
-       .id             = GENL_ID_GENERATE,
-       .hdrsize        = 0,
-       .name           = IPVS_GENL_NAME,
-       .version        = IPVS_GENL_VERSION,
-       .maxattr        = IPVS_CMD_MAX,
-};
-
-/* Policy used for first-level command attributes */
-static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
-       [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
-       [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
-       [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
-       [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
-       [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
-       [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
-};
-
-/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
-static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
-       [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
-       [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
-                                           .len = IP_VS_IFNAME_MAXLEN },
-       [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
-};
-
-/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
-static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
-       [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
-       [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
-       [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
-                                           .len = sizeof(union nf_inet_addr) },
-       [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
-       [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
-       [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
-                                           .len = IP_VS_SCHEDNAME_MAXLEN },
-       [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
-                                           .len = sizeof(struct ip_vs_flags) },
-       [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
-       [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
-       [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
-};
-
-/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
-static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
-       [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
-                                           .len = sizeof(union nf_inet_addr) },
-       [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
-       [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
-       [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
-};
-
-static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
-                                struct ip_vs_stats *stats)
-{
-       struct nlattr *nl_stats = nla_nest_start(skb, container_type);
-       if (!nl_stats)
-               return -EMSGSIZE;
-
-       spin_lock_bh(&stats->lock);
-
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
-       NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
-       NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
-       NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
-
-       spin_unlock_bh(&stats->lock);
-
-       nla_nest_end(skb, nl_stats);
-
-       return 0;
-
-nla_put_failure:
-       spin_unlock_bh(&stats->lock);
-       nla_nest_cancel(skb, nl_stats);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_fill_service(struct sk_buff *skb,
-                                  struct ip_vs_service *svc)
-{
-       struct nlattr *nl_service;
-       struct ip_vs_flags flags = { .flags = svc->flags,
-                                    .mask = ~0 };
-
-       nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
-       if (!nl_service)
-               return -EMSGSIZE;
-
-       NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
-
-       if (svc->fwmark) {
-               NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
-       } else {
-               NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
-               NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
-               NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
-       }
-
-       NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
-       NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
-       NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
-       NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
-
-       if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
-               goto nla_put_failure;
-
-       nla_nest_end(skb, nl_service);
-
-       return 0;
-
-nla_put_failure:
-       nla_nest_cancel(skb, nl_service);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_service(struct sk_buff *skb,
-                                  struct ip_vs_service *svc,
-                                  struct netlink_callback *cb)
-{
-       void *hdr;
-
-       hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-                         &ip_vs_genl_family, NLM_F_MULTI,
-                         IPVS_CMD_NEW_SERVICE);
-       if (!hdr)
-               return -EMSGSIZE;
-
-       if (ip_vs_genl_fill_service(skb, svc) < 0)
-               goto nla_put_failure;
-
-       return genlmsg_end(skb, hdr);
-
-nla_put_failure:
-       genlmsg_cancel(skb, hdr);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_services(struct sk_buff *skb,
-                                   struct netlink_callback *cb)
-{
-       int idx = 0, i;
-       int start = cb->args[0];
-       struct ip_vs_service *svc;
-
-       mutex_lock(&__ip_vs_mutex);
-       for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-               list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
-                       if (++idx <= start)
-                               continue;
-                       if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
-                               idx--;
-                               goto nla_put_failure;
-                       }
-               }
-       }
-
-       for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
-               list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
-                       if (++idx <= start)
-                               continue;
-                       if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
-                               idx--;
-                               goto nla_put_failure;
-                       }
-               }
-       }
-
-nla_put_failure:
-       mutex_unlock(&__ip_vs_mutex);
-       cb->args[0] = idx;
-
-       return skb->len;
-}
-
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-                                   struct nlattr *nla, int full_entry)
-{
-       struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
-       struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
-
-       /* Parse mandatory identifying service fields first */
-       if (nla == NULL ||
-           nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
-               return -EINVAL;
-
-       nla_af          = attrs[IPVS_SVC_ATTR_AF];
-       nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
-       nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
-       nla_port        = attrs[IPVS_SVC_ATTR_PORT];
-       nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
-
-       if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
-               return -EINVAL;
-
-       usvc->af = nla_get_u16(nla_af);
-#ifdef CONFIG_IP_VS_IPV6
-       if (usvc->af != AF_INET && usvc->af != AF_INET6)
-#else
-       if (usvc->af != AF_INET)
-#endif
-               return -EAFNOSUPPORT;
-
-       if (nla_fwmark) {
-               usvc->protocol = IPPROTO_TCP;
-               usvc->fwmark = nla_get_u32(nla_fwmark);
-       } else {
-               usvc->protocol = nla_get_u16(nla_protocol);
-               nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
-               usvc->port = nla_get_u16(nla_port);
-               usvc->fwmark = 0;
-       }
-
-       /* If a full entry was requested, check for the additional fields */
-       if (full_entry) {
-               struct nlattr *nla_sched, *nla_flags, *nla_timeout,
-                             *nla_netmask;
-               struct ip_vs_flags flags;
-               struct ip_vs_service *svc;
-
-               nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
-               nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
-               nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
-               nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
-
-               if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
-                       return -EINVAL;
-
-               nla_memcpy(&flags, nla_flags, sizeof(flags));
-
-               /* prefill flags from service if it already exists */
-               if (usvc->fwmark)
-                       svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-               else
-                       svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-                                                 &usvc->addr, usvc->port);
-               if (svc) {
-                       usvc->flags = svc->flags;
-                       ip_vs_service_put(svc);
-               } else
-                       usvc->flags = 0;
-
-               /* set new flags from userland */
-               usvc->flags = (usvc->flags & ~flags.mask) |
-                             (flags.flags & flags.mask);
-               usvc->sched_name = nla_data(nla_sched);
-               usvc->timeout = nla_get_u32(nla_timeout);
-               usvc->netmask = nla_get_u32(nla_netmask);
-       }
-
-       return 0;
-}
-
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
-{
-       struct ip_vs_service_user_kern usvc;
-       int ret;
-
-       ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-       if (ret)
-               return ERR_PTR(ret);
-
-       if (usvc.fwmark)
-               return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-       else
-               return __ip_vs_service_get(usvc.af, usvc.protocol,
-                                          &usvc.addr, usvc.port);
-}
-
-static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
-{
-       struct nlattr *nl_dest;
-
-       nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
-       if (!nl_dest)
-               return -EMSGSIZE;
-
-       NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
-       NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
-
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
-                   atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
-                   atomic_read(&dest->activeconns));
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
-                   atomic_read(&dest->inactconns));
-       NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
-                   atomic_read(&dest->persistconns));
-
-       if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
-               goto nla_put_failure;
-
-       nla_nest_end(skb, nl_dest);
-
-       return 0;
-
-nla_put_failure:
-       nla_nest_cancel(skb, nl_dest);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
-                               struct netlink_callback *cb)
-{
-       void *hdr;
-
-       hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-                         &ip_vs_genl_family, NLM_F_MULTI,
-                         IPVS_CMD_NEW_DEST);
-       if (!hdr)
-               return -EMSGSIZE;
-
-       if (ip_vs_genl_fill_dest(skb, dest) < 0)
-               goto nla_put_failure;
-
-       return genlmsg_end(skb, hdr);
-
-nla_put_failure:
-       genlmsg_cancel(skb, hdr);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_dests(struct sk_buff *skb,
-                                struct netlink_callback *cb)
-{
-       int idx = 0;
-       int start = cb->args[0];
-       struct ip_vs_service *svc;
-       struct ip_vs_dest *dest;
-       struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
-
-       mutex_lock(&__ip_vs_mutex);
-
-       /* Try to find the service for which to dump destinations */
-       if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
-                       IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
-               goto out_err;
-
-       svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
-       if (IS_ERR(svc) || svc == NULL)
-               goto out_err;
-
-       /* Dump the destinations */
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if (++idx <= start)
-                       continue;
-               if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
-                       idx--;
-                       goto nla_put_failure;
-               }
-       }
-
-nla_put_failure:
-       cb->args[0] = idx;
-       ip_vs_service_put(svc);
-
-out_err:
-       mutex_unlock(&__ip_vs_mutex);
-
-       return skb->len;
-}
-
-static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
-                                struct nlattr *nla, int full_entry)
-{
-       struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
-       struct nlattr *nla_addr, *nla_port;
-
-       /* Parse mandatory identifying destination fields first */
-       if (nla == NULL ||
-           nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
-               return -EINVAL;
-
-       nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
-       nla_port        = attrs[IPVS_DEST_ATTR_PORT];
-
-       if (!(nla_addr && nla_port))
-               return -EINVAL;
-
-       nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
-       udest->port = nla_get_u16(nla_port);
-
-       /* If a full entry was requested, check for the additional fields */
-       if (full_entry) {
-               struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
-                             *nla_l_thresh;
-
-               nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
-               nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
-               nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
-               nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
-
-               if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
-                       return -EINVAL;
-
-               udest->conn_flags = nla_get_u32(nla_fwd)
-                                   & IP_VS_CONN_F_FWD_MASK;
-               udest->weight = nla_get_u32(nla_weight);
-               udest->u_threshold = nla_get_u32(nla_u_thresh);
-               udest->l_threshold = nla_get_u32(nla_l_thresh);
-       }
-
-       return 0;
-}
-
-static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
-                                 const char *mcast_ifn, __be32 syncid)
-{
-       struct nlattr *nl_daemon;
-
-       nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
-       if (!nl_daemon)
-               return -EMSGSIZE;
-
-       NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
-       NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
-       NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
-
-       nla_nest_end(skb, nl_daemon);
-
-       return 0;
-
-nla_put_failure:
-       nla_nest_cancel(skb, nl_daemon);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
-                                 const char *mcast_ifn, __be32 syncid,
-                                 struct netlink_callback *cb)
-{
-       void *hdr;
-       hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-                         &ip_vs_genl_family, NLM_F_MULTI,
-                         IPVS_CMD_NEW_DAEMON);
-       if (!hdr)
-               return -EMSGSIZE;
-
-       if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
-               goto nla_put_failure;
-
-       return genlmsg_end(skb, hdr);
-
-nla_put_failure:
-       genlmsg_cancel(skb, hdr);
-       return -EMSGSIZE;
-}
-
-static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
-                                  struct netlink_callback *cb)
-{
-       mutex_lock(&__ip_vs_mutex);
-       if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
-               if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
-                                          ip_vs_master_mcast_ifn,
-                                          ip_vs_master_syncid, cb) < 0)
-                       goto nla_put_failure;
-
-               cb->args[0] = 1;
-       }
-
-       if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
-               if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
-                                          ip_vs_backup_mcast_ifn,
-                                          ip_vs_backup_syncid, cb) < 0)
-                       goto nla_put_failure;
-
-               cb->args[1] = 1;
-       }
-
-nla_put_failure:
-       mutex_unlock(&__ip_vs_mutex);
-
-       return skb->len;
-}
-
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
-{
-       if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
-             attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
-             attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
-               return -EINVAL;
-
-       return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
-                                nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
-                                nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
-}
-
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
-{
-       if (!attrs[IPVS_DAEMON_ATTR_STATE])
-               return -EINVAL;
-
-       return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
-}
-
-static int ip_vs_genl_set_config(struct nlattr **attrs)
-{
-       struct ip_vs_timeout_user t;
-
-       __ip_vs_get_timeouts(&t);
-
-       if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
-               t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
-
-       if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
-               t.tcp_fin_timeout =
-                       nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
-
-       if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
-               t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
-
-       return ip_vs_set_timeout(&t);
-}
-
-static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
-{
-       struct ip_vs_service *svc = NULL;
-       struct ip_vs_service_user_kern usvc;
-       struct ip_vs_dest_user_kern udest;
-       int ret = 0, cmd;
-       int need_full_svc = 0, need_full_dest = 0;
-
-       cmd = info->genlhdr->cmd;
-
-       mutex_lock(&__ip_vs_mutex);
-
-       if (cmd == IPVS_CMD_FLUSH) {
-               ret = ip_vs_flush();
-               goto out;
-       } else if (cmd == IPVS_CMD_SET_CONFIG) {
-               ret = ip_vs_genl_set_config(info->attrs);
-               goto out;
-       } else if (cmd == IPVS_CMD_NEW_DAEMON ||
-                  cmd == IPVS_CMD_DEL_DAEMON) {
-
-               struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
-
-               if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
-                   nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
-                                    info->attrs[IPVS_CMD_ATTR_DAEMON],
-                                    ip_vs_daemon_policy)) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               if (cmd == IPVS_CMD_NEW_DAEMON)
-                       ret = ip_vs_genl_new_daemon(daemon_attrs);
-               else
-                       ret = ip_vs_genl_del_daemon(daemon_attrs);
-               goto out;
-       } else if (cmd == IPVS_CMD_ZERO &&
-                  !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
-               ret = ip_vs_zero_all();
-               goto out;
-       }
-
-       /* All following commands require a service argument, so check if we
-        * received a valid one. We need a full service specification when
-        * adding / editing a service. Only identifying members otherwise. */
-       if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
-               need_full_svc = 1;
-
-       ret = ip_vs_genl_parse_service(&usvc,
-                                      info->attrs[IPVS_CMD_ATTR_SERVICE],
-                                      need_full_svc);
-       if (ret)
-               goto out;
-
-       /* Lookup the exact service by <protocol, addr, port> or fwmark */
-       if (usvc.fwmark == 0)
-               svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-                                         &usvc.addr, usvc.port);
-       else
-               svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
-       /* Unless we're adding a new service, the service must already exist */
-       if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
-               ret = -ESRCH;
-               goto out;
-       }
-
-       /* Destination commands require a valid destination argument. For
-        * adding / editing a destination, we need a full destination
-        * specification. */
-       if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
-           cmd == IPVS_CMD_DEL_DEST) {
-               if (cmd != IPVS_CMD_DEL_DEST)
-                       need_full_dest = 1;
-
-               ret = ip_vs_genl_parse_dest(&udest,
-                                           info->attrs[IPVS_CMD_ATTR_DEST],
-                                           need_full_dest);
-               if (ret)
-                       goto out;
-       }
-
-       switch (cmd) {
-       case IPVS_CMD_NEW_SERVICE:
-               if (svc == NULL)
-                       ret = ip_vs_add_service(&usvc, &svc);
-               else
-                       ret = -EEXIST;
-               break;
-       case IPVS_CMD_SET_SERVICE:
-               ret = ip_vs_edit_service(svc, &usvc);
-               break;
-       case IPVS_CMD_DEL_SERVICE:
-               ret = ip_vs_del_service(svc);
-               break;
-       case IPVS_CMD_NEW_DEST:
-               ret = ip_vs_add_dest(svc, &udest);
-               break;
-       case IPVS_CMD_SET_DEST:
-               ret = ip_vs_edit_dest(svc, &udest);
-               break;
-       case IPVS_CMD_DEL_DEST:
-               ret = ip_vs_del_dest(svc, &udest);
-               break;
-       case IPVS_CMD_ZERO:
-               ret = ip_vs_zero_service(svc);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-out:
-       if (svc)
-               ip_vs_service_put(svc);
-       mutex_unlock(&__ip_vs_mutex);
-
-       return ret;
-}
-
-static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
-{
-       struct sk_buff *msg;
-       void *reply;
-       int ret, cmd, reply_cmd;
-
-       cmd = info->genlhdr->cmd;
-
-       if (cmd == IPVS_CMD_GET_SERVICE)
-               reply_cmd = IPVS_CMD_NEW_SERVICE;
-       else if (cmd == IPVS_CMD_GET_INFO)
-               reply_cmd = IPVS_CMD_SET_INFO;
-       else if (cmd == IPVS_CMD_GET_CONFIG)
-               reply_cmd = IPVS_CMD_SET_CONFIG;
-       else {
-               IP_VS_ERR("unknown Generic Netlink command\n");
-               return -EINVAL;
-       }
-
-       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-       if (!msg)
-               return -ENOMEM;
-
-       mutex_lock(&__ip_vs_mutex);
-
-       reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
-       if (reply == NULL)
-               goto nla_put_failure;
-
-       switch (cmd) {
-       case IPVS_CMD_GET_SERVICE:
-       {
-               struct ip_vs_service *svc;
-
-               svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
-               if (IS_ERR(svc)) {
-                       ret = PTR_ERR(svc);
-                       goto out_err;
-               } else if (svc) {
-                       ret = ip_vs_genl_fill_service(msg, svc);
-                       ip_vs_service_put(svc);
-                       if (ret)
-                               goto nla_put_failure;
-               } else {
-                       ret = -ESRCH;
-                       goto out_err;
-               }
-
-               break;
-       }
-
-       case IPVS_CMD_GET_CONFIG:
-       {
-               struct ip_vs_timeout_user t;
-
-               __ip_vs_get_timeouts(&t);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-               NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
-               NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
-                           t.tcp_fin_timeout);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
-               NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
-#endif
-
-               break;
-       }
-
-       case IPVS_CMD_GET_INFO:
-               NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
-               NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
-                           IP_VS_CONN_TAB_SIZE);
-               break;
-       }
-
-       genlmsg_end(msg, reply);
-       ret = genlmsg_unicast(msg, info->snd_pid);
-       goto out;
-
-nla_put_failure:
-       IP_VS_ERR("not enough space in Netlink message\n");
-       ret = -EMSGSIZE;
-
-out_err:
-       nlmsg_free(msg);
-out:
-       mutex_unlock(&__ip_vs_mutex);
-
-       return ret;
-}
-
-
-static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
-       {
-               .cmd    = IPVS_CMD_NEW_SERVICE,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_SET_SERVICE,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_DEL_SERVICE,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_GET_SERVICE,
-               .flags  = GENL_ADMIN_PERM,
-               .doit   = ip_vs_genl_get_cmd,
-               .dumpit = ip_vs_genl_dump_services,
-               .policy = ip_vs_cmd_policy,
-       },
-       {
-               .cmd    = IPVS_CMD_NEW_DEST,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_SET_DEST,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_DEL_DEST,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_GET_DEST,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .dumpit = ip_vs_genl_dump_dests,
-       },
-       {
-               .cmd    = IPVS_CMD_NEW_DAEMON,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_DEL_DAEMON,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_GET_DAEMON,
-               .flags  = GENL_ADMIN_PERM,
-               .dumpit = ip_vs_genl_dump_daemons,
-       },
-       {
-               .cmd    = IPVS_CMD_SET_CONFIG,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_GET_CONFIG,
-               .flags  = GENL_ADMIN_PERM,
-               .doit   = ip_vs_genl_get_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_GET_INFO,
-               .flags  = GENL_ADMIN_PERM,
-               .doit   = ip_vs_genl_get_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_ZERO,
-               .flags  = GENL_ADMIN_PERM,
-               .policy = ip_vs_cmd_policy,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-       {
-               .cmd    = IPVS_CMD_FLUSH,
-               .flags  = GENL_ADMIN_PERM,
-               .doit   = ip_vs_genl_set_cmd,
-       },
-};
-
-static int __init ip_vs_genl_register(void)
-{
-       int ret, i;
-
-       ret = genl_register_family(&ip_vs_genl_family);
-       if (ret)
-               return ret;
-
-       for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
-               ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
-               if (ret)
-                       goto err_out;
-       }
-       return 0;
-
-err_out:
-       genl_unregister_family(&ip_vs_genl_family);
-       return ret;
-}
-
-static void ip_vs_genl_unregister(void)
-{
-       genl_unregister_family(&ip_vs_genl_family);
-}
-
-/* End of Generic Netlink interface definitions */
-
-
-int __init ip_vs_control_init(void)
-{
-       int ret;
-       int idx;
-
-       EnterFunction(2);
-
-       ret = nf_register_sockopt(&ip_vs_sockopts);
-       if (ret) {
-               IP_VS_ERR("cannot register sockopt.\n");
-               return ret;
-       }
-
-       ret = ip_vs_genl_register();
-       if (ret) {
-               IP_VS_ERR("cannot register Generic Netlink interface.\n");
-               nf_unregister_sockopt(&ip_vs_sockopts);
-               return ret;
-       }
-
-       proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
-       proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
-
-       sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
-
-       /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
-       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-               INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-               INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-       }
-       for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-               INIT_LIST_HEAD(&ip_vs_rtable[idx]);
-       }
-
-       ip_vs_new_estimator(&ip_vs_stats);
-
-       /* Hook the defense timer */
-       schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
-
-       LeaveFunction(2);
-       return 0;
-}
-
-
-void ip_vs_control_cleanup(void)
-{
-       EnterFunction(2);
-       ip_vs_trash_cleanup();
-       cancel_rearming_delayed_work(&defense_work);
-       cancel_work_sync(&defense_work.work);
-       ip_vs_kill_estimator(&ip_vs_stats);
-       unregister_sysctl_table(sysctl_header);
-       proc_net_remove(&init_net, "ip_vs_stats");
-       proc_net_remove(&init_net, "ip_vs");
-       ip_vs_genl_unregister();
-       nf_unregister_sockopt(&ip_vs_sockopts);
-       LeaveFunction(2);
-}
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c

deleted file mode 100644 (file)

index 2eb2860..0000000
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * ip_vs_est.c: simple rate estimator for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/sysctl.h>
-#include <linux/list.h>
-
-#include <net/ip_vs.h>
-
-/*
-  This code is to estimate rate in a shorter interval (such as 8
-  seconds) for virtual services and real servers. For measure rate in a
-  long interval, it is easy to implement a user level daemon which
-  periodically reads those statistical counters and measure rate.
-
-  Currently, the measurement is activated by slow timer handler. Hope
-  this measurement will not introduce too much load.
-
-  We measure rate during the last 8 seconds every 2 seconds:
-
-    avgrate = avgrate*(1-W) + rate*W
-
-    where W = 2^(-2)
-
-  NOTES.
-
-  * The stored value for average bps is scaled by 2^5, so that maximal
-    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
-
-  * A lot code is taken from net/sched/estimator.c
- */
-
-
-static void estimation_timer(unsigned long arg);
-
-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
-static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
-
-static void estimation_timer(unsigned long arg)
-{
-       struct ip_vs_estimator *e;
-       struct ip_vs_stats *s;
-       u32 n_conns;
-       u32 n_inpkts, n_outpkts;
-       u64 n_inbytes, n_outbytes;
-       u32 rate;
-
-       spin_lock(&est_lock);
-       list_for_each_entry(e, &est_list, list) {
-               s = container_of(e, struct ip_vs_stats, est);
-
-               spin_lock(&s->lock);
-               n_conns = s->ustats.conns;
-               n_inpkts = s->ustats.inpkts;
-               n_outpkts = s->ustats.outpkts;
-               n_inbytes = s->ustats.inbytes;
-               n_outbytes = s->ustats.outbytes;
-
-               /* scaled by 2^10, but divided 2 seconds */
-               rate = (n_conns - e->last_conns)<<9;
-               e->last_conns = n_conns;
-               e->cps += ((long)rate - (long)e->cps)>>2;
-               s->ustats.cps = (e->cps+0x1FF)>>10;
-
-               rate = (n_inpkts - e->last_inpkts)<<9;
-               e->last_inpkts = n_inpkts;
-               e->inpps += ((long)rate - (long)e->inpps)>>2;
-               s->ustats.inpps = (e->inpps+0x1FF)>>10;
-
-               rate = (n_outpkts - e->last_outpkts)<<9;
-               e->last_outpkts = n_outpkts;
-               e->outpps += ((long)rate - (long)e->outpps)>>2;
-               s->ustats.outpps = (e->outpps+0x1FF)>>10;
-
-               rate = (n_inbytes - e->last_inbytes)<<4;
-               e->last_inbytes = n_inbytes;
-               e->inbps += ((long)rate - (long)e->inbps)>>2;
-               s->ustats.inbps = (e->inbps+0xF)>>5;
-
-               rate = (n_outbytes - e->last_outbytes)<<4;
-               e->last_outbytes = n_outbytes;
-               e->outbps += ((long)rate - (long)e->outbps)>>2;
-               s->ustats.outbps = (e->outbps+0xF)>>5;
-               spin_unlock(&s->lock);
-       }
-       spin_unlock(&est_lock);
-       mod_timer(&est_timer, jiffies + 2*HZ);
-}
-
-void ip_vs_new_estimator(struct ip_vs_stats *stats)
-{
-       struct ip_vs_estimator *est = &stats->est;
-
-       INIT_LIST_HEAD(&est->list);
-
-       est->last_conns = stats->ustats.conns;
-       est->cps = stats->ustats.cps<<10;
-
-       est->last_inpkts = stats->ustats.inpkts;
-       est->inpps = stats->ustats.inpps<<10;
-
-       est->last_outpkts = stats->ustats.outpkts;
-       est->outpps = stats->ustats.outpps<<10;
-
-       est->last_inbytes = stats->ustats.inbytes;
-       est->inbps = stats->ustats.inbps<<5;
-
-       est->last_outbytes = stats->ustats.outbytes;
-       est->outbps = stats->ustats.outbps<<5;
-
-       spin_lock_bh(&est_lock);
-       list_add(&est->list, &est_list);
-       spin_unlock_bh(&est_lock);
-}
-
-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
-{
-       struct ip_vs_estimator *est = &stats->est;
-
-       spin_lock_bh(&est_lock);
-       list_del(&est->list);
-       spin_unlock_bh(&est_lock);
-}
-
-void ip_vs_zero_estimator(struct ip_vs_stats *stats)
-{
-       struct ip_vs_estimator *est = &stats->est;
-
-       /* set counters zero, caller must hold the stats->lock lock */
-       est->last_inbytes = 0;
-       est->last_outbytes = 0;
-       est->last_conns = 0;
-       est->last_inpkts = 0;
-       est->last_outpkts = 0;
-       est->cps = 0;
-       est->inpps = 0;
-       est->outpps = 0;
-       est->inbps = 0;
-       est->outbps = 0;
-}
-
-int __init ip_vs_estimator_init(void)
-{
-       mod_timer(&est_timer, jiffies + 2 * HZ);
-       return 0;
-}
-
-void ip_vs_estimator_cleanup(void)
-{
-       del_timer_sync(&est_timer);
-}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c

deleted file mode 100644 (file)

index 2e7dbd8..0000000
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * ip_vs_ftp.c: IPVS ftp application module
- *
- * Authors:    Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * Changes:
- *
- *
- *     This program is free software; you can redistribute it and/or
- *     modify it under the terms of the GNU General Public License
- *     as published by the Free Software Foundation; either version
- *     2 of the License, or (at your option) any later version.
- *
- * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
- * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
- *
- *             IP_MASQ_FTP ftp masquerading module
- *
- * Version:    @(#)ip_masq_ftp.c 0.04   02/05/96
- *
- * Author:     Wouter Gadeyne
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <asm/unaligned.h>
-
-#include <net/ip_vs.h>
-
-
-#define SERVER_STRING "227 Entering Passive Mode ("
-#define CLIENT_STRING "PORT "
-
-
-/*
- * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
-module_param_array(ports, ushort, NULL, 0);
-MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
-
-
-/*     Dummy variable */
-static int ip_vs_ftp_pasv;
-
-
-static int
-ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
-{
-       return 0;
-}
-
-
-static int
-ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
-{
-       return 0;
-}
-
-
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern" and terminated with the "term" character.
- * <addr,port> is in network order.
- */
-static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
-                                 const char *pattern, size_t plen, char term,
-                                 __be32 *addr, __be16 *port,
-                                 char **start, char **end)
-{
-       unsigned char p[6];
-       int i = 0;
-
-       if (data_limit - data < plen) {
-               /* check if there is partial match */
-               if (strnicmp(data, pattern, data_limit - data) == 0)
-                       return -1;
-               else
-                       return 0;
-       }
-
-       if (strnicmp(data, pattern, plen) != 0) {
-               return 0;
-       }
-       *start = data + plen;
-
-       for (data = *start; *data != term; data++) {
-               if (data == data_limit)
-                       return -1;
-       }
-       *end = data;
-
-       memset(p, 0, sizeof(p));
-       for (data = *start; data != *end; data++) {
-               if (*data >= '0' && *data <= '9') {
-                       p[i] = p[i]*10 + *data - '0';
-               } else if (*data == ',' && i < 5) {
-                       i++;
-               } else {
-                       /* unexpected character */
-                       return -1;
-               }
-       }
-
-       if (i != 5)
-               return -1;
-
-       *addr = get_unaligned((__be32 *)p);
-       *port = get_unaligned((__be16 *)(p + 4));
-       return 1;
-}
-
-
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
- * from the server (inside-to-outside).
- * When we see one, we build a connection entry with the client address,
- * client port 0 (unknown at the moment), the server address and the
- * server port.  Mark the current connection entry as a control channel
- * of the new entry. All this work is just to make the data connection
- * can be scheduled to the right server later.
- *
- * The outgoing packet should be something like
- *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
- * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
- */
-static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
-                        struct sk_buff *skb, int *diff)
-{
-       struct iphdr *iph;
-       struct tcphdr *th;
-       char *data, *data_limit;
-       char *start, *end;
-       union nf_inet_addr from;
-       __be16 port;
-       struct ip_vs_conn *n_cp;
-       char buf[24];           /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
-       unsigned buf_len;
-       int ret;
-
-#ifdef CONFIG_IP_VS_IPV6
-       /* This application helper doesn't work with IPv6 yet,
-        * so turn this into a no-op for IPv6 packets
-        */
-       if (cp->af == AF_INET6)
-               return 1;
-#endif
-
-       *diff = 0;
-
-       /* Only useful for established sessions */
-       if (cp->state != IP_VS_TCP_S_ESTABLISHED)
-               return 1;
-
-       /* Linear packets are much easier to deal with. */
-       if (!skb_make_writable(skb, skb->len))
-               return 0;
-
-       if (cp->app_data == &ip_vs_ftp_pasv) {
-               iph = ip_hdr(skb);
-               th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-               data = (char *)th + (th->doff << 2);
-               data_limit = skb_tail_pointer(skb);
-
-               if (ip_vs_ftp_get_addrport(data, data_limit,
-                                          SERVER_STRING,
-                                          sizeof(SERVER_STRING)-1, ')',
-                                          &from.ip, &port,
-                                          &start, &end) != 1)
-                       return 1;
-
-               IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
-                         "%u.%u.%u.%u:%d detected\n",
-                         NIPQUAD(from.ip), ntohs(port),
-                         NIPQUAD(cp->caddr.ip), 0);
-
-               /*
-                * Now update or create an connection entry for it
-                */
-               n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-                                         &cp->caddr, 0);
-               if (!n_cp) {
-                       n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-                                             &cp->caddr, 0,
-                                             &cp->vaddr, port,
-                                             &from, port,
-                                             IP_VS_CONN_F_NO_CPORT,
-                                             cp->dest);
-                       if (!n_cp)
-                               return 0;
-
-                       /* add its controller */
-                       ip_vs_control_add(n_cp, cp);
-               }
-
-               /*
-                * Replace the old passive address with the new one
-                */
-               from.ip = n_cp->vaddr.ip;
-               port = n_cp->vport;
-               sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
-                       (ntohs(port)>>8)&255, ntohs(port)&255);
-               buf_len = strlen(buf);
-
-               /*
-                * Calculate required delta-offset to keep TCP happy
-                */
-               *diff = buf_len - (end-start);
-
-               if (*diff == 0) {
-                       /* simply replace it with new passive address */
-                       memcpy(start, buf, buf_len);
-                       ret = 1;
-               } else {
-                       ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
-                                         end-start, buf, buf_len);
-               }
-
-               cp->app_data = NULL;
-               ip_vs_tcp_conn_listen(n_cp);
-               ip_vs_conn_put(n_cp);
-               return ret;
-       }
-       return 1;
-}
-
-
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
- * (outside-to-inside).
- *
- * The incoming packet having the PORT command should be something like
- *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
- * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
- * In this case, we create a connection entry using the client address and
- * port, so that the active ftp data connection from the server can reach
- * the client.
- */
-static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
-                       struct sk_buff *skb, int *diff)
-{
-       struct iphdr *iph;
-       struct tcphdr *th;
-       char *data, *data_start, *data_limit;
-       char *start, *end;
-       union nf_inet_addr to;
-       __be16 port;
-       struct ip_vs_conn *n_cp;
-
-#ifdef CONFIG_IP_VS_IPV6
-       /* This application helper doesn't work with IPv6 yet,
-        * so turn this into a no-op for IPv6 packets
-        */
-       if (cp->af == AF_INET6)
-               return 1;
-#endif
-
-       /* no diff required for incoming packets */
-       *diff = 0;
-
-       /* Only useful for established sessions */
-       if (cp->state != IP_VS_TCP_S_ESTABLISHED)
-               return 1;
-
-       /* Linear packets are much easier to deal with. */
-       if (!skb_make_writable(skb, skb->len))
-               return 0;
-
-       /*
-        * Detecting whether it is passive
-        */
-       iph = ip_hdr(skb);
-       th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
-       /* Since there may be OPTIONS in the TCP packet and the HLEN is
-          the length of the header in 32-bit multiples, it is accurate
-          to calculate data address by th+HLEN*4 */
-       data = data_start = (char *)th + (th->doff << 2);
-       data_limit = skb_tail_pointer(skb);
-
-       while (data <= data_limit - 6) {
-               if (strnicmp(data, "PASV\r\n", 6) == 0) {
-                       /* Passive mode on */
-                       IP_VS_DBG(7, "got PASV at %td of %td\n",
-                                 data - data_start,
-                                 data_limit - data_start);
-                       cp->app_data = &ip_vs_ftp_pasv;
-                       return 1;
-               }
-               data++;
-       }
-
-       /*
-        * To support virtual FTP server, the scenerio is as follows:
-        *       FTP client ----> Load Balancer ----> FTP server
-        * First detect the port number in the application data,
-        * then create a new connection entry for the coming data
-        * connection.
-        */
-       if (ip_vs_ftp_get_addrport(data_start, data_limit,
-                                  CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-                                  '\r', &to.ip, &port,
-                                  &start, &end) != 1)
-               return 1;
-
-       IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
-                 NIPQUAD(to.ip), ntohs(port));
-
-       /* Passive mode off */
-       cp->app_data = NULL;
-
-       /*
-        * Now update or create a connection entry for it
-        */
-       IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
-                 ip_vs_proto_name(iph->protocol),
-                 NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
-
-       n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-                                &to, port,
-                                &cp->vaddr, htons(ntohs(cp->vport)-1));
-       if (!n_cp) {
-               n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-                                     &to, port,
-                                     &cp->vaddr, htons(ntohs(cp->vport)-1),
-                                     &cp->daddr, htons(ntohs(cp->dport)-1),
-                                     0,
-                                     cp->dest);
-               if (!n_cp)
-                       return 0;
-
-               /* add its controller */
-               ip_vs_control_add(n_cp, cp);
-       }
-
-       /*
-        *      Move tunnel to listen state
-        */
-       ip_vs_tcp_conn_listen(n_cp);
-       ip_vs_conn_put(n_cp);
-
-       return 1;
-}
-
-
-static struct ip_vs_app ip_vs_ftp = {
-       .name =         "ftp",
-       .type =         IP_VS_APP_TYPE_FTP,
-       .protocol =     IPPROTO_TCP,
-       .module =       THIS_MODULE,
-       .incs_list =    LIST_HEAD_INIT(ip_vs_ftp.incs_list),
-       .init_conn =    ip_vs_ftp_init_conn,
-       .done_conn =    ip_vs_ftp_done_conn,
-       .bind_conn =    NULL,
-       .unbind_conn =  NULL,
-       .pkt_out =      ip_vs_ftp_out,
-       .pkt_in =       ip_vs_ftp_in,
-};
-
-
-/*
- *     ip_vs_ftp initialization
- */
-static int __init ip_vs_ftp_init(void)
-{
-       int i, ret;
-       struct ip_vs_app *app = &ip_vs_ftp;
-
-       ret = register_ip_vs_app(app);
-       if (ret)
-               return ret;
-
-       for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
-               if (!ports[i])
-                       continue;
-               ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
-               if (ret)
-                       break;
-               IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
-                          app->name, i, ports[i]);
-       }
-
-       if (ret)
-               unregister_ip_vs_app(app);
-
-       return ret;
-}
-
-
-/*
- *     ip_vs_ftp finish.
- */
-static void __exit ip_vs_ftp_exit(void)
-{
-       unregister_ip_vs_app(&ip_vs_ftp);
-}
-
-
-module_init(ip_vs_ftp_init);
-module_exit(ip_vs_ftp_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c

deleted file mode 100644 (file)

index 6ecef35..0000000
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
- * IPVS:        Locality-Based Least-Connection scheduling module
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Martin Hamilton         :    fixed the terrible locking bugs
- *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
- *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
- *     Wensong Zhang           :    added doing full expiration check to
- *                                   collect stale entries of 24+ hours when
- *                                   no partial expire check in a half hour
- *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
- *                                   to avoid the possible race between timer
- *                                   handler and del_timer thread in SMP
- *
- */
-
-/*
- * The lblc algorithm is as follows (pseudo code):
- *
- *       if cachenode[dest_ip] is null then
- *               n, cachenode[dest_ip] <- {weighted least-conn node};
- *       else
- *               n <- cachenode[dest_ip];
- *               if (n is dead) OR
- *                  (n.conns>n.weight AND
- *                   there is a node m with m.conns<m.weight/2) then
- *                 n, cachenode[dest_ip] <- {weighted least-conn node};
- *
- *       return n;
- *
- * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
- * me to write this module.
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/jiffies.h>
-
-/* for sysctl */
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *    It is for garbage collection of stale IPVS lblc entries,
- *    when the table is full.
- */
-#define CHECK_EXPIRE_INTERVAL   (60*HZ)
-#define ENTRY_TIMEOUT           (6*60*HZ)
-
-/*
- *    It is for full expiration check.
- *    When there is no partial expiration check (garbage collection)
- *    in a half hour, do a full expiration check to collect stale
- *    entries that haven't been touched for a day.
- */
-#define COUNT_FOR_FULL_EXPIRATION   30
-static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
-
-
-/*
- *     for IPVS lblc entry hash table
- */
-#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
-#define CONFIG_IP_VS_LBLC_TAB_BITS      10
-#endif
-#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
-#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
-#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
-
-
-/*
- *      IPVS lblc entry represents an association between destination
- *      IP address and its destination server
- */
-struct ip_vs_lblc_entry {
-       struct list_head        list;
-       __be32                  addr;           /* destination IP address */
-       struct ip_vs_dest       *dest;          /* real server (cache) */
-       unsigned long           lastuse;        /* last used time */
-};
-
-
-/*
- *      IPVS lblc hash table
- */
-struct ip_vs_lblc_table {
-       struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
-       atomic_t                entries;        /* number of entries */
-       int                     max_size;       /* maximum size of entries */
-       struct timer_list       periodic_timer; /* collect stale entries */
-       int                     rover;          /* rover for expire check */
-       int                     counter;        /* counter for no expire */
-};
-
-
-/*
- *      IPVS LBLC sysctl table
- */
-
-static ctl_table vs_vars_table[] = {
-       {
-               .procname       = "lblc_expiration",
-               .data           = &sysctl_ip_vs_lblc_expiration,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       { .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
-{
-       list_del(&en->list);
-       /*
-        * We don't kfree dest because it is refered either by its service
-        * or the trash dest list.
-        */
-       atomic_dec(&en->dest->refcnt);
-       kfree(en);
-}
-
-
-/*
- *     Returns hash value for IPVS LBLC entry
- */
-static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
-{
-       return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
-}
-
-
-/*
- *     Hash an entry in the ip_vs_lblc_table.
- *     returns bool success.
- */
-static void
-ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
-{
-       unsigned hash = ip_vs_lblc_hashkey(en->addr);
-
-       list_add(&en->list, &tbl->bucket[hash]);
-       atomic_inc(&tbl->entries);
-}
-
-
-/*
- *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- *  lock
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
-{
-       unsigned hash = ip_vs_lblc_hashkey(addr);
-       struct ip_vs_lblc_entry *en;
-
-       list_for_each_entry(en, &tbl->bucket[hash], list)
-               if (en->addr == addr)
-                       return en;
-
-       return NULL;
-}
-
-
-/*
- * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
- */
-static inline struct ip_vs_lblc_entry *
-ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
-              struct ip_vs_dest *dest)
-{
-       struct ip_vs_lblc_entry *en;
-
-       en = ip_vs_lblc_get(tbl, daddr);
-       if (!en) {
-               en = kmalloc(sizeof(*en), GFP_ATOMIC);
-               if (!en) {
-                       IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
-                       return NULL;
-               }
-
-               en->addr = daddr;
-               en->lastuse = jiffies;
-
-               atomic_inc(&dest->refcnt);
-               en->dest = dest;
-
-               ip_vs_lblc_hash(tbl, en);
-       } else if (en->dest != dest) {
-               atomic_dec(&en->dest->refcnt);
-               atomic_inc(&dest->refcnt);
-               en->dest = dest;
-       }
-
-       return en;
-}
-
-
-/*
- *      Flush all the entries of the specified table.
- */
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
-{
-       struct ip_vs_lblc_entry *en, *nxt;
-       int i;
-
-       for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-               list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
-                       ip_vs_lblc_free(en);
-                       atomic_dec(&tbl->entries);
-               }
-       }
-}
-
-
-static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
-{
-       struct ip_vs_lblc_table *tbl = svc->sched_data;
-       struct ip_vs_lblc_entry *en, *nxt;
-       unsigned long now = jiffies;
-       int i, j;
-
-       for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
-               j = (j + 1) & IP_VS_LBLC_TAB_MASK;
-
-               write_lock(&svc->sched_lock);
-               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-                       if (time_before(now,
-                                       en->lastuse + sysctl_ip_vs_lblc_expiration))
-                               continue;
-
-                       ip_vs_lblc_free(en);
-                       atomic_dec(&tbl->entries);
-               }
-               write_unlock(&svc->sched_lock);
-       }
-       tbl->rover = j;
-}
-
-
-/*
- *      Periodical timer handler for IPVS lblc table
- *      It is used to collect stale entries when the number of entries
- *      exceeds the maximum size of the table.
- *
- *      Fixme: we probably need more complicated algorithm to collect
- *             entries that have not been used for a long time even
- *             if the number of entries doesn't exceed the maximum size
- *             of the table.
- *      The full expiration check is for this purpose now.
- */
-static void ip_vs_lblc_check_expire(unsigned long data)
-{
-       struct ip_vs_service *svc = (struct ip_vs_service *) data;
-       struct ip_vs_lblc_table *tbl = svc->sched_data;
-       unsigned long now = jiffies;
-       int goal;
-       int i, j;
-       struct ip_vs_lblc_entry *en, *nxt;
-
-       if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
-               /* do full expiration check */
-               ip_vs_lblc_full_check(svc);
-               tbl->counter = 1;
-               goto out;
-       }
-
-       if (atomic_read(&tbl->entries) <= tbl->max_size) {
-               tbl->counter++;
-               goto out;
-       }
-
-       goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
-       if (goal > tbl->max_size/2)
-               goal = tbl->max_size/2;
-
-       for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
-               j = (j + 1) & IP_VS_LBLC_TAB_MASK;
-
-               write_lock(&svc->sched_lock);
-               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-                       if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
-                               continue;
-
-                       ip_vs_lblc_free(en);
-                       atomic_dec(&tbl->entries);
-                       goal--;
-               }
-               write_unlock(&svc->sched_lock);
-               if (goal <= 0)
-                       break;
-       }
-       tbl->rover = j;
-
-  out:
-       mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
-}
-
-
-static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
-{
-       int i;
-       struct ip_vs_lblc_table *tbl;
-
-       /*
-        *    Allocate the ip_vs_lblc_table for this service
-        */
-       tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
-       if (tbl == NULL) {
-               IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
-               return -ENOMEM;
-       }
-       svc->sched_data = tbl;
-       IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
-                 "current service\n", sizeof(*tbl));
-
-       /*
-        *    Initialize the hash buckets
-        */
-       for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
-               INIT_LIST_HEAD(&tbl->bucket[i]);
-       }
-       tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
-       tbl->rover = 0;
-       tbl->counter = 1;
-
-       /*
-        *    Hook periodic timer for garbage collection
-        */
-       setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
-                       (unsigned long)svc);
-       mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
-
-       return 0;
-}
-
-
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_lblc_table *tbl = svc->sched_data;
-
-       /* remove periodic timer */
-       del_timer_sync(&tbl->periodic_timer);
-
-       /* got to clean up table entries here */
-       ip_vs_lblc_flush(tbl);
-
-       /* release the table itself */
-       kfree(tbl);
-       IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
-                 sizeof(*tbl));
-
-       return 0;
-}
-
-
-static inline struct ip_vs_dest *
-__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
-{
-       struct ip_vs_dest *dest, *least;
-       int loh, doh;
-
-       /*
-        * We think the overhead of processing active connections is fifty
-        * times higher than that of inactive connections in average. (This
-        * fifty times might not be accurate, we will change it later.) We
-        * use the following formula to estimate the overhead:
-        *                dest->activeconns*50 + dest->inactconns
-        * and the load:
-        *                (dest overhead) / dest->weight
-        *
-        * Remember -- no floats in kernel mode!!!
-        * The comparison of h1*w2 > h2*w1 is equivalent to that of
-        *                h1/w1 > h2/w2
-        * if every weight is larger than zero.
-        *
-        * The server with weight=0 is quiesced and will not receive any
-        * new connection.
-        */
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-               if (atomic_read(&dest->weight) > 0) {
-                       least = dest;
-                       loh = atomic_read(&least->activeconns) * 50
-                               + atomic_read(&least->inactconns);
-                       goto nextstage;
-               }
-       }
-       return NULL;
-
-       /*
-        *    Find the destination with the least load.
-        */
-  nextstage:
-       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-
-               doh = atomic_read(&dest->activeconns) * 50
-                       + atomic_read(&dest->inactconns);
-               if (loh * atomic_read(&dest->weight) >
-                   doh * atomic_read(&least->weight)) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
-                 "activeconns %d refcnt %d weight %d overhead %d\n",
-                 NIPQUAD(least->addr.ip), ntohs(least->port),
-                 atomic_read(&least->activeconns),
-                 atomic_read(&least->refcnt),
-                 atomic_read(&least->weight), loh);
-
-       return least;
-}
-
-
-/*
- *   If this destination server is overloaded and there is a less loaded
- *   server, then return true.
- */
-static inline int
-is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
-       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
-               struct ip_vs_dest *d;
-
-               list_for_each_entry(d, &svc->destinations, n_list) {
-                       if (atomic_read(&d->activeconns)*2
-                           < atomic_read(&d->weight)) {
-                               return 1;
-                       }
-               }
-       }
-       return 0;
-}
-
-
-/*
- *    Locality-Based (weighted) Least-Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_lblc_table *tbl = svc->sched_data;
-       struct iphdr *iph = ip_hdr(skb);
-       struct ip_vs_dest *dest = NULL;
-       struct ip_vs_lblc_entry *en;
-
-       IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
-
-       /* First look in our cache */
-       read_lock(&svc->sched_lock);
-       en = ip_vs_lblc_get(tbl, iph->daddr);
-       if (en) {
-               /* We only hold a read lock, but this is atomic */
-               en->lastuse = jiffies;
-
-               /*
-                * If the destination is not available, i.e. it's in the trash,
-                * we must ignore it, as it may be removed from under our feet,
-                * if someone drops our reference count. Our caller only makes
-                * sure that destinations, that are not in the trash, are not
-                * moved to the trash, while we are scheduling. But anyone can
-                * free up entries from the trash at any time.
-                */
-
-               if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
-                       dest = en->dest;
-       }
-       read_unlock(&svc->sched_lock);
-
-       /* If the destination has a weight and is not overloaded, use it */
-       if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
-               goto out;
-
-       /* No cache entry or it is invalid, time to schedule */
-       dest = __ip_vs_lblc_schedule(svc, iph);
-       if (!dest) {
-               IP_VS_DBG(1, "no destination available\n");
-               return NULL;
-       }
-
-       /* If we fail to create a cache entry, we'll just use the valid dest */
-       write_lock(&svc->sched_lock);
-       ip_vs_lblc_new(tbl, iph->daddr, dest);
-       write_unlock(&svc->sched_lock);
-
-out:
-       IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
-                 "--> server %u.%u.%u.%u:%d\n",
-                 NIPQUAD(iph->daddr),
-                 NIPQUAD(dest->addr.ip),
-                 ntohs(dest->port));
-
-       return dest;
-}
-
-
-/*
- *      IPVS LBLC Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_lblc_scheduler =
-{
-       .name =                 "lblc",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        0,
-#endif
-       .init_service =         ip_vs_lblc_init_svc,
-       .done_service =         ip_vs_lblc_done_svc,
-       .schedule =             ip_vs_lblc_schedule,
-};
-
-
-static int __init ip_vs_lblc_init(void)
-{
-       int ret;
-
-       sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
-       ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-       if (ret)
-               unregister_sysctl_table(sysctl_header);
-       return ret;
-}
-
-
-static void __exit ip_vs_lblc_cleanup(void)
-{
-       unregister_sysctl_table(sysctl_header);
-       unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-}
-
-
-module_init(ip_vs_lblc_init);
-module_exit(ip_vs_lblc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c

deleted file mode 100644 (file)

index 1f75ea8..0000000
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * IPVS:        Locality-Based Least-Connection with Replication scheduler
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Julian Anastasov        :    Added the missing (dest->weight>0)
- *                                  condition in the ip_vs_dest_set_max.
- *
- */
-
-/*
- * The lblc/r algorithm is as follows (pseudo code):
- *
- *       if serverSet[dest_ip] is null then
- *               n, serverSet[dest_ip] <- {weighted least-conn node};
- *       else
- *               n <- {least-conn (alive) node in serverSet[dest_ip]};
- *               if (n is null) OR
- *                  (n.conns>n.weight AND
- *                   there is a node m with m.conns<m.weight/2) then
- *                   n <- {weighted least-conn node};
- *                   add n to serverSet[dest_ip];
- *               if |serverSet[dest_ip]| > 1 AND
- *                   now - serverSet[dest_ip].lastMod > T then
- *                   m <- {most conn node in serverSet[dest_ip]};
- *                   remove m from serverSet[dest_ip];
- *       if serverSet[dest_ip] changed then
- *               serverSet[dest_ip].lastMod <- now;
- *
- *       return n;
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/jiffies.h>
-
-/* for sysctl */
-#include <linux/fs.h>
-#include <linux/sysctl.h>
-#include <net/net_namespace.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *    It is for garbage collection of stale IPVS lblcr entries,
- *    when the table is full.
- */
-#define CHECK_EXPIRE_INTERVAL   (60*HZ)
-#define ENTRY_TIMEOUT           (6*60*HZ)
-
-/*
- *    It is for full expiration check.
- *    When there is no partial expiration check (garbage collection)
- *    in a half hour, do a full expiration check to collect stale
- *    entries that haven't been touched for a day.
- */
-#define COUNT_FOR_FULL_EXPIRATION   30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
-
-/*
- *     for IPVS lblcr entry hash table
- */
-#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
-#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
-#endif
-#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
-#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
-#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
-
-
-/*
- *      IPVS destination set structure and operations
- */
-struct ip_vs_dest_list {
-       struct ip_vs_dest_list  *next;          /* list link */
-       struct ip_vs_dest       *dest;          /* destination server */
-};
-
-struct ip_vs_dest_set {
-       atomic_t                size;           /* set size */
-       unsigned long           lastmod;        /* last modified time */
-       struct ip_vs_dest_list  *list;          /* destination list */
-       rwlock_t                lock;           /* lock for this list */
-};
-
-
-static struct ip_vs_dest_list *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
-{
-       struct ip_vs_dest_list *e;
-
-       for (e=set->list; e!=NULL; e=e->next) {
-               if (e->dest == dest)
-                       /* already existed */
-                       return NULL;
-       }
-
-       e = kmalloc(sizeof(*e), GFP_ATOMIC);
-       if (e == NULL) {
-               IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
-               return NULL;
-       }
-
-       atomic_inc(&dest->refcnt);
-       e->dest = dest;
-
-       /* link it to the list */
-       e->next = set->list;
-       set->list = e;
-       atomic_inc(&set->size);
-
-       set->lastmod = jiffies;
-       return e;
-}
-
-static void
-ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
-{
-       struct ip_vs_dest_list *e, **ep;
-
-       for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
-               if (e->dest == dest) {
-                       /* HIT */
-                       *ep = e->next;
-                       atomic_dec(&set->size);
-                       set->lastmod = jiffies;
-                       atomic_dec(&e->dest->refcnt);
-                       kfree(e);
-                       break;
-               }
-               ep = &e->next;
-       }
-}
-
-static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
-{
-       struct ip_vs_dest_list *e, **ep;
-
-       write_lock(&set->lock);
-       for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
-               *ep = e->next;
-               /*
-                * We don't kfree dest because it is refered either
-                * by its service or by the trash dest list.
-                */
-               atomic_dec(&e->dest->refcnt);
-               kfree(e);
-       }
-       write_unlock(&set->lock);
-}
-
-/* get weighted least-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
-{
-       register struct ip_vs_dest_list *e;
-       struct ip_vs_dest *dest, *least;
-       int loh, doh;
-
-       if (set == NULL)
-               return NULL;
-
-       /* select the first destination server, whose weight > 0 */
-       for (e=set->list; e!=NULL; e=e->next) {
-               least = e->dest;
-               if (least->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-
-               if ((atomic_read(&least->weight) > 0)
-                   && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
-                       loh = atomic_read(&least->activeconns) * 50
-                               + atomic_read(&least->inactconns);
-                       goto nextstage;
-               }
-       }
-       return NULL;
-
-       /* find the destination with the weighted least load */
-  nextstage:
-       for (e=e->next; e!=NULL; e=e->next) {
-               dest = e->dest;
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-
-               doh = atomic_read(&dest->activeconns) * 50
-                       + atomic_read(&dest->inactconns);
-               if ((loh * atomic_read(&dest->weight) >
-                    doh * atomic_read(&least->weight))
-                   && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
-                 "activeconns %d refcnt %d weight %d overhead %d\n",
-                 NIPQUAD(least->addr.ip), ntohs(least->port),
-                 atomic_read(&least->activeconns),
-                 atomic_read(&least->refcnt),
-                 atomic_read(&least->weight), loh);
-       return least;
-}
-
-
-/* get weighted most-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
-{
-       register struct ip_vs_dest_list *e;
-       struct ip_vs_dest *dest, *most;
-       int moh, doh;
-
-       if (set == NULL)
-               return NULL;
-
-       /* select the first destination server, whose weight > 0 */
-       for (e=set->list; e!=NULL; e=e->next) {
-               most = e->dest;
-               if (atomic_read(&most->weight) > 0) {
-                       moh = atomic_read(&most->activeconns) * 50
-                               + atomic_read(&most->inactconns);
-                       goto nextstage;
-               }
-       }
-       return NULL;
-
-       /* find the destination with the weighted most load */
-  nextstage:
-       for (e=e->next; e!=NULL; e=e->next) {
-               dest = e->dest;
-               doh = atomic_read(&dest->activeconns) * 50
-                       + atomic_read(&dest->inactconns);
-               /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
-               if ((moh * atomic_read(&dest->weight) <
-                    doh * atomic_read(&most->weight))
-                   && (atomic_read(&dest->weight) > 0)) {
-                       most = dest;
-                       moh = doh;
-               }
-       }
-
-       IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
-                 "activeconns %d refcnt %d weight %d overhead %d\n",
-                 NIPQUAD(most->addr.ip), ntohs(most->port),
-                 atomic_read(&most->activeconns),
-                 atomic_read(&most->refcnt),
-                 atomic_read(&most->weight), moh);
-       return most;
-}
-
-
-/*
- *      IPVS lblcr entry represents an association between destination
- *      IP address and its destination server set
- */
-struct ip_vs_lblcr_entry {
-       struct list_head        list;
-       __be32                   addr;           /* destination IP address */
-       struct ip_vs_dest_set   set;            /* destination server set */
-       unsigned long           lastuse;        /* last used time */
-};
-
-
-/*
- *      IPVS lblcr hash table
- */
-struct ip_vs_lblcr_table {
-       struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
-       atomic_t                entries;        /* number of entries */
-       int                     max_size;       /* maximum size of entries */
-       struct timer_list       periodic_timer; /* collect stale entries */
-       int                     rover;          /* rover for expire check */
-       int                     counter;        /* counter for no expire */
-};
-
-
-/*
- *      IPVS LBLCR sysctl table
- */
-
-static ctl_table vs_vars_table[] = {
-       {
-               .procname       = "lblcr_expiration",
-               .data           = &sysctl_ip_vs_lblcr_expiration,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-       },
-       { .ctl_name = 0 }
-};
-
-static struct ctl_table_header * sysctl_header;
-
-static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
-{
-       list_del(&en->list);
-       ip_vs_dest_set_eraseall(&en->set);
-       kfree(en);
-}
-
-
-/*
- *     Returns hash value for IPVS LBLCR entry
- */
-static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
-{
-       return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
-}
-
-
-/*
- *     Hash an entry in the ip_vs_lblcr_table.
- *     returns bool success.
- */
-static void
-ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
-{
-       unsigned hash = ip_vs_lblcr_hashkey(en->addr);
-
-       list_add(&en->list, &tbl->bucket[hash]);
-       atomic_inc(&tbl->entries);
-}
-
-
-/*
- *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- *  read lock.
- */
-static inline struct ip_vs_lblcr_entry *
-ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
-{
-       unsigned hash = ip_vs_lblcr_hashkey(addr);
-       struct ip_vs_lblcr_entry *en;
-
-       list_for_each_entry(en, &tbl->bucket[hash], list)
-               if (en->addr == addr)
-                       return en;
-
-       return NULL;
-}
-
-
-/*
- * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
- */
-static inline struct ip_vs_lblcr_entry *
-ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
-               struct ip_vs_dest *dest)
-{
-       struct ip_vs_lblcr_entry *en;
-
-       en = ip_vs_lblcr_get(tbl, daddr);
-       if (!en) {
-               en = kmalloc(sizeof(*en), GFP_ATOMIC);
-               if (!en) {
-                       IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
-                       return NULL;
-               }
-
-               en->addr = daddr;
-               en->lastuse = jiffies;
-
-               /* initilize its dest set */
-               atomic_set(&(en->set.size), 0);
-               en->set.list = NULL;
-               rwlock_init(&en->set.lock);
-
-               ip_vs_lblcr_hash(tbl, en);
-       }
-
-       write_lock(&en->set.lock);
-       ip_vs_dest_set_insert(&en->set, dest);
-       write_unlock(&en->set.lock);
-
-       return en;
-}
-
-
-/*
- *      Flush all the entries of the specified table.
- */
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
-{
-       int i;
-       struct ip_vs_lblcr_entry *en, *nxt;
-
-       /* No locking required, only called during cleanup. */
-       for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-               list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
-                       ip_vs_lblcr_free(en);
-               }
-       }
-}
-
-
-static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
-{
-       struct ip_vs_lblcr_table *tbl = svc->sched_data;
-       unsigned long now = jiffies;
-       int i, j;
-       struct ip_vs_lblcr_entry *en, *nxt;
-
-       for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-               j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
-
-               write_lock(&svc->sched_lock);
-               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-                       if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
-                                      now))
-                               continue;
-
-                       ip_vs_lblcr_free(en);
-                       atomic_dec(&tbl->entries);
-               }
-               write_unlock(&svc->sched_lock);
-       }
-       tbl->rover = j;
-}
-
-
-/*
- *      Periodical timer handler for IPVS lblcr table
- *      It is used to collect stale entries when the number of entries
- *      exceeds the maximum size of the table.
- *
- *      Fixme: we probably need more complicated algorithm to collect
- *             entries that have not been used for a long time even
- *             if the number of entries doesn't exceed the maximum size
- *             of the table.
- *      The full expiration check is for this purpose now.
- */
-static void ip_vs_lblcr_check_expire(unsigned long data)
-{
-       struct ip_vs_service *svc = (struct ip_vs_service *) data;
-       struct ip_vs_lblcr_table *tbl = svc->sched_data;
-       unsigned long now = jiffies;
-       int goal;
-       int i, j;
-       struct ip_vs_lblcr_entry *en, *nxt;
-
-       if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
-               /* do full expiration check */
-               ip_vs_lblcr_full_check(svc);
-               tbl->counter = 1;
-               goto out;
-       }
-
-       if (atomic_read(&tbl->entries) <= tbl->max_size) {
-               tbl->counter++;
-               goto out;
-       }
-
-       goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
-       if (goal > tbl->max_size/2)
-               goal = tbl->max_size/2;
-
-       for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-               j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
-
-               write_lock(&svc->sched_lock);
-               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-                       if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
-                               continue;
-
-                       ip_vs_lblcr_free(en);
-                       atomic_dec(&tbl->entries);
-                       goal--;
-               }
-               write_unlock(&svc->sched_lock);
-               if (goal <= 0)
-                       break;
-       }
-       tbl->rover = j;
-
-  out:
-       mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
-}
-
-static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
-{
-       int i;
-       struct ip_vs_lblcr_table *tbl;
-
-       /*
-        *    Allocate the ip_vs_lblcr_table for this service
-        */
-       tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
-       if (tbl == NULL) {
-               IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
-               return -ENOMEM;
-       }
-       svc->sched_data = tbl;
-       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
-                 "current service\n", sizeof(*tbl));
-
-       /*
-        *    Initialize the hash buckets
-        */
-       for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
-               INIT_LIST_HEAD(&tbl->bucket[i]);
-       }
-       tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
-       tbl->rover = 0;
-       tbl->counter = 1;
-
-       /*
-        *    Hook periodic timer for garbage collection
-        */
-       setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
-                       (unsigned long)svc);
-       mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
-
-       return 0;
-}
-
-
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_lblcr_table *tbl = svc->sched_data;
-
-       /* remove periodic timer */
-       del_timer_sync(&tbl->periodic_timer);
-
-       /* got to clean up table entries here */
-       ip_vs_lblcr_flush(tbl);
-
-       /* release the table itself */
-       kfree(tbl);
-       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
-                 sizeof(*tbl));
-
-       return 0;
-}
-
-
-static inline struct ip_vs_dest *
-__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
-{
-       struct ip_vs_dest *dest, *least;
-       int loh, doh;
-
-       /*
-        * We think the overhead of processing active connections is fifty
-        * times higher than that of inactive connections in average. (This
-        * fifty times might not be accurate, we will change it later.) We
-        * use the following formula to estimate the overhead:
-        *                dest->activeconns*50 + dest->inactconns
-        * and the load:
-        *                (dest overhead) / dest->weight
-        *
-        * Remember -- no floats in kernel mode!!!
-        * The comparison of h1*w2 > h2*w1 is equivalent to that of
-        *                h1/w1 > h2/w2
-        * if every weight is larger than zero.
-        *
-        * The server with weight=0 is quiesced and will not receive any
-        * new connection.
-        */
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-
-               if (atomic_read(&dest->weight) > 0) {
-                       least = dest;
-                       loh = atomic_read(&least->activeconns) * 50
-                               + atomic_read(&least->inactconns);
-                       goto nextstage;
-               }
-       }
-       return NULL;
-
-       /*
-        *    Find the destination with the least load.
-        */
-  nextstage:
-       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-
-               doh = atomic_read(&dest->activeconns) * 50
-                       + atomic_read(&dest->inactconns);
-               if (loh * atomic_read(&dest->weight) >
-                   doh * atomic_read(&least->weight)) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
-                 "activeconns %d refcnt %d weight %d overhead %d\n",
-                 NIPQUAD(least->addr.ip), ntohs(least->port),
-                 atomic_read(&least->activeconns),
-                 atomic_read(&least->refcnt),
-                 atomic_read(&least->weight), loh);
-
-       return least;
-}
-
-
-/*
- *   If this destination server is overloaded and there is a less loaded
- *   server, then return true.
- */
-static inline int
-is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
-{
-       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
-               struct ip_vs_dest *d;
-
-               list_for_each_entry(d, &svc->destinations, n_list) {
-                       if (atomic_read(&d->activeconns)*2
-                           < atomic_read(&d->weight)) {
-                               return 1;
-                       }
-               }
-       }
-       return 0;
-}
-
-
-/*
- *    Locality-Based (weighted) Least-Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_lblcr_table *tbl = svc->sched_data;
-       struct iphdr *iph = ip_hdr(skb);
-       struct ip_vs_dest *dest = NULL;
-       struct ip_vs_lblcr_entry *en;
-
-       IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
-
-       /* First look in our cache */
-       read_lock(&svc->sched_lock);
-       en = ip_vs_lblcr_get(tbl, iph->daddr);
-       if (en) {
-               /* We only hold a read lock, but this is atomic */
-               en->lastuse = jiffies;
-
-               /* Get the least loaded destination */
-               read_lock(&en->set.lock);
-               dest = ip_vs_dest_set_min(&en->set);
-               read_unlock(&en->set.lock);
-
-               /* More than one destination + enough time passed by, cleanup */
-               if (atomic_read(&en->set.size) > 1 &&
-                               time_after(jiffies, en->set.lastmod +
-                               sysctl_ip_vs_lblcr_expiration)) {
-                       struct ip_vs_dest *m;
-
-                       write_lock(&en->set.lock);
-                       m = ip_vs_dest_set_max(&en->set);
-                       if (m)
-                               ip_vs_dest_set_erase(&en->set, m);
-                       write_unlock(&en->set.lock);
-               }
-
-               /* If the destination is not overloaded, use it */
-               if (dest && !is_overloaded(dest, svc)) {
-                       read_unlock(&svc->sched_lock);
-                       goto out;
-               }
-
-               /* The cache entry is invalid, time to schedule */
-               dest = __ip_vs_lblcr_schedule(svc, iph);
-               if (!dest) {
-                       IP_VS_DBG(1, "no destination available\n");
-                       read_unlock(&svc->sched_lock);
-                       return NULL;
-               }
-
-               /* Update our cache entry */
-               write_lock(&en->set.lock);
-               ip_vs_dest_set_insert(&en->set, dest);
-               write_unlock(&en->set.lock);
-       }
-       read_unlock(&svc->sched_lock);
-
-       if (dest)
-               goto out;
-
-       /* No cache entry, time to schedule */
-       dest = __ip_vs_lblcr_schedule(svc, iph);
-       if (!dest) {
-               IP_VS_DBG(1, "no destination available\n");
-               return NULL;
-       }
-
-       /* If we fail to create a cache entry, we'll just use the valid dest */
-       write_lock(&svc->sched_lock);
-       ip_vs_lblcr_new(tbl, iph->daddr, dest);
-       write_unlock(&svc->sched_lock);
-
-out:
-       IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
-                 "--> server %u.%u.%u.%u:%d\n",
-                 NIPQUAD(iph->daddr),
-                 NIPQUAD(dest->addr.ip),
-                 ntohs(dest->port));
-
-       return dest;
-}
-
-
-/*
- *      IPVS LBLCR Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
-{
-       .name =                 "lblcr",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        0,
-#endif
-       .init_service =         ip_vs_lblcr_init_svc,
-       .done_service =         ip_vs_lblcr_done_svc,
-       .schedule =             ip_vs_lblcr_schedule,
-};
-
-
-static int __init ip_vs_lblcr_init(void)
-{
-       int ret;
-
-       sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
-       ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-       if (ret)
-               unregister_sysctl_table(sysctl_header);
-       return ret;
-}
-
-
-static void __exit ip_vs_lblcr_cleanup(void)
-{
-       unregister_sysctl_table(sysctl_header);
-       unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-}
-
-
-module_init(ip_vs_lblcr_init);
-module_exit(ip_vs_lblcr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c

deleted file mode 100644 (file)

index b69f808..0000000
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * IPVS:        Least-Connection Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Wensong Zhang            :     added the ip_vs_lc_update_svc
- *     Wensong Zhang            :     added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
-{
-       /*
-        * We think the overhead of processing active connections is 256
-        * times higher than that of inactive connections in average. (This
-        * 256 times might not be accurate, we will change it later) We
-        * use the following formula to estimate the overhead now:
-        *                dest->activeconns*256 + dest->inactconns
-        */
-       return (atomic_read(&dest->activeconns) << 8) +
-               atomic_read(&dest->inactconns);
-}
-
-
-/*
- *     Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest, *least = NULL;
-       unsigned int loh = 0, doh;
-
-       IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
-
-       /*
-        * Simply select the server with the least number of
-        *        (activeconns<<5) + inactconns
-        * Except whose weight is equal to zero.
-        * If the weight is equal to zero, it means that the server is
-        * quiesced, the existing connections to the server still get
-        * served, but no new connection is assigned to the server.
-        */
-
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
-                   atomic_read(&dest->weight) == 0)
-                       continue;
-               doh = ip_vs_lc_dest_overhead(dest);
-               if (!least || doh < loh) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       if (least)
-       IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
-                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-                     atomic_read(&least->activeconns),
-                     atomic_read(&least->inactconns));
-
-       return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_lc_scheduler = {
-       .name =                 "lc",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        1,
-#endif
-       .schedule =             ip_vs_lc_schedule,
-};
-
-
-static int __init ip_vs_lc_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
-}
-
-static void __exit ip_vs_lc_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
-}
-
-module_init(ip_vs_lc_init);
-module_exit(ip_vs_lc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c

deleted file mode 100644 (file)

index 9a2d803..0000000
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * IPVS:        Never Queue scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The NQ algorithm adopts a two-speed model. When there is an idle server
- * available, the job will be sent to the idle server, instead of waiting
- * for a fast one. When there is no idle server available, the job will be
- * sent to the server that minimize its expected delay (The Shortest
- * Expected Delay scheduling algorithm).
- *
- * See the following paper for more information:
- * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
- * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
- * pages 986-994, 1988.
- *
- * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
- *
- * The difference between NQ and SED is that NQ can improve overall
- * system utilization.
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
-{
-       /*
-        * We only use the active connection number in the cost
-        * calculation here.
-        */
-       return atomic_read(&dest->activeconns) + 1;
-}
-
-
-/*
- *     Weighted Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest, *least = NULL;
-       unsigned int loh = 0, doh;
-
-       IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
-
-       /*
-        * We calculate the load of each dest server as follows:
-        *      (server expected overhead) / dest->weight
-        *
-        * Remember -- no floats in kernel mode!!!
-        * The comparison of h1*w2 > h2*w1 is equivalent to that of
-        *                h1/w1 > h2/w2
-        * if every weight is larger than zero.
-        *
-        * The server with weight=0 is quiesced and will not receive any
-        * new connections.
-        */
-
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
-                   !atomic_read(&dest->weight))
-                       continue;
-
-               doh = ip_vs_nq_dest_overhead(dest);
-
-               /* return the server directly if it is idle */
-               if (atomic_read(&dest->activeconns) == 0) {
-                       least = dest;
-                       loh = doh;
-                       goto out;
-               }
-
-               if (!least ||
-                   (loh * atomic_read(&dest->weight) >
-                    doh * atomic_read(&least->weight))) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       if (!least)
-               return NULL;
-
-  out:
-       IP_VS_DBG_BUF(6, "NQ: server %s:%u "
-                     "activeconns %d refcnt %d weight %d overhead %d\n",
-                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-                     atomic_read(&least->activeconns),
-                     atomic_read(&least->refcnt),
-                     atomic_read(&least->weight), loh);
-
-       return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_nq_scheduler =
-{
-       .name =                 "nq",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        1,
-#endif
-       .schedule =             ip_vs_nq_schedule,
-};
-
-
-static int __init ip_vs_nq_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
-}
-
-static void __exit ip_vs_nq_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
-}
-
-module_init(ip_vs_nq_init);
-module_exit(ip_vs_nq_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c

deleted file mode 100644 (file)

index 0791f9e..0000000
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * ip_vs_proto.c: transport protocol load balancing support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- * IPVS protocols can only be registered/unregistered when the ipvs
- * module is loaded/unloaded, so no lock is needed in accessing the
- * ipvs protocol table.
- */
-
-#define IP_VS_PROTO_TAB_SIZE           32      /* must be power of 2 */
-#define IP_VS_PROTO_HASH(proto)                ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
-
-static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
-
-
-/*
- *     register an ipvs protocol
- */
-static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
-{
-       unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
-
-       pp->next = ip_vs_proto_table[hash];
-       ip_vs_proto_table[hash] = pp;
-
-       if (pp->init != NULL)
-               pp->init(pp);
-
-       return 0;
-}
-
-
-/*
- *     unregister an ipvs protocol
- */
-static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
-{
-       struct ip_vs_protocol **pp_p;
-       unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
-
-       pp_p = &ip_vs_proto_table[hash];
-       for (; *pp_p; pp_p = &(*pp_p)->next) {
-               if (*pp_p == pp) {
-                       *pp_p = pp->next;
-                       if (pp->exit != NULL)
-                               pp->exit(pp);
-                       return 0;
-               }
-       }
-
-       return -ESRCH;
-}
-
-
-/*
- *     get ip_vs_protocol object by its proto.
- */
-struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
-{
-       struct ip_vs_protocol *pp;
-       unsigned hash = IP_VS_PROTO_HASH(proto);
-
-       for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
-               if (pp->protocol == proto)
-                       return pp;
-       }
-
-       return NULL;
-}
-
-
-/*
- *     Propagate event for state change to all protocols
- */
-void ip_vs_protocol_timeout_change(int flags)
-{
-       struct ip_vs_protocol *pp;
-       int i;
-
-       for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
-               for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
-                       if (pp->timeout_change)
-                               pp->timeout_change(pp, flags);
-               }
-       }
-}
-
-
-int *
-ip_vs_create_timeout_table(int *table, int size)
-{
-       return kmemdup(table, size, GFP_ATOMIC);
-}
-
-
-/*
- *     Set timeout value for state specified by name
- */
-int
-ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
-{
-       int i;
-
-       if (!table || !name || !to)
-               return -EINVAL;
-
-       for (i = 0; i < num; i++) {
-               if (strcmp(names[i], name))
-                       continue;
-               table[i] = to * HZ;
-               return 0;
-       }
-       return -ENOENT;
-}
-
-
-const char * ip_vs_state_name(__u16 proto, int state)
-{
-       struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
-
-       if (pp == NULL || pp->state_name == NULL)
-               return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
-       return pp->state_name(state);
-}
-
-
-static void
-ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
-                            const struct sk_buff *skb,
-                            int offset,
-                            const char *msg)
-{
-       char buf[128];
-       struct iphdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "%s TRUNCATED", pp->name);
-       else if (ih->frag_off & htons(IP_OFFSET))
-               sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
-                       pp->name, NIPQUAD(ih->saddr),
-                       NIPQUAD(ih->daddr));
-       else {
-               __be16 _ports[2], *pptr
-;
-               pptr = skb_header_pointer(skb, offset + ih->ihl*4,
-                                         sizeof(_ports), _ports);
-               if (pptr == NULL)
-                       sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
-                               pp->name,
-                               NIPQUAD(ih->saddr),
-                               NIPQUAD(ih->daddr));
-               else
-                       sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
-                               pp->name,
-                               NIPQUAD(ih->saddr),
-                               ntohs(pptr[0]),
-                               NIPQUAD(ih->daddr),
-                               ntohs(pptr[1]));
-       }
-
-       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
-                            const struct sk_buff *skb,
-                            int offset,
-                            const char *msg)
-{
-       char buf[192];
-       struct ipv6hdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "%s TRUNCATED", pp->name);
-       else if (ih->nexthdr == IPPROTO_FRAGMENT)
-               sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
-                       pp->name, NIP6(ih->saddr),
-                       NIP6(ih->daddr));
-       else {
-               __be16 _ports[2], *pptr;
-
-               pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
-                                         sizeof(_ports), _ports);
-               if (pptr == NULL)
-                       sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
-                               pp->name,
-                               NIP6(ih->saddr),
-                               NIP6(ih->daddr));
-               else
-                       sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
-                               pp->name,
-                               NIP6(ih->saddr),
-                               ntohs(pptr[0]),
-                               NIP6(ih->daddr),
-                               ntohs(pptr[1]));
-       }
-
-       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-#endif
-
-
-void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
-                         const struct sk_buff *skb,
-                         int offset,
-                         const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (skb->protocol == htons(ETH_P_IPV6))
-               ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
-       else
-#endif
-               ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
-int __init ip_vs_protocol_init(void)
-{
-       char protocols[64];
-#define REGISTER_PROTOCOL(p)                   \
-       do {                                    \
-               register_ip_vs_protocol(p);     \
-               strcat(protocols, ", ");        \
-               strcat(protocols, (p)->name);   \
-       } while (0)
-
-       protocols[0] = '\0';
-       protocols[2] = '\0';
-#ifdef CONFIG_IP_VS_PROTO_TCP
-       REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_UDP
-       REGISTER_PROTOCOL(&ip_vs_protocol_udp);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_AH
-       REGISTER_PROTOCOL(&ip_vs_protocol_ah);
-#endif
-#ifdef CONFIG_IP_VS_PROTO_ESP
-       REGISTER_PROTOCOL(&ip_vs_protocol_esp);
-#endif
-       IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
-
-       return 0;
-}
-
-
-void ip_vs_protocol_cleanup(void)
-{
-       struct ip_vs_protocol *pp;
-       int i;
-
-       /* unregister all the ipvs protocols */
-       for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
-               while ((pp = ip_vs_proto_table[i]) != NULL)
-                       unregister_ip_vs_protocol(pp);
-       }
-}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c

deleted file mode 100644 (file)

index 80ab0c8..0000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * ip_vs_proto_ah_esp.c:       AH/ESP IPSec load balancing support for IPVS
- *
- * Authors:    Julian Anastasov <ja@ssi.bg>, February 2002
- *             Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *             This program is free software; you can redistribute it and/or
- *             modify it under the terms of the GNU General Public License
- *             version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/* TODO:
-
-struct isakmp_hdr {
-       __u8            icookie[8];
-       __u8            rcookie[8];
-       __u8            np;
-       __u8            version;
-       __u8            xchgtype;
-       __u8            flags;
-       __u32           msgid;
-       __u32           length;
-};
-
-*/
-
-#define PORT_ISAKMP    500
-
-
-static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-                  const struct ip_vs_iphdr *iph, unsigned int proto_off,
-                  int inverse)
-{
-       struct ip_vs_conn *cp;
-
-       if (likely(!inverse)) {
-               cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-                                      &iph->saddr,
-                                      htons(PORT_ISAKMP),
-                                      &iph->daddr,
-                                      htons(PORT_ISAKMP));
-       } else {
-               cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-                                      &iph->daddr,
-                                      htons(PORT_ISAKMP),
-                                      &iph->saddr,
-                                      htons(PORT_ISAKMP));
-       }
-
-       if (!cp) {
-               /*
-                * We are not sure if the packet is from our
-                * service, so our conn_schedule hook should return NF_ACCEPT
-                */
-               IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
-                             "%s%s %s->%s\n",
-                             inverse ? "ICMP+" : "",
-                             pp->name,
-                             IP_VS_DBG_ADDR(af, &iph->saddr),
-                             IP_VS_DBG_ADDR(af, &iph->daddr));
-       }
-
-       return cp;
-}
-
-
-static struct ip_vs_conn *
-ah_esp_conn_out_get(int af, const struct sk_buff *skb,
-                   struct ip_vs_protocol *pp,
-                   const struct ip_vs_iphdr *iph,
-                   unsigned int proto_off,
-                   int inverse)
-{
-       struct ip_vs_conn *cp;
-
-       if (likely(!inverse)) {
-               cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-                                       &iph->saddr,
-                                       htons(PORT_ISAKMP),
-                                       &iph->daddr,
-                                       htons(PORT_ISAKMP));
-       } else {
-               cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-                                       &iph->daddr,
-                                       htons(PORT_ISAKMP),
-                                       &iph->saddr,
-                                       htons(PORT_ISAKMP));
-       }
-
-       if (!cp) {
-               IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
-                             "%s%s %s->%s\n",
-                             inverse ? "ICMP+" : "",
-                             pp->name,
-                             IP_VS_DBG_ADDR(af, &iph->saddr),
-                             IP_VS_DBG_ADDR(af, &iph->daddr));
-       }
-
-       return cp;
-}
-
-
-static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-                    int *verdict, struct ip_vs_conn **cpp)
-{
-       /*
-        * AH/ESP is only related traffic. Pass the packet to IP stack.
-        */
-       *verdict = NF_ACCEPT;
-       return 0;
-}
-
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                      int offset, const char *msg)
-{
-       char buf[256];
-       struct iphdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "%s TRUNCATED", pp->name);
-       else
-               sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
-                       pp->name, NIPQUAD(ih->saddr),
-                       NIPQUAD(ih->daddr));
-
-       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                      int offset, const char *msg)
-{
-       char buf[256];
-       struct ipv6hdr _iph, *ih;
-
-       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-       if (ih == NULL)
-               sprintf(buf, "%s TRUNCATED", pp->name);
-       else
-               sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
-                       pp->name, NIP6(ih->saddr),
-                       NIP6(ih->daddr));
-
-       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-                   int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (skb->protocol == htons(ETH_P_IPV6))
-               ah_esp_debug_packet_v6(pp, skb, offset, msg);
-       else
-#endif
-               ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
-       /* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
-       /* nothing to do now */
-}
-
-
-#ifdef CONFIG_IP_VS_PROTO_AH
-struct ip_vs_protocol ip_vs_protocol_ah = {
-       .name =                 "AH",
-       .protocol =             IPPROTO_AH,
-       .num_states =           1,
-       .dont_defrag =          1,
-       .init =                 ah_esp_init,
-       .exit =                 ah_esp_exit,
-       .conn_schedule =        ah_esp_conn_schedule,
-       .conn_in_get =          ah_esp_conn_in_get,
-       .conn_out_get =         ah_esp_conn_out_get,
-       .snat_handler =         NULL,
-       .dnat_handler =         NULL,
-       .csum_check =           NULL,
-       .state_transition =     NULL,
-       .register_app =         NULL,
-       .unregister_app =       NULL,
-       .app_conn_bind =        NULL,
-       .debug_packet =         ah_esp_debug_packet,
-       .timeout_change =       NULL,           /* ISAKMP */
-       .set_state_timeout =    NULL,
-};
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_ESP
-struct ip_vs_protocol ip_vs_protocol_esp = {
-       .name =                 "ESP",
-       .protocol =             IPPROTO_ESP,
-       .num_states =           1,
-       .dont_defrag =          1,
-       .init =                 ah_esp_init,
-       .exit =                 ah_esp_exit,
-       .conn_schedule =        ah_esp_conn_schedule,
-       .conn_in_get =          ah_esp_conn_in_get,
-       .conn_out_get =         ah_esp_conn_out_get,
-       .snat_handler =         NULL,
-       .dnat_handler =         NULL,
-       .csum_check =           NULL,
-       .state_transition =     NULL,
-       .register_app =         NULL,
-       .unregister_app =       NULL,
-       .app_conn_bind =        NULL,
-       .debug_packet =         ah_esp_debug_packet,
-       .timeout_change =       NULL,           /* ISAKMP */
-};
-#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c

deleted file mode 100644 (file)

index dd4566e..0000000
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ /dev/null
@@ -1,732 +0,0 @@
-/*
- * ip_vs_proto_tcp.c:  TCP load balancing support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>                  /* for tcphdr */
-#include <net/ip.h>
-#include <net/tcp.h>                    /* for csum_tcpudp_magic */
-#include <net/ip6_checksum.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-static struct ip_vs_conn *
-tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-               const struct ip_vs_iphdr *iph, unsigned int proto_off,
-               int inverse)
-{
-       __be16 _ports[2], *pptr;
-
-       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-       if (pptr == NULL)
-               return NULL;
-
-       if (likely(!inverse)) {
-               return ip_vs_conn_in_get(af, iph->protocol,
-                                        &iph->saddr, pptr[0],
-                                        &iph->daddr, pptr[1]);
-       } else {
-               return ip_vs_conn_in_get(af, iph->protocol,
-                                        &iph->daddr, pptr[1],
-                                        &iph->saddr, pptr[0]);
-       }
-}
-
-static struct ip_vs_conn *
-tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-                const struct ip_vs_iphdr *iph, unsigned int proto_off,
-                int inverse)
-{
-       __be16 _ports[2], *pptr;
-
-       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-       if (pptr == NULL)
-               return NULL;
-
-       if (likely(!inverse)) {
-               return ip_vs_conn_out_get(af, iph->protocol,
-                                         &iph->saddr, pptr[0],
-                                         &iph->daddr, pptr[1]);
-       } else {
-               return ip_vs_conn_out_get(af, iph->protocol,
-                                         &iph->daddr, pptr[1],
-                                         &iph->saddr, pptr[0]);
-       }
-}
-
-
-static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-                 int *verdict, struct ip_vs_conn **cpp)
-{
-       struct ip_vs_service *svc;
-       struct tcphdr _tcph, *th;
-       struct ip_vs_iphdr iph;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
-       th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
-       if (th == NULL) {
-               *verdict = NF_DROP;
-               return 0;
-       }
-
-       if (th->syn &&
-           (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
-                                    th->dest))) {
-               if (ip_vs_todrop()) {
-                       /*
-                        * It seems that we are very loaded.
-                        * We have to drop this packet :(
-                        */
-                       ip_vs_service_put(svc);
-                       *verdict = NF_DROP;
-                       return 0;
-               }
-
-               /*
-                * Let the virtual server select a real server for the
-                * incoming connection, and create a connection entry.
-                */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
-                       *verdict = ip_vs_leave(svc, skb, pp);
-                       return 0;
-               }
-               ip_vs_service_put(svc);
-       }
-       return 1;
-}
-
-
-static inline void
-tcp_fast_csum_update(int af, struct tcphdr *tcph,
-                    const union nf_inet_addr *oldip,
-                    const union nf_inet_addr *newip,
-                    __be16 oldport, __be16 newport)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               tcph->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-                                        ip_vs_check_diff2(oldport, newport,
-                                               ~csum_unfold(tcph->check))));
-       else
-#endif
-       tcph->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-                                ip_vs_check_diff2(oldport, newport,
-                                               ~csum_unfold(tcph->check))));
-}
-
-
-static inline void
-tcp_partial_csum_update(int af, struct tcphdr *tcph,
-                    const union nf_inet_addr *oldip,
-                    const union nf_inet_addr *newip,
-                    __be16 oldlen, __be16 newlen)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               tcph->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-                                        ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(tcph->check))));
-       else
-#endif
-       tcph->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-                               ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(tcph->check))));
-}
-
-
-static int
-tcp_snat_handler(struct sk_buff *skb,
-                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-       struct tcphdr *tcph;
-       unsigned int tcphoff;
-       int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (cp->af == AF_INET6)
-               tcphoff = sizeof(struct ipv6hdr);
-       else
-#endif
-               tcphoff = ip_hdrlen(skb);
-       oldlen = skb->len - tcphoff;
-
-       /* csum_check requires unshared skb */
-       if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
-               return 0;
-
-       if (unlikely(cp->app != NULL)) {
-               /* Some checks before mangling */
-               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-                       return 0;
-
-               /* Call application helper if needed */
-               if (!ip_vs_app_pkt_out(cp, skb))
-                       return 0;
-       }
-
-       tcph = (void *)skb_network_header(skb) + tcphoff;
-       tcph->source = cp->vport;
-
-       /* Adjust TCP checksums */
-       if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
-                                       htonl(oldlen),
-                                       htonl(skb->len - tcphoff));
-       } else if (!cp->app) {
-               /* Only port and addr are changed, do fast csum update */
-               tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
-                                    cp->dport, cp->vport);
-               if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
-       } else {
-               /* full checksum calculation */
-               tcph->check = 0;
-               skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-               if (cp->af == AF_INET6)
-                       tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
-                                                     &cp->caddr.in6,
-                                                     skb->len - tcphoff,
-                                                     cp->protocol, skb->csum);
-               else
-#endif
-                       tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
-                                                       cp->caddr.ip,
-                                                       skb->len - tcphoff,
-                                                       cp->protocol,
-                                                       skb->csum);
-
-               IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
-                         pp->name, tcph->check,
-                         (char*)&(tcph->check) - (char*)tcph);
-       }
-       return 1;
-}
-
-
-static int
-tcp_dnat_handler(struct sk_buff *skb,
-                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-       struct tcphdr *tcph;
-       unsigned int tcphoff;
-       int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (cp->af == AF_INET6)
-               tcphoff = sizeof(struct ipv6hdr);
-       else
-#endif
-               tcphoff = ip_hdrlen(skb);
-       oldlen = skb->len - tcphoff;
-
-       /* csum_check requires unshared skb */
-       if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
-               return 0;
-
-       if (unlikely(cp->app != NULL)) {
-               /* Some checks before mangling */
-               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-                       return 0;
-
-               /*
-                *      Attempt ip_vs_app call.
-                *      It will fix ip_vs_conn and iph ack_seq stuff
-                */
-               if (!ip_vs_app_pkt_in(cp, skb))
-                       return 0;
-       }
-
-       tcph = (void *)skb_network_header(skb) + tcphoff;
-       tcph->dest = cp->dport;
-
-       /*
-        *      Adjust TCP checksums
-        */
-       if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
-                                       htonl(oldlen),
-                                       htonl(skb->len - tcphoff));
-       } else if (!cp->app) {
-               /* Only port and addr are changed, do fast csum update */
-               tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
-                                    cp->vport, cp->dport);
-               if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
-       } else {
-               /* full checksum calculation */
-               tcph->check = 0;
-               skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-               if (cp->af == AF_INET6)
-                       tcph->check = csum_ipv6_magic(&cp->caddr.in6,
-                                                     &cp->daddr.in6,
-                                                     skb->len - tcphoff,
-                                                     cp->protocol, skb->csum);
-               else
-#endif
-                       tcph->check = csum_tcpudp_magic(cp->caddr.ip,
-                                                       cp->daddr.ip,
-                                                       skb->len - tcphoff,
-                                                       cp->protocol,
-                                                       skb->csum);
-               skb->ip_summed = CHECKSUM_UNNECESSARY;
-       }
-       return 1;
-}
-
-
-static int
-tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
-       unsigned int tcphoff;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               tcphoff = sizeof(struct ipv6hdr);
-       else
-#endif
-               tcphoff = ip_hdrlen(skb);
-
-       switch (skb->ip_summed) {
-       case CHECKSUM_NONE:
-               skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-       case CHECKSUM_COMPLETE:
-#ifdef CONFIG_IP_VS_IPV6
-               if (af == AF_INET6) {
-                       if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-                                           &ipv6_hdr(skb)->daddr,
-                                           skb->len - tcphoff,
-                                           ipv6_hdr(skb)->nexthdr,
-                                           skb->csum)) {
-                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-                                                "Failed checksum for");
-                               return 0;
-                       }
-               } else
-#endif
-                       if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
-                                             ip_hdr(skb)->daddr,
-                                             skb->len - tcphoff,
-                                             ip_hdr(skb)->protocol,
-                                             skb->csum)) {
-                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-                                                "Failed checksum for");
-                               return 0;
-                       }
-               break;
-       default:
-               /* No need to checksum. */
-               break;
-       }
-
-       return 1;
-}
-
-
-#define TCP_DIR_INPUT          0
-#define TCP_DIR_OUTPUT         4
-#define TCP_DIR_INPUT_ONLY     8
-
-static const int tcp_state_off[IP_VS_DIR_LAST] = {
-       [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
-       [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
-       [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
-};
-
-/*
- *     Timeout table[state]
- */
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
-       [IP_VS_TCP_S_NONE]              =       2*HZ,
-       [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
-       [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
-       [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
-       [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
-       [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
-       [IP_VS_TCP_S_CLOSE]             =       10*HZ,
-       [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
-       [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
-       [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
-       [IP_VS_TCP_S_SYNACK]            =       120*HZ,
-       [IP_VS_TCP_S_LAST]              =       2*HZ,
-};
-
-static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
-       [IP_VS_TCP_S_NONE]              =       "NONE",
-       [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
-       [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
-       [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
-       [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
-       [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
-       [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
-       [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
-       [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
-       [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
-       [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
-       [IP_VS_TCP_S_LAST]              =       "BUG!",
-};
-
-#define sNO IP_VS_TCP_S_NONE
-#define sES IP_VS_TCP_S_ESTABLISHED
-#define sSS IP_VS_TCP_S_SYN_SENT
-#define sSR IP_VS_TCP_S_SYN_RECV
-#define sFW IP_VS_TCP_S_FIN_WAIT
-#define sTW IP_VS_TCP_S_TIME_WAIT
-#define sCL IP_VS_TCP_S_CLOSE
-#define sCW IP_VS_TCP_S_CLOSE_WAIT
-#define sLA IP_VS_TCP_S_LAST_ACK
-#define sLI IP_VS_TCP_S_LISTEN
-#define sSA IP_VS_TCP_S_SYNACK
-
-struct tcp_states_t {
-       int next_state[IP_VS_TCP_S_LAST];
-};
-
-static const char * tcp_state_name(int state)
-{
-       if (state >= IP_VS_TCP_S_LAST)
-               return "ERR!";
-       return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
-}
-
-static struct tcp_states_t tcp_states [] = {
-/*     INPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
-/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
-
-/*     OUTPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
-/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
-/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
-/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
-/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
-
-/*     INPUT-ONLY */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
-/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-};
-
-static struct tcp_states_t tcp_states_dos [] = {
-/*     INPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
-/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
-/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-
-/*     OUTPUT */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
-/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
-/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
-/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
-/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
-
-/*     INPUT-ONLY */
-/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
-/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
-/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
-/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
-};
-
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-       int on = (flags & 1);           /* secure_tcp */
-
-       /*
-       ** FIXME: change secure_tcp to independent sysctl var
-       ** or make it per-service or per-app because it is valid
-       ** for most if not for all of the applications. Something
-       ** like "capabilities" (flags) for each object.
-       */
-       tcp_state_table = (on? tcp_states_dos : tcp_states);
-}
-
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-       return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
-                                      tcp_state_name_table, sname, to);
-}
-
-static inline int tcp_state_idx(struct tcphdr *th)
-{
-       if (th->rst)
-               return 3;
-       if (th->syn)
-               return 0;
-       if (th->fin)
-               return 1;
-       if (th->ack)
-               return 2;
-       return -1;
-}
-
-static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
-             int direction, struct tcphdr *th)
-{
-       int state_idx;
-       int new_state = IP_VS_TCP_S_CLOSE;
-       int state_off = tcp_state_off[direction];
-
-       /*
-        *    Update state offset to INPUT_ONLY if necessary
-        *    or delete NO_OUTPUT flag if output packet detected
-        */
-       if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
-               if (state_off == TCP_DIR_OUTPUT)
-                       cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
-               else
-                       state_off = TCP_DIR_INPUT_ONLY;
-       }
-
-       if ((state_idx = tcp_state_idx(th)) < 0) {
-               IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
-               goto tcp_state_out;
-       }
-
-       new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
-
-  tcp_state_out:
-       if (new_state != cp->state) {
-               struct ip_vs_dest *dest = cp->dest;
-
-               IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
-                             "%s:%d state: %s->%s conn->refcnt:%d\n",
-                             pp->name,
-                             ((state_off == TCP_DIR_OUTPUT) ?
-                              "output " : "input "),
-                             th->syn ? 'S' : '.',
-                             th->fin ? 'F' : '.',
-                             th->ack ? 'A' : '.',
-                             th->rst ? 'R' : '.',
-                             IP_VS_DBG_ADDR(cp->af, &cp->daddr),
-                             ntohs(cp->dport),
-                             IP_VS_DBG_ADDR(cp->af, &cp->caddr),
-                             ntohs(cp->cport),
-                             tcp_state_name(cp->state),
-                             tcp_state_name(new_state),
-                             atomic_read(&cp->refcnt));
-
-               if (dest) {
-                       if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-                           (new_state != IP_VS_TCP_S_ESTABLISHED)) {
-                               atomic_dec(&dest->activeconns);
-                               atomic_inc(&dest->inactconns);
-                               cp->flags |= IP_VS_CONN_F_INACTIVE;
-                       } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-                                  (new_state == IP_VS_TCP_S_ESTABLISHED)) {
-                               atomic_inc(&dest->activeconns);
-                               atomic_dec(&dest->inactconns);
-                               cp->flags &= ~IP_VS_CONN_F_INACTIVE;
-                       }
-               }
-       }
-
-       cp->timeout = pp->timeout_table[cp->state = new_state];
-}
-
-
-/*
- *     Handle state transitions
- */
-static int
-tcp_state_transition(struct ip_vs_conn *cp, int direction,
-                    const struct sk_buff *skb,
-                    struct ip_vs_protocol *pp)
-{
-       struct tcphdr _tcph, *th;
-
-#ifdef CONFIG_IP_VS_IPV6
-       int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
-#else
-       int ihl = ip_hdrlen(skb);
-#endif
-
-       th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
-       if (th == NULL)
-               return 0;
-
-       spin_lock(&cp->lock);
-       set_tcp_state(pp, cp, direction, th);
-       spin_unlock(&cp->lock);
-
-       return 1;
-}
-
-
-/*
- *     Hash table for TCP application incarnations
- */
-#define        TCP_APP_TAB_BITS        4
-#define        TCP_APP_TAB_SIZE        (1 << TCP_APP_TAB_BITS)
-#define        TCP_APP_TAB_MASK        (TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
-static inline __u16 tcp_app_hashkey(__be16 port)
-{
-       return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
-               & TCP_APP_TAB_MASK;
-}
-
-
-static int tcp_register_app(struct ip_vs_app *inc)
-{
-       struct ip_vs_app *i;
-       __u16 hash;
-       __be16 port = inc->port;
-       int ret = 0;
-
-       hash = tcp_app_hashkey(port);
-
-       spin_lock_bh(&tcp_app_lock);
-       list_for_each_entry(i, &tcp_apps[hash], p_list) {
-               if (i->port == port) {
-                       ret = -EEXIST;
-                       goto out;
-               }
-       }
-       list_add(&inc->p_list, &tcp_apps[hash]);
-       atomic_inc(&ip_vs_protocol_tcp.appcnt);
-
-  out:
-       spin_unlock_bh(&tcp_app_lock);
-       return ret;
-}
-
-
-static void
-tcp_unregister_app(struct ip_vs_app *inc)
-{
-       spin_lock_bh(&tcp_app_lock);
-       atomic_dec(&ip_vs_protocol_tcp.appcnt);
-       list_del(&inc->p_list);
-       spin_unlock_bh(&tcp_app_lock);
-}
-
-
-static int
-tcp_app_conn_bind(struct ip_vs_conn *cp)
-{
-       int hash;
-       struct ip_vs_app *inc;
-       int result = 0;
-
-       /* Default binding: bind app only for NAT */
-       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
-               return 0;
-
-       /* Lookup application incarnations and bind the right one */
-       hash = tcp_app_hashkey(cp->vport);
-
-       spin_lock(&tcp_app_lock);
-       list_for_each_entry(inc, &tcp_apps[hash], p_list) {
-               if (inc->port == cp->vport) {
-                       if (unlikely(!ip_vs_app_inc_get(inc)))
-                               break;
-                       spin_unlock(&tcp_app_lock);
-
-                       IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
-                                     "%s:%u to app %s on port %u\n",
-                                     __func__,
-                                     IP_VS_DBG_ADDR(cp->af, &cp->caddr),
-                                     ntohs(cp->cport),
-                                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
-                                     ntohs(cp->vport),
-                                     inc->name, ntohs(inc->port));
-
-                       cp->app = inc;
-                       if (inc->init_conn)
-                               result = inc->init_conn(inc, cp);
-                       goto out;
-               }
-       }
-       spin_unlock(&tcp_app_lock);
-
-  out:
-       return result;
-}
-
-
-/*
- *     Set LISTEN timeout. (ip_vs_conn_put will setup timer)
- */
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
-{
-       spin_lock(&cp->lock);
-       cp->state = IP_VS_TCP_S_LISTEN;
-       cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
-       spin_unlock(&cp->lock);
-}
-
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
-{
-       IP_VS_INIT_HASH_TABLE(tcp_apps);
-       pp->timeout_table = tcp_timeouts;
-}
-
-
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_tcp = {
-       .name =                 "TCP",
-       .protocol =             IPPROTO_TCP,
-       .num_states =           IP_VS_TCP_S_LAST,
-       .dont_defrag =          0,
-       .appcnt =               ATOMIC_INIT(0),
-       .init =                 ip_vs_tcp_init,
-       .exit =                 ip_vs_tcp_exit,
-       .register_app =         tcp_register_app,
-       .unregister_app =       tcp_unregister_app,
-       .conn_schedule =        tcp_conn_schedule,
-       .conn_in_get =          tcp_conn_in_get,
-       .conn_out_get =         tcp_conn_out_get,
-       .snat_handler =         tcp_snat_handler,
-       .dnat_handler =         tcp_dnat_handler,
-       .csum_check =           tcp_csum_check,
-       .state_name =           tcp_state_name,
-       .state_transition =     tcp_state_transition,
-       .app_conn_bind =        tcp_app_conn_bind,
-       .debug_packet =         ip_vs_tcpudp_debug_packet,
-       .timeout_change =       tcp_timeout_change,
-       .set_state_timeout =    tcp_set_state_timeout,
-};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c

deleted file mode 100644 (file)

index 6eb6039..0000000
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * ip_vs_proto_udp.c:  UDP load balancing support for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/udp.h>
-
-#include <net/ip_vs.h>
-#include <net/ip.h>
-#include <net/ip6_checksum.h>
-
-static struct ip_vs_conn *
-udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-               const struct ip_vs_iphdr *iph, unsigned int proto_off,
-               int inverse)
-{
-       struct ip_vs_conn *cp;
-       __be16 _ports[2], *pptr;
-
-       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-       if (pptr == NULL)
-               return NULL;
-
-       if (likely(!inverse)) {
-               cp = ip_vs_conn_in_get(af, iph->protocol,
-                                      &iph->saddr, pptr[0],
-                                      &iph->daddr, pptr[1]);
-       } else {
-               cp = ip_vs_conn_in_get(af, iph->protocol,
-                                      &iph->daddr, pptr[1],
-                                      &iph->saddr, pptr[0]);
-       }
-
-       return cp;
-}
-
-
-static struct ip_vs_conn *
-udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-                const struct ip_vs_iphdr *iph, unsigned int proto_off,
-                int inverse)
-{
-       struct ip_vs_conn *cp;
-       __be16 _ports[2], *pptr;
-
-       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-       if (pptr == NULL)
-               return NULL;
-
-       if (likely(!inverse)) {
-               cp = ip_vs_conn_out_get(af, iph->protocol,
-                                       &iph->saddr, pptr[0],
-                                       &iph->daddr, pptr[1]);
-       } else {
-               cp = ip_vs_conn_out_get(af, iph->protocol,
-                                       &iph->daddr, pptr[1],
-                                       &iph->saddr, pptr[0]);
-       }
-
-       return cp;
-}
-
-
-static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
-                 int *verdict, struct ip_vs_conn **cpp)
-{
-       struct ip_vs_service *svc;
-       struct udphdr _udph, *uh;
-       struct ip_vs_iphdr iph;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
-       uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
-       if (uh == NULL) {
-               *verdict = NF_DROP;
-               return 0;
-       }
-
-       svc = ip_vs_service_get(af, skb->mark, iph.protocol,
-                               &iph.daddr, uh->dest);
-       if (svc) {
-               if (ip_vs_todrop()) {
-                       /*
-                        * It seems that we are very loaded.
-                        * We have to drop this packet :(
-                        */
-                       ip_vs_service_put(svc);
-                       *verdict = NF_DROP;
-                       return 0;
-               }
-
-               /*
-                * Let the virtual server select a real server for the
-                * incoming connection, and create a connection entry.
-                */
-               *cpp = ip_vs_schedule(svc, skb);
-               if (!*cpp) {
-                       *verdict = ip_vs_leave(svc, skb, pp);
-                       return 0;
-               }
-               ip_vs_service_put(svc);
-       }
-       return 1;
-}
-
-
-static inline void
-udp_fast_csum_update(int af, struct udphdr *uhdr,
-                    const union nf_inet_addr *oldip,
-                    const union nf_inet_addr *newip,
-                    __be16 oldport, __be16 newport)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               uhdr->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-                                        ip_vs_check_diff2(oldport, newport,
-                                               ~csum_unfold(uhdr->check))));
-       else
-#endif
-               uhdr->check =
-                       csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-                                        ip_vs_check_diff2(oldport, newport,
-                                               ~csum_unfold(uhdr->check))));
-       if (!uhdr->check)
-               uhdr->check = CSUM_MANGLED_0;
-}
-
-static inline void
-udp_partial_csum_update(int af, struct udphdr *uhdr,
-                    const union nf_inet_addr *oldip,
-                    const union nf_inet_addr *newip,
-                    __be16 oldlen, __be16 newlen)
-{
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               uhdr->check =
-                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
-                                        ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(uhdr->check))));
-       else
-#endif
-       uhdr->check =
-               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
-                               ip_vs_check_diff2(oldlen, newlen,
-                                               ~csum_unfold(uhdr->check))));
-}
-
-
-static int
-udp_snat_handler(struct sk_buff *skb,
-                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-       struct udphdr *udph;
-       unsigned int udphoff;
-       int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (cp->af == AF_INET6)
-               udphoff = sizeof(struct ipv6hdr);
-       else
-#endif
-               udphoff = ip_hdrlen(skb);
-       oldlen = skb->len - udphoff;
-
-       /* csum_check requires unshared skb */
-       if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
-               return 0;
-
-       if (unlikely(cp->app != NULL)) {
-               /* Some checks before mangling */
-               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-                       return 0;
-
-               /*
-                *      Call application helper if needed
-                */
-               if (!ip_vs_app_pkt_out(cp, skb))
-                       return 0;
-       }
-
-       udph = (void *)skb_network_header(skb) + udphoff;
-       udph->source = cp->vport;
-
-       /*
-        *      Adjust UDP checksums
-        */
-       if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
-                                       htonl(oldlen),
-                                       htonl(skb->len - udphoff));
-       } else if (!cp->app && (udph->check != 0)) {
-               /* Only port and addr are changed, do fast csum update */
-               udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
-                                    cp->dport, cp->vport);
-               if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
-       } else {
-               /* full checksum calculation */
-               udph->check = 0;
-               skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-               if (cp->af == AF_INET6)
-                       udph->check = csum_ipv6_magic(&cp->vaddr.in6,
-                                                     &cp->caddr.in6,
-                                                     skb->len - udphoff,
-                                                     cp->protocol, skb->csum);
-               else
-#endif
-                       udph->check = csum_tcpudp_magic(cp->vaddr.ip,
-                                                       cp->caddr.ip,
-                                                       skb->len - udphoff,
-                                                       cp->protocol,
-                                                       skb->csum);
-               if (udph->check == 0)
-                       udph->check = CSUM_MANGLED_0;
-               IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
-                         pp->name, udph->check,
-                         (char*)&(udph->check) - (char*)udph);
-       }
-       return 1;
-}
-
-
-static int
-udp_dnat_handler(struct sk_buff *skb,
-                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
-{
-       struct udphdr *udph;
-       unsigned int udphoff;
-       int oldlen;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (cp->af == AF_INET6)
-               udphoff = sizeof(struct ipv6hdr);
-       else
-#endif
-               udphoff = ip_hdrlen(skb);
-       oldlen = skb->len - udphoff;
-
-       /* csum_check requires unshared skb */
-       if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
-               return 0;
-
-       if (unlikely(cp->app != NULL)) {
-               /* Some checks before mangling */
-               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
-                       return 0;
-
-               /*
-                *      Attempt ip_vs_app call.
-                *      It will fix ip_vs_conn
-                */
-               if (!ip_vs_app_pkt_in(cp, skb))
-                       return 0;
-       }
-
-       udph = (void *)skb_network_header(skb) + udphoff;
-       udph->dest = cp->dport;
-
-       /*
-        *      Adjust UDP checksums
-        */
-       if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
-                                       htonl(oldlen),
-                                       htonl(skb->len - udphoff));
-       } else if (!cp->app && (udph->check != 0)) {
-               /* Only port and addr are changed, do fast csum update */
-               udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
-                                    cp->vport, cp->dport);
-               if (skb->ip_summed == CHECKSUM_COMPLETE)
-                       skb->ip_summed = CHECKSUM_NONE;
-       } else {
-               /* full checksum calculation */
-               udph->check = 0;
-               skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
-#ifdef CONFIG_IP_VS_IPV6
-               if (cp->af == AF_INET6)
-                       udph->check = csum_ipv6_magic(&cp->caddr.in6,
-                                                     &cp->daddr.in6,
-                                                     skb->len - udphoff,
-                                                     cp->protocol, skb->csum);
-               else
-#endif
-                       udph->check = csum_tcpudp_magic(cp->caddr.ip,
-                                                       cp->daddr.ip,
-                                                       skb->len - udphoff,
-                                                       cp->protocol,
-                                                       skb->csum);
-               if (udph->check == 0)
-                       udph->check = CSUM_MANGLED_0;
-               skb->ip_summed = CHECKSUM_UNNECESSARY;
-       }
-       return 1;
-}
-
-
-static int
-udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
-       struct udphdr _udph, *uh;
-       unsigned int udphoff;
-
-#ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6)
-               udphoff = sizeof(struct ipv6hdr);
-       else
-#endif
-               udphoff = ip_hdrlen(skb);
-
-       uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
-       if (uh == NULL)
-               return 0;
-
-       if (uh->check != 0) {
-               switch (skb->ip_summed) {
-               case CHECKSUM_NONE:
-                       skb->csum = skb_checksum(skb, udphoff,
-                                                skb->len - udphoff, 0);
-               case CHECKSUM_COMPLETE:
-#ifdef CONFIG_IP_VS_IPV6
-                       if (af == AF_INET6) {
-                               if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
-                                                   &ipv6_hdr(skb)->daddr,
-                                                   skb->len - udphoff,
-                                                   ipv6_hdr(skb)->nexthdr,
-                                                   skb->csum)) {
-                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-                                                        "Failed checksum for");
-                                       return 0;
-                               }
-                       } else
-#endif
-                               if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
-                                                     ip_hdr(skb)->daddr,
-                                                     skb->len - udphoff,
-                                                     ip_hdr(skb)->protocol,
-                                                     skb->csum)) {
-                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-                                                        "Failed checksum for");
-                                       return 0;
-                               }
-                       break;
-               default:
-                       /* No need to checksum. */
-                       break;
-               }
-       }
-       return 1;
-}
-
-
-/*
- *     Note: the caller guarantees that only one of register_app,
- *     unregister_app or app_conn_bind is called each time.
- */
-
-#define        UDP_APP_TAB_BITS        4
-#define        UDP_APP_TAB_SIZE        (1 << UDP_APP_TAB_BITS)
-#define        UDP_APP_TAB_MASK        (UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
-static inline __u16 udp_app_hashkey(__be16 port)
-{
-       return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
-               & UDP_APP_TAB_MASK;
-}
-
-
-static int udp_register_app(struct ip_vs_app *inc)
-{
-       struct ip_vs_app *i;
-       __u16 hash;
-       __be16 port = inc->port;
-       int ret = 0;
-
-       hash = udp_app_hashkey(port);
-
-
-       spin_lock_bh(&udp_app_lock);
-       list_for_each_entry(i, &udp_apps[hash], p_list) {
-               if (i->port == port) {
-                       ret = -EEXIST;
-                       goto out;
-               }
-       }
-       list_add(&inc->p_list, &udp_apps[hash]);
-       atomic_inc(&ip_vs_protocol_udp.appcnt);
-
-  out:
-       spin_unlock_bh(&udp_app_lock);
-       return ret;
-}
-
-
-static void
-udp_unregister_app(struct ip_vs_app *inc)
-{
-       spin_lock_bh(&udp_app_lock);
-       atomic_dec(&ip_vs_protocol_udp.appcnt);
-       list_del(&inc->p_list);
-       spin_unlock_bh(&udp_app_lock);
-}
-
-
-static int udp_app_conn_bind(struct ip_vs_conn *cp)
-{
-       int hash;
-       struct ip_vs_app *inc;
-       int result = 0;
-
-       /* Default binding: bind app only for NAT */
-       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
-               return 0;
-
-       /* Lookup application incarnations and bind the right one */
-       hash = udp_app_hashkey(cp->vport);
-
-       spin_lock(&udp_app_lock);
-       list_for_each_entry(inc, &udp_apps[hash], p_list) {
-               if (inc->port == cp->vport) {
-                       if (unlikely(!ip_vs_app_inc_get(inc)))
-                               break;
-                       spin_unlock(&udp_app_lock);
-
-                       IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
-                                     "%s:%u to app %s on port %u\n",
-                                     __func__,
-                                     IP_VS_DBG_ADDR(cp->af, &cp->caddr),
-                                     ntohs(cp->cport),
-                                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
-                                     ntohs(cp->vport),
-                                     inc->name, ntohs(inc->port));
-
-                       cp->app = inc;
-                       if (inc->init_conn)
-                               result = inc->init_conn(inc, cp);
-                       goto out;
-               }
-       }
-       spin_unlock(&udp_app_lock);
-
-  out:
-       return result;
-}
-
-
-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
-       [IP_VS_UDP_S_NORMAL]            =       5*60*HZ,
-       [IP_VS_UDP_S_LAST]              =       2*HZ,
-};
-
-static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
-       [IP_VS_UDP_S_NORMAL]            =       "UDP",
-       [IP_VS_UDP_S_LAST]              =       "BUG!",
-};
-
-
-static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-       return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
-                                      udp_state_name_table, sname, to);
-}
-
-static const char * udp_state_name(int state)
-{
-       if (state >= IP_VS_UDP_S_LAST)
-               return "ERR!";
-       return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
-}
-
-static int
-udp_state_transition(struct ip_vs_conn *cp, int direction,
-                    const struct sk_buff *skb,
-                    struct ip_vs_protocol *pp)
-{
-       cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
-       return 1;
-}
-
-static void udp_init(struct ip_vs_protocol *pp)
-{
-       IP_VS_INIT_HASH_TABLE(udp_apps);
-       pp->timeout_table = udp_timeouts;
-}
-
-static void udp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-
-struct ip_vs_protocol ip_vs_protocol_udp = {
-       .name =                 "UDP",
-       .protocol =             IPPROTO_UDP,
-       .num_states =           IP_VS_UDP_S_LAST,
-       .dont_defrag =          0,
-       .init =                 udp_init,
-       .exit =                 udp_exit,
-       .conn_schedule =        udp_conn_schedule,
-       .conn_in_get =          udp_conn_in_get,
-       .conn_out_get =         udp_conn_out_get,
-       .snat_handler =         udp_snat_handler,
-       .dnat_handler =         udp_dnat_handler,
-       .csum_check =           udp_csum_check,
-       .state_transition =     udp_state_transition,
-       .state_name =           udp_state_name,
-       .register_app =         udp_register_app,
-       .unregister_app =       udp_unregister_app,
-       .app_conn_bind =        udp_app_conn_bind,
-       .debug_packet =         ip_vs_tcpudp_debug_packet,
-       .timeout_change =       NULL,
-       .set_state_timeout =    udp_set_state_timeout,
-};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c

deleted file mode 100644 (file)

index a22195f..0000000
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * IPVS:        Round-Robin Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Fixes/Changes:
- *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
- *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
- *     Wensong Zhang            :     changed some comestics things for debugging
- *     Wensong Zhang            :     changed for the d-linked destination list
- *     Wensong Zhang            :     added the ip_vs_rr_update_svc
- *     Wensong Zhang            :     added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
-{
-       svc->sched_data = &svc->destinations;
-       return 0;
-}
-
-
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
-{
-       svc->sched_data = &svc->destinations;
-       return 0;
-}
-
-
-/*
- * Round-Robin Scheduling
- */
-static struct ip_vs_dest *
-ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct list_head *p, *q;
-       struct ip_vs_dest *dest;
-
-       IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
-
-       write_lock(&svc->sched_lock);
-       p = (struct list_head *)svc->sched_data;
-       p = p->next;
-       q = p;
-       do {
-               /* skip list head */
-               if (q == &svc->destinations) {
-                       q = q->next;
-                       continue;
-               }
-
-               dest = list_entry(q, struct ip_vs_dest, n_list);
-               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-                   atomic_read(&dest->weight) > 0)
-                       /* HIT */
-                       goto out;
-               q = q->next;
-       } while (q != p);
-       write_unlock(&svc->sched_lock);
-       return NULL;
-
-  out:
-       svc->sched_data = q;
-       write_unlock(&svc->sched_lock);
-       IP_VS_DBG_BUF(6, "RR: server %s:%u "
-                     "activeconns %d refcnt %d weight %d\n",
-                     IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
-                     atomic_read(&dest->activeconns),
-                     atomic_read(&dest->refcnt), atomic_read(&dest->weight));
-
-       return dest;
-}
-
-
-static struct ip_vs_scheduler ip_vs_rr_scheduler = {
-       .name =                 "rr",                   /* name */
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        1,
-#endif
-       .init_service =         ip_vs_rr_init_svc,
-       .update_service =       ip_vs_rr_update_svc,
-       .schedule =             ip_vs_rr_schedule,
-};
-
-static int __init ip_vs_rr_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
-}
-
-static void __exit ip_vs_rr_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
-}
-
-module_init(ip_vs_rr_init);
-module_exit(ip_vs_rr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c

deleted file mode 100644 (file)

index a46ad9e..0000000
--- a/net/ipv4/ipvs/ip_vs_sched.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the Netfilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <asm/string.h>
-#include <linux/kmod.h>
-#include <linux/sysctl.h>
-
-#include <net/ip_vs.h>
-
-/*
- *  IPVS scheduler list
- */
-static LIST_HEAD(ip_vs_schedulers);
-
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_sched_lock);
-
-
-/*
- *  Bind a service with a scheduler
- */
-int ip_vs_bind_scheduler(struct ip_vs_service *svc,
-                        struct ip_vs_scheduler *scheduler)
-{
-       int ret;
-
-       if (svc == NULL) {
-               IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
-               return -EINVAL;
-       }
-       if (scheduler == NULL) {
-               IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
-               return -EINVAL;
-       }
-
-       svc->scheduler = scheduler;
-
-       if (scheduler->init_service) {
-               ret = scheduler->init_service(svc);
-               if (ret) {
-                       IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-
-/*
- *  Unbind a service with its scheduler
- */
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
-{
-       struct ip_vs_scheduler *sched;
-
-       if (svc == NULL) {
-               IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
-               return -EINVAL;
-       }
-
-       sched = svc->scheduler;
-       if (sched == NULL) {
-               IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
-               return -EINVAL;
-       }
-
-       if (sched->done_service) {
-               if (sched->done_service(svc) != 0) {
-                       IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
-                       return -EINVAL;
-               }
-       }
-
-       svc->scheduler = NULL;
-       return 0;
-}
-
-
-/*
- *  Get scheduler in the scheduler list by name
- */
-static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
-{
-       struct ip_vs_scheduler *sched;
-
-       IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
-                 sched_name);
-
-       read_lock_bh(&__ip_vs_sched_lock);
-
-       list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
-               /*
-                * Test and get the modules atomically
-                */
-               if (sched->module && !try_module_get(sched->module)) {
-                       /*
-                        * This scheduler is just deleted
-                        */
-                       continue;
-               }
-               if (strcmp(sched_name, sched->name)==0) {
-                       /* HIT */
-                       read_unlock_bh(&__ip_vs_sched_lock);
-                       return sched;
-               }
-               if (sched->module)
-                       module_put(sched->module);
-       }
-
-       read_unlock_bh(&__ip_vs_sched_lock);
-       return NULL;
-}
-
-
-/*
- *  Lookup scheduler and try to load it if it doesn't exist
- */
-struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
-{
-       struct ip_vs_scheduler *sched;
-
-       /*
-        *  Search for the scheduler by sched_name
-        */
-       sched = ip_vs_sched_getbyname(sched_name);
-
-       /*
-        *  If scheduler not found, load the module and search again
-        */
-       if (sched == NULL) {
-               request_module("ip_vs_%s", sched_name);
-               sched = ip_vs_sched_getbyname(sched_name);
-       }
-
-       return sched;
-}
-
-void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
-{
-       if (scheduler->module)
-               module_put(scheduler->module);
-}
-
-
-/*
- *  Register a scheduler in the scheduler list
- */
-int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
-{
-       struct ip_vs_scheduler *sched;
-
-       if (!scheduler) {
-               IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
-               return -EINVAL;
-       }
-
-       if (!scheduler->name) {
-               IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
-               return -EINVAL;
-       }
-
-       /* increase the module use count */
-       ip_vs_use_count_inc();
-
-       write_lock_bh(&__ip_vs_sched_lock);
-
-       if (!list_empty(&scheduler->n_list)) {
-               write_unlock_bh(&__ip_vs_sched_lock);
-               ip_vs_use_count_dec();
-               IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
-                         "already linked\n", scheduler->name);
-               return -EINVAL;
-       }
-
-       /*
-        *  Make sure that the scheduler with this name doesn't exist
-        *  in the scheduler list.
-        */
-       list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
-               if (strcmp(scheduler->name, sched->name) == 0) {
-                       write_unlock_bh(&__ip_vs_sched_lock);
-                       ip_vs_use_count_dec();
-                       IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
-                                       "already existed in the system\n",
-                                       scheduler->name);
-                       return -EINVAL;
-               }
-       }
-       /*
-        *      Add it into the d-linked scheduler list
-        */
-       list_add(&scheduler->n_list, &ip_vs_schedulers);
-       write_unlock_bh(&__ip_vs_sched_lock);
-
-       IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
-
-       return 0;
-}
-
-
-/*
- *  Unregister a scheduler from the scheduler list
- */
-int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
-{
-       if (!scheduler) {
-               IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
-               return -EINVAL;
-       }
-
-       write_lock_bh(&__ip_vs_sched_lock);
-       if (list_empty(&scheduler->n_list)) {
-               write_unlock_bh(&__ip_vs_sched_lock);
-               IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
-                         "is not in the list. failed\n", scheduler->name);
-               return -EINVAL;
-       }
-
-       /*
-        *      Remove it from the d-linked scheduler list
-        */
-       list_del(&scheduler->n_list);
-       write_unlock_bh(&__ip_vs_sched_lock);
-
-       /* decrease the module use count */
-       ip_vs_use_count_dec();
-
-       IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
-
-       return 0;
-}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c

deleted file mode 100644 (file)

index 7d2f22f..0000000
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * IPVS:        Shortest Expected Delay scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The SED algorithm attempts to minimize each job's expected delay until
- * completion. The expected delay that the job will experience is
- * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
- * jobs on the ith server and Ui is the fixed service rate (weight) of
- * the ith server. The SED algorithm adopts a greedy policy that each does
- * what is in its own best interest, i.e. to join the queue which would
- * minimize its expected delay of completion.
- *
- * See the following paper for more information:
- * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
- * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
- * pages 986-994, 1988.
- *
- * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
- *
- * The difference between SED and WLC is that SED includes the incoming
- * job in the cost function (the increment of 1). SED may outperform
- * WLC, while scheduling big jobs under larger heterogeneous systems
- * (the server weight varies a lot).
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
-{
-       /*
-        * We only use the active connection number in the cost
-        * calculation here.
-        */
-       return atomic_read(&dest->activeconns) + 1;
-}
-
-
-/*
- *     Weighted Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest, *least;
-       unsigned int loh, doh;
-
-       IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
-
-       /*
-        * We calculate the load of each dest server as follows:
-        *      (server expected overhead) / dest->weight
-        *
-        * Remember -- no floats in kernel mode!!!
-        * The comparison of h1*w2 > h2*w1 is equivalent to that of
-        *                h1/w1 > h2/w2
-        * if every weight is larger than zero.
-        *
-        * The server with weight=0 is quiesced and will not receive any
-        * new connections.
-        */
-
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-                   atomic_read(&dest->weight) > 0) {
-                       least = dest;
-                       loh = ip_vs_sed_dest_overhead(least);
-                       goto nextstage;
-               }
-       }
-       return NULL;
-
-       /*
-        *    Find the destination with the least load.
-        */
-  nextstage:
-       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-               doh = ip_vs_sed_dest_overhead(dest);
-               if (loh * atomic_read(&dest->weight) >
-                   doh * atomic_read(&least->weight)) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       IP_VS_DBG_BUF(6, "SED: server %s:%u "
-                     "activeconns %d refcnt %d weight %d overhead %d\n",
-                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-                     atomic_read(&least->activeconns),
-                     atomic_read(&least->refcnt),
-                     atomic_read(&least->weight), loh);
-
-       return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_sed_scheduler =
-{
-       .name =                 "sed",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        1,
-#endif
-       .schedule =             ip_vs_sed_schedule,
-};
-
-
-static int __init ip_vs_sed_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
-}
-
-static void __exit ip_vs_sed_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
-}
-
-module_init(ip_vs_sed_init);
-module_exit(ip_vs_sed_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c

deleted file mode 100644 (file)

index 1d96de2..0000000
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * IPVS:        Source Hashing scheduling module
- *
- * Authors:     Wensong Zhang <wensong@gnuchina.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-/*
- * The sh algorithm is to select server by the hash key of source IP
- * address. The pseudo code is as follows:
- *
- *       n <- servernode[src_ip];
- *       if (n is dead) OR
- *          (n is overloaded) or (n.weight <= 0) then
- *                 return NULL;
- *
- *       return n;
- *
- * Notes that servernode is a 256-bucket hash table that maps the hash
- * index derived from packet source IP address to the current server
- * array. If the sh scheduler is used in cache cluster, it is good to
- * combine it with cache_bypass feature. When the statically assigned
- * server is dead or overloaded, the load balancer can bypass the cache
- * server and send requests to the original server directly.
- *
- */
-
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *      IPVS SH bucket
- */
-struct ip_vs_sh_bucket {
-       struct ip_vs_dest       *dest;          /* real server (cache) */
-};
-
-/*
- *     for IPVS SH entry hash table
- */
-#ifndef CONFIG_IP_VS_SH_TAB_BITS
-#define CONFIG_IP_VS_SH_TAB_BITS        8
-#endif
-#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
-#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
-#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
-
-
-/*
- *     Returns hash value for IPVS SH entry
- */
-static inline unsigned ip_vs_sh_hashkey(__be32 addr)
-{
-       return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
-}
-
-
-/*
- *      Get ip_vs_dest associated with supplied parameters.
- */
-static inline struct ip_vs_dest *
-ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
-{
-       return (tbl[ip_vs_sh_hashkey(addr)]).dest;
-}
-
-
-/*
- *      Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
-{
-       int i;
-       struct ip_vs_sh_bucket *b;
-       struct list_head *p;
-       struct ip_vs_dest *dest;
-
-       b = tbl;
-       p = &svc->destinations;
-       for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
-               if (list_empty(p)) {
-                       b->dest = NULL;
-               } else {
-                       if (p == &svc->destinations)
-                               p = p->next;
-
-                       dest = list_entry(p, struct ip_vs_dest, n_list);
-                       atomic_inc(&dest->refcnt);
-                       b->dest = dest;
-
-                       p = p->next;
-               }
-               b++;
-       }
-       return 0;
-}
-
-
-/*
- *      Flush all the hash buckets of the specified table.
- */
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
-{
-       int i;
-       struct ip_vs_sh_bucket *b;
-
-       b = tbl;
-       for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
-               if (b->dest) {
-                       atomic_dec(&b->dest->refcnt);
-                       b->dest = NULL;
-               }
-               b++;
-       }
-}
-
-
-static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_sh_bucket *tbl;
-
-       /* allocate the SH table for this service */
-       tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
-                     GFP_ATOMIC);
-       if (tbl == NULL) {
-               IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
-               return -ENOMEM;
-       }
-       svc->sched_data = tbl;
-       IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
-                 "current service\n",
-                 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
-       /* assign the hash buckets with the updated service */
-       ip_vs_sh_assign(tbl, svc);
-
-       return 0;
-}
-
-
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
-       /* got to clean up hash buckets here */
-       ip_vs_sh_flush(tbl);
-
-       /* release the table itself */
-       kfree(svc->sched_data);
-       IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
-                 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
-       return 0;
-}
-
-
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
-       /* got to clean up hash buckets here */
-       ip_vs_sh_flush(tbl);
-
-       /* assign the hash buckets with the updated service */
-       ip_vs_sh_assign(tbl, svc);
-
-       return 0;
-}
-
-
-/*
- *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-       return dest->flags & IP_VS_DEST_F_OVERLOAD;
-}
-
-
-/*
- *      Source Hashing scheduling
- */
-static struct ip_vs_dest *
-ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest;
-       struct ip_vs_sh_bucket *tbl;
-       struct iphdr *iph = ip_hdr(skb);
-
-       IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
-
-       tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
-       dest = ip_vs_sh_get(tbl, iph->saddr);
-       if (!dest
-           || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
-           || atomic_read(&dest->weight) <= 0
-           || is_overloaded(dest)) {
-               return NULL;
-       }
-
-       IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
-                 "--> server %u.%u.%u.%u:%d\n",
-                 NIPQUAD(iph->saddr),
-                 NIPQUAD(dest->addr.ip),
-                 ntohs(dest->port));
-
-       return dest;
-}
-
-
-/*
- *      IPVS SH Scheduler structure
- */
-static struct ip_vs_scheduler ip_vs_sh_scheduler =
-{
-       .name =                 "sh",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list  =              LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        0,
-#endif
-       .init_service =         ip_vs_sh_init_svc,
-       .done_service =         ip_vs_sh_done_svc,
-       .update_service =       ip_vs_sh_update_svc,
-       .schedule =             ip_vs_sh_schedule,
-};
-
-
-static int __init ip_vs_sh_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
-}
-
-
-static void __exit ip_vs_sh_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
-}
-
-
-module_init(ip_vs_sh_init);
-module_exit(ip_vs_sh_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c

deleted file mode 100644 (file)

index de5e7e1..0000000
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ /dev/null
@@ -1,942 +0,0 @@
-/*
- * IPVS         An implementation of the IP virtual server support for the
- *              LINUX operating system.  IPVS is now implemented as a module
- *              over the NetFilter framework. IPVS can be used to build a
- *              high-performance and highly available server based on a
- *              cluster of servers.
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * ip_vs_sync:  sync connection info from master load balancer to backups
- *              through multicast
- *
- * Changes:
- *     Alexandre Cassen        :       Added master & backup support at a time.
- *     Alexandre Cassen        :       Added SyncID support for incoming sync
- *                                     messages filtering.
- *     Justin Ossevoort        :       Fix endian problem on sync message size.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/inetdevice.h>
-#include <linux/net.h>
-#include <linux/completion.h>
-#include <linux/delay.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/igmp.h>                 /* for ip_mc_join_group */
-#include <linux/udp.h>
-#include <linux/err.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/kernel.h>
-
-#include <net/ip.h>
-#include <net/sock.h>
-
-#include <net/ip_vs.h>
-
-#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
-#define IP_VS_SYNC_PORT  8848          /* multicast port */
-
-
-/*
- *     IPVS sync connection entry
- */
-struct ip_vs_sync_conn {
-       __u8                    reserved;
-
-       /* Protocol, addresses and port numbers */
-       __u8                    protocol;       /* Which protocol (TCP/UDP) */
-       __be16                  cport;
-       __be16                  vport;
-       __be16                  dport;
-       __be32                  caddr;          /* client address */
-       __be32                  vaddr;          /* virtual address */
-       __be32                  daddr;          /* destination address */
-
-       /* Flags and state transition */
-       __be16                  flags;          /* status flags */
-       __be16                  state;          /* state info */
-
-       /* The sequence options start here */
-};
-
-struct ip_vs_sync_conn_options {
-       struct ip_vs_seq        in_seq;         /* incoming seq. struct */
-       struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
-};
-
-struct ip_vs_sync_thread_data {
-       struct socket *sock;
-       char *buf;
-};
-
-#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
-#define FULL_CONN_SIZE  \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
-
-
-/*
-  The master mulitcasts messages to the backup load balancers in the
-  following format.
-
-       0                   1                   2                   3
-       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |  Count Conns  |    SyncID     |            Size               |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |                                                               |
-      |                    IPVS Sync Connection (1)                   |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |                            .                                  |
-      |                            .                                  |
-      |                            .                                  |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |                                                               |
-      |                    IPVS Sync Connection (n)                   |
-      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*/
-
-#define SYNC_MESG_HEADER_LEN   4
-#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-
-struct ip_vs_sync_mesg {
-       __u8                    nr_conns;
-       __u8                    syncid;
-       __u16                   size;
-
-       /* ip_vs_sync_conn entries start here */
-};
-
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
-
-struct ip_vs_sync_buff {
-       struct list_head        list;
-       unsigned long           firstuse;
-
-       /* pointers for the message data */
-       struct ip_vs_sync_mesg  *mesg;
-       unsigned char           *head;
-       unsigned char           *end;
-};
-
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff   *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
-       .sin_family             = AF_INET,
-       .sin_port               = __constant_htons(IP_VS_SYNC_PORT),
-       .sin_addr.s_addr        = __constant_htonl(IP_VS_SYNC_GROUP),
-};
-
-
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
-{
-       struct ip_vs_sync_buff *sb;
-
-       spin_lock_bh(&ip_vs_sync_lock);
-       if (list_empty(&ip_vs_sync_queue)) {
-               sb = NULL;
-       } else {
-               sb = list_entry(ip_vs_sync_queue.next,
-                               struct ip_vs_sync_buff,
-                               list);
-               list_del(&sb->list);
-       }
-       spin_unlock_bh(&ip_vs_sync_lock);
-
-       return sb;
-}
-
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
-{
-       struct ip_vs_sync_buff *sb;
-
-       if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
-               return NULL;
-
-       if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
-               kfree(sb);
-               return NULL;
-       }
-       sb->mesg->nr_conns = 0;
-       sb->mesg->syncid = ip_vs_master_syncid;
-       sb->mesg->size = 4;
-       sb->head = (unsigned char *)sb->mesg + 4;
-       sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
-       sb->firstuse = jiffies;
-       return sb;
-}
-
-static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
-{
-       kfree(sb->mesg);
-       kfree(sb);
-}
-
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
-{
-       spin_lock(&ip_vs_sync_lock);
-       if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-               list_add_tail(&sb->list, &ip_vs_sync_queue);
-       else
-               ip_vs_sync_buff_release(sb);
-       spin_unlock(&ip_vs_sync_lock);
-}
-
-/*
- *     Get the current sync buffer if it has been created for more
- *     than the specified time or the specified time is zero.
- */
-static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
-{
-       struct ip_vs_sync_buff *sb;
-
-       spin_lock_bh(&curr_sb_lock);
-       if (curr_sb && (time == 0 ||
-                       time_before(jiffies - curr_sb->firstuse, time))) {
-               sb = curr_sb;
-               curr_sb = NULL;
-       } else
-               sb = NULL;
-       spin_unlock_bh(&curr_sb_lock);
-       return sb;
-}
-
-
-/*
- *      Add an ip_vs_conn information into the current sync_buff.
- *      Called by ip_vs_in.
- */
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
-{
-       struct ip_vs_sync_mesg *m;
-       struct ip_vs_sync_conn *s;
-       int len;
-
-       spin_lock(&curr_sb_lock);
-       if (!curr_sb) {
-               if (!(curr_sb=ip_vs_sync_buff_create())) {
-                       spin_unlock(&curr_sb_lock);
-                       IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
-                       return;
-               }
-       }
-
-       len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
-               SIMPLE_CONN_SIZE;
-       m = curr_sb->mesg;
-       s = (struct ip_vs_sync_conn *)curr_sb->head;
-
-       /* copy members */
-       s->protocol = cp->protocol;
-       s->cport = cp->cport;
-       s->vport = cp->vport;
-       s->dport = cp->dport;
-       s->caddr = cp->caddr.ip;
-       s->vaddr = cp->vaddr.ip;
-       s->daddr = cp->daddr.ip;
-       s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
-       s->state = htons(cp->state);
-       if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
-               struct ip_vs_sync_conn_options *opt =
-                       (struct ip_vs_sync_conn_options *)&s[1];
-               memcpy(opt, &cp->in_seq, sizeof(*opt));
-       }
-
-       m->nr_conns++;
-       m->size += len;
-       curr_sb->head += len;
-
-       /* check if there is a space for next one */
-       if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
-               sb_queue_tail(curr_sb);
-               curr_sb = NULL;
-       }
-       spin_unlock(&curr_sb_lock);
-
-       /* synchronize its controller if it has */
-       if (cp->control)
-               ip_vs_sync_conn(cp->control);
-}
-
-
-/*
- *      Process received multicast message and create the corresponding
- *      ip_vs_conn entries.
- */
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
-{
-       struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
-       struct ip_vs_sync_conn *s;
-       struct ip_vs_sync_conn_options *opt;
-       struct ip_vs_conn *cp;
-       struct ip_vs_protocol *pp;
-       struct ip_vs_dest *dest;
-       char *p;
-       int i;
-
-       if (buflen < sizeof(struct ip_vs_sync_mesg)) {
-               IP_VS_ERR_RL("sync message header too short\n");
-               return;
-       }
-
-       /* Convert size back to host byte order */
-       m->size = ntohs(m->size);
-
-       if (buflen != m->size) {
-               IP_VS_ERR_RL("bogus sync message size\n");
-               return;
-       }
-
-       /* SyncID sanity check */
-       if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
-               IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
-                         m->syncid);
-               return;
-       }
-
-       p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
-       for (i=0; i<m->nr_conns; i++) {
-               unsigned flags, state;
-
-               if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
-                       IP_VS_ERR_RL("bogus conn in sync message\n");
-                       return;
-               }
-               s = (struct ip_vs_sync_conn *) p;
-               flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
-               flags &= ~IP_VS_CONN_F_HASHED;
-               if (flags & IP_VS_CONN_F_SEQ_MASK) {
-                       opt = (struct ip_vs_sync_conn_options *)&s[1];
-                       p += FULL_CONN_SIZE;
-                       if (p > buffer+buflen) {
-                               IP_VS_ERR_RL("bogus conn options in sync message\n");
-                               return;
-                       }
-               } else {
-                       opt = NULL;
-                       p += SIMPLE_CONN_SIZE;
-               }
-
-               state = ntohs(s->state);
-               if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
-                       pp = ip_vs_proto_get(s->protocol);
-                       if (!pp) {
-                               IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
-                                       s->protocol);
-                               continue;
-                       }
-                       if (state >= pp->num_states) {
-                               IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
-                                       pp->name, state);
-                               continue;
-                       }
-               } else {
-                       /* protocol in templates is not used for state/timeout */
-                       pp = NULL;
-                       if (state > 0) {
-                               IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
-                                       state);
-                               state = 0;
-                       }
-               }
-
-               if (!(flags & IP_VS_CONN_F_TEMPLATE))
-                       cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-                                              (union nf_inet_addr *)&s->caddr,
-                                              s->cport,
-                                              (union nf_inet_addr *)&s->vaddr,
-                                              s->vport);
-               else
-                       cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-                                            (union nf_inet_addr *)&s->caddr,
-                                            s->cport,
-                                            (union nf_inet_addr *)&s->vaddr,
-                                            s->vport);
-               if (!cp) {
-                       /*
-                        * Find the appropriate destination for the connection.
-                        * If it is not found the connection will remain unbound
-                        * but still handled.
-                        */
-                       dest = ip_vs_find_dest(AF_INET,
-                                              (union nf_inet_addr *)&s->daddr,
-                                              s->dport,
-                                              (union nf_inet_addr *)&s->vaddr,
-                                              s->vport,
-                                              s->protocol);
-                       /*  Set the approprite ativity flag */
-                       if (s->protocol == IPPROTO_TCP) {
-                               if (state != IP_VS_TCP_S_ESTABLISHED)
-                                       flags |= IP_VS_CONN_F_INACTIVE;
-                               else
-                                       flags &= ~IP_VS_CONN_F_INACTIVE;
-                       }
-                       cp = ip_vs_conn_new(AF_INET, s->protocol,
-                                           (union nf_inet_addr *)&s->caddr,
-                                           s->cport,
-                                           (union nf_inet_addr *)&s->vaddr,
-                                           s->vport,
-                                           (union nf_inet_addr *)&s->daddr,
-                                           s->dport,
-                                           flags, dest);
-                       if (dest)
-                               atomic_dec(&dest->refcnt);
-                       if (!cp) {
-                               IP_VS_ERR("ip_vs_conn_new failed\n");
-                               return;
-                       }
-               } else if (!cp->dest) {
-                       dest = ip_vs_try_bind_dest(cp);
-                       if (dest)
-                               atomic_dec(&dest->refcnt);
-               } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
-                          (cp->state != state)) {
-                       /* update active/inactive flag for the connection */
-                       dest = cp->dest;
-                       if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-                               (state != IP_VS_TCP_S_ESTABLISHED)) {
-                               atomic_dec(&dest->activeconns);
-                               atomic_inc(&dest->inactconns);
-                               cp->flags |= IP_VS_CONN_F_INACTIVE;
-                       } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-                               (state == IP_VS_TCP_S_ESTABLISHED)) {
-                               atomic_inc(&dest->activeconns);
-                               atomic_dec(&dest->inactconns);
-                               cp->flags &= ~IP_VS_CONN_F_INACTIVE;
-                       }
-               }
-
-               if (opt)
-                       memcpy(&cp->in_seq, opt, sizeof(*opt));
-               atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
-               cp->state = state;
-               cp->old_state = cp->state;
-               /*
-                * We can not recover the right timeout for templates
-                * in all cases, we can not find the right fwmark
-                * virtual service. If needed, we can do it for
-                * non-fwmark persistent services.
-                */
-               if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
-                       cp->timeout = pp->timeout_table[state];
-               else
-                       cp->timeout = (3*60*HZ);
-               ip_vs_conn_put(cp);
-       }
-}
-
-
-/*
- *      Setup loopback of outgoing multicasts on a sending socket
- */
-static void set_mcast_loop(struct sock *sk, u_char loop)
-{
-       struct inet_sock *inet = inet_sk(sk);
-
-       /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
-       lock_sock(sk);
-       inet->mc_loop = loop ? 1 : 0;
-       release_sock(sk);
-}
-
-/*
- *      Specify TTL for outgoing multicasts on a sending socket
- */
-static void set_mcast_ttl(struct sock *sk, u_char ttl)
-{
-       struct inet_sock *inet = inet_sk(sk);
-
-       /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
-       lock_sock(sk);
-       inet->mc_ttl = ttl;
-       release_sock(sk);
-}
-
-/*
- *      Specifiy default interface for outgoing multicasts
- */
-static int set_mcast_if(struct sock *sk, char *ifname)
-{
-       struct net_device *dev;
-       struct inet_sock *inet = inet_sk(sk);
-
-       if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
-               return -ENODEV;
-
-       if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
-               return -EINVAL;
-
-       lock_sock(sk);
-       inet->mc_index = dev->ifindex;
-       /*  inet->mc_addr  = 0; */
-       release_sock(sk);
-
-       return 0;
-}
-
-
-/*
- *     Set the maximum length of sync message according to the
- *     specified interface's MTU.
- */
-static int set_sync_mesg_maxlen(int sync_state)
-{
-       struct net_device *dev;
-       int num;
-
-       if (sync_state == IP_VS_STATE_MASTER) {
-               if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
-                       return -ENODEV;
-
-               num = (dev->mtu - sizeof(struct iphdr) -
-                      sizeof(struct udphdr) -
-                      SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-               sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
-                       SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
-               IP_VS_DBG(7, "setting the maximum length of sync sending "
-                         "message %d.\n", sync_send_mesg_maxlen);
-       } else if (sync_state == IP_VS_STATE_BACKUP) {
-               if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
-                       return -ENODEV;
-
-               sync_recv_mesg_maxlen = dev->mtu -
-                       sizeof(struct iphdr) - sizeof(struct udphdr);
-               IP_VS_DBG(7, "setting the maximum length of sync receiving "
-                         "message %d.\n", sync_recv_mesg_maxlen);
-       }
-
-       return 0;
-}
-
-
-/*
- *      Join a multicast group.
- *      the group is specified by a class D multicast address 224.0.0.0/8
- *      in the in_addr structure passed in as a parameter.
- */
-static int
-join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
-{
-       struct ip_mreqn mreq;
-       struct net_device *dev;
-       int ret;
-
-       memset(&mreq, 0, sizeof(mreq));
-       memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
-
-       if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
-               return -ENODEV;
-       if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
-               return -EINVAL;
-
-       mreq.imr_ifindex = dev->ifindex;
-
-       lock_sock(sk);
-       ret = ip_mc_join_group(sk, &mreq);
-       release_sock(sk);
-
-       return ret;
-}
-
-
-static int bind_mcastif_addr(struct socket *sock, char *ifname)
-{
-       struct net_device *dev;
-       __be32 addr;
-       struct sockaddr_in sin;
-
-       if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
-               return -ENODEV;
-
-       addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
-       if (!addr)
-               IP_VS_ERR("You probably need to specify IP address on "
-                         "multicast interface.\n");
-
-       IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
-                 ifname, NIPQUAD(addr));
-
-       /* Now bind the socket with the address of multicast interface */
-       sin.sin_family       = AF_INET;
-       sin.sin_addr.s_addr  = addr;
-       sin.sin_port         = 0;
-
-       return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
-}
-
-/*
- *      Set up sending multicast socket over UDP
- */
-static struct socket * make_send_sock(void)
-{
-       struct socket *sock;
-       int result;
-
-       /* First create a socket */
-       result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
-       if (result < 0) {
-               IP_VS_ERR("Error during creation of socket; terminating\n");
-               return ERR_PTR(result);
-       }
-
-       result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
-       if (result < 0) {
-               IP_VS_ERR("Error setting outbound mcast interface\n");
-               goto error;
-       }
-
-       set_mcast_loop(sock->sk, 0);
-       set_mcast_ttl(sock->sk, 1);
-
-       result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
-       if (result < 0) {
-               IP_VS_ERR("Error binding address of the mcast interface\n");
-               goto error;
-       }
-
-       result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
-                       sizeof(struct sockaddr), 0);
-       if (result < 0) {
-               IP_VS_ERR("Error connecting to the multicast addr\n");
-               goto error;
-       }
-
-       return sock;
-
-  error:
-       sock_release(sock);
-       return ERR_PTR(result);
-}
-
-
-/*
- *      Set up receiving multicast socket over UDP
- */
-static struct socket * make_receive_sock(void)
-{
-       struct socket *sock;
-       int result;
-
-       /* First create a socket */
-       result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
-       if (result < 0) {
-               IP_VS_ERR("Error during creation of socket; terminating\n");
-               return ERR_PTR(result);
-       }
-
-       /* it is equivalent to the REUSEADDR option in user-space */
-       sock->sk->sk_reuse = 1;
-
-       result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
-                       sizeof(struct sockaddr));
-       if (result < 0) {
-               IP_VS_ERR("Error binding to the multicast addr\n");
-               goto error;
-       }
-
-       /* join the multicast group */
-       result = join_mcast_group(sock->sk,
-                       (struct in_addr *) &mcast_addr.sin_addr,
-                       ip_vs_backup_mcast_ifn);
-       if (result < 0) {
-               IP_VS_ERR("Error joining to the multicast group\n");
-               goto error;
-       }
-
-       return sock;
-
-  error:
-       sock_release(sock);
-       return ERR_PTR(result);
-}
-
-
-static int
-ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
-{
-       struct msghdr   msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
-       struct kvec     iov;
-       int             len;
-
-       EnterFunction(7);
-       iov.iov_base     = (void *)buffer;
-       iov.iov_len      = length;
-
-       len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
-
-       LeaveFunction(7);
-       return len;
-}
-
-static void
-ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
-{
-       int msize;
-
-       msize = msg->size;
-
-       /* Put size in network byte order */
-       msg->size = htons(msg->size);
-
-       if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
-               IP_VS_ERR("ip_vs_send_async error\n");
-}
-
-static int
-ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
-{
-       struct msghdr           msg = {NULL,};
-       struct kvec             iov;
-       int                     len;
-
-       EnterFunction(7);
-
-       /* Receive a packet */
-       iov.iov_base     = buffer;
-       iov.iov_len      = (size_t)buflen;
-
-       len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
-
-       if (len < 0)
-               return -1;
-
-       LeaveFunction(7);
-       return len;
-}
-
-
-static int sync_thread_master(void *data)
-{
-       struct ip_vs_sync_thread_data *tinfo = data;
-       struct ip_vs_sync_buff *sb;
-
-       IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
-                  "syncid = %d\n",
-                  ip_vs_master_mcast_ifn, ip_vs_master_syncid);
-
-       while (!kthread_should_stop()) {
-               while ((sb = sb_dequeue())) {
-                       ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
-                       ip_vs_sync_buff_release(sb);
-               }
-
-               /* check if entries stay in curr_sb for 2 seconds */
-               sb = get_curr_sync_buff(2 * HZ);
-               if (sb) {
-                       ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
-                       ip_vs_sync_buff_release(sb);
-               }
-
-               schedule_timeout_interruptible(HZ);
-       }
-
-       /* clean up the sync_buff queue */
-       while ((sb=sb_dequeue())) {
-               ip_vs_sync_buff_release(sb);
-       }
-
-       /* clean up the current sync_buff */
-       if ((sb = get_curr_sync_buff(0))) {
-               ip_vs_sync_buff_release(sb);
-       }
-
-       /* release the sending multicast socket */
-       sock_release(tinfo->sock);
-       kfree(tinfo);
-
-       return 0;
-}
-
-
-static int sync_thread_backup(void *data)
-{
-       struct ip_vs_sync_thread_data *tinfo = data;
-       int len;
-
-       IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
-                  "syncid = %d\n",
-                  ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
-
-       while (!kthread_should_stop()) {
-               wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
-                        !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
-                        || kthread_should_stop());
-
-               /* do we have data now? */
-               while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
-                       len = ip_vs_receive(tinfo->sock, tinfo->buf,
-                                       sync_recv_mesg_maxlen);
-                       if (len <= 0) {
-                               IP_VS_ERR("receiving message error\n");
-                               break;
-                       }
-
-                       /* disable bottom half, because it accesses the data
-                          shared by softirq while getting/creating conns */
-                       local_bh_disable();
-                       ip_vs_process_message(tinfo->buf, len);
-                       local_bh_enable();
-               }
-       }
-
-       /* release the sending multicast socket */
-       sock_release(tinfo->sock);
-       kfree(tinfo->buf);
-       kfree(tinfo);
-
-       return 0;
-}
-
-
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
-{
-       struct ip_vs_sync_thread_data *tinfo;
-       struct task_struct **realtask, *task;
-       struct socket *sock;
-       char *name, *buf = NULL;
-       int (*threadfn)(void *data);
-       int result = -ENOMEM;
-
-       IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
-       IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
-                 sizeof(struct ip_vs_sync_conn));
-
-       if (state == IP_VS_STATE_MASTER) {
-               if (sync_master_thread)
-                       return -EEXIST;
-
-               strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
-                       sizeof(ip_vs_master_mcast_ifn));
-               ip_vs_master_syncid = syncid;
-               realtask = &sync_master_thread;
-               name = "ipvs_syncmaster";
-               threadfn = sync_thread_master;
-               sock = make_send_sock();
-       } else if (state == IP_VS_STATE_BACKUP) {
-               if (sync_backup_thread)
-                       return -EEXIST;
-
-               strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
-                       sizeof(ip_vs_backup_mcast_ifn));
-               ip_vs_backup_syncid = syncid;
-               realtask = &sync_backup_thread;
-               name = "ipvs_syncbackup";
-               threadfn = sync_thread_backup;
-               sock = make_receive_sock();
-       } else {
-               return -EINVAL;
-       }
-
-       if (IS_ERR(sock)) {
-               result = PTR_ERR(sock);
-               goto out;
-       }
-
-       set_sync_mesg_maxlen(state);
-       if (state == IP_VS_STATE_BACKUP) {
-               buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
-               if (!buf)
-                       goto outsocket;
-       }
-
-       tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
-       if (!tinfo)
-               goto outbuf;
-
-       tinfo->sock = sock;
-       tinfo->buf = buf;
-
-       task = kthread_run(threadfn, tinfo, name);
-       if (IS_ERR(task)) {
-               result = PTR_ERR(task);
-               goto outtinfo;
-       }
-
-       /* mark as active */
-       *realtask = task;
-       ip_vs_sync_state |= state;
-
-       /* increase the module use count */
-       ip_vs_use_count_inc();
-
-       return 0;
-
-outtinfo:
-       kfree(tinfo);
-outbuf:
-       kfree(buf);
-outsocket:
-       sock_release(sock);
-out:
-       return result;
-}
-
-
-int stop_sync_thread(int state)
-{
-       IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
-
-       if (state == IP_VS_STATE_MASTER) {
-               if (!sync_master_thread)
-                       return -ESRCH;
-
-               IP_VS_INFO("stopping master sync thread %d ...\n",
-                          task_pid_nr(sync_master_thread));
-
-               /*
-                * The lock synchronizes with sb_queue_tail(), so that we don't
-                * add sync buffers to the queue, when we are already in
-                * progress of stopping the master sync daemon.
-                */
-
-               spin_lock_bh(&ip_vs_sync_lock);
-               ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
-               spin_unlock_bh(&ip_vs_sync_lock);
-               kthread_stop(sync_master_thread);
-               sync_master_thread = NULL;
-       } else if (state == IP_VS_STATE_BACKUP) {
-               if (!sync_backup_thread)
-                       return -ESRCH;
-
-               IP_VS_INFO("stopping backup sync thread %d ...\n",
-                          task_pid_nr(sync_backup_thread));
-
-               ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
-               kthread_stop(sync_backup_thread);
-               sync_backup_thread = NULL;
-       } else {
-               return -EINVAL;
-       }
-
-       /* decrease the module use count */
-       ip_vs_use_count_dec();
-
-       return 0;
-}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c

deleted file mode 100644 (file)

index 8c596e7..0000000
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * IPVS:        Weighted Least-Connection Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Peter Kese <peter.kese@ijs.si>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
- *     Wensong Zhang            :     changed to use the inactconns in scheduling
- *     Wensong Zhang            :     changed some comestics things for debugging
- *     Wensong Zhang            :     changed for the d-linked destination list
- *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
- *     Wensong Zhang            :     added any dest with weight=0 is quiesced
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <net/ip_vs.h>
-
-
-static inline unsigned int
-ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
-{
-       /*
-        * We think the overhead of processing active connections is 256
-        * times higher than that of inactive connections in average. (This
-        * 256 times might not be accurate, we will change it later) We
-        * use the following formula to estimate the overhead now:
-        *                dest->activeconns*256 + dest->inactconns
-        */
-       return (atomic_read(&dest->activeconns) << 8) +
-               atomic_read(&dest->inactconns);
-}
-
-
-/*
- *     Weighted Least Connection scheduling
- */
-static struct ip_vs_dest *
-ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest, *least;
-       unsigned int loh, doh;
-
-       IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
-
-       /*
-        * We calculate the load of each dest server as follows:
-        *                (dest overhead) / dest->weight
-        *
-        * Remember -- no floats in kernel mode!!!
-        * The comparison of h1*w2 > h2*w1 is equivalent to that of
-        *                h1/w1 > h2/w2
-        * if every weight is larger than zero.
-        *
-        * The server with weight=0 is quiesced and will not receive any
-        * new connections.
-        */
-
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-                   atomic_read(&dest->weight) > 0) {
-                       least = dest;
-                       loh = ip_vs_wlc_dest_overhead(least);
-                       goto nextstage;
-               }
-       }
-       return NULL;
-
-       /*
-        *    Find the destination with the least load.
-        */
-  nextstage:
-       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
-               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
-                       continue;
-               doh = ip_vs_wlc_dest_overhead(dest);
-               if (loh * atomic_read(&dest->weight) >
-                   doh * atomic_read(&least->weight)) {
-                       least = dest;
-                       loh = doh;
-               }
-       }
-
-       IP_VS_DBG_BUF(6, "WLC: server %s:%u "
-                     "activeconns %d refcnt %d weight %d overhead %d\n",
-                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
-                     atomic_read(&least->activeconns),
-                     atomic_read(&least->refcnt),
-                     atomic_read(&least->weight), loh);
-
-       return least;
-}
-
-
-static struct ip_vs_scheduler ip_vs_wlc_scheduler =
-{
-       .name =                 "wlc",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        1,
-#endif
-       .schedule =             ip_vs_wlc_schedule,
-};
-
-
-static int __init ip_vs_wlc_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
-}
-
-static void __exit ip_vs_wlc_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
-}
-
-module_init(ip_vs_wlc_init);
-module_exit(ip_vs_wlc_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c

deleted file mode 100644 (file)

index 7ea92fe..0000000
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * IPVS:        Weighted Round-Robin Scheduling module
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
- *     Wensong Zhang            :     changed some comestics things for debugging
- *     Wensong Zhang            :     changed for the d-linked destination list
- *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
- *     Julian Anastasov         :     fixed the bug of returning destination
- *                                    with weight 0 when all weights are zero
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/net.h>
-
-#include <net/ip_vs.h>
-
-/*
- * current destination pointer for weighted round-robin scheduling
- */
-struct ip_vs_wrr_mark {
-       struct list_head *cl;   /* current list head */
-       int cw;                 /* current weight */
-       int mw;                 /* maximum weight */
-       int di;                 /* decreasing interval */
-};
-
-
-/*
- *    Get the gcd of server weights
- */
-static int gcd(int a, int b)
-{
-       int c;
-
-       while ((c = a % b)) {
-               a = b;
-               b = c;
-       }
-       return b;
-}
-
-static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
-{
-       struct ip_vs_dest *dest;
-       int weight;
-       int g = 0;
-
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               weight = atomic_read(&dest->weight);
-               if (weight > 0) {
-                       if (g > 0)
-                               g = gcd(weight, g);
-                       else
-                               g = weight;
-               }
-       }
-       return g ? g : 1;
-}
-
-
-/*
- *    Get the maximum weight of the service destinations.
- */
-static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
-{
-       struct ip_vs_dest *dest;
-       int weight = 0;
-
-       list_for_each_entry(dest, &svc->destinations, n_list) {
-               if (atomic_read(&dest->weight) > weight)
-                       weight = atomic_read(&dest->weight);
-       }
-
-       return weight;
-}
-
-
-static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_wrr_mark *mark;
-
-       /*
-        *    Allocate the mark variable for WRR scheduling
-        */
-       mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
-       if (mark == NULL) {
-               IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
-               return -ENOMEM;
-       }
-       mark->cl = &svc->destinations;
-       mark->cw = 0;
-       mark->mw = ip_vs_wrr_max_weight(svc);
-       mark->di = ip_vs_wrr_gcd_weight(svc);
-       svc->sched_data = mark;
-
-       return 0;
-}
-
-
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
-{
-       /*
-        *    Release the mark variable
-        */
-       kfree(svc->sched_data);
-
-       return 0;
-}
-
-
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
-{
-       struct ip_vs_wrr_mark *mark = svc->sched_data;
-
-       mark->cl = &svc->destinations;
-       mark->mw = ip_vs_wrr_max_weight(svc);
-       mark->di = ip_vs_wrr_gcd_weight(svc);
-       if (mark->cw > mark->mw)
-               mark->cw = 0;
-       return 0;
-}
-
-
-/*
- *    Weighted Round-Robin Scheduling
- */
-static struct ip_vs_dest *
-ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
-{
-       struct ip_vs_dest *dest;
-       struct ip_vs_wrr_mark *mark = svc->sched_data;
-       struct list_head *p;
-
-       IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
-
-       /*
-        * This loop will always terminate, because mark->cw in (0, max_weight]
-        * and at least one server has its weight equal to max_weight.
-        */
-       write_lock(&svc->sched_lock);
-       p = mark->cl;
-       while (1) {
-               if (mark->cl == &svc->destinations) {
-                       /* it is at the head of the destination list */
-
-                       if (mark->cl == mark->cl->next) {
-                               /* no dest entry */
-                               dest = NULL;
-                               goto out;
-                       }
-
-                       mark->cl = svc->destinations.next;
-                       mark->cw -= mark->di;
-                       if (mark->cw <= 0) {
-                               mark->cw = mark->mw;
-                               /*
-                                * Still zero, which means no available servers.
-                                */
-                               if (mark->cw == 0) {
-                                       mark->cl = &svc->destinations;
-                                       IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
-                                                  "no available servers\n");
-                                       dest = NULL;
-                                       goto out;
-                               }
-                       }
-               } else
-                       mark->cl = mark->cl->next;
-
-               if (mark->cl != &svc->destinations) {
-                       /* not at the head of the list */
-                       dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
-                       if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
-                           atomic_read(&dest->weight) >= mark->cw) {
-                               /* got it */
-                               break;
-                       }
-               }
-
-               if (mark->cl == p && mark->cw == mark->di) {
-                       /* back to the start, and no dest is found.
-                          It is only possible when all dests are OVERLOADED */
-                       dest = NULL;
-                       goto out;
-               }
-       }
-
-       IP_VS_DBG_BUF(6, "WRR: server %s:%u "
-                     "activeconns %d refcnt %d weight %d\n",
-                     IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
-                     atomic_read(&dest->activeconns),
-                     atomic_read(&dest->refcnt),
-                     atomic_read(&dest->weight));
-
-  out:
-       write_unlock(&svc->sched_lock);
-       return dest;
-}
-
-
-static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
-       .name =                 "wrr",
-       .refcnt =               ATOMIC_INIT(0),
-       .module =               THIS_MODULE,
-       .n_list =               LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
-#ifdef CONFIG_IP_VS_IPV6
-       .supports_ipv6 =        1,
-#endif
-       .init_service =         ip_vs_wrr_init_svc,
-       .done_service =         ip_vs_wrr_done_svc,
-       .update_service =       ip_vs_wrr_update_svc,
-       .schedule =             ip_vs_wrr_schedule,
-};
-
-static int __init ip_vs_wrr_init(void)
-{
-       return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
-}
-
-static void __exit ip_vs_wrr_cleanup(void)
-{
-       unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
-}
-
-module_init(ip_vs_wrr_init);
-module_exit(ip_vs_wrr_cleanup);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c

deleted file mode 100644 (file)

index 02ddc2b..0000000
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ /dev/null
@@ -1,1004 +0,0 @@
-/*
- * ip_vs_xmit.c: various packet transmitters for IPVS
- *
- * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
- *              Julian Anastasov <ja@ssi.bg>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- *
- * Changes:
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/tcp.h>                  /* for tcphdr */
-#include <net/ip.h>
-#include <net/tcp.h>                    /* for csum_tcpudp_magic */
-#include <net/udp.h>
-#include <net/icmp.h>                   /* for icmp_send */
-#include <net/route.h>                  /* for ip_route_output */
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <linux/icmpv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-/*
- *      Destination cache to speed up outgoing route lookup
- */
-static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
-{
-       struct dst_entry *old_dst;
-
-       old_dst = dest->dst_cache;
-       dest->dst_cache = dst;
-       dest->dst_rtos = rtos;
-       dst_release(old_dst);
-}
-
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
-{
-       struct dst_entry *dst = dest->dst_cache;
-
-       if (!dst)
-               return NULL;
-       if ((dst->obsolete
-            || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-           dst->ops->check(dst, cookie) == NULL) {
-               dest->dst_cache = NULL;
-               dst_release(dst);
-               return NULL;
-       }
-       dst_hold(dst);
-       return dst;
-}
-
-static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
-{
-       struct rtable *rt;                      /* Route to the other host */
-       struct ip_vs_dest *dest = cp->dest;
-
-       if (dest) {
-               spin_lock(&dest->dst_lock);
-               if (!(rt = (struct rtable *)
-                     __ip_vs_dst_check(dest, rtos, 0))) {
-                       struct flowi fl = {
-                               .oif = 0,
-                               .nl_u = {
-                                       .ip4_u = {
-                                               .daddr = dest->addr.ip,
-                                               .saddr = 0,
-                                               .tos = rtos, } },
-                       };
-
-                       if (ip_route_output_key(&init_net, &rt, &fl)) {
-                               spin_unlock(&dest->dst_lock);
-                               IP_VS_DBG_RL("ip_route_output error, "
-                                            "dest: %u.%u.%u.%u\n",
-                                            NIPQUAD(dest->addr.ip));
-                               return NULL;
-                       }
-                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
-                       IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
-                                 NIPQUAD(dest->addr.ip),
-                                 atomic_read(&rt->u.dst.__refcnt), rtos);
-               }
-               spin_unlock(&dest->dst_lock);
-       } else {
-               struct flowi fl = {
-                       .oif = 0,
-                       .nl_u = {
-                               .ip4_u = {
-                                       .daddr = cp->daddr.ip,
-                                       .saddr = 0,
-                                       .tos = rtos, } },
-               };
-
-               if (ip_route_output_key(&init_net, &rt, &fl)) {
-                       IP_VS_DBG_RL("ip_route_output error, dest: "
-                                    "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
-                       return NULL;
-               }
-       }
-
-       return rt;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
-{
-       struct rt6_info *rt;                    /* Route to the other host */
-       struct ip_vs_dest *dest = cp->dest;
-
-       if (dest) {
-               spin_lock(&dest->dst_lock);
-               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
-               if (!rt) {
-                       struct flowi fl = {
-                               .oif = 0,
-                               .nl_u = {
-                                       .ip6_u = {
-                                               .daddr = dest->addr.in6,
-                                               .saddr = {
-                                                       .s6_addr32 =
-                                                               { 0, 0, 0, 0 },
-                                               },
-                                       },
-                               },
-                       };
-
-                       rt = (struct rt6_info *)ip6_route_output(&init_net,
-                                                                NULL, &fl);
-                       if (!rt) {
-                               spin_unlock(&dest->dst_lock);
-                               IP_VS_DBG_RL("ip6_route_output error, "
-                                            "dest: " NIP6_FMT "\n",
-                                            NIP6(dest->addr.in6));
-                               return NULL;
-                       }
-                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
-                       IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
-                                 NIP6(dest->addr.in6),
-                                 atomic_read(&rt->u.dst.__refcnt));
-               }
-               spin_unlock(&dest->dst_lock);
-       } else {
-               struct flowi fl = {
-                       .oif = 0,
-                       .nl_u = {
-                               .ip6_u = {
-                                       .daddr = cp->daddr.in6,
-                                       .saddr = {
-                                               .s6_addr32 = { 0, 0, 0, 0 },
-                                       },
-                               },
-                       },
-               };
-
-               rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-               if (!rt) {
-                       IP_VS_DBG_RL("ip6_route_output error, dest: "
-                                    NIP6_FMT "\n", NIP6(cp->daddr.in6));
-                       return NULL;
-               }
-       }
-
-       return rt;
-}
-#endif
-
-
-/*
- *     Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
-{
-       struct dst_entry *old_dst;
-
-       old_dst = dest->dst_cache;
-       dest->dst_cache = NULL;
-       dst_release(old_dst);
-}
-
-#define IP_VS_XMIT(pf, skb, rt)                                \
-do {                                                   \
-       (skb)->ipvs_property = 1;                       \
-       skb_forward_csum(skb);                          \
-       NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
-               (rt)->u.dst.dev, dst_output);           \
-} while (0)
-
-
-/*
- *      NULL transmitter (do nothing except return NF_ACCEPT)
- */
-int
-ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-               struct ip_vs_protocol *pp)
-{
-       /* we do not touch skb and do not need pskb ptr */
-       return NF_ACCEPT;
-}
-
-
-/*
- *      Bypass transmitter
- *      Let packets bypass the destination when the destination is not
- *      available, it may be only used in transparent cache cluster.
- */
-int
-ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-                 struct ip_vs_protocol *pp)
-{
-       struct rtable *rt;                      /* Route to the other host */
-       struct iphdr  *iph = ip_hdr(skb);
-       u8     tos = iph->tos;
-       int    mtu;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip4_u = {
-                               .daddr = iph->daddr,
-                               .saddr = 0,
-                               .tos = RT_TOS(tos), } },
-       };
-
-       EnterFunction(10);
-
-       if (ip_route_output_key(&init_net, &rt, &fl)) {
-               IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
-                            "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
-               goto tx_error_icmp;
-       }
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
-               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
-               goto tx_error;
-       }
-
-       /*
-        * Call ip_send_check because we are not sure it is called
-        * after ip_defrag. Is copy-on-write needed?
-        */
-       if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
-               ip_rt_put(rt);
-               return NF_STOLEN;
-       }
-       ip_send_check(ip_hdr(skb));
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET, skb, rt);
-
-       LeaveFunction(10);
-       return NF_STOLEN;
-
- tx_error_icmp:
-       dst_link_failure(skb);
- tx_error:
-       kfree_skb(skb);
-       LeaveFunction(10);
-       return NF_STOLEN;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-                    struct ip_vs_protocol *pp)
-{
-       struct rt6_info *rt;                    /* Route to the other host */
-       struct ipv6hdr  *iph = ipv6_hdr(skb);
-       int    mtu;
-       struct flowi fl = {
-               .oif = 0,
-               .nl_u = {
-                       .ip6_u = {
-                               .daddr = iph->daddr,
-                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-       };
-
-       EnterFunction(10);
-
-       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-       if (!rt) {
-               IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
-                            "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
-               goto tx_error_icmp;
-       }
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if (skb->len > mtu) {
-               dst_release(&rt->u.dst);
-               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-               IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
-               goto tx_error;
-       }
-
-       /*
-        * Call ip_send_check because we are not sure it is called
-        * after ip_defrag. Is copy-on-write needed?
-        */
-       skb = skb_share_check(skb, GFP_ATOMIC);
-       if (unlikely(skb == NULL)) {
-               dst_release(&rt->u.dst);
-               return NF_STOLEN;
-       }
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET6, skb, rt);
-
-       LeaveFunction(10);
-       return NF_STOLEN;
-
- tx_error_icmp:
-       dst_link_failure(skb);
- tx_error:
-       kfree_skb(skb);
-       LeaveFunction(10);
-       return NF_STOLEN;
-}
-#endif
-
-/*
- *      NAT transmitter (only for outside-to-inside nat forwarding)
- *      Not used for related ICMP
- */
-int
-ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-              struct ip_vs_protocol *pp)
-{
-       struct rtable *rt;              /* Route to the other host */
-       int mtu;
-       struct iphdr *iph = ip_hdr(skb);
-
-       EnterFunction(10);
-
-       /* check if it is a connection of no-client-port */
-       if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
-               __be16 _pt, *p;
-               p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
-               if (p == NULL)
-                       goto tx_error;
-               ip_vs_conn_fill_cport(cp, *p);
-               IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
-       }
-
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
-               goto tx_error_icmp;
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
-               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-               goto tx_error;
-       }
-
-       /* copy-on-write the packet before mangling it */
-       if (!skb_make_writable(skb, sizeof(struct iphdr)))
-               goto tx_error_put;
-
-       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-               goto tx_error_put;
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /* mangle the packet */
-       if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-               goto tx_error;
-       ip_hdr(skb)->daddr = cp->daddr.ip;
-       ip_send_check(ip_hdr(skb));
-
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-
-       /* FIXME: when application helper enlarges the packet and the length
-          is larger than the MTU of outgoing device, there will be still
-          MTU problem. */
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET, skb, rt);
-
-       LeaveFunction(10);
-       return NF_STOLEN;
-
-  tx_error_icmp:
-       dst_link_failure(skb);
-  tx_error:
-       LeaveFunction(10);
-       kfree_skb(skb);
-       return NF_STOLEN;
-  tx_error_put:
-       ip_rt_put(rt);
-       goto tx_error;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-                 struct ip_vs_protocol *pp)
-{
-       struct rt6_info *rt;            /* Route to the other host */
-       int mtu;
-
-       EnterFunction(10);
-
-       /* check if it is a connection of no-client-port */
-       if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
-               __be16 _pt, *p;
-               p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
-                                      sizeof(_pt), &_pt);
-               if (p == NULL)
-                       goto tx_error;
-               ip_vs_conn_fill_cport(cp, *p);
-               IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
-       }
-
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
-               goto tx_error_icmp;
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if (skb->len > mtu) {
-               dst_release(&rt->u.dst);
-               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
-                                "ip_vs_nat_xmit_v6(): frag needed for");
-               goto tx_error;
-       }
-
-       /* copy-on-write the packet before mangling it */
-       if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
-               goto tx_error_put;
-
-       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-               goto tx_error_put;
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /* mangle the packet */
-       if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-               goto tx_error;
-       ipv6_hdr(skb)->daddr = cp->daddr.in6;
-
-       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
-
-       /* FIXME: when application helper enlarges the packet and the length
-          is larger than the MTU of outgoing device, there will be still
-          MTU problem. */
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET6, skb, rt);
-
-       LeaveFunction(10);
-       return NF_STOLEN;
-
-tx_error_icmp:
-       dst_link_failure(skb);
-tx_error:
-       LeaveFunction(10);
-       kfree_skb(skb);
-       return NF_STOLEN;
-tx_error_put:
-       dst_release(&rt->u.dst);
-       goto tx_error;
-}
-#endif
-
-
-/*
- *   IP Tunneling transmitter
- *
- *   This function encapsulates the packet in a new IP packet, its
- *   destination will be set to cp->daddr. Most code of this function
- *   is taken from ipip.c.
- *
- *   It is used in VS/TUN cluster. The load balancer selects a real
- *   server from a cluster based on a scheduling algorithm,
- *   encapsulates the request packet and forwards it to the selected
- *   server. For example, all real servers are configured with
- *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
- *   the encapsulated packet, it will decapsulate the packet, processe
- *   the request and return the response packets directly to the client
- *   without passing the load balancer. This can greatly increase the
- *   scalability of virtual server.
- *
- *   Used for ANY protocol
- */
-int
-ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-                 struct ip_vs_protocol *pp)
-{
-       struct rtable *rt;                      /* Route to the other host */
-       struct net_device *tdev;                /* Device to other host */
-       struct iphdr  *old_iph = ip_hdr(skb);
-       u8     tos = old_iph->tos;
-       __be16 df = old_iph->frag_off;
-       sk_buff_data_t old_transport_header = skb->transport_header;
-       struct iphdr  *iph;                     /* Our new IP header */
-       unsigned int max_headroom;              /* The extra header space needed */
-       int    mtu;
-
-       EnterFunction(10);
-
-       if (skb->protocol != htons(ETH_P_IP)) {
-               IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
-                            "ETH_P_IP: %d, skb protocol: %d\n",
-                            htons(ETH_P_IP), skb->protocol);
-               goto tx_error;
-       }
-
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
-               goto tx_error_icmp;
-
-       tdev = rt->u.dst.dev;
-
-       mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
-       if (mtu < 68) {
-               ip_rt_put(rt);
-               IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
-               goto tx_error;
-       }
-       if (skb->dst)
-               skb->dst->ops->update_pmtu(skb->dst, mtu);
-
-       df |= (old_iph->frag_off & htons(IP_DF));
-
-       if ((old_iph->frag_off & htons(IP_DF))
-           && mtu < ntohs(old_iph->tot_len)) {
-               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               ip_rt_put(rt);
-               IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
-               goto tx_error;
-       }
-
-       /*
-        * Okay, now see if we can stuff it in the buffer as-is.
-        */
-       max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
-
-       if (skb_headroom(skb) < max_headroom
-           || skb_cloned(skb) || skb_shared(skb)) {
-               struct sk_buff *new_skb =
-                       skb_realloc_headroom(skb, max_headroom);
-               if (!new_skb) {
-                       ip_rt_put(rt);
-                       kfree_skb(skb);
-                       IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
-                       return NF_STOLEN;
-               }
-               kfree_skb(skb);
-               skb = new_skb;
-               old_iph = ip_hdr(skb);
-       }
-
-       skb->transport_header = old_transport_header;
-
-       /* fix old IP header checksum */
-       ip_send_check(old_iph);
-
-       skb_push(skb, sizeof(struct iphdr));
-       skb_reset_network_header(skb);
-       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /*
-        *      Push down and install the IPIP header.
-        */
-       iph                     =       ip_hdr(skb);
-       iph->version            =       4;
-       iph->ihl                =       sizeof(struct iphdr)>>2;
-       iph->frag_off           =       df;
-       iph->protocol           =       IPPROTO_IPIP;
-       iph->tos                =       tos;
-       iph->daddr              =       rt->rt_dst;
-       iph->saddr              =       rt->rt_src;
-       iph->ttl                =       old_iph->ttl;
-       ip_select_ident(iph, &rt->u.dst, NULL);
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       ip_local_out(skb);
-
-       LeaveFunction(10);
-
-       return NF_STOLEN;
-
-  tx_error_icmp:
-       dst_link_failure(skb);
-  tx_error:
-       kfree_skb(skb);
-       LeaveFunction(10);
-       return NF_STOLEN;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-                    struct ip_vs_protocol *pp)
-{
-       struct rt6_info *rt;            /* Route to the other host */
-       struct net_device *tdev;        /* Device to other host */
-       struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-       sk_buff_data_t old_transport_header = skb->transport_header;
-       struct ipv6hdr  *iph;           /* Our new IP header */
-       unsigned int max_headroom;      /* The extra header space needed */
-       int    mtu;
-
-       EnterFunction(10);
-
-       if (skb->protocol != htons(ETH_P_IPV6)) {
-               IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
-                            "ETH_P_IPV6: %d, skb protocol: %d\n",
-                            htons(ETH_P_IPV6), skb->protocol);
-               goto tx_error;
-       }
-
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
-               goto tx_error_icmp;
-
-       tdev = rt->u.dst.dev;
-
-       mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
-       /* TODO IPv6: do we need this check in IPv6? */
-       if (mtu < 1280) {
-               dst_release(&rt->u.dst);
-               IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
-               goto tx_error;
-       }
-       if (skb->dst)
-               skb->dst->ops->update_pmtu(skb->dst, mtu);
-
-       if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
-               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-               dst_release(&rt->u.dst);
-               IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
-               goto tx_error;
-       }
-
-       /*
-        * Okay, now see if we can stuff it in the buffer as-is.
-        */
-       max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
-
-       if (skb_headroom(skb) < max_headroom
-           || skb_cloned(skb) || skb_shared(skb)) {
-               struct sk_buff *new_skb =
-                       skb_realloc_headroom(skb, max_headroom);
-               if (!new_skb) {
-                       dst_release(&rt->u.dst);
-                       kfree_skb(skb);
-                       IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
-                       return NF_STOLEN;
-               }
-               kfree_skb(skb);
-               skb = new_skb;
-               old_iph = ipv6_hdr(skb);
-       }
-
-       skb->transport_header = old_transport_header;
-
-       skb_push(skb, sizeof(struct ipv6hdr));
-       skb_reset_network_header(skb);
-       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /*
-        *      Push down and install the IPIP header.
-        */
-       iph                     =       ipv6_hdr(skb);
-       iph->version            =       6;
-       iph->nexthdr            =       IPPROTO_IPV6;
-       iph->payload_len        =       old_iph->payload_len + sizeof(old_iph);
-       iph->priority           =       old_iph->priority;
-       memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-       iph->daddr              =       rt->rt6i_dst.addr;
-       iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
-       iph->hop_limit          =       old_iph->hop_limit;
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       ip6_local_out(skb);
-
-       LeaveFunction(10);
-
-       return NF_STOLEN;
-
-tx_error_icmp:
-       dst_link_failure(skb);
-tx_error:
-       kfree_skb(skb);
-       LeaveFunction(10);
-       return NF_STOLEN;
-}
-#endif
-
-
-/*
- *      Direct Routing transmitter
- *      Used for ANY protocol
- */
-int
-ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-             struct ip_vs_protocol *pp)
-{
-       struct rtable *rt;                      /* Route to the other host */
-       struct iphdr  *iph = ip_hdr(skb);
-       int    mtu;
-
-       EnterFunction(10);
-
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
-               goto tx_error_icmp;
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
-               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-               ip_rt_put(rt);
-               IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
-               goto tx_error;
-       }
-
-       /*
-        * Call ip_send_check because we are not sure it is called
-        * after ip_defrag. Is copy-on-write needed?
-        */
-       if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
-               ip_rt_put(rt);
-               return NF_STOLEN;
-       }
-       ip_send_check(ip_hdr(skb));
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET, skb, rt);
-
-       LeaveFunction(10);
-       return NF_STOLEN;
-
-  tx_error_icmp:
-       dst_link_failure(skb);
-  tx_error:
-       kfree_skb(skb);
-       LeaveFunction(10);
-       return NF_STOLEN;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-                struct ip_vs_protocol *pp)
-{
-       struct rt6_info *rt;                    /* Route to the other host */
-       int    mtu;
-
-       EnterFunction(10);
-
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
-               goto tx_error_icmp;
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if (skb->len > mtu) {
-               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-               dst_release(&rt->u.dst);
-               IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
-               goto tx_error;
-       }
-
-       /*
-        * Call ip_send_check because we are not sure it is called
-        * after ip_defrag. Is copy-on-write needed?
-        */
-       skb = skb_share_check(skb, GFP_ATOMIC);
-       if (unlikely(skb == NULL)) {
-               dst_release(&rt->u.dst);
-               return NF_STOLEN;
-       }
-
-       /* drop old route */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET6, skb, rt);
-
-       LeaveFunction(10);
-       return NF_STOLEN;
-
-tx_error_icmp:
-       dst_link_failure(skb);
-tx_error:
-       kfree_skb(skb);
-       LeaveFunction(10);
-       return NF_STOLEN;
-}
-#endif
-
-
-/*
- *     ICMP packet transmitter
- *     called by the ip_vs_in_icmp
- */
-int
-ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
-               struct ip_vs_protocol *pp, int offset)
-{
-       struct rtable   *rt;    /* Route to the other host */
-       int mtu;
-       int rc;
-
-       EnterFunction(10);
-
-       /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
-          forwarded directly here, because there is no need to
-          translate address/port back */
-       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
-               if (cp->packet_xmit)
-                       rc = cp->packet_xmit(skb, cp, pp);
-               else
-                       rc = NF_ACCEPT;
-               /* do not touch skb anymore */
-               atomic_inc(&cp->in_pkts);
-               goto out;
-       }
-
-       /*
-        * mangle and send the packet here (only for VS/NAT)
-        */
-
-       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
-               goto tx_error_icmp;
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-               ip_rt_put(rt);
-               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
-               IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
-               goto tx_error;
-       }
-
-       /* copy-on-write the packet before mangling it */
-       if (!skb_make_writable(skb, offset))
-               goto tx_error_put;
-
-       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-               goto tx_error_put;
-
-       /* drop the old route when skb is not shared */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       ip_vs_nat_icmp(skb, pp, cp, 0);
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET, skb, rt);
-
-       rc = NF_STOLEN;
-       goto out;
-
-  tx_error_icmp:
-       dst_link_failure(skb);
-  tx_error:
-       dev_kfree_skb(skb);
-       rc = NF_STOLEN;
-  out:
-       LeaveFunction(10);
-       return rc;
-  tx_error_put:
-       ip_rt_put(rt);
-       goto tx_error;
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-int
-ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
-               struct ip_vs_protocol *pp, int offset)
-{
-       struct rt6_info *rt;    /* Route to the other host */
-       int mtu;
-       int rc;
-
-       EnterFunction(10);
-
-       /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
-          forwarded directly here, because there is no need to
-          translate address/port back */
-       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
-               if (cp->packet_xmit)
-                       rc = cp->packet_xmit(skb, cp, pp);
-               else
-                       rc = NF_ACCEPT;
-               /* do not touch skb anymore */
-               atomic_inc(&cp->in_pkts);
-               goto out;
-       }
-
-       /*
-        * mangle and send the packet here (only for VS/NAT)
-        */
-
-       rt = __ip_vs_get_out_rt_v6(cp);
-       if (!rt)
-               goto tx_error_icmp;
-
-       /* MTU checking */
-       mtu = dst_mtu(&rt->u.dst);
-       if (skb->len > mtu) {
-               dst_release(&rt->u.dst);
-               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
-               IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
-               goto tx_error;
-       }
-
-       /* copy-on-write the packet before mangling it */
-       if (!skb_make_writable(skb, offset))
-               goto tx_error_put;
-
-       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
-               goto tx_error_put;
-
-       /* drop the old route when skb is not shared */
-       dst_release(skb->dst);
-       skb->dst = &rt->u.dst;
-
-       ip_vs_nat_icmp_v6(skb, pp, cp, 0);
-
-       /* Another hack: avoid icmp_send in ip_fragment */
-       skb->local_df = 1;
-
-       IP_VS_XMIT(PF_INET6, skb, rt);
-
-       rc = NF_STOLEN;
-       goto out;
-
-tx_error_icmp:
-       dst_link_failure(skb);
-tx_error:
-       dev_kfree_skb(skb);
-       rc = NF_STOLEN;
-out:
-       LeaveFunction(10);
-       return rc;
-tx_error_put:
-       dst_release(&rt->u.dst);
-       goto tx_error;
-}
-#endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig

index ee898e74808d32f59821e780fc617d9e18af85d2..73f9378e1bfe3e051012f5bb690c23b4e9794392 100644 (file)
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -838,3 +838,5 @@ config NETFILTER_XT_MATCH_HASHLIMIT
  
  endmenu
  
+source "net/netfilter/ipvs/Kconfig"
+
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile

index 3bd2cc556aea6fabd720921f30b6c7b1c5c8e3a3..cf75055be830daed6ea06e05d02f24dbe252d8a3 100644 (file)
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -83,3 +83,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+
+# IPVS
+obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig

new file mode 100644 (file)

index 0000000..de6004d
--- /dev/null
+++ b/net/netfilter/ipvs/Kconfig
@@ -0,0 +1,239 @@
+#
+# IP Virtual Server configuration
+#
+menuconfig IP_VS
+       tristate "IP virtual server support (EXPERIMENTAL)"
+       depends on NETFILTER
+       ---help---
+         IP Virtual Server support will let you build a high-performance
+         virtual server based on cluster of two or more real servers. This
+         option must be enabled for at least one of the clustered computers
+         that will take care of intercepting incoming connections to a
+         single IP address and scheduling them to real servers.
+
+         Three request dispatching techniques are implemented, they are
+         virtual server via NAT, virtual server via tunneling and virtual
+         server via direct routing. The several scheduling algorithms can
+         be used to choose which server the connection is directed to,
+         thus load balancing can be achieved among the servers.  For more
+         information and its administration program, please visit the
+         following URL: <http://www.linuxvirtualserver.org/>.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+if IP_VS
+
+config IP_VS_IPV6
+       bool "IPv6 support for IPVS (DANGEROUS)"
+       depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
+       ---help---
+         Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+
+         Say N if unsure.
+
+config IP_VS_DEBUG
+       bool "IP virtual server debugging"
+       ---help---
+         Say Y here if you want to get additional messages useful in
+         debugging the IP virtual server code. You can change the debug
+         level in /proc/sys/net/ipv4/vs/debug_level
+
+config IP_VS_TAB_BITS
+       int "IPVS connection table size (the Nth power of 2)"
+       range 8 20
+       default 12
+       ---help---
+         The IPVS connection hash table uses the chaining scheme to handle
+         hash collisions. Using a big IPVS connection hash table will greatly
+         reduce conflicts when there are hundreds of thousands of connections
+         in the hash table.
+
+         Note the table size must be power of 2. The table size will be the
+         value of 2 to the your input number power. The number to choose is
+         from 8 to 20, the default number is 12, which means the table size
+         is 4096. Don't input the number too small, otherwise you will lose
+         performance on it. You can adapt the table size yourself, according
+         to your virtual server application. It is good to set the table size
+         not far less than the number of connections per second multiplying
+         average lasting time of connection in the table.  For example, your
+         virtual server gets 200 connections per second, the connection lasts
+         for 200 seconds in average in the connection table, the table size
+         should be not far less than 200x200, it is good to set the table
+         size 32768 (2**15).
+
+         Another note that each connection occupies 128 bytes effectively and
+         each hash entry uses 8 bytes, so you can estimate how much memory is
+         needed for your box.
+
+comment "IPVS transport protocol load balancing support"
+
+config IP_VS_PROTO_TCP
+       bool "TCP load balancing support"
+       ---help---
+         This option enables support for load balancing TCP transport
+         protocol. Say Y if unsure.
+
+config IP_VS_PROTO_UDP
+       bool "UDP load balancing support"
+       ---help---
+         This option enables support for load balancing UDP transport
+         protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH_ESP
+       bool
+       depends on UNDEFINED
+
+config IP_VS_PROTO_ESP
+       bool "ESP load balancing support"
+       select IP_VS_PROTO_AH_ESP
+       ---help---
+         This option enables support for load balancing ESP (Encapsulation
+         Security Payload) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH
+       bool "AH load balancing support"
+       select IP_VS_PROTO_AH_ESP
+       ---help---
+         This option enables support for load balancing AH (Authentication
+         Header) transport protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+
+config IP_VS_RR
+       tristate "round-robin scheduling"
+       ---help---
+         The robin-robin scheduling algorithm simply directs network
+         connections to different real servers in a round-robin manner.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+ 
+config IP_VS_WRR
+        tristate "weighted round-robin scheduling" 
+       ---help---
+         The weighted robin-robin scheduling algorithm directs network
+         connections to different real servers based on server weights
+         in a round-robin manner. Servers with higher weights receive
+         new connections first than those with less weights, and servers
+         with higher weights get more connections than those with less
+         weights and servers with equal weights get equal connections.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_LC
+        tristate "least-connection scheduling"
+       ---help---
+         The least-connection scheduling algorithm directs network
+         connections to the server with the least number of active 
+         connections.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_WLC
+        tristate "weighted least-connection scheduling"
+       ---help---
+         The weighted least-connection scheduling algorithm directs network
+         connections to the server with the least active connections
+         normalized by the server weight.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_LBLC
+       tristate "locality-based least-connection scheduling"
+       ---help---
+         The locality-based least-connection scheduling algorithm is for
+         destination IP load balancing. It is usually used in cache cluster.
+         This algorithm usually directs packet destined for an IP address to
+         its server if the server is alive and under load. If the server is
+         overloaded (its active connection numbers is larger than its weight)
+         and there is a server in its half load, then allocate the weighted
+         least-connection server to this IP address.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config  IP_VS_LBLCR
+       tristate "locality-based least-connection with replication scheduling"
+       ---help---
+         The locality-based least-connection with replication scheduling
+         algorithm is also for destination IP load balancing. It is 
+         usually used in cache cluster. It differs from the LBLC scheduling
+         as follows: the load balancer maintains mappings from a target
+         to a set of server nodes that can serve the target. Requests for
+         a target are assigned to the least-connection node in the target's
+         server set. If all the node in the server set are over loaded,
+         it picks up a least-connection node in the cluster and adds it
+         in the sever set for the target. If the server set has not been
+         modified for the specified time, the most loaded node is removed
+         from the server set, in order to avoid high degree of replication.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_DH
+       tristate "destination hashing scheduling"
+       ---help---
+         The destination hashing scheduling algorithm assigns network
+         connections to the servers through looking up a statically assigned
+         hash table by their destination IP addresses.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_SH
+       tristate "source hashing scheduling"
+       ---help---
+         The source hashing scheduling algorithm assigns network
+         connections to the servers through looking up a statically assigned
+         hash table by their source IP addresses.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_SED
+       tristate "shortest expected delay scheduling"
+       ---help---
+         The shortest expected delay scheduling algorithm assigns network
+         connections to the server with the shortest expected delay. The 
+         expected delay that the job will experience is (Ci + 1) / Ui if 
+         sent to the ith server, in which Ci is the number of connections
+         on the ith server and Ui is the fixed service rate (weight)
+         of the ith server.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_NQ
+       tristate "never queue scheduling"
+       ---help---
+         The never queue scheduling algorithm adopts a two-speed model.
+         When there is an idle server available, the job will be sent to
+         the idle server, instead of waiting for a fast one. When there
+         is no idle server available, the job will be sent to the server
+         that minimize its expected delay (The Shortest Expected Delay
+         scheduling algorithm).
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+comment 'IPVS application helper'
+
+config IP_VS_FTP
+       tristate "FTP protocol helper"
+        depends on IP_VS_PROTO_TCP
+       ---help---
+         FTP is a protocol that transfers IP address and/or port number in
+         the payload. In the virtual server via Network Address Translation,
+         the IP address and port number of real servers cannot be sent to
+         clients in ftp connections directly, so FTP protocol helper is
+         required for tracking the connection and mangling it back to that of
+         virtual service.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile

new file mode 100644 (file)

index 0000000..73a46fe
--- /dev/null
+++ b/net/netfilter/ipvs/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
+
+ip_vs-objs :=  ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o        \
+               ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o                      \
+               ip_vs_est.o ip_vs_proto.o                                  \
+               $(ip_vs_proto-objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c

new file mode 100644 (file)

index 0000000..201b8ea
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -0,0 +1,622 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ *             IP_MASQ_APP application masquerading module
+ *
+ * Author:     Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DEFINE_MUTEX(__ip_vs_app_mutex);
+
+
+/*
+ *     Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+       return try_module_get(app->module);
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+       module_put(app->module);
+}
+
+
+/*
+ *     Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+       struct ip_vs_protocol *pp;
+       struct ip_vs_app *inc;
+       int ret;
+
+       if (!(pp = ip_vs_proto_get(proto)))
+               return -EPROTONOSUPPORT;
+
+       if (!pp->unregister_app)
+               return -EOPNOTSUPP;
+
+       inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
+       if (!inc)
+               return -ENOMEM;
+       INIT_LIST_HEAD(&inc->p_list);
+       INIT_LIST_HEAD(&inc->incs_list);
+       inc->app = app;
+       inc->port = htons(port);
+       atomic_set(&inc->usecnt, 0);
+
+       if (app->timeouts) {
+               inc->timeout_table =
+                       ip_vs_create_timeout_table(app->timeouts,
+                                                  app->timeouts_size);
+               if (!inc->timeout_table) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
+       ret = pp->register_app(inc);
+       if (ret)
+               goto out;
+
+       list_add(&inc->a_list, &app->incs_list);
+       IP_VS_DBG(9, "%s application %s:%u registered\n",
+                 pp->name, inc->name, inc->port);
+
+       return 0;
+
+  out:
+       kfree(inc->timeout_table);
+       kfree(inc);
+       return ret;
+}
+
+
+/*
+ *     Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+       struct ip_vs_protocol *pp;
+
+       if (!(pp = ip_vs_proto_get(inc->protocol)))
+               return;
+
+       if (pp->unregister_app)
+               pp->unregister_app(inc);
+
+       IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+                 pp->name, inc->name, inc->port);
+
+       list_del(&inc->a_list);
+
+       kfree(inc->timeout_table);
+       kfree(inc);
+}
+
+
+/*
+ *     Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+       int result;
+
+       atomic_inc(&inc->usecnt);
+       if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+               atomic_dec(&inc->usecnt);
+       return result;
+}
+
+
+/*
+ *     Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+       ip_vs_app_put(inc->app);
+       atomic_dec(&inc->usecnt);
+}
+
+
+/*
+ *     Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+       int result;
+
+       mutex_lock(&__ip_vs_app_mutex);
+
+       result = ip_vs_app_inc_new(app, proto, port);
+
+       mutex_unlock(&__ip_vs_app_mutex);
+
+       return result;
+}
+
+
+/*
+ *     ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+       /* increase the module use count */
+       ip_vs_use_count_inc();
+
+       mutex_lock(&__ip_vs_app_mutex);
+
+       list_add(&app->a_list, &ip_vs_app_list);
+
+       mutex_unlock(&__ip_vs_app_mutex);
+
+       return 0;
+}
+
+
+/*
+ *     ip_vs_app unregistration routine
+ *     We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+       struct ip_vs_app *inc, *nxt;
+
+       mutex_lock(&__ip_vs_app_mutex);
+
+       list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+               ip_vs_app_inc_release(inc);
+       }
+
+       list_del(&app->a_list);
+
+       mutex_unlock(&__ip_vs_app_mutex);
+
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+}
+
+
+/*
+ *     Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+       return pp->app_conn_bind(cp);
+}
+
+
+/*
+ *     Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+       struct ip_vs_app *inc = cp->app;
+
+       if (!inc)
+               return;
+
+       if (inc->unbind_conn)
+               inc->unbind_conn(inc, cp);
+       if (inc->done_conn)
+               inc->done_conn(inc, cp);
+       ip_vs_app_inc_put(inc);
+       cp->app = NULL;
+}
+
+
+/*
+ *     Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+       __u32 seq = ntohl(th->seq);
+
+       /*
+        *      Adjust seq with delta-offset for all packets after
+        *      the most recent resized pkt seq and with previous_delta offset
+        *      for all packets before most recent resized pkt seq.
+        */
+       if (vseq->delta || vseq->previous_delta) {
+               if(after(seq, vseq->init_seq)) {
+                       th->seq = htonl(seq + vseq->delta);
+                       IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+                                 vseq->delta);
+               } else {
+                       th->seq = htonl(seq + vseq->previous_delta);
+                       IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+                                 "(%d) to seq\n", vseq->previous_delta);
+               }
+       }
+}
+
+
+/*
+ *     Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+       __u32 ack_seq = ntohl(th->ack_seq);
+
+       /*
+        * Adjust ack_seq with delta-offset for
+        * the packets AFTER most recent resized pkt has caused a shift
+        * for packets before most recent resized pkt, use previous_delta
+        */
+       if (vseq->delta || vseq->previous_delta) {
+               /* since ack_seq is the number of octet that is expected
+                  to receive next, so compare it with init_seq+delta */
+               if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+                       th->ack_seq = htonl(ack_seq - vseq->delta);
+                       IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+                                 "(%d) from ack_seq\n", vseq->delta);
+
+               } else {
+                       th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+                       IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+                                 "previous_delta (%d) from ack_seq\n",
+                                 vseq->previous_delta);
+               }
+       }
+}
+
+
+/*
+ *     Updates ip_vs_seq if pkt has been resized
+ *     Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+                                unsigned flag, __u32 seq, int diff)
+{
+       /* spinlock is to keep updating cp->flags atomic */
+       spin_lock(&cp->lock);
+       if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+               vseq->previous_delta = vseq->delta;
+               vseq->delta += diff;
+               vseq->init_seq = seq;
+               cp->flags |= flag;
+       }
+       spin_unlock(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+                                 struct ip_vs_app *app)
+{
+       int diff;
+       const unsigned int tcp_offset = ip_hdrlen(skb);
+       struct tcphdr *th;
+       __u32 seq;
+
+       if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+               return 0;
+
+       th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+       /*
+        *      Remember seq number in case this pkt gets resized
+        */
+       seq = ntohl(th->seq);
+
+       /*
+        *      Fix seq stuff if flagged as so.
+        */
+       if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+               vs_fix_seq(&cp->out_seq, th);
+       if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+               vs_fix_ack_seq(&cp->in_seq, th);
+
+       /*
+        *      Call private output hook function
+        */
+       if (app->pkt_out == NULL)
+               return 1;
+
+       if (!app->pkt_out(app, cp, skb, &diff))
+               return 0;
+
+       /*
+        *      Update ip_vs seq stuff if len has changed.
+        */
+       if (diff != 0)
+               vs_seq_update(cp, &cp->out_seq,
+                             IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+       return 1;
+}
+
+/*
+ *     Output pkt hook. Will call bound ip_vs_app specific function
+ *     called by ipvs packet handler, assumes previously checked cp!=NULL
+ *     returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+       struct ip_vs_app *app;
+
+       /*
+        *      check if application module is bound to
+        *      this ip_vs_conn.
+        */
+       if ((app = cp->app) == NULL)
+               return 1;
+
+       /* TCP is complicated */
+       if (cp->protocol == IPPROTO_TCP)
+               return app_tcp_pkt_out(cp, skb, app);
+
+       /*
+        *      Call private output hook function
+        */
+       if (app->pkt_out == NULL)
+               return 1;
+
+       return app->pkt_out(app, cp, skb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+                                struct ip_vs_app *app)
+{
+       int diff;
+       const unsigned int tcp_offset = ip_hdrlen(skb);
+       struct tcphdr *th;
+       __u32 seq;
+
+       if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+               return 0;
+
+       th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+       /*
+        *      Remember seq number in case this pkt gets resized
+        */
+       seq = ntohl(th->seq);
+
+       /*
+        *      Fix seq stuff if flagged as so.
+        */
+       if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+               vs_fix_seq(&cp->in_seq, th);
+       if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+               vs_fix_ack_seq(&cp->out_seq, th);
+
+       /*
+        *      Call private input hook function
+        */
+       if (app->pkt_in == NULL)
+               return 1;
+
+       if (!app->pkt_in(app, cp, skb, &diff))
+               return 0;
+
+       /*
+        *      Update ip_vs seq stuff if len has changed.
+        */
+       if (diff != 0)
+               vs_seq_update(cp, &cp->in_seq,
+                             IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+       return 1;
+}
+
+/*
+ *     Input pkt hook. Will call bound ip_vs_app specific function
+ *     called by ipvs packet handler, assumes previously checked cp!=NULL.
+ *     returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+       struct ip_vs_app *app;
+
+       /*
+        *      check if application module is bound to
+        *      this ip_vs_conn.
+        */
+       if ((app = cp->app) == NULL)
+               return 1;
+
+       /* TCP is complicated */
+       if (cp->protocol == IPPROTO_TCP)
+               return app_tcp_pkt_in(cp, skb, app);
+
+       /*
+        *      Call private input hook function
+        */
+       if (app->pkt_in == NULL)
+               return 1;
+
+       return app->pkt_in(app, cp, skb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ *     /proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+{
+       struct ip_vs_app *app, *inc;
+
+       list_for_each_entry(app, &ip_vs_app_list, a_list) {
+               list_for_each_entry(inc, &app->incs_list, a_list) {
+                       if (pos-- == 0)
+                               return inc;
+               }
+       }
+       return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       mutex_lock(&__ip_vs_app_mutex);
+
+       return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct ip_vs_app *inc, *app;
+       struct list_head *e;
+
+       ++*pos;
+       if (v == SEQ_START_TOKEN)
+               return ip_vs_app_idx(0);
+
+       inc = v;
+       app = inc->app;
+
+       if ((e = inc->a_list.next) != &app->incs_list)
+               return list_entry(e, struct ip_vs_app, a_list);
+
+       /* go on to next application */
+       for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+               app = list_entry(e, struct ip_vs_app, a_list);
+               list_for_each_entry(inc, &app->incs_list, a_list) {
+                       return inc;
+               }
+       }
+       return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+       mutex_unlock(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+       if (v == SEQ_START_TOKEN)
+               seq_puts(seq, "prot port    usecnt name\n");
+       else {
+               const struct ip_vs_app *inc = v;
+
+               seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
+                          ip_vs_proto_name(inc->protocol),
+                          ntohs(inc->port),
+                          atomic_read(&inc->usecnt),
+                          inc->name);
+       }
+       return 0;
+}
+
+static const struct seq_operations ip_vs_app_seq_ops = {
+       .start = ip_vs_app_seq_start,
+       .next  = ip_vs_app_seq_next,
+       .stop  = ip_vs_app_seq_stop,
+       .show  = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &ip_vs_app_seq_ops);
+}
+
+static const struct file_operations ip_vs_app_fops = {
+       .owner   = THIS_MODULE,
+       .open    = ip_vs_app_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+#endif
+
+
+/*
+ *     Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
+                     char *o_buf, int o_len, char *n_buf, int n_len)
+{
+       int diff;
+       int o_offset;
+       int o_left;
+
+       EnterFunction(9);
+
+       diff = n_len - o_len;
+       o_offset = o_buf - (char *)skb->data;
+       /* The length of left data after o_buf+o_len in the skb data */
+       o_left = skb->len - (o_offset + o_len);
+
+       if (diff <= 0) {
+               memmove(o_buf + n_len, o_buf + o_len, o_left);
+               memcpy(o_buf, n_buf, n_len);
+               skb_trim(skb, skb->len + diff);
+       } else if (diff <= skb_tailroom(skb)) {
+               skb_put(skb, diff);
+               memmove(o_buf + n_len, o_buf + o_len, o_left);
+               memcpy(o_buf, n_buf, n_len);
+       } else {
+               if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+                       return -ENOMEM;
+               skb_put(skb, diff);
+               memmove(skb->data + o_offset + n_len,
+                       skb->data + o_offset + o_len, o_left);
+               skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
+       }
+
+       /* must update the iph total length here */
+       ip_hdr(skb)->tot_len = htons(skb->len);
+
+       LeaveFunction(9);
+       return 0;
+}
+
+
+int __init ip_vs_app_init(void)
+{
+       /* we will replace it with proc_net_ipvs_create() soon */
+       proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+       return 0;
+}
+
+
+void ip_vs_app_cleanup(void)
+{
+       proc_net_remove(&init_net, "ip_vs_app");
+}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c

new file mode 100644 (file)

index 0000000..9a24332
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -0,0 +1,1110 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>             /* for proc_net_* */
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/net_namespace.h>
+#include <net/ip_vs.h>
+
+
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+
+/*  SLAB cache for IPVS connections */
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
+
+/*  counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+
+/*  counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+       rwlock_t        l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+       read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+       read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+       write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+       write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+       read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+       read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+       write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+       write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ *     Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
+                                      const union nf_inet_addr *addr,
+                                      __be16 port)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+                                   (__force u32)port, proto, ip_vs_conn_rnd)
+                       & IP_VS_CONN_TAB_MASK;
+#endif
+       return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+                           ip_vs_conn_rnd)
+               & IP_VS_CONN_TAB_MASK;
+}
+
+
+/*
+ *     Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *     returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+       unsigned hash;
+       int ret;
+
+       /* Hash by protocol, client address and port */
+       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+
+       ct_write_lock(hash);
+
+       if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+               list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+               cp->flags |= IP_VS_CONN_F_HASHED;
+               atomic_inc(&cp->refcnt);
+               ret = 1;
+       } else {
+               IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+                         "called from %p\n", __builtin_return_address(0));
+               ret = 0;
+       }
+
+       ct_write_unlock(hash);
+
+       return ret;
+}
+
+
+/*
+ *     UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *     returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+       unsigned hash;
+       int ret;
+
+       /* unhash it and decrease its reference counter */
+       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+
+       ct_write_lock(hash);
+
+       if (cp->flags & IP_VS_CONN_F_HASHED) {
+               list_del(&cp->c_list);
+               cp->flags &= ~IP_VS_CONN_F_HASHED;
+               atomic_dec(&cp->refcnt);
+               ret = 1;
+       } else
+               ret = 0;
+
+       ct_write_unlock(hash);
+
+       return ret;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *     s_addr, s_port: pkt source address (foreign host)
+ *     d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+       unsigned hash;
+       struct ip_vs_conn *cp;
+
+       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+
+       ct_read_lock(hash);
+
+       list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+               if (cp->af == af &&
+                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
+                   s_port == cp->cport && d_port == cp->vport &&
+                   ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+                   protocol == cp->protocol) {
+                       /* HIT */
+                       atomic_inc(&cp->refcnt);
+                       ct_read_unlock(hash);
+                       return cp;
+               }
+       }
+
+       ct_read_unlock(hash);
+
+       return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+       struct ip_vs_conn *cp;
+
+       cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
+       if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+               cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
+                                        d_port);
+
+       IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+                     ip_vs_proto_name(protocol),
+                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+                     cp ? "hit" : "not hit");
+
+       return cp;
+}
+
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+       unsigned hash;
+       struct ip_vs_conn *cp;
+
+       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+
+       ct_read_lock(hash);
+
+       list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+               if (cp->af == af &&
+                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
+                   s_port == cp->cport && d_port == cp->vport &&
+                   cp->flags & IP_VS_CONN_F_TEMPLATE &&
+                   protocol == cp->protocol) {
+                       /* HIT */
+                       atomic_inc(&cp->refcnt);
+                       goto out;
+               }
+       }
+       cp = NULL;
+
+  out:
+       ct_read_unlock(hash);
+
+       IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+                     ip_vs_proto_name(protocol),
+                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+                     cp ? "hit" : "not hit");
+
+       return cp;
+}
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from inside-to-OUTside.
+ *     s_addr, s_port: pkt source address (inside host)
+ *     d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
+ const union nf_inet_addr *d_addr, __be16 d_port)
+{
+       unsigned hash;
+       struct ip_vs_conn *cp, *ret=NULL;
+
+       /*
+        *      Check for "full" addressed entries
+        */
+       hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+
+       ct_read_lock(hash);
+
+       list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+               if (cp->af == af &&
+                   ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
+                   ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
+                   d_port == cp->cport && s_port == cp->dport &&
+                   protocol == cp->protocol) {
+                       /* HIT */
+                       atomic_inc(&cp->refcnt);
+                       ret = cp;
+                       break;
+               }
+       }
+
+       ct_read_unlock(hash);
+
+       IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+                     ip_vs_proto_name(protocol),
+                     IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
+                     IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+                     ret ? "hit" : "not hit");
+
+       return ret;
+}
+
+
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+       /* reset it expire in its timeout */
+       mod_timer(&cp->timer, jiffies+cp->timeout);
+
+       __ip_vs_conn_put(cp);
+}
+
+
+/*
+ *     Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+{
+       if (ip_vs_conn_unhash(cp)) {
+               spin_lock(&cp->lock);
+               if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+                       atomic_dec(&ip_vs_conn_no_cport_cnt);
+                       cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+                       cp->cport = cport;
+               }
+               spin_unlock(&cp->lock);
+
+               /* hash on new dport */
+               ip_vs_conn_hash(cp);
+       }
+}
+
+
+/*
+ *     Bind a connection entry with the corresponding packet_xmit.
+ *     Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+       switch (IP_VS_FWD_METHOD(cp)) {
+       case IP_VS_CONN_F_MASQ:
+               cp->packet_xmit = ip_vs_nat_xmit;
+               break;
+
+       case IP_VS_CONN_F_TUNNEL:
+               cp->packet_xmit = ip_vs_tunnel_xmit;
+               break;
+
+       case IP_VS_CONN_F_DROUTE:
+               cp->packet_xmit = ip_vs_dr_xmit;
+               break;
+
+       case IP_VS_CONN_F_LOCALNODE:
+               cp->packet_xmit = ip_vs_null_xmit;
+               break;
+
+       case IP_VS_CONN_F_BYPASS:
+               cp->packet_xmit = ip_vs_bypass_xmit;
+               break;
+       }
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+       switch (IP_VS_FWD_METHOD(cp)) {
+       case IP_VS_CONN_F_MASQ:
+               cp->packet_xmit = ip_vs_nat_xmit_v6;
+               break;
+
+       case IP_VS_CONN_F_TUNNEL:
+               cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+               break;
+
+       case IP_VS_CONN_F_DROUTE:
+               cp->packet_xmit = ip_vs_dr_xmit_v6;
+               break;
+
+       case IP_VS_CONN_F_LOCALNODE:
+               cp->packet_xmit = ip_vs_null_xmit;
+               break;
+
+       case IP_VS_CONN_F_BYPASS:
+               cp->packet_xmit = ip_vs_bypass_xmit_v6;
+               break;
+       }
+}
+#endif
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+       return atomic_read(&dest->activeconns)
+               + atomic_read(&dest->inactconns);
+}
+
+/*
+ *     Bind a connection entry with a virtual service destination
+ *     Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+       /* if dest is NULL, then return directly */
+       if (!dest)
+               return;
+
+       /* Increase the refcnt counter of the dest */
+       atomic_inc(&dest->refcnt);
+
+       /* Bind with the destination and its corresponding transmitter */
+       if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+           (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+               /* if the connection is not template and is created
+                * by sync, preserve the activity flag.
+                */
+               cp->flags |= atomic_read(&dest->conn_flags) &
+                            (~IP_VS_CONN_F_INACTIVE);
+       else
+               cp->flags |= atomic_read(&dest->conn_flags);
+       cp->dest = dest;
+
+       IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+                     "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+                     "dest->refcnt:%d\n",
+                     ip_vs_proto_name(cp->protocol),
+                     IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+                     IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+                     ip_vs_fwd_tag(cp), cp->state,
+                     cp->flags, atomic_read(&cp->refcnt),
+                     atomic_read(&dest->refcnt));
+
+       /* Update the connection counters */
+       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+               /* It is a normal connection, so increase the inactive
+                  connection counter because it is in TCP SYNRECV
+                  state (inactive) or other protocol inacive state */
+               if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+                   (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+                       atomic_inc(&dest->activeconns);
+               else
+                       atomic_inc(&dest->inactconns);
+       } else {
+               /* It is a persistent connection/template, so increase
+                  the peristent connection counter */
+               atomic_inc(&dest->persistconns);
+       }
+
+       if (dest->u_threshold != 0 &&
+           ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+               dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Check if there is a destination for the connection, if so
+ * bind the connection to the destination.
+ */
+struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+{
+       struct ip_vs_dest *dest;
+
+       if ((cp) && (!cp->dest)) {
+               dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
+                                      &cp->vaddr, cp->vport,
+                                      cp->protocol);
+               ip_vs_bind_dest(cp, dest);
+               return dest;
+       } else
+               return NULL;
+}
+
+
+/*
+ *     Unbind a connection entry with its VS destination
+ *     Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+       struct ip_vs_dest *dest = cp->dest;
+
+       if (!dest)
+               return;
+
+       IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+                     "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+                     "dest->refcnt:%d\n",
+                     ip_vs_proto_name(cp->protocol),
+                     IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+                     IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+                     ip_vs_fwd_tag(cp), cp->state,
+                     cp->flags, atomic_read(&cp->refcnt),
+                     atomic_read(&dest->refcnt));
+
+       /* Update the connection counters */
+       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+               /* It is a normal connection, so decrease the inactconns
+                  or activeconns counter */
+               if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+                       atomic_dec(&dest->inactconns);
+               } else {
+                       atomic_dec(&dest->activeconns);
+               }
+       } else {
+               /* It is a persistent connection/template, so decrease
+                  the peristent connection counter */
+               atomic_dec(&dest->persistconns);
+       }
+
+       if (dest->l_threshold != 0) {
+               if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+       } else if (dest->u_threshold != 0) {
+               if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+       } else {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+       }
+
+       /*
+        * Simply decrease the refcnt of the dest, because the
+        * dest will be either in service's destination list
+        * or in the trash.
+        */
+       atomic_dec(&dest->refcnt);
+}
+
+
+/*
+ *     Checking if the destination of a connection template is available.
+ *     If available, return 1, otherwise invalidate this connection
+ *     template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+       struct ip_vs_dest *dest = ct->dest;
+
+       /*
+        * Checking the dest server status.
+        */
+       if ((dest == NULL) ||
+           !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+           (sysctl_ip_vs_expire_quiescent_template &&
+            (atomic_read(&dest->weight) == 0))) {
+               IP_VS_DBG_BUF(9, "check_template: dest not available for "
+                             "protocol %s s:%s:%d v:%s:%d "
+                             "-> d:%s:%d\n",
+                             ip_vs_proto_name(ct->protocol),
+                             IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+                             ntohs(ct->cport),
+                             IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+                             ntohs(ct->vport),
+                             IP_VS_DBG_ADDR(ct->af, &ct->daddr),
+                             ntohs(ct->dport));
+
+               /*
+                * Invalidate the connection template
+                */
+               if (ct->vport != htons(0xffff)) {
+                       if (ip_vs_conn_unhash(ct)) {
+                               ct->dport = htons(0xffff);
+                               ct->vport = htons(0xffff);
+                               ct->cport = 0;
+                               ip_vs_conn_hash(ct);
+                       }
+               }
+
+               /*
+                * Simply decrease the refcnt of the template,
+                * don't restart its timer.
+                */
+               atomic_dec(&ct->refcnt);
+               return 0;
+       }
+       return 1;
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+       struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+
+       cp->timeout = 60*HZ;
+
+       /*
+        *      hey, I'm using it
+        */
+       atomic_inc(&cp->refcnt);
+
+       /*
+        *      do I control anybody?
+        */
+       if (atomic_read(&cp->n_control))
+               goto expire_later;
+
+       /*
+        *      unhash it if it is hashed in the conn table
+        */
+       if (!ip_vs_conn_unhash(cp))
+               goto expire_later;
+
+       /*
+        *      refcnt==1 implies I'm the only one referrer
+        */
+       if (likely(atomic_read(&cp->refcnt) == 1)) {
+               /* delete the timer if it is activated by other users */
+               if (timer_pending(&cp->timer))
+                       del_timer(&cp->timer);
+
+               /* does anybody control me? */
+               if (cp->control)
+                       ip_vs_control_del(cp);
+
+               if (unlikely(cp->app != NULL))
+                       ip_vs_unbind_app(cp);
+               ip_vs_unbind_dest(cp);
+               if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+                       atomic_dec(&ip_vs_conn_no_cport_cnt);
+               atomic_dec(&ip_vs_conn_count);
+
+               kmem_cache_free(ip_vs_conn_cachep, cp);
+               return;
+       }
+
+       /* hash it back to the table */
+       ip_vs_conn_hash(cp);
+
+  expire_later:
+       IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
+                 atomic_read(&cp->refcnt)-1,
+                 atomic_read(&cp->n_control));
+
+       ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+       if (del_timer(&cp->timer))
+               mod_timer(&cp->timer, jiffies);
+}
+
+
+/*
+ *     Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
+              const union nf_inet_addr *vaddr, __be16 vport,
+              const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
+              struct ip_vs_dest *dest)
+{
+       struct ip_vs_conn *cp;
+       struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+       cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+       if (cp == NULL) {
+               IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+               return NULL;
+       }
+
+       INIT_LIST_HEAD(&cp->c_list);
+       setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+       cp->af             = af;
+       cp->protocol       = proto;
+       ip_vs_addr_copy(af, &cp->caddr, caddr);
+       cp->cport          = cport;
+       ip_vs_addr_copy(af, &cp->vaddr, vaddr);
+       cp->vport          = vport;
+       ip_vs_addr_copy(af, &cp->daddr, daddr);
+       cp->dport          = dport;
+       cp->flags          = flags;
+       spin_lock_init(&cp->lock);
+
+       /*
+        * Set the entry is referenced by the current thread before hashing
+        * it in the table, so that other thread run ip_vs_random_dropentry
+        * but cannot drop this entry.
+        */
+       atomic_set(&cp->refcnt, 1);
+
+       atomic_set(&cp->n_control, 0);
+       atomic_set(&cp->in_pkts, 0);
+
+       atomic_inc(&ip_vs_conn_count);
+       if (flags & IP_VS_CONN_F_NO_CPORT)
+               atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+       /* Bind the connection with a destination server */
+       ip_vs_bind_dest(cp, dest);
+
+       /* Set its state and timeout */
+       cp->state = 0;
+       cp->timeout = 3*HZ;
+
+       /* Bind its packet transmitter */
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               ip_vs_bind_xmit_v6(cp);
+       else
+#endif
+               ip_vs_bind_xmit(cp);
+
+       if (unlikely(pp && atomic_read(&pp->appcnt)))
+               ip_vs_bind_app(cp, pp);
+
+       /* Hash it in the ip_vs_conn_tab finally */
+       ip_vs_conn_hash(cp);
+
+       return cp;
+}
+
+
+/*
+ *     /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+       int idx;
+       struct ip_vs_conn *cp;
+
+       for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+               ct_read_lock_bh(idx);
+               list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                       if (pos-- == 0) {
+                               seq->private = &ip_vs_conn_tab[idx];
+                               return cp;
+                       }
+               }
+               ct_read_unlock_bh(idx);
+       }
+
+       return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       seq->private = NULL;
+       return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct ip_vs_conn *cp = v;
+       struct list_head *e, *l = seq->private;
+       int idx;
+
+       ++*pos;
+       if (v == SEQ_START_TOKEN)
+               return ip_vs_conn_array(seq, 0);
+
+       /* more on same hash chain? */
+       if ((e = cp->c_list.next) != l)
+               return list_entry(e, struct ip_vs_conn, c_list);
+
+       idx = l - ip_vs_conn_tab;
+       ct_read_unlock_bh(idx);
+
+       while (++idx < IP_VS_CONN_TAB_SIZE) {
+               ct_read_lock_bh(idx);
+               list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+                       seq->private = &ip_vs_conn_tab[idx];
+                       return cp;
+               }
+               ct_read_unlock_bh(idx);
+       }
+       seq->private = NULL;
+       return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+       struct list_head *l = seq->private;
+
+       if (l)
+               ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+       if (v == SEQ_START_TOKEN)
+               seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+       else {
+               const struct ip_vs_conn *cp = v;
+
+#ifdef CONFIG_IP_VS_IPV6
+               if (cp->af == AF_INET6)
+                       seq_printf(seq,
+                               "%-3s " NIP6_FMT " %04X " NIP6_FMT
+                               " %04X " NIP6_FMT " %04X %-11s %7lu\n",
+                               ip_vs_proto_name(cp->protocol),
+                               NIP6(cp->caddr.in6), ntohs(cp->cport),
+                               NIP6(cp->vaddr.in6), ntohs(cp->vport),
+                               NIP6(cp->daddr.in6), ntohs(cp->dport),
+                               ip_vs_state_name(cp->protocol, cp->state),
+                               (cp->timer.expires-jiffies)/HZ);
+               else
+#endif
+                       seq_printf(seq,
+                               "%-3s %08X %04X %08X %04X"
+                               " %08X %04X %-11s %7lu\n",
+                               ip_vs_proto_name(cp->protocol),
+                               ntohl(cp->caddr.ip), ntohs(cp->cport),
+                               ntohl(cp->vaddr.ip), ntohs(cp->vport),
+                               ntohl(cp->daddr.ip), ntohs(cp->dport),
+                               ip_vs_state_name(cp->protocol, cp->state),
+                               (cp->timer.expires-jiffies)/HZ);
+       }
+       return 0;
+}
+
+static const struct seq_operations ip_vs_conn_seq_ops = {
+       .start = ip_vs_conn_seq_start,
+       .next  = ip_vs_conn_seq_next,
+       .stop  = ip_vs_conn_seq_stop,
+       .show  = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &ip_vs_conn_seq_ops);
+}
+
+static const struct file_operations ip_vs_conn_fops = {
+       .owner   = THIS_MODULE,
+       .open    = ip_vs_conn_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static const char *ip_vs_origin_name(unsigned flags)
+{
+       if (flags & IP_VS_CONN_F_SYNC)
+               return "SYNC";
+       else
+               return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+
+       if (v == SEQ_START_TOKEN)
+               seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
+       else {
+               const struct ip_vs_conn *cp = v;
+
+#ifdef CONFIG_IP_VS_IPV6
+               if (cp->af == AF_INET6)
+                       seq_printf(seq,
+                               "%-3s " NIP6_FMT " %04X " NIP6_FMT
+                               " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
+                               ip_vs_proto_name(cp->protocol),
+                               NIP6(cp->caddr.in6), ntohs(cp->cport),
+                               NIP6(cp->vaddr.in6), ntohs(cp->vport),
+                               NIP6(cp->daddr.in6), ntohs(cp->dport),
+                               ip_vs_state_name(cp->protocol, cp->state),
+                               ip_vs_origin_name(cp->flags),
+                               (cp->timer.expires-jiffies)/HZ);
+               else
+#endif
+                       seq_printf(seq,
+                               "%-3s %08X %04X %08X %04X "
+                               "%08X %04X %-11s %-6s %7lu\n",
+                               ip_vs_proto_name(cp->protocol),
+                               ntohl(cp->caddr.ip), ntohs(cp->cport),
+                               ntohl(cp->vaddr.ip), ntohs(cp->vport),
+                               ntohl(cp->daddr.ip), ntohs(cp->dport),
+                               ip_vs_state_name(cp->protocol, cp->state),
+                               ip_vs_origin_name(cp->flags),
+                               (cp->timer.expires-jiffies)/HZ);
+       }
+       return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+       .start = ip_vs_conn_seq_start,
+       .next  = ip_vs_conn_seq_next,
+       .stop  = ip_vs_conn_seq_stop,
+       .show  = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &ip_vs_conn_sync_seq_ops);
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+       .owner   = THIS_MODULE,
+       .open    = ip_vs_conn_sync_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+#endif
+
+
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+       /*
+        * The drop rate array needs tuning for real environments.
+        * Called from timer bh only => no locking
+        */
+       static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+       static char todrop_counter[9] = {0};
+       int i;
+
+       /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+          This will leave enough time for normal connection to get
+          through. */
+       if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+               return 0;
+
+       /* Don't drop the entry if its number of incoming packets is not
+          located in [0, 8] */
+       i = atomic_read(&cp->in_pkts);
+       if (i > 8 || i < 0) return 0;
+
+       if (!todrop_rate[i]) return 0;
+       if (--todrop_counter[i] > 0) return 0;
+
+       todrop_counter[i] = todrop_rate[i];
+       return 1;
+}
+
+/* Called from keventd and must protect itself from softirqs */
+void ip_vs_random_dropentry(void)
+{
+       int idx;
+       struct ip_vs_conn *cp;
+
+       /*
+        * Randomly scan 1/32 of the whole table every second
+        */
+       for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+               unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+
+               /*
+                *  Lock is actually needed in this loop.
+                */
+               ct_write_lock_bh(hash);
+
+               list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+                       if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+                               /* connection template */
+                               continue;
+
+                       if (cp->protocol == IPPROTO_TCP) {
+                               switch(cp->state) {
+                               case IP_VS_TCP_S_SYN_RECV:
+                               case IP_VS_TCP_S_SYNACK:
+                                       break;
+
+                               case IP_VS_TCP_S_ESTABLISHED:
+                                       if (todrop_entry(cp))
+                                               break;
+                                       continue;
+
+                               default:
+                                       continue;
+                               }
+                       } else {
+                               if (!todrop_entry(cp))
+                                       continue;
+                       }
+
+                       IP_VS_DBG(4, "del connection\n");
+                       ip_vs_conn_expire_now(cp);
+                       if (cp->control) {
+                               IP_VS_DBG(4, "del conn template\n");
+                               ip_vs_conn_expire_now(cp->control);
+                       }
+               }
+               ct_write_unlock_bh(hash);
+       }
+}
+
+
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+       int idx;
+       struct ip_vs_conn *cp;
+
+  flush_again:
+       for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+               /*
+                *  Lock is actually needed in this loop.
+                */
+               ct_write_lock_bh(idx);
+
+               list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+
+                       IP_VS_DBG(4, "del connection\n");
+                       ip_vs_conn_expire_now(cp);
+                       if (cp->control) {
+                               IP_VS_DBG(4, "del conn template\n");
+                               ip_vs_conn_expire_now(cp->control);
+                       }
+               }
+               ct_write_unlock_bh(idx);
+       }
+
+       /* the counter may be not NULL, because maybe some conn entries
+          are run by slow timer handler or unhashed but still referred */
+       if (atomic_read(&ip_vs_conn_count) != 0) {
+               schedule();
+               goto flush_again;
+       }
+}
+
+
+int __init ip_vs_conn_init(void)
+{
+       int idx;
+
+       /*
+        * Allocate the connection hash table and initialize its list heads
+        */
+       ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+       if (!ip_vs_conn_tab)
+               return -ENOMEM;
+
+       /* Allocate ip_vs_conn slab cache */
+       ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+                                             sizeof(struct ip_vs_conn), 0,
+                                             SLAB_HWCACHE_ALIGN, NULL);
+       if (!ip_vs_conn_cachep) {
+               vfree(ip_vs_conn_tab);
+               return -ENOMEM;
+       }
+
+       IP_VS_INFO("Connection hash table configured "
+                  "(size=%d, memory=%ldKbytes)\n",
+                  IP_VS_CONN_TAB_SIZE,
+                  (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+       IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+                 sizeof(struct ip_vs_conn));
+
+       for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+               INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+       }
+
+       for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+               rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+       }
+
+       proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+       proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+
+       /* calculate the random value for connection hash */
+       get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+       return 0;
+}
+
+
+void ip_vs_conn_cleanup(void)
+{
+       /* flush all the connection entries first */
+       ip_vs_conn_flush();
+
+       /* Release the empty cache */
+       kmem_cache_destroy(ip_vs_conn_cachep);
+       proc_net_remove(&init_net, "ip_vs_conn");
+       proc_net_remove(&init_net, "ip_vs_conn_sync");
+       vfree(ip_vs_conn_tab);
+}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c

new file mode 100644 (file)

index 0000000..958abf3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -0,0 +1,1542 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *     Paul `Rusty' Russell            properly handle non-linear skbs
+ *     Harald Welte                    don't use nfcache
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#endif
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph)          (((icmph)->un).echo.id)
+#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
+
+const char *ip_vs_proto_name(unsigned proto)
+{
+       static char buf[20];
+
+       switch (proto) {
+       case IPPROTO_IP:
+               return "IP";
+       case IPPROTO_UDP:
+               return "UDP";
+       case IPPROTO_TCP:
+               return "TCP";
+       case IPPROTO_ICMP:
+               return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+       case IPPROTO_ICMPV6:
+               return "ICMPv6";
+#endif
+       default:
+               sprintf(buf, "IP_%d", proto);
+               return buf;
+       }
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+       while (--rows >= 0)
+               INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest = cp->dest;
+       if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+               spin_lock(&dest->stats.lock);
+               dest->stats.ustats.inpkts++;
+               dest->stats.ustats.inbytes += skb->len;
+               spin_unlock(&dest->stats.lock);
+
+               spin_lock(&dest->svc->stats.lock);
+               dest->svc->stats.ustats.inpkts++;
+               dest->svc->stats.ustats.inbytes += skb->len;
+               spin_unlock(&dest->svc->stats.lock);
+
+               spin_lock(&ip_vs_stats.lock);
+               ip_vs_stats.ustats.inpkts++;
+               ip_vs_stats.ustats.inbytes += skb->len;
+               spin_unlock(&ip_vs_stats.lock);
+       }
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest = cp->dest;
+       if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+               spin_lock(&dest->stats.lock);
+               dest->stats.ustats.outpkts++;
+               dest->stats.ustats.outbytes += skb->len;
+               spin_unlock(&dest->stats.lock);
+
+               spin_lock(&dest->svc->stats.lock);
+               dest->svc->stats.ustats.outpkts++;
+               dest->svc->stats.ustats.outbytes += skb->len;
+               spin_unlock(&dest->svc->stats.lock);
+
+               spin_lock(&ip_vs_stats.lock);
+               ip_vs_stats.ustats.outpkts++;
+               ip_vs_stats.ustats.outbytes += skb->len;
+               spin_unlock(&ip_vs_stats.lock);
+       }
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+       spin_lock(&cp->dest->stats.lock);
+       cp->dest->stats.ustats.conns++;
+       spin_unlock(&cp->dest->stats.lock);
+
+       spin_lock(&svc->stats.lock);
+       svc->stats.ustats.conns++;
+       spin_unlock(&svc->stats.lock);
+
+       spin_lock(&ip_vs_stats.lock);
+       ip_vs_stats.ustats.conns++;
+       spin_unlock(&ip_vs_stats.lock);
+}
+
+
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+               const struct sk_buff *skb,
+               struct ip_vs_protocol *pp)
+{
+       if (unlikely(!pp->state_transition))
+               return 0;
+       return pp->state_transition(cp, direction, skb, pp);
+}
+
+
+/*
+ *  IPVS persistent scheduling function
+ *  It creates a connection entry according to its template if exists,
+ *  or selects a server and creates a connection entry plus a template.
+ *  Locking: we are svc user (svc->refcnt), so we hold all dests too
+ *  Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+                   const struct sk_buff *skb,
+                   __be16 ports[2])
+{
+       struct ip_vs_conn *cp = NULL;
+       struct ip_vs_iphdr iph;
+       struct ip_vs_dest *dest;
+       struct ip_vs_conn *ct;
+       __be16  dport;                  /* destination port to forward */
+       union nf_inet_addr snet;        /* source network of the client,
+                                          after masking */
+
+       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+       /* Mask saddr with the netmask to adjust template granularity */
+#ifdef CONFIG_IP_VS_IPV6
+       if (svc->af == AF_INET6)
+               ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
+       else
+#endif
+               snet.ip = iph.saddr.ip & svc->netmask;
+
+       IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+                     "mnet %s\n",
+                     IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
+                     IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+                     IP_VS_DBG_ADDR(svc->af, &snet));
+
+       /*
+        * As far as we know, FTP is a very complicated network protocol, and
+        * it uses control connection and data connections. For active FTP,
+        * FTP server initialize data connection to the client, its source port
+        * is often 20. For passive FTP, FTP server tells the clients the port
+        * that it passively listens to,  and the client issues the data
+        * connection. In the tunneling or direct routing mode, the load
+        * balancer is on the client-to-server half of connection, the port
+        * number is unknown to the load balancer. So, a conn template like
+        * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+        * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+        * is created for other persistent services.
+        */
+       if (ports[1] == svc->port) {
+               /* Check if a template already exists */
+               if (svc->port != FTPPORT)
+                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
+                                            &iph.daddr, ports[1]);
+               else
+                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
+                                            &iph.daddr, 0);
+
+               if (!ct || !ip_vs_check_template(ct)) {
+                       /*
+                        * No template found or the dest of the connection
+                        * template is not available.
+                        */
+                       dest = svc->scheduler->schedule(svc, skb);
+                       if (dest == NULL) {
+                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                               return NULL;
+                       }
+
+                       /*
+                        * Create a template like <protocol,caddr,0,
+                        * vaddr,vport,daddr,dport> for non-ftp service,
+                        * and <protocol,caddr,0,vaddr,0,daddr,0>
+                        * for ftp service.
+                        */
+                       if (svc->port != FTPPORT)
+                               ct = ip_vs_conn_new(svc->af, iph.protocol,
+                                                   &snet, 0,
+                                                   &iph.daddr,
+                                                   ports[1],
+                                                   &dest->addr, dest->port,
+                                                   IP_VS_CONN_F_TEMPLATE,
+                                                   dest);
+                       else
+                               ct = ip_vs_conn_new(svc->af, iph.protocol,
+                                                   &snet, 0,
+                                                   &iph.daddr, 0,
+                                                   &dest->addr, 0,
+                                                   IP_VS_CONN_F_TEMPLATE,
+                                                   dest);
+                       if (ct == NULL)
+                               return NULL;
+
+                       ct->timeout = svc->timeout;
+               } else {
+                       /* set destination with the found template */
+                       dest = ct->dest;
+               }
+               dport = dest->port;
+       } else {
+               /*
+                * Note: persistent fwmark-based services and persistent
+                * port zero service are handled here.
+                * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+                * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+                */
+               if (svc->fwmark) {
+                       union nf_inet_addr fwmark = {
+                               .all = { 0, 0, 0, htonl(svc->fwmark) }
+                       };
+
+                       ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
+                                            &fwmark, 0);
+               } else
+                       ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
+                                            &iph.daddr, 0);
+
+               if (!ct || !ip_vs_check_template(ct)) {
+                       /*
+                        * If it is not persistent port zero, return NULL,
+                        * otherwise create a connection template.
+                        */
+                       if (svc->port)
+                               return NULL;
+
+                       dest = svc->scheduler->schedule(svc, skb);
+                       if (dest == NULL) {
+                               IP_VS_DBG(1, "p-schedule: no dest found.\n");
+                               return NULL;
+                       }
+
+                       /*
+                        * Create a template according to the service
+                        */
+                       if (svc->fwmark) {
+                               union nf_inet_addr fwmark = {
+                                       .all = { 0, 0, 0, htonl(svc->fwmark) }
+                               };
+
+                               ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
+                                                   &snet, 0,
+                                                   &fwmark, 0,
+                                                   &dest->addr, 0,
+                                                   IP_VS_CONN_F_TEMPLATE,
+                                                   dest);
+                       } else
+                               ct = ip_vs_conn_new(svc->af, iph.protocol,
+                                                   &snet, 0,
+                                                   &iph.daddr, 0,
+                                                   &dest->addr, 0,
+                                                   IP_VS_CONN_F_TEMPLATE,
+                                                   dest);
+                       if (ct == NULL)
+                               return NULL;
+
+                       ct->timeout = svc->timeout;
+               } else {
+                       /* set destination with the found template */
+                       dest = ct->dest;
+               }
+               dport = ports[1];
+       }
+
+       /*
+        *    Create a new connection according to the template
+        */
+       cp = ip_vs_conn_new(svc->af, iph.protocol,
+                           &iph.saddr, ports[0],
+                           &iph.daddr, ports[1],
+                           &dest->addr, dport,
+                           0,
+                           dest);
+       if (cp == NULL) {
+               ip_vs_conn_put(ct);
+               return NULL;
+       }
+
+       /*
+        *    Add its control
+        */
+       ip_vs_control_add(cp, ct);
+       ip_vs_conn_put(ct);
+
+       ip_vs_conn_stats(cp, svc);
+       return cp;
+}
+
+
+/*
+ *  IPVS main scheduling function
+ *  It selects a server according to the virtual service, and
+ *  creates a connection entry.
+ *  Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_conn *cp = NULL;
+       struct ip_vs_iphdr iph;
+       struct ip_vs_dest *dest;
+       __be16 _ports[2], *pptr;
+
+       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+       pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+       if (pptr == NULL)
+               return NULL;
+
+       /*
+        *    Persistent service
+        */
+       if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+               return ip_vs_sched_persist(svc, skb, pptr);
+
+       /*
+        *    Non-persistent service
+        */
+       if (!svc->fwmark && pptr[1] != svc->port) {
+               if (!svc->port)
+                       IP_VS_ERR("Schedule: port zero only supported "
+                                 "in persistent services, "
+                                 "check your ipvs configuration\n");
+               return NULL;
+       }
+
+       dest = svc->scheduler->schedule(svc, skb);
+       if (dest == NULL) {
+               IP_VS_DBG(1, "Schedule: no dest found.\n");
+               return NULL;
+       }
+
+       /*
+        *    Create a connection entry.
+        */
+       cp = ip_vs_conn_new(svc->af, iph.protocol,
+                           &iph.saddr, pptr[0],
+                           &iph.daddr, pptr[1],
+                           &dest->addr, dest->port ? dest->port : pptr[1],
+                           0,
+                           dest);
+       if (cp == NULL)
+               return NULL;
+
+       IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+                     "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+                     ip_vs_fwd_tag(cp),
+                     IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
+                     IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
+                     IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
+                     cp->flags, atomic_read(&cp->refcnt));
+
+       ip_vs_conn_stats(cp, svc);
+       return cp;
+}
+
+
+/*
+ *  Pass or drop the packet.
+ *  Called by ip_vs_in, when the virtual service is available but
+ *  no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+               struct ip_vs_protocol *pp)
+{
+       __be16 _ports[2], *pptr;
+       struct ip_vs_iphdr iph;
+       int unicast;
+       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+       pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+       if (pptr == NULL) {
+               ip_vs_service_put(svc);
+               return NF_DROP;
+       }
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (svc->af == AF_INET6)
+               unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
+       else
+#endif
+               unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
+
+       /* if it is fwmark-based service, the cache_bypass sysctl is up
+          and the destination is a non-local unicast, then create
+          a cache_bypass connection entry */
+       if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
+               int ret, cs;
+               struct ip_vs_conn *cp;
+               union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
+
+               ip_vs_service_put(svc);
+
+               /* create a new connection entry */
+               IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+               cp = ip_vs_conn_new(svc->af, iph.protocol,
+                                   &iph.saddr, pptr[0],
+                                   &iph.daddr, pptr[1],
+                                   &daddr, 0,
+                                   IP_VS_CONN_F_BYPASS,
+                                   NULL);
+               if (cp == NULL)
+                       return NF_DROP;
+
+               /* statistics */
+               ip_vs_in_stats(cp, skb);
+
+               /* set state */
+               cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+
+               /* transmit the first SYN packet */
+               ret = cp->packet_xmit(skb, cp, pp);
+               /* do not touch skb anymore */
+
+               atomic_inc(&cp->in_pkts);
+               ip_vs_conn_put(cp);
+               return ret;
+       }
+
+       /*
+        * When the virtual ftp service is presented, packets destined
+        * for other services on the VIP may get here (except services
+        * listed in the ipvs table), pass the packets, because it is
+        * not ipvs job to decide to drop the packets.
+        */
+       if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+               ip_vs_service_put(svc);
+               return NF_ACCEPT;
+       }
+
+       ip_vs_service_put(svc);
+
+       /*
+        * Notify the client that the destination is unreachable, and
+        * release the socket buffer.
+        * Since it is in IP layer, the TCP socket is not actually
+        * created, the TCP RST packet cannot be sent, instead that
+        * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+        */
+#ifdef CONFIG_IP_VS_IPV6
+       if (svc->af == AF_INET6)
+               icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
+                           skb->dev);
+       else
+#endif
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+       return NF_DROP;
+}
+
+
+/*
+ *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
+ *      chain, and is used for VS/NAT.
+ *      It detects packets for VS/NAT connections and sends the packets
+ *      immediately. This can avoid that iptable_nat mangles the packets
+ *      for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+                                      struct sk_buff *skb,
+                                      const struct net_device *in,
+                                      const struct net_device *out,
+                                      int (*okfn)(struct sk_buff *))
+{
+       if (!skb->ipvs_property)
+               return NF_ACCEPT;
+       /* The packet was sent from IPVS, exit this chain */
+       return NF_STOP;
+}
+
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+       return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+       int err = ip_defrag(skb, user);
+
+       if (!err)
+               ip_send_check(ip_hdr(skb));
+
+       return err;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
+{
+       /* TODO IPv6: Find out what to do here for IPv6 */
+       return 0;
+}
+#endif
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                   struct ip_vs_conn *cp, int inout)
+{
+       struct iphdr *iph        = ip_hdr(skb);
+       unsigned int icmp_offset = iph->ihl*4;
+       struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
+                                                     icmp_offset);
+       struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
+
+       if (inout) {
+               iph->saddr = cp->vaddr.ip;
+               ip_send_check(iph);
+               ciph->daddr = cp->vaddr.ip;
+               ip_send_check(ciph);
+       } else {
+               iph->daddr = cp->daddr.ip;
+               ip_send_check(iph);
+               ciph->saddr = cp->daddr.ip;
+               ip_send_check(ciph);
+       }
+
+       /* the TCP/UDP port */
+       if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
+               __be16 *ports = (void *)ciph + ciph->ihl*4;
+
+               if (inout)
+                       ports[1] = cp->vport;
+               else
+                       ports[0] = cp->dport;
+       }
+
+       /* And finally the ICMP checksum */
+       icmph->checksum = 0;
+       icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+       if (inout)
+               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                       "Forwarding altered outgoing ICMP");
+       else
+               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                       "Forwarding altered incoming ICMP");
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+                   struct ip_vs_conn *cp, int inout)
+{
+       struct ipv6hdr *iph      = ipv6_hdr(skb);
+       unsigned int icmp_offset = sizeof(struct ipv6hdr);
+       struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
+                                                     icmp_offset);
+       struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
+
+       if (inout) {
+               iph->saddr = cp->vaddr.in6;
+               ciph->daddr = cp->vaddr.in6;
+       } else {
+               iph->daddr = cp->daddr.in6;
+               ciph->saddr = cp->daddr.in6;
+       }
+
+       /* the TCP/UDP port */
+       if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
+               __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
+
+               if (inout)
+                       ports[1] = cp->vport;
+               else
+                       ports[0] = cp->dport;
+       }
+
+       /* And finally the ICMP checksum */
+       icmph->icmp6_cksum = 0;
+       /* TODO IPv6: is this correct for ICMPv6? */
+       ip_vs_checksum_complete(skb, icmp_offset);
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+       if (inout)
+               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                       "Forwarding altered outgoing ICMPv6");
+       else
+               IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+                       "Forwarding altered incoming ICMPv6");
+}
+#endif
+
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host. Used for NAT and local client.
+ */
+static int handle_response_icmp(int af, struct sk_buff *skb,
+                               union nf_inet_addr *snet,
+                               __u8 protocol, struct ip_vs_conn *cp,
+                               struct ip_vs_protocol *pp,
+                               unsigned int offset, unsigned int ihl)
+{
+       unsigned int verdict = NF_DROP;
+
+       if (IP_VS_FWD_METHOD(cp) != 0) {
+               IP_VS_ERR("shouldn't reach here, because the box is on the "
+                         "half connection in the tun/dr module.\n");
+       }
+
+       /* Ensure the checksum is correct */
+       if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+               /* Failed checksum! */
+               IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+                             IP_VS_DBG_ADDR(af, snet));
+               goto out;
+       }
+
+       if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
+               offset += 2 * sizeof(__u16);
+       if (!skb_make_writable(skb, offset))
+               goto out;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+       else
+#endif
+               ip_vs_nat_icmp(skb, pp, cp, 1);
+
+       /* do the statistics and put it back */
+       ip_vs_out_stats(cp, skb);
+
+       skb->ipvs_property = 1;
+       verdict = NF_ACCEPT;
+
+out:
+       __ip_vs_conn_put(cp);
+
+       return verdict;
+}
+
+/*
+ *     Handle ICMP messages in the inside-to-outside direction (outgoing).
+ *     Find any that might be relevant, check against existing connections.
+ *     Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+{
+       struct iphdr *iph;
+       struct icmphdr  _icmph, *ic;
+       struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
+       struct ip_vs_iphdr ciph;
+       struct ip_vs_conn *cp;
+       struct ip_vs_protocol *pp;
+       unsigned int offset, ihl;
+       union nf_inet_addr snet;
+
+       *related = 1;
+
+       /* reassemble IP fragments */
+       if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+               if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+                       return NF_STOLEN;
+       }
+
+       iph = ip_hdr(skb);
+       offset = ihl = iph->ihl * 4;
+       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       if (ic == NULL)
+               return NF_DROP;
+
+       IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+                 ic->type, ntohs(icmp_id(ic)),
+                 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+       /*
+        * Work through seeing if this is for us.
+        * These checks are supposed to be in an order that means easy
+        * things are checked first to speed up processing.... however
+        * this means that some packets will manage to get a long way
+        * down this stack and then be rejected, but that's life.
+        */
+       if ((ic->type != ICMP_DEST_UNREACH) &&
+           (ic->type != ICMP_SOURCE_QUENCH) &&
+           (ic->type != ICMP_TIME_EXCEEDED)) {
+               *related = 0;
+               return NF_ACCEPT;
+       }
+
+       /* Now find the contained IP header */
+       offset += sizeof(_icmph);
+       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+       if (cih == NULL)
+               return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+       pp = ip_vs_proto_get(cih->protocol);
+       if (!pp)
+               return NF_ACCEPT;
+
+       /* Is the embedded protocol header present? */
+       if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+                    pp->dont_defrag))
+               return NF_ACCEPT;
+
+       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+
+       offset += cih->ihl * 4;
+
+       ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+       /* The embedded headers contain source and dest in reverse order */
+       cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+       if (!cp)
+               return NF_ACCEPT;
+
+       snet.ip = iph->saddr;
+       return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+                                   pp, offset, ihl);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+{
+       struct ipv6hdr *iph;
+       struct icmp6hdr _icmph, *ic;
+       struct ipv6hdr  _ciph, *cih;    /* The ip header contained
+                                          within the ICMP */
+       struct ip_vs_iphdr ciph;
+       struct ip_vs_conn *cp;
+       struct ip_vs_protocol *pp;
+       unsigned int offset;
+       union nf_inet_addr snet;
+
+       *related = 1;
+
+       /* reassemble IP fragments */
+       if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+               if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+                       return NF_STOLEN;
+       }
+
+       iph = ipv6_hdr(skb);
+       offset = sizeof(struct ipv6hdr);
+       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       if (ic == NULL)
+               return NF_DROP;
+
+       IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
+                 ic->icmp6_type, ntohs(icmpv6_id(ic)),
+                 NIP6(iph->saddr), NIP6(iph->daddr));
+
+       /*
+        * Work through seeing if this is for us.
+        * These checks are supposed to be in an order that means easy
+        * things are checked first to speed up processing.... however
+        * this means that some packets will manage to get a long way
+        * down this stack and then be rejected, but that's life.
+        */
+       if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+           (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+           (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+               *related = 0;
+               return NF_ACCEPT;
+       }
+
+       /* Now find the contained IP header */
+       offset += sizeof(_icmph);
+       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+       if (cih == NULL)
+               return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+       pp = ip_vs_proto_get(cih->nexthdr);
+       if (!pp)
+               return NF_ACCEPT;
+
+       /* Is the embedded protocol header present? */
+       /* TODO: we don't support fragmentation at the moment anyways */
+       if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+               return NF_ACCEPT;
+
+       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+
+       offset += sizeof(struct ipv6hdr);
+
+       ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+       /* The embedded headers contain source and dest in reverse order */
+       cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+       if (!cp)
+               return NF_ACCEPT;
+
+       ipv6_addr_copy(&snet.in6, &iph->saddr);
+       return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
+                                   pp, offset, sizeof(struct ipv6hdr));
+}
+#endif
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
+{
+       struct tcphdr _tcph, *th;
+
+       th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
+       if (th == NULL)
+               return 0;
+       return th->rst;
+}
+
+/* Handle response packets: rewrite addresses and send away...
+ * Used for NAT and local client.
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+               struct ip_vs_conn *cp, int ihl)
+{
+       IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+
+       if (!skb_make_writable(skb, ihl))
+               goto drop;
+
+       /* mangle the packet */
+       if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+               goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+       else
+#endif
+       {
+               ip_hdr(skb)->saddr = cp->vaddr.ip;
+               ip_send_check(ip_hdr(skb));
+       }
+
+       /* For policy routing, packets originating from this
+        * machine itself may be routed differently to packets
+        * passing through.  We want this packet to be routed as
+        * if it came from this machine itself.  So re-compute
+        * the routing information.
+        */
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6) {
+               if (ip6_route_me_harder(skb) != 0)
+                       goto drop;
+       } else
+#endif
+               if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+                       goto drop;
+
+       IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+
+       ip_vs_out_stats(cp, skb);
+       ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+       ip_vs_conn_put(cp);
+
+       skb->ipvs_property = 1;
+
+       LeaveFunction(11);
+       return NF_ACCEPT;
+
+drop:
+       ip_vs_conn_put(cp);
+       kfree_skb(skb);
+       return NF_STOLEN;
+}
+
+/*
+ *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *     Check if outgoing packet belongs to the established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
+         const struct net_device *in, const struct net_device *out,
+         int (*okfn)(struct sk_buff *))
+{
+       struct ip_vs_iphdr iph;
+       struct ip_vs_protocol *pp;
+       struct ip_vs_conn *cp;
+       int af;
+
+       EnterFunction(11);
+
+       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+
+       if (skb->ipvs_property)
+               return NF_ACCEPT;
+
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6) {
+               if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+
+                       if (related)
+                               return verdict;
+                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               }
+       } else
+#endif
+               if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+                       int related, verdict = ip_vs_out_icmp(skb, &related);
+
+                       if (related)
+                               return verdict;
+                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               }
+
+       pp = ip_vs_proto_get(iph.protocol);
+       if (unlikely(!pp))
+               return NF_ACCEPT;
+
+       /* reassemble IP fragments */
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6) {
+               if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+                       int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+
+                       if (related)
+                               return verdict;
+
+                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               }
+       } else
+#endif
+               if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
+                            !pp->dont_defrag)) {
+                       if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+                               return NF_STOLEN;
+
+                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               }
+
+       /*
+        * Check if the packet belongs to an existing entry
+        */
+       cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+
+       if (unlikely(!cp)) {
+               if (sysctl_ip_vs_nat_icmp_send &&
+                   (pp->protocol == IPPROTO_TCP ||
+                    pp->protocol == IPPROTO_UDP)) {
+                       __be16 _ports[2], *pptr;
+
+                       pptr = skb_header_pointer(skb, iph.len,
+                                                 sizeof(_ports), _ports);
+                       if (pptr == NULL)
+                               return NF_ACCEPT;       /* Not for me */
+                       if (ip_vs_lookup_real_service(af, iph.protocol,
+                                                     &iph.saddr,
+                                                     pptr[0])) {
+                               /*
+                                * Notify the real server: there is no
+                                * existing entry if it is not RST
+                                * packet or not TCP packet.
+                                */
+                               if (iph.protocol != IPPROTO_TCP
+                                   || !is_tcp_reset(skb, iph.len)) {
+#ifdef CONFIG_IP_VS_IPV6
+                                       if (af == AF_INET6)
+                                               icmpv6_send(skb,
+                                                           ICMPV6_DEST_UNREACH,
+                                                           ICMPV6_PORT_UNREACH,
+                                                           0, skb->dev);
+                                       else
+#endif
+                                               icmp_send(skb,
+                                                         ICMP_DEST_UNREACH,
+                                                         ICMP_PORT_UNREACH, 0);
+                                       return NF_DROP;
+                               }
+                       }
+               }
+               IP_VS_DBG_PKT(12, pp, skb, 0,
+                             "packet continues traversal as normal");
+               return NF_ACCEPT;
+       }
+
+       return handle_response(af, skb, pp, cp, iph.len);
+}
+
+
+/*
+ *     Handle ICMP messages in the outside-to-inside direction (incoming).
+ *     Find any that might be relevant, check against existing connections,
+ *     forward to the right destination host if relevant.
+ *     Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+       struct iphdr *iph;
+       struct icmphdr  _icmph, *ic;
+       struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
+       struct ip_vs_iphdr ciph;
+       struct ip_vs_conn *cp;
+       struct ip_vs_protocol *pp;
+       unsigned int offset, ihl, verdict;
+       union nf_inet_addr snet;
+
+       *related = 1;
+
+       /* reassemble IP fragments */
+       if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+               if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
+                                           IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+                       return NF_STOLEN;
+       }
+
+       iph = ip_hdr(skb);
+       offset = ihl = iph->ihl * 4;
+       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       if (ic == NULL)
+               return NF_DROP;
+
+       IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+                 ic->type, ntohs(icmp_id(ic)),
+                 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+       /*
+        * Work through seeing if this is for us.
+        * These checks are supposed to be in an order that means easy
+        * things are checked first to speed up processing.... however
+        * this means that some packets will manage to get a long way
+        * down this stack and then be rejected, but that's life.
+        */
+       if ((ic->type != ICMP_DEST_UNREACH) &&
+           (ic->type != ICMP_SOURCE_QUENCH) &&
+           (ic->type != ICMP_TIME_EXCEEDED)) {
+               *related = 0;
+               return NF_ACCEPT;
+       }
+
+       /* Now find the contained IP header */
+       offset += sizeof(_icmph);
+       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+       if (cih == NULL)
+               return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+       pp = ip_vs_proto_get(cih->protocol);
+       if (!pp)
+               return NF_ACCEPT;
+
+       /* Is the embedded protocol header present? */
+       if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+                    pp->dont_defrag))
+               return NF_ACCEPT;
+
+       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+
+       offset += cih->ihl * 4;
+
+       ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+       /* The embedded headers contain source and dest in reverse order */
+       cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
+       if (!cp) {
+               /* The packet could also belong to a local client */
+               cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+               if (cp) {
+                       snet.ip = iph->saddr;
+                       return handle_response_icmp(AF_INET, skb, &snet,
+                                                   cih->protocol, cp, pp,
+                                                   offset, ihl);
+               }
+               return NF_ACCEPT;
+       }
+
+       verdict = NF_DROP;
+
+       /* Ensure the checksum is correct */
+       if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+               /* Failed checksum! */
+               IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
+                         NIPQUAD(iph->saddr));
+               goto out;
+       }
+
+       /* do the statistics and put it back */
+       ip_vs_in_stats(cp, skb);
+       if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+               offset += 2 * sizeof(__u16);
+       verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
+       /* do not touch skb anymore */
+
+  out:
+       __ip_vs_conn_put(cp);
+
+       return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int
+ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+       struct ipv6hdr *iph;
+       struct icmp6hdr _icmph, *ic;
+       struct ipv6hdr  _ciph, *cih;    /* The ip header contained
+                                          within the ICMP */
+       struct ip_vs_iphdr ciph;
+       struct ip_vs_conn *cp;
+       struct ip_vs_protocol *pp;
+       unsigned int offset, verdict;
+       union nf_inet_addr snet;
+
+       *related = 1;
+
+       /* reassemble IP fragments */
+       if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+               if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
+                                              IP_DEFRAG_VS_IN :
+                                              IP_DEFRAG_VS_FWD))
+                       return NF_STOLEN;
+       }
+
+       iph = ipv6_hdr(skb);
+       offset = sizeof(struct ipv6hdr);
+       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       if (ic == NULL)
+               return NF_DROP;
+
+       IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
+                 ic->icmp6_type, ntohs(icmpv6_id(ic)),
+                 NIP6(iph->saddr), NIP6(iph->daddr));
+
+       /*
+        * Work through seeing if this is for us.
+        * These checks are supposed to be in an order that means easy
+        * things are checked first to speed up processing.... however
+        * this means that some packets will manage to get a long way
+        * down this stack and then be rejected, but that's life.
+        */
+       if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+           (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+           (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+               *related = 0;
+               return NF_ACCEPT;
+       }
+
+       /* Now find the contained IP header */
+       offset += sizeof(_icmph);
+       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+       if (cih == NULL)
+               return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+       pp = ip_vs_proto_get(cih->nexthdr);
+       if (!pp)
+               return NF_ACCEPT;
+
+       /* Is the embedded protocol header present? */
+       /* TODO: we don't support fragmentation at the moment anyways */
+       if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+               return NF_ACCEPT;
+
+       IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+
+       offset += sizeof(struct ipv6hdr);
+
+       ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+       /* The embedded headers contain source and dest in reverse order */
+       cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
+       if (!cp) {
+               /* The packet could also belong to a local client */
+               cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+               if (cp) {
+                       ipv6_addr_copy(&snet.in6, &iph->saddr);
+                       return handle_response_icmp(AF_INET6, skb, &snet,
+                                                   cih->nexthdr,
+                                                   cp, pp, offset,
+                                                   sizeof(struct ipv6hdr));
+               }
+               return NF_ACCEPT;
+       }
+
+       verdict = NF_DROP;
+
+       /* do the statistics and put it back */
+       ip_vs_in_stats(cp, skb);
+       if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
+               offset += 2 * sizeof(__u16);
+       verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
+       /* do not touch skb anymore */
+
+       __ip_vs_conn_put(cp);
+
+       return verdict;
+}
+#endif
+
+
+/*
+ *     Check if it's for virtual services, look it up,
+ *     and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
+        const struct net_device *in, const struct net_device *out,
+        int (*okfn)(struct sk_buff *))
+{
+       struct ip_vs_iphdr iph;
+       struct ip_vs_protocol *pp;
+       struct ip_vs_conn *cp;
+       int ret, restart, af;
+
+       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
+
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+       /*
+        *      Big tappo: only PACKET_HOST, including loopback for local client
+        *      Don't handle local packets on IPv6 for now
+        */
+       if (unlikely(skb->pkt_type != PACKET_HOST)) {
+               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
+                             skb->pkt_type,
+                             iph.protocol,
+                             IP_VS_DBG_ADDR(af, &iph.daddr));
+               return NF_ACCEPT;
+       }
+
+       if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+               int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+               if (related)
+                       return verdict;
+               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+       }
+
+       /* Protocol supported? */
+       pp = ip_vs_proto_get(iph.protocol);
+       if (unlikely(!pp))
+               return NF_ACCEPT;
+
+       /*
+        * Check if the packet belongs to an existing connection entry
+        */
+       cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+
+       if (unlikely(!cp)) {
+               int v;
+
+               /* For local client packets, it could be a response */
+               cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+               if (cp)
+                       return handle_response(af, skb, pp, cp, iph.len);
+
+               if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+                       return v;
+       }
+
+       if (unlikely(!cp)) {
+               /* sorry, all this trouble for a no-hit :) */
+               IP_VS_DBG_PKT(12, pp, skb, 0,
+                             "packet continues traversal as normal");
+               return NF_ACCEPT;
+       }
+
+       IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+
+       /* Check the server status */
+       if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+               /* the destination server is not available */
+
+               if (sysctl_ip_vs_expire_nodest_conn) {
+                       /* try to expire the connection immediately */
+                       ip_vs_conn_expire_now(cp);
+               }
+               /* don't restart its timer, and silently
+                  drop the packet. */
+               __ip_vs_conn_put(cp);
+               return NF_DROP;
+       }
+
+       ip_vs_in_stats(cp, skb);
+       restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+       if (cp->packet_xmit)
+               ret = cp->packet_xmit(skb, cp, pp);
+               /* do not touch skb anymore */
+       else {
+               IP_VS_DBG_RL("warning: packet_xmit is null");
+               ret = NF_ACCEPT;
+       }
+
+       /* Increase its packet counter and check if it is needed
+        * to be synchronized
+        *
+        * Sync connection if it is about to close to
+        * encorage the standby servers to update the connections timeout
+        */
+       atomic_inc(&cp->in_pkts);
+       if (af == AF_INET &&
+           (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+           (((cp->protocol != IPPROTO_TCP ||
+              cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+             (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+              == sysctl_ip_vs_sync_threshold[0])) ||
+            ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
+             ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
+              (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
+              (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
+               ip_vs_sync_conn(cp);
+       cp->old_state = cp->state;
+
+       ip_vs_conn_put(cp);
+       return ret;
+}
+
+
+/*
+ *     It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
+ *      related packets destined for 0.0.0.0/0.
+ *      When fwmark-based virtual service is used, such as transparent
+ *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
+ *      and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+                  const struct net_device *in, const struct net_device *out,
+                  int (*okfn)(struct sk_buff *))
+{
+       int r;
+
+       if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+               return NF_ACCEPT;
+
+       return ip_vs_in_icmp(skb, &r, hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in, const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       int r;
+
+       if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+               return NF_ACCEPT;
+
+       return ip_vs_in_icmp_v6(skb, &r, hooknum);
+}
+#endif
+
+
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+       /* After packet filtering, forward packet through VS/DR, VS/TUN,
+        * or VS/NAT(change destination), so that filtering rules can be
+        * applied to IPVS. */
+       {
+               .hook           = ip_vs_in,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 100,
+       },
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_out,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
+       },
+       /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+        * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+       {
+               .hook           = ip_vs_forward_icmp,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
+       },
+       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       {
+               .hook           = ip_vs_post_routing,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_POST_ROUTING,
+               .priority       = NF_IP_PRI_NAT_SRC-1,
+       },
+#ifdef CONFIG_IP_VS_IPV6
+       /* After packet filtering, forward packet through VS/DR, VS/TUN,
+        * or VS/NAT(change destination), so that filtering rules can be
+        * applied to IPVS. */
+       {
+               .hook           = ip_vs_in,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 100,
+       },
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_out,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
+       },
+       /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+        * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+       {
+               .hook           = ip_vs_forward_icmp_v6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
+       },
+       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       {
+               .hook           = ip_vs_post_routing,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_POST_ROUTING,
+               .priority       = NF_IP6_PRI_NAT_SRC-1,
+       },
+#endif
+};
+
+
+/*
+ *     Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+       int ret;
+
+       ip_vs_estimator_init();
+
+       ret = ip_vs_control_init();
+       if (ret < 0) {
+               IP_VS_ERR("can't setup control.\n");
+               goto cleanup_estimator;
+       }
+
+       ip_vs_protocol_init();
+
+       ret = ip_vs_app_init();
+       if (ret < 0) {
+               IP_VS_ERR("can't setup application helper.\n");
+               goto cleanup_protocol;
+       }
+
+       ret = ip_vs_conn_init();
+       if (ret < 0) {
+               IP_VS_ERR("can't setup connection table.\n");
+               goto cleanup_app;
+       }
+
+       ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+       if (ret < 0) {
+               IP_VS_ERR("can't register hooks.\n");
+               goto cleanup_conn;
+       }
+
+       IP_VS_INFO("ipvs loaded.\n");
+       return ret;
+
+  cleanup_conn:
+       ip_vs_conn_cleanup();
+  cleanup_app:
+       ip_vs_app_cleanup();
+  cleanup_protocol:
+       ip_vs_protocol_cleanup();
+       ip_vs_control_cleanup();
+  cleanup_estimator:
+       ip_vs_estimator_cleanup();
+       return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+       nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+       ip_vs_conn_cleanup();
+       ip_vs_app_cleanup();
+       ip_vs_protocol_cleanup();
+       ip_vs_control_cleanup();
+       ip_vs_estimator_cleanup();
+       IP_VS_INFO("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c

new file mode 100644 (file)

index 0000000..0302cf3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -0,0 +1,3443 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DEFINE_MUTEX(__ip_vs_mutex);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+
+/* lock for table with the real services */
+static DEFINE_RWLOCK(__ip_vs_rs_lock);
+
+/* lock for state and timeout tables */
+static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+
+/* lock for drop entry handling */
+static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
+
+/* lock for drop packet handling */
+static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
+
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+       return sysctl_ip_vs_debug_level;
+}
+#endif
+
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
+{
+       struct rt6_info *rt;
+       struct flowi fl = {
+               .oif = 0,
+               .nl_u = {
+                       .ip6_u = {
+                               .daddr = *addr,
+                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
+       };
+
+       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+       if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
+                       return 1;
+
+       return 0;
+}
+#endif
+/*
+ *     update_defense_level is called from keventd and from sysctl,
+ *     so it needs to protect itself from softirqs
+ */
+static void update_defense_level(void)
+{
+       struct sysinfo i;
+       static int old_secure_tcp = 0;
+       int availmem;
+       int nomem;
+       int to_change = -1;
+
+       /* we only count free and buffered memory (in pages) */
+       si_meminfo(&i);
+       availmem = i.freeram + i.bufferram;
+       /* however in linux 2.5 the i.bufferram is total page cache size,
+          we need adjust it */
+       /* si_swapinfo(&i); */
+       /* availmem = availmem - (i.totalswap - i.freeswap); */
+
+       nomem = (availmem < sysctl_ip_vs_amemthresh);
+
+       local_bh_disable();
+
+       /* drop_entry */
+       spin_lock(&__ip_vs_dropentry_lock);
+       switch (sysctl_ip_vs_drop_entry) {
+       case 0:
+               atomic_set(&ip_vs_dropentry, 0);
+               break;
+       case 1:
+               if (nomem) {
+                       atomic_set(&ip_vs_dropentry, 1);
+                       sysctl_ip_vs_drop_entry = 2;
+               } else {
+                       atomic_set(&ip_vs_dropentry, 0);
+               }
+               break;
+       case 2:
+               if (nomem) {
+                       atomic_set(&ip_vs_dropentry, 1);
+               } else {
+                       atomic_set(&ip_vs_dropentry, 0);
+                       sysctl_ip_vs_drop_entry = 1;
+               };
+               break;
+       case 3:
+               atomic_set(&ip_vs_dropentry, 1);
+               break;
+       }
+       spin_unlock(&__ip_vs_dropentry_lock);
+
+       /* drop_packet */
+       spin_lock(&__ip_vs_droppacket_lock);
+       switch (sysctl_ip_vs_drop_packet) {
+       case 0:
+               ip_vs_drop_rate = 0;
+               break;
+       case 1:
+               if (nomem) {
+                       ip_vs_drop_rate = ip_vs_drop_counter
+                               = sysctl_ip_vs_amemthresh /
+                               (sysctl_ip_vs_amemthresh-availmem);
+                       sysctl_ip_vs_drop_packet = 2;
+               } else {
+                       ip_vs_drop_rate = 0;
+               }
+               break;
+       case 2:
+               if (nomem) {
+                       ip_vs_drop_rate = ip_vs_drop_counter
+                               = sysctl_ip_vs_amemthresh /
+                               (sysctl_ip_vs_amemthresh-availmem);
+               } else {
+                       ip_vs_drop_rate = 0;
+                       sysctl_ip_vs_drop_packet = 1;
+               }
+               break;
+       case 3:
+               ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+               break;
+       }
+       spin_unlock(&__ip_vs_droppacket_lock);
+
+       /* secure_tcp */
+       write_lock(&__ip_vs_securetcp_lock);
+       switch (sysctl_ip_vs_secure_tcp) {
+       case 0:
+               if (old_secure_tcp >= 2)
+                       to_change = 0;
+               break;
+       case 1:
+               if (nomem) {
+                       if (old_secure_tcp < 2)
+                               to_change = 1;
+                       sysctl_ip_vs_secure_tcp = 2;
+               } else {
+                       if (old_secure_tcp >= 2)
+                               to_change = 0;
+               }
+               break;
+       case 2:
+               if (nomem) {
+                       if (old_secure_tcp < 2)
+                               to_change = 1;
+               } else {
+                       if (old_secure_tcp >= 2)
+                               to_change = 0;
+                       sysctl_ip_vs_secure_tcp = 1;
+               }
+               break;
+       case 3:
+               if (old_secure_tcp < 2)
+                       to_change = 1;
+               break;
+       }
+       old_secure_tcp = sysctl_ip_vs_secure_tcp;
+       if (to_change >= 0)
+               ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+       write_unlock(&__ip_vs_securetcp_lock);
+
+       local_bh_enable();
+}
+
+
+/*
+ *     Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD   1*HZ
+static void defense_work_handler(struct work_struct *work);
+static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
+
+static void defense_work_handler(struct work_struct *work)
+{
+       update_defense_level();
+       if (atomic_read(&ip_vs_dropentry))
+               ip_vs_random_dropentry();
+
+       schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+}
+
+int
+ip_vs_use_count_inc(void)
+{
+       return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+       module_put(THIS_MODULE);
+}
+
+
+/*
+ *     Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+/*
+ *     Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+
+/*
+ *     Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+
+/*
+ *     FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+
+
+/*
+ *     Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
+                 __be16 port)
+{
+       register unsigned porth = ntohs(port);
+       __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               addr_fold = addr->ip6[0]^addr->ip6[1]^
+                           addr->ip6[2]^addr->ip6[3];
+#endif
+
+       return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+               & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *     Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+       return fwmark & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *     Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ *     or in the ip_vs_svc_fwm_table by fwmark.
+ *     Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+       unsigned hash;
+
+       if (svc->flags & IP_VS_SVC_F_HASHED) {
+               IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+                         "called from %p\n", __builtin_return_address(0));
+               return 0;
+       }
+
+       if (svc->fwmark == 0) {
+               /*
+                *  Hash it by <protocol,addr,port> in ip_vs_svc_table
+                */
+               hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
+                                        svc->port);
+               list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+       } else {
+               /*
+                *  Hash it by fwmark in ip_vs_svc_fwm_table
+                */
+               hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+               list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+       }
+
+       svc->flags |= IP_VS_SVC_F_HASHED;
+       /* increase its refcnt because it is referenced by the svc table */
+       atomic_inc(&svc->refcnt);
+       return 1;
+}
+
+
+/*
+ *     Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ *     Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+       if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+               IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+                         "called from %p\n", __builtin_return_address(0));
+               return 0;
+       }
+
+       if (svc->fwmark == 0) {
+               /* Remove it from the ip_vs_svc_table table */
+               list_del(&svc->s_list);
+       } else {
+               /* Remove it from the ip_vs_svc_fwm_table table */
+               list_del(&svc->f_list);
+       }
+
+       svc->flags &= ~IP_VS_SVC_F_HASHED;
+       atomic_dec(&svc->refcnt);
+       return 1;
+}
+
+
+/*
+ *     Get service by {proto,addr,port} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+                   __be16 vport)
+{
+       unsigned hash;
+       struct ip_vs_service *svc;
+
+       /* Check for "full" addressed entries */
+       hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+
+       list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+               if ((svc->af == af)
+                   && ip_vs_addr_equal(af, &svc->addr, vaddr)
+                   && (svc->port == vport)
+                   && (svc->protocol == protocol)) {
+                       /* HIT */
+                       atomic_inc(&svc->usecnt);
+                       return svc;
+               }
+       }
+
+       return NULL;
+}
+
+
+/*
+ *     Get service by {fwmark} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+{
+       unsigned hash;
+       struct ip_vs_service *svc;
+
+       /* Check for fwmark addressed entries */
+       hash = ip_vs_svc_fwm_hashkey(fwmark);
+
+       list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+               if (svc->fwmark == fwmark && svc->af == af) {
+                       /* HIT */
+                       atomic_inc(&svc->usecnt);
+                       return svc;
+               }
+       }
+
+       return NULL;
+}
+
+struct ip_vs_service *
+ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+                 const union nf_inet_addr *vaddr, __be16 vport)
+{
+       struct ip_vs_service *svc;
+
+       read_lock(&__ip_vs_svc_lock);
+
+       /*
+        *      Check the table hashed by fwmark first
+        */
+       if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+               goto out;
+
+       /*
+        *      Check the table hashed by <protocol,addr,port>
+        *      for "full" addressed entries
+        */
+       svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+
+       if (svc == NULL
+           && protocol == IPPROTO_TCP
+           && atomic_read(&ip_vs_ftpsvc_counter)
+           && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+               /*
+                * Check if ftp service entry exists, the packet
+                * might belong to FTP data connections.
+                */
+               svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+       }
+
+       if (svc == NULL
+           && atomic_read(&ip_vs_nullsvc_counter)) {
+               /*
+                * Check if the catch-all port (port zero) exists
+                */
+               svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+       }
+
+  out:
+       read_unlock(&__ip_vs_svc_lock);
+
+       IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+                     fwmark, ip_vs_proto_name(protocol),
+                     IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+                     svc ? "hit" : "not hit");
+
+       return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+       atomic_inc(&svc->refcnt);
+       dest->svc = svc;
+}
+
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+       struct ip_vs_service *svc = dest->svc;
+
+       dest->svc = NULL;
+       if (atomic_dec_and_test(&svc->refcnt))
+               kfree(svc);
+}
+
+
+/*
+ *     Returns hash value for real service
+ */
+static inline unsigned ip_vs_rs_hashkey(int af,
+                                           const union nf_inet_addr *addr,
+                                           __be16 port)
+{
+       register unsigned porth = ntohs(port);
+       __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               addr_fold = addr->ip6[0]^addr->ip6[1]^
+                           addr->ip6[2]^addr->ip6[3];
+#endif
+
+       return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
+               & IP_VS_RTAB_MASK;
+}
+
+/*
+ *     Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ *     should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+       unsigned hash;
+
+       if (!list_empty(&dest->d_list)) {
+               return 0;
+       }
+
+       /*
+        *      Hash by proto,addr,port,
+        *      which are the parameters of the real service.
+        */
+       hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
+       list_add(&dest->d_list, &ip_vs_rtable[hash]);
+
+       return 1;
+}
+
+/*
+ *     UNhashes ip_vs_dest from ip_vs_rtable.
+ *     should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+       /*
+        * Remove it from the ip_vs_rtable table.
+        */
+       if (!list_empty(&dest->d_list)) {
+               list_del(&dest->d_list);
+               INIT_LIST_HEAD(&dest->d_list);
+       }
+
+       return 1;
+}
+
+/*
+ *     Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(int af, __u16 protocol,
+                         const union nf_inet_addr *daddr,
+                         __be16 dport)
+{
+       unsigned hash;
+       struct ip_vs_dest *dest;
+
+       /*
+        *      Check for "full" addressed entries
+        *      Return the first found entry
+        */
+       hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+       read_lock(&__ip_vs_rs_lock);
+       list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+               if ((dest->af == af)
+                   && ip_vs_addr_equal(af, &dest->addr, daddr)
+                   && (dest->port == dport)
+                   && ((dest->protocol == protocol) ||
+                       dest->vfwmark)) {
+                       /* HIT */
+                       read_unlock(&__ip_vs_rs_lock);
+                       return dest;
+               }
+       }
+       read_unlock(&__ip_vs_rs_lock);
+
+       return NULL;
+}
+
+/*
+ *     Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+                 __be16 dport)
+{
+       struct ip_vs_dest *dest;
+
+       /*
+        * Find the destination for the given service
+        */
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if ((dest->af == svc->af)
+                   && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
+                   && (dest->port == dport)) {
+                       /* HIT */
+                       return dest;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Find destination by {daddr,dport,vaddr,protocol}
+ * Cretaed to be used in ip_vs_process_message() in
+ * the backup synchronization daemon. It finds the
+ * destination to be bound to the received connection
+ * on the backup.
+ *
+ * ip_vs_lookup_real_service() looked promissing, but
+ * seems not working as expected.
+ */
+struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+                                  __be16 dport,
+                                  const union nf_inet_addr *vaddr,
+                                  __be16 vport, __u16 protocol)
+{
+       struct ip_vs_dest *dest;
+       struct ip_vs_service *svc;
+
+       svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+       if (!svc)
+               return NULL;
+       dest = ip_vs_lookup_dest(svc, daddr, dport);
+       if (dest)
+               atomic_inc(&dest->refcnt);
+       ip_vs_service_put(svc);
+       return dest;
+}
+
+/*
+ *  Lookup dest by {svc,addr,port} in the destination trash.
+ *  The destination trash is used to hold the destinations that are removed
+ *  from the service table but are still referenced by some conn entries.
+ *  The reason to add the destination trash is when the dest is temporary
+ *  down (either by administrator or by monitor program), the dest can be
+ *  picked back from the trash, the remaining connections to the dest can
+ *  continue, and the counting information of the dest is also useful for
+ *  scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+                    __be16 dport)
+{
+       struct ip_vs_dest *dest, *nxt;
+
+       /*
+        * Find the destination in trash
+        */
+       list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+               IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+                             "dest->refcnt=%d\n",
+                             dest->vfwmark,
+                             IP_VS_DBG_ADDR(svc->af, &dest->addr),
+                             ntohs(dest->port),
+                             atomic_read(&dest->refcnt));
+               if (dest->af == svc->af &&
+                   ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
+                   dest->port == dport &&
+                   dest->vfwmark == svc->fwmark &&
+                   dest->protocol == svc->protocol &&
+                   (svc->fwmark ||
+                    (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
+                     dest->vport == svc->port))) {
+                       /* HIT */
+                       return dest;
+               }
+
+               /*
+                * Try to purge the destination from trash if not referenced
+                */
+               if (atomic_read(&dest->refcnt) == 1) {
+                       IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
+                                     "from trash\n",
+                                     dest->vfwmark,
+                                     IP_VS_DBG_ADDR(svc->af, &dest->addr),
+                                     ntohs(dest->port));
+                       list_del(&dest->n_list);
+                       ip_vs_dst_reset(dest);
+                       __ip_vs_unbind_svc(dest);
+                       kfree(dest);
+               }
+       }
+
+       return NULL;
+}
+
+
+/*
+ *  Clean up all the destinations in the trash
+ *  Called by the ip_vs_control_cleanup()
+ *
+ *  When the ip_vs_control_clearup is activated by ipvs module exit,
+ *  the service tables must have been flushed and all the connections
+ *  are expired, and the refcnt of each destination in the trash must
+ *  be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+       struct ip_vs_dest *dest, *nxt;
+
+       list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+               list_del(&dest->n_list);
+               ip_vs_dst_reset(dest);
+               __ip_vs_unbind_svc(dest);
+               kfree(dest);
+       }
+}
+
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+       spin_lock_bh(&stats->lock);
+
+       memset(&stats->ustats, 0, sizeof(stats->ustats));
+       ip_vs_zero_estimator(stats);
+
+       spin_unlock_bh(&stats->lock);
+}
+
+/*
+ *     Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+                   struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+{
+       int conn_flags;
+
+       /* set the weight and the flags */
+       atomic_set(&dest->weight, udest->weight);
+       conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+
+       /* check if local node and update the flags */
+#ifdef CONFIG_IP_VS_IPV6
+       if (svc->af == AF_INET6) {
+               if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
+                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+                               | IP_VS_CONN_F_LOCALNODE;
+               }
+       } else
+#endif
+               if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
+                       conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+                               | IP_VS_CONN_F_LOCALNODE;
+               }
+
+       /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+       if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+               conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+       } else {
+               /*
+                *    Put the real service in ip_vs_rtable if not present.
+                *    For now only for NAT!
+                */
+               write_lock_bh(&__ip_vs_rs_lock);
+               ip_vs_rs_hash(dest);
+               write_unlock_bh(&__ip_vs_rs_lock);
+       }
+       atomic_set(&dest->conn_flags, conn_flags);
+
+       /* bind the service */
+       if (!dest->svc) {
+               __ip_vs_bind_svc(dest, svc);
+       } else {
+               if (dest->svc != svc) {
+                       __ip_vs_unbind_svc(dest);
+                       ip_vs_zero_stats(&dest->stats);
+                       __ip_vs_bind_svc(dest, svc);
+               }
+       }
+
+       /* set the dest status flags */
+       dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+       if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+               dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+       dest->u_threshold = udest->u_threshold;
+       dest->l_threshold = udest->l_threshold;
+}
+
+
+/*
+ *     Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
+              struct ip_vs_dest **dest_p)
+{
+       struct ip_vs_dest *dest;
+       unsigned atype;
+
+       EnterFunction(2);
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (svc->af == AF_INET6) {
+               atype = ipv6_addr_type(&udest->addr.in6);
+               if ((!(atype & IPV6_ADDR_UNICAST) ||
+                       atype & IPV6_ADDR_LINKLOCAL) &&
+                       !__ip_vs_addr_is_local_v6(&udest->addr.in6))
+                       return -EINVAL;
+       } else
+#endif
+       {
+               atype = inet_addr_type(&init_net, udest->addr.ip);
+               if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+                       return -EINVAL;
+       }
+
+       dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+       if (dest == NULL) {
+               IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+               return -ENOMEM;
+       }
+
+       dest->af = svc->af;
+       dest->protocol = svc->protocol;
+       dest->vaddr = svc->addr;
+       dest->vport = svc->port;
+       dest->vfwmark = svc->fwmark;
+       ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
+       dest->port = udest->port;
+
+       atomic_set(&dest->activeconns, 0);
+       atomic_set(&dest->inactconns, 0);
+       atomic_set(&dest->persistconns, 0);
+       atomic_set(&dest->refcnt, 0);
+
+       INIT_LIST_HEAD(&dest->d_list);
+       spin_lock_init(&dest->dst_lock);
+       spin_lock_init(&dest->stats.lock);
+       __ip_vs_update_dest(svc, dest, udest);
+       ip_vs_new_estimator(&dest->stats);
+
+       *dest_p = dest;
+
+       LeaveFunction(2);
+       return 0;
+}
+
+
+/*
+ *     Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+       struct ip_vs_dest *dest;
+       union nf_inet_addr daddr;
+       __be16 dport = udest->port;
+       int ret;
+
+       EnterFunction(2);
+
+       if (udest->weight < 0) {
+               IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+               return -ERANGE;
+       }
+
+       if (udest->l_threshold > udest->u_threshold) {
+               IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
+                         "upper threshold\n");
+               return -ERANGE;
+       }
+
+       ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+       /*
+        * Check if the dest already exists in the list
+        */
+       dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
+       if (dest != NULL) {
+               IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+               return -EEXIST;
+       }
+
+       /*
+        * Check if the dest already exists in the trash and
+        * is from the same service
+        */
+       dest = ip_vs_trash_get_dest(svc, &daddr, dport);
+
+       if (dest != NULL) {
+               IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+                             "dest->refcnt=%d, service %u/%s:%u\n",
+                             IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
+                             atomic_read(&dest->refcnt),
+                             dest->vfwmark,
+                             IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+                             ntohs(dest->vport));
+
+               __ip_vs_update_dest(svc, dest, udest);
+
+               /*
+                * Get the destination from the trash
+                */
+               list_del(&dest->n_list);
+
+               ip_vs_new_estimator(&dest->stats);
+
+               write_lock_bh(&__ip_vs_svc_lock);
+
+               /*
+                * Wait until all other svc users go away.
+                */
+               IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+               list_add(&dest->n_list, &svc->destinations);
+               svc->num_dests++;
+
+               /* call the update_service function of its scheduler */
+               if (svc->scheduler->update_service)
+                       svc->scheduler->update_service(svc);
+
+               write_unlock_bh(&__ip_vs_svc_lock);
+               return 0;
+       }
+
+       /*
+        * Allocate and initialize the dest structure
+        */
+       ret = ip_vs_new_dest(svc, udest, &dest);
+       if (ret) {
+               return ret;
+       }
+
+       /*
+        * Add the dest entry into the list
+        */
+       atomic_inc(&dest->refcnt);
+
+       write_lock_bh(&__ip_vs_svc_lock);
+
+       /*
+        * Wait until all other svc users go away.
+        */
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+       list_add(&dest->n_list, &svc->destinations);
+       svc->num_dests++;
+
+       /* call the update_service function of its scheduler */
+       if (svc->scheduler->update_service)
+               svc->scheduler->update_service(svc);
+
+       write_unlock_bh(&__ip_vs_svc_lock);
+
+       LeaveFunction(2);
+
+       return 0;
+}
+
+
+/*
+ *     Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+       struct ip_vs_dest *dest;
+       union nf_inet_addr daddr;
+       __be16 dport = udest->port;
+
+       EnterFunction(2);
+
+       if (udest->weight < 0) {
+               IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
+               return -ERANGE;
+       }
+
+       if (udest->l_threshold > udest->u_threshold) {
+               IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
+                         "upper threshold\n");
+               return -ERANGE;
+       }
+
+       ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+       /*
+        *  Lookup the destination list
+        */
+       dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
+       if (dest == NULL) {
+               IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+               return -ENOENT;
+       }
+
+       __ip_vs_update_dest(svc, dest, udest);
+
+       write_lock_bh(&__ip_vs_svc_lock);
+
+       /* Wait until all other svc users go away */
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+       /* call the update_service, because server weight may be changed */
+       if (svc->scheduler->update_service)
+               svc->scheduler->update_service(svc);
+
+       write_unlock_bh(&__ip_vs_svc_lock);
+
+       LeaveFunction(2);
+
+       return 0;
+}
+
+
+/*
+ *     Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+       ip_vs_kill_estimator(&dest->stats);
+
+       /*
+        *  Remove it from the d-linked list with the real services.
+        */
+       write_lock_bh(&__ip_vs_rs_lock);
+       ip_vs_rs_unhash(dest);
+       write_unlock_bh(&__ip_vs_rs_lock);
+
+       /*
+        *  Decrease the refcnt of the dest, and free the dest
+        *  if nobody refers to it (refcnt=0). Otherwise, throw
+        *  the destination into the trash.
+        */
+       if (atomic_dec_and_test(&dest->refcnt)) {
+               ip_vs_dst_reset(dest);
+               /* simply decrease svc->refcnt here, let the caller check
+                  and release the service if nobody refers to it.
+                  Only user context can release destination and service,
+                  and only one user context can update virtual service at a
+                  time, so the operation here is OK */
+               atomic_dec(&dest->svc->refcnt);
+               kfree(dest);
+       } else {
+               IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
+                             "dest->refcnt=%d\n",
+                             IP_VS_DBG_ADDR(dest->af, &dest->addr),
+                             ntohs(dest->port),
+                             atomic_read(&dest->refcnt));
+               list_add(&dest->n_list, &ip_vs_dest_trash);
+               atomic_inc(&dest->refcnt);
+       }
+}
+
+
+/*
+ *     Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+                               struct ip_vs_dest *dest,
+                               int svcupd)
+{
+       dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+       /*
+        *  Remove it from the d-linked destination list.
+        */
+       list_del(&dest->n_list);
+       svc->num_dests--;
+
+       /*
+        *  Call the update_service function of its scheduler
+        */
+       if (svcupd && svc->scheduler->update_service)
+                       svc->scheduler->update_service(svc);
+}
+
+
+/*
+ *     Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+       struct ip_vs_dest *dest;
+       __be16 dport = udest->port;
+
+       EnterFunction(2);
+
+       dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+
+       if (dest == NULL) {
+               IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+               return -ENOENT;
+       }
+
+       write_lock_bh(&__ip_vs_svc_lock);
+
+       /*
+        *      Wait until all other svc users go away.
+        */
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+       /*
+        *      Unlink dest from the service
+        */
+       __ip_vs_unlink_dest(svc, dest, 1);
+
+       write_unlock_bh(&__ip_vs_svc_lock);
+
+       /*
+        *      Delete the destination
+        */
+       __ip_vs_del_dest(dest);
+
+       LeaveFunction(2);
+
+       return 0;
+}
+
+
+/*
+ *     Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user_kern *u,
+                 struct ip_vs_service **svc_p)
+{
+       int ret = 0;
+       struct ip_vs_scheduler *sched = NULL;
+       struct ip_vs_service *svc = NULL;
+
+       /* increase the module use count */
+       ip_vs_use_count_inc();
+
+       /* Lookup the scheduler by 'u->sched_name' */
+       sched = ip_vs_scheduler_get(u->sched_name);
+       if (sched == NULL) {
+               IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+                          u->sched_name);
+               ret = -ENOENT;
+               goto out_mod_dec;
+       }
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (u->af == AF_INET6) {
+               if (!sched->supports_ipv6) {
+                       ret = -EAFNOSUPPORT;
+                       goto out_err;
+               }
+               if ((u->netmask < 1) || (u->netmask > 128)) {
+                       ret = -EINVAL;
+                       goto out_err;
+               }
+       }
+#endif
+
+       svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+       if (svc == NULL) {
+               IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+               ret = -ENOMEM;
+               goto out_err;
+       }
+
+       /* I'm the first user of the service */
+       atomic_set(&svc->usecnt, 1);
+       atomic_set(&svc->refcnt, 0);
+
+       svc->af = u->af;
+       svc->protocol = u->protocol;
+       ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
+       svc->port = u->port;
+       svc->fwmark = u->fwmark;
+       svc->flags = u->flags;
+       svc->timeout = u->timeout * HZ;
+       svc->netmask = u->netmask;
+
+       INIT_LIST_HEAD(&svc->destinations);
+       rwlock_init(&svc->sched_lock);
+       spin_lock_init(&svc->stats.lock);
+
+       /* Bind the scheduler */
+       ret = ip_vs_bind_scheduler(svc, sched);
+       if (ret)
+               goto out_err;
+       sched = NULL;
+
+       /* Update the virtual service counters */
+       if (svc->port == FTPPORT)
+               atomic_inc(&ip_vs_ftpsvc_counter);
+       else if (svc->port == 0)
+               atomic_inc(&ip_vs_nullsvc_counter);
+
+       ip_vs_new_estimator(&svc->stats);
+
+       /* Count only IPv4 services for old get/setsockopt interface */
+       if (svc->af == AF_INET)
+               ip_vs_num_services++;
+
+       /* Hash the service into the service table */
+       write_lock_bh(&__ip_vs_svc_lock);
+       ip_vs_svc_hash(svc);
+       write_unlock_bh(&__ip_vs_svc_lock);
+
+       *svc_p = svc;
+       return 0;
+
+  out_err:
+       if (svc != NULL) {
+               if (svc->scheduler)
+                       ip_vs_unbind_scheduler(svc);
+               if (svc->inc) {
+                       local_bh_disable();
+                       ip_vs_app_inc_put(svc->inc);
+                       local_bh_enable();
+               }
+               kfree(svc);
+       }
+       ip_vs_scheduler_put(sched);
+
+  out_mod_dec:
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+
+       return ret;
+}
+
+
+/*
+ *     Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
+{
+       struct ip_vs_scheduler *sched, *old_sched;
+       int ret = 0;
+
+       /*
+        * Lookup the scheduler, by 'u->sched_name'
+        */
+       sched = ip_vs_scheduler_get(u->sched_name);
+       if (sched == NULL) {
+               IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+                          u->sched_name);
+               return -ENOENT;
+       }
+       old_sched = sched;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (u->af == AF_INET6) {
+               if (!sched->supports_ipv6) {
+                       ret = -EAFNOSUPPORT;
+                       goto out;
+               }
+               if ((u->netmask < 1) || (u->netmask > 128)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+#endif
+
+       write_lock_bh(&__ip_vs_svc_lock);
+
+       /*
+        * Wait until all other svc users go away.
+        */
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+       /*
+        * Set the flags and timeout value
+        */
+       svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+       svc->timeout = u->timeout * HZ;
+       svc->netmask = u->netmask;
+
+       old_sched = svc->scheduler;
+       if (sched != old_sched) {
+               /*
+                * Unbind the old scheduler
+                */
+               if ((ret = ip_vs_unbind_scheduler(svc))) {
+                       old_sched = sched;
+                       goto out_unlock;
+               }
+
+               /*
+                * Bind the new scheduler
+                */
+               if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+                       /*
+                        * If ip_vs_bind_scheduler fails, restore the old
+                        * scheduler.
+                        * The main reason of failure is out of memory.
+                        *
+                        * The question is if the old scheduler can be
+                        * restored all the time. TODO: if it cannot be
+                        * restored some time, we must delete the service,
+                        * otherwise the system may crash.
+                        */
+                       ip_vs_bind_scheduler(svc, old_sched);
+                       old_sched = sched;
+                       goto out_unlock;
+               }
+       }
+
+  out_unlock:
+       write_unlock_bh(&__ip_vs_svc_lock);
+#ifdef CONFIG_IP_VS_IPV6
+  out:
+#endif
+
+       if (old_sched)
+               ip_vs_scheduler_put(old_sched);
+
+       return ret;
+}
+
+
+/*
+ *     Delete a service from the service list
+ *     - The service must be unlinked, unlocked and not referenced!
+ *     - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+       struct ip_vs_dest *dest, *nxt;
+       struct ip_vs_scheduler *old_sched;
+
+       /* Count only IPv4 services for old get/setsockopt interface */
+       if (svc->af == AF_INET)
+               ip_vs_num_services--;
+
+       ip_vs_kill_estimator(&svc->stats);
+
+       /* Unbind scheduler */
+       old_sched = svc->scheduler;
+       ip_vs_unbind_scheduler(svc);
+       if (old_sched)
+               ip_vs_scheduler_put(old_sched);
+
+       /* Unbind app inc */
+       if (svc->inc) {
+               ip_vs_app_inc_put(svc->inc);
+               svc->inc = NULL;
+       }
+
+       /*
+        *    Unlink the whole destination list
+        */
+       list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+               __ip_vs_unlink_dest(svc, dest, 0);
+               __ip_vs_del_dest(dest);
+       }
+
+       /*
+        *    Update the virtual service counters
+        */
+       if (svc->port == FTPPORT)
+               atomic_dec(&ip_vs_ftpsvc_counter);
+       else if (svc->port == 0)
+               atomic_dec(&ip_vs_nullsvc_counter);
+
+       /*
+        *    Free the service if nobody refers to it
+        */
+       if (atomic_read(&svc->refcnt) == 0)
+               kfree(svc);
+
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+}
+
+/*
+ *     Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+       if (svc == NULL)
+               return -EEXIST;
+
+       /*
+        * Unhash it from the service table
+        */
+       write_lock_bh(&__ip_vs_svc_lock);
+
+       ip_vs_svc_unhash(svc);
+
+       /*
+        * Wait until all the svc users go away.
+        */
+       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+       __ip_vs_del_service(svc);
+
+       write_unlock_bh(&__ip_vs_svc_lock);
+
+       return 0;
+}
+
+
+/*
+ *     Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+       int idx;
+       struct ip_vs_service *svc, *nxt;
+
+       /*
+        * Flush the service table hashed by <protocol,addr,port>
+        */
+       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
+                       write_lock_bh(&__ip_vs_svc_lock);
+                       ip_vs_svc_unhash(svc);
+                       /*
+                        * Wait until all the svc users go away.
+                        */
+                       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+                       __ip_vs_del_service(svc);
+                       write_unlock_bh(&__ip_vs_svc_lock);
+               }
+       }
+
+       /*
+        * Flush the service table hashed by fwmark
+        */
+       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry_safe(svc, nxt,
+                                        &ip_vs_svc_fwm_table[idx], f_list) {
+                       write_lock_bh(&__ip_vs_svc_lock);
+                       ip_vs_svc_unhash(svc);
+                       /*
+                        * Wait until all the svc users go away.
+                        */
+                       IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+                       __ip_vs_del_service(svc);
+                       write_unlock_bh(&__ip_vs_svc_lock);
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+ *     Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+       struct ip_vs_dest *dest;
+
+       write_lock_bh(&__ip_vs_svc_lock);
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               ip_vs_zero_stats(&dest->stats);
+       }
+       ip_vs_zero_stats(&svc->stats);
+       write_unlock_bh(&__ip_vs_svc_lock);
+       return 0;
+}
+
+static int ip_vs_zero_all(void)
+{
+       int idx;
+       struct ip_vs_service *svc;
+
+       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                       ip_vs_zero_service(svc);
+               }
+       }
+
+       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                       ip_vs_zero_service(svc);
+               }
+       }
+
+       ip_vs_zero_stats(&ip_vs_stats);
+       return 0;
+}
+
+
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+                    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int *valp = table->data;
+       int val = *valp;
+       int rc;
+
+       rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (write && (*valp != val)) {
+               if ((*valp < 0) || (*valp > 3)) {
+                       /* Restore the correct value */
+                       *valp = val;
+               } else {
+                       update_defense_level();
+               }
+       }
+       return rc;
+}
+
+
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+                      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int *valp = table->data;
+       int val[2];
+       int rc;
+
+       /* backup the value first */
+       memcpy(val, valp, sizeof(val));
+
+       rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+               /* Restore the correct value */
+               memcpy(valp, val, sizeof(val));
+       }
+       return rc;
+}
+
+
+/*
+ *     IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+
+static struct ctl_table vs_vars[] = {
+       {
+               .procname       = "amemthresh",
+               .data           = &sysctl_ip_vs_amemthresh,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#ifdef CONFIG_IP_VS_DEBUG
+       {
+               .procname       = "debug_level",
+               .data           = &sysctl_ip_vs_debug_level,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
+       {
+               .procname       = "am_droprate",
+               .data           = &sysctl_ip_vs_am_droprate,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .procname       = "drop_entry",
+               .data           = &sysctl_ip_vs_drop_entry,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_do_defense_mode,
+       },
+       {
+               .procname       = "drop_packet",
+               .data           = &sysctl_ip_vs_drop_packet,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_do_defense_mode,
+       },
+       {
+               .procname       = "secure_tcp",
+               .data           = &sysctl_ip_vs_secure_tcp,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_do_defense_mode,
+       },
+#if 0
+       {
+               .procname       = "timeout_established",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_synsent",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_synrecv",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_finwait",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_timewait",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_close",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_closewait",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_lastack",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_listen",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_synack",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_udp",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       {
+               .procname       = "timeout_icmp",
+               .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+#endif
+       {
+               .procname       = "cache_bypass",
+               .data           = &sysctl_ip_vs_cache_bypass,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .procname       = "expire_nodest_conn",
+               .data           = &sysctl_ip_vs_expire_nodest_conn,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .procname       = "expire_quiescent_template",
+               .data           = &sysctl_ip_vs_expire_quiescent_template,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .procname       = "sync_threshold",
+               .data           = &sysctl_ip_vs_sync_threshold,
+               .maxlen         = sizeof(sysctl_ip_vs_sync_threshold),
+               .mode           = 0644,
+               .proc_handler   = &proc_do_sync_threshold,
+       },
+       {
+               .procname       = "nat_icmp_send",
+               .data           = &sysctl_ip_vs_nat_icmp_send,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       { .ctl_name = 0 }
+};
+
+const struct ctl_path net_vs_ctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "ipv4", .ctl_name = NET_IPV4, },
+       { .procname = "vs", },
+       { }
+};
+EXPORT_SYMBOL_GPL(net_vs_ctl_path);
+
+static struct ctl_table_header * sysctl_header;
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+       struct list_head *table;
+       int bucket;
+};
+
+/*
+ *     Write the contents of the VS rule table to a PROCfs file.
+ *     (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+       switch (flags & IP_VS_CONN_F_FWD_MASK) {
+       case IP_VS_CONN_F_LOCALNODE:
+               return "Local";
+       case IP_VS_CONN_F_TUNNEL:
+               return "Tunnel";
+       case IP_VS_CONN_F_DROUTE:
+               return "Route";
+       default:
+               return "Masq";
+       }
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+       struct ip_vs_iter *iter = seq->private;
+       int idx;
+       struct ip_vs_service *svc;
+
+       /* look in hash by protocol */
+       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                       if (pos-- == 0){
+                               iter->table = ip_vs_svc_table;
+                               iter->bucket = idx;
+                               return svc;
+                       }
+               }
+       }
+
+       /* keep looking in fwmark */
+       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                       if (pos-- == 0) {
+                               iter->table = ip_vs_svc_fwm_table;
+                               iter->bucket = idx;
+                               return svc;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+__acquires(__ip_vs_svc_lock)
+{
+
+       read_lock_bh(&__ip_vs_svc_lock);
+       return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct list_head *e;
+       struct ip_vs_iter *iter;
+       struct ip_vs_service *svc;
+
+       ++*pos;
+       if (v == SEQ_START_TOKEN)
+               return ip_vs_info_array(seq,0);
+
+       svc = v;
+       iter = seq->private;
+
+       if (iter->table == ip_vs_svc_table) {
+               /* next service in table hashed by protocol */
+               if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+                       return list_entry(e, struct ip_vs_service, s_list);
+
+
+               while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+                       list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+                                           s_list) {
+                               return svc;
+                       }
+               }
+
+               iter->table = ip_vs_svc_fwm_table;
+               iter->bucket = -1;
+               goto scan_fwmark;
+       }
+
+       /* next service in hashed by fwmark */
+       if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+               return list_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+       while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+               list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+                                   f_list)
+                       return svc;
+       }
+
+       return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+__releases(__ip_vs_svc_lock)
+{
+       read_unlock_bh(&__ip_vs_svc_lock);
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+       if (v == SEQ_START_TOKEN) {
+               seq_printf(seq,
+                       "IP Virtual Server version %d.%d.%d (size=%d)\n",
+                       NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+               seq_puts(seq,
+                        "Prot LocalAddress:Port Scheduler Flags\n");
+               seq_puts(seq,
+                        "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+       } else {
+               const struct ip_vs_service *svc = v;
+               const struct ip_vs_iter *iter = seq->private;
+               const struct ip_vs_dest *dest;
+
+               if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+                       if (svc->af == AF_INET6)
+                               seq_printf(seq, "%s  [" NIP6_FMT "]:%04X %s ",
+                                          ip_vs_proto_name(svc->protocol),
+                                          NIP6(svc->addr.in6),
+                                          ntohs(svc->port),
+                                          svc->scheduler->name);
+                       else
+#endif
+                               seq_printf(seq, "%s  %08X:%04X %s ",
+                                          ip_vs_proto_name(svc->protocol),
+                                          ntohl(svc->addr.ip),
+                                          ntohs(svc->port),
+                                          svc->scheduler->name);
+               } else {
+                       seq_printf(seq, "FWM  %08X %s ",
+                                  svc->fwmark, svc->scheduler->name);
+               }
+
+               if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+                       seq_printf(seq, "persistent %d %08X\n",
+                               svc->timeout,
+                               ntohl(svc->netmask));
+               else
+                       seq_putc(seq, '\n');
+
+               list_for_each_entry(dest, &svc->destinations, n_list) {
+#ifdef CONFIG_IP_VS_IPV6
+                       if (dest->af == AF_INET6)
+                               seq_printf(seq,
+                                          "  -> [" NIP6_FMT "]:%04X"
+                                          "      %-7s %-6d %-10d %-10d\n",
+                                          NIP6(dest->addr.in6),
+                                          ntohs(dest->port),
+                                          ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+                                          atomic_read(&dest->weight),
+                                          atomic_read(&dest->activeconns),
+                                          atomic_read(&dest->inactconns));
+                       else
+#endif
+                               seq_printf(seq,
+                                          "  -> %08X:%04X      "
+                                          "%-7s %-6d %-10d %-10d\n",
+                                          ntohl(dest->addr.ip),
+                                          ntohs(dest->port),
+                                          ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+                                          atomic_read(&dest->weight),
+                                          atomic_read(&dest->activeconns),
+                                          atomic_read(&dest->inactconns));
+
+               }
+       }
+       return 0;
+}
+
+static const struct seq_operations ip_vs_info_seq_ops = {
+       .start = ip_vs_info_seq_start,
+       .next  = ip_vs_info_seq_next,
+       .stop  = ip_vs_info_seq_stop,
+       .show  = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+       return seq_open_private(file, &ip_vs_info_seq_ops,
+                       sizeof(struct ip_vs_iter));
+}
+
+static const struct file_operations ip_vs_info_fops = {
+       .owner   = THIS_MODULE,
+       .open    = ip_vs_info_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release_private,
+};
+
+#endif
+
+struct ip_vs_stats ip_vs_stats = {
+       .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
+};
+
+#ifdef CONFIG_PROC_FS
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+       seq_puts(seq,
+                "   Total Incoming Outgoing         Incoming         Outgoing\n");
+       seq_printf(seq,
+                  "   Conns  Packets  Packets            Bytes            Bytes\n");
+
+       spin_lock_bh(&ip_vs_stats.lock);
+       seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
+                  ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
+                  (unsigned long long) ip_vs_stats.ustats.inbytes,
+                  (unsigned long long) ip_vs_stats.ustats.outbytes);
+
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+       seq_puts(seq,
+                  " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+       seq_printf(seq,"%8X %8X %8X %16X %16X\n",
+                       ip_vs_stats.ustats.cps,
+                       ip_vs_stats.ustats.inpps,
+                       ip_vs_stats.ustats.outpps,
+                       ip_vs_stats.ustats.inbps,
+                       ip_vs_stats.ustats.outbps);
+       spin_unlock_bh(&ip_vs_stats.lock);
+
+       return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ip_vs_stats_show, NULL);
+}
+
+static const struct file_operations ip_vs_stats_fops = {
+       .owner = THIS_MODULE,
+       .open = ip_vs_stats_seq_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+#endif
+
+/*
+ *     Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+       IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+                 u->tcp_timeout,
+                 u->tcp_fin_timeout,
+                 u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+       if (u->tcp_timeout) {
+               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+                       = u->tcp_timeout * HZ;
+       }
+
+       if (u->tcp_fin_timeout) {
+               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+                       = u->tcp_fin_timeout * HZ;
+       }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+       if (u->udp_timeout) {
+               ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+                       = u->udp_timeout * HZ;
+       }
+#endif
+       return 0;
+}
+
+
+#define SET_CMDID(cmd)         (cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN                (sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN                (sizeof(struct ip_vs_service_user) +    \
+                                sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN                (sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN         (sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN            SVCDEST_ARG_LEN
+
+static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+       [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
+       [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
+       [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
+};
+
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+                                 struct ip_vs_service_user *usvc_compat)
+{
+       usvc->af                = AF_INET;
+       usvc->protocol          = usvc_compat->protocol;
+       usvc->addr.ip           = usvc_compat->addr;
+       usvc->port              = usvc_compat->port;
+       usvc->fwmark            = usvc_compat->fwmark;
+
+       /* Deep copy of sched_name is not needed here */
+       usvc->sched_name        = usvc_compat->sched_name;
+
+       usvc->flags             = usvc_compat->flags;
+       usvc->timeout           = usvc_compat->timeout;
+       usvc->netmask           = usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+                                  struct ip_vs_dest_user *udest_compat)
+{
+       udest->addr.ip          = udest_compat->addr;
+       udest->port             = udest_compat->port;
+       udest->conn_flags       = udest_compat->conn_flags;
+       udest->weight           = udest_compat->weight;
+       udest->u_threshold      = udest_compat->u_threshold;
+       udest->l_threshold      = udest_compat->l_threshold;
+}
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+       int ret;
+       unsigned char arg[MAX_ARG_LEN];
+       struct ip_vs_service_user *usvc_compat;
+       struct ip_vs_service_user_kern usvc;
+       struct ip_vs_service *svc;
+       struct ip_vs_dest_user *udest_compat;
+       struct ip_vs_dest_user_kern udest;
+
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       if (len != set_arglen[SET_CMDID(cmd)]) {
+               IP_VS_ERR("set_ctl: len %u != %u\n",
+                         len, set_arglen[SET_CMDID(cmd)]);
+               return -EINVAL;
+       }
+
+       if (copy_from_user(arg, user, len) != 0)
+               return -EFAULT;
+
+       /* increase the module use count */
+       ip_vs_use_count_inc();
+
+       if (mutex_lock_interruptible(&__ip_vs_mutex)) {
+               ret = -ERESTARTSYS;
+               goto out_dec;
+       }
+
+       if (cmd == IP_VS_SO_SET_FLUSH) {
+               /* Flush the virtual service */
+               ret = ip_vs_flush();
+               goto out_unlock;
+       } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+               /* Set timeout values for (tcp tcpfin udp) */
+               ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+               goto out_unlock;
+       } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+               struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+               ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+               goto out_unlock;
+       } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+               struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+               ret = stop_sync_thread(dm->state);
+               goto out_unlock;
+       }
+
+       usvc_compat = (struct ip_vs_service_user *)arg;
+       udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+       /* We only use the new structs internally, so copy userspace compat
+        * structs to extended internal versions */
+       ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+       ip_vs_copy_udest_compat(&udest, udest_compat);
+
+       if (cmd == IP_VS_SO_SET_ZERO) {
+               /* if no service address is set, zero counters in all */
+               if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
+                       ret = ip_vs_zero_all();
+                       goto out_unlock;
+               }
+       }
+
+       /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+       if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
+               IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
+                         usvc.protocol, NIPQUAD(usvc.addr.ip),
+                         ntohs(usvc.port), usvc.sched_name);
+               ret = -EFAULT;
+               goto out_unlock;
+       }
+
+       /* Lookup the exact service by <protocol, addr, port> or fwmark */
+       if (usvc.fwmark == 0)
+               svc = __ip_vs_service_get(usvc.af, usvc.protocol,
+                                         &usvc.addr, usvc.port);
+       else
+               svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+
+       if (cmd != IP_VS_SO_SET_ADD
+           && (svc == NULL || svc->protocol != usvc.protocol)) {
+               ret = -ESRCH;
+               goto out_unlock;
+       }
+
+       switch (cmd) {
+       case IP_VS_SO_SET_ADD:
+               if (svc != NULL)
+                       ret = -EEXIST;
+               else
+                       ret = ip_vs_add_service(&usvc, &svc);
+               break;
+       case IP_VS_SO_SET_EDIT:
+               ret = ip_vs_edit_service(svc, &usvc);
+               break;
+       case IP_VS_SO_SET_DEL:
+               ret = ip_vs_del_service(svc);
+               if (!ret)
+                       goto out_unlock;
+               break;
+       case IP_VS_SO_SET_ZERO:
+               ret = ip_vs_zero_service(svc);
+               break;
+       case IP_VS_SO_SET_ADDDEST:
+               ret = ip_vs_add_dest(svc, &udest);
+               break;
+       case IP_VS_SO_SET_EDITDEST:
+               ret = ip_vs_edit_dest(svc, &udest);
+               break;
+       case IP_VS_SO_SET_DELDEST:
+               ret = ip_vs_del_dest(svc, &udest);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       if (svc)
+               ip_vs_service_put(svc);
+
+  out_unlock:
+       mutex_unlock(&__ip_vs_mutex);
+  out_dec:
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+
+       return ret;
+}
+
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+       spin_lock_bh(&src->lock);
+       memcpy(dst, &src->ustats, sizeof(*dst));
+       spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+       dst->protocol = src->protocol;
+       dst->addr = src->addr.ip;
+       dst->port = src->port;
+       dst->fwmark = src->fwmark;
+       strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+       dst->flags = src->flags;
+       dst->timeout = src->timeout / HZ;
+       dst->netmask = src->netmask;
+       dst->num_dests = src->num_dests;
+       ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+                           struct ip_vs_get_services __user *uptr)
+{
+       int idx, count=0;
+       struct ip_vs_service *svc;
+       struct ip_vs_service_entry entry;
+       int ret = 0;
+
+       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+                       /* Only expose IPv4 entries to old interface */
+                       if (svc->af != AF_INET)
+                               continue;
+
+                       if (count >= get->num_services)
+                               goto out;
+                       memset(&entry, 0, sizeof(entry));
+                       ip_vs_copy_service(&entry, svc);
+                       if (copy_to_user(&uptr->entrytable[count],
+                                        &entry, sizeof(entry))) {
+                               ret = -EFAULT;
+                               goto out;
+                       }
+                       count++;
+               }
+       }
+
+       for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+               list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+                       /* Only expose IPv4 entries to old interface */
+                       if (svc->af != AF_INET)
+                               continue;
+
+                       if (count >= get->num_services)
+                               goto out;
+                       memset(&entry, 0, sizeof(entry));
+                       ip_vs_copy_service(&entry, svc);
+                       if (copy_to_user(&uptr->entrytable[count],
+                                        &entry, sizeof(entry))) {
+                               ret = -EFAULT;
+                               goto out;
+                       }
+                       count++;
+               }
+       }
+  out:
+       return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+                        struct ip_vs_get_dests __user *uptr)
+{
+       struct ip_vs_service *svc;
+       union nf_inet_addr addr = { .ip = get->addr };
+       int ret = 0;
+
+       if (get->fwmark)
+               svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+       else
+               svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
+                                         get->port);
+
+       if (svc) {
+               int count = 0;
+               struct ip_vs_dest *dest;
+               struct ip_vs_dest_entry entry;
+
+               list_for_each_entry(dest, &svc->destinations, n_list) {
+                       if (count >= get->num_dests)
+                               break;
+
+                       entry.addr = dest->addr.ip;
+                       entry.port = dest->port;
+                       entry.conn_flags = atomic_read(&dest->conn_flags);
+                       entry.weight = atomic_read(&dest->weight);
+                       entry.u_threshold = dest->u_threshold;
+                       entry.l_threshold = dest->l_threshold;
+                       entry.activeconns = atomic_read(&dest->activeconns);
+                       entry.inactconns = atomic_read(&dest->inactconns);
+                       entry.persistconns = atomic_read(&dest->persistconns);
+                       ip_vs_copy_stats(&entry.stats, &dest->stats);
+                       if (copy_to_user(&uptr->entrytable[count],
+                                        &entry, sizeof(entry))) {
+                               ret = -EFAULT;
+                               break;
+                       }
+                       count++;
+               }
+               ip_vs_service_put(svc);
+       } else
+               ret = -ESRCH;
+       return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+       u->tcp_timeout =
+               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+       u->tcp_fin_timeout =
+               ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+       u->udp_timeout =
+               ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd)         (cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN       (sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN   (sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN    (sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN      (sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN    (sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN     (sizeof(struct ip_vs_daemon_user) * 2)
+
+static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+       [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
+       [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
+       [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
+       [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
+       [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
+       [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
+       [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+       unsigned char arg[128];
+       int ret = 0;
+
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       if (*len < get_arglen[GET_CMDID(cmd)]) {
+               IP_VS_ERR("get_ctl: len %u < %u\n",
+                         *len, get_arglen[GET_CMDID(cmd)]);
+               return -EINVAL;
+       }
+
+       if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+               return -EFAULT;
+
+       if (mutex_lock_interruptible(&__ip_vs_mutex))
+               return -ERESTARTSYS;
+
+       switch (cmd) {
+       case IP_VS_SO_GET_VERSION:
+       {
+               char buf[64];
+
+               sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+                       NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+               if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               *len = strlen(buf)+1;
+       }
+       break;
+
+       case IP_VS_SO_GET_INFO:
+       {
+               struct ip_vs_getinfo info;
+               info.version = IP_VS_VERSION_CODE;
+               info.size = IP_VS_CONN_TAB_SIZE;
+               info.num_services = ip_vs_num_services;
+               if (copy_to_user(user, &info, sizeof(info)) != 0)
+                       ret = -EFAULT;
+       }
+       break;
+
+       case IP_VS_SO_GET_SERVICES:
+       {
+               struct ip_vs_get_services *get;
+               int size;
+
+               get = (struct ip_vs_get_services *)arg;
+               size = sizeof(*get) +
+                       sizeof(struct ip_vs_service_entry) * get->num_services;
+               if (*len != size) {
+                       IP_VS_ERR("length: %u != %u\n", *len, size);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = __ip_vs_get_service_entries(get, user);
+       }
+       break;
+
+       case IP_VS_SO_GET_SERVICE:
+       {
+               struct ip_vs_service_entry *entry;
+               struct ip_vs_service *svc;
+               union nf_inet_addr addr;
+
+               entry = (struct ip_vs_service_entry *)arg;
+               addr.ip = entry->addr;
+               if (entry->fwmark)
+                       svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+               else
+                       svc = __ip_vs_service_get(AF_INET, entry->protocol,
+                                                 &addr, entry->port);
+               if (svc) {
+                       ip_vs_copy_service(entry, svc);
+                       if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+                               ret = -EFAULT;
+                       ip_vs_service_put(svc);
+               } else
+                       ret = -ESRCH;
+       }
+       break;
+
+       case IP_VS_SO_GET_DESTS:
+       {
+               struct ip_vs_get_dests *get;
+               int size;
+
+               get = (struct ip_vs_get_dests *)arg;
+               size = sizeof(*get) +
+                       sizeof(struct ip_vs_dest_entry) * get->num_dests;
+               if (*len != size) {
+                       IP_VS_ERR("length: %u != %u\n", *len, size);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = __ip_vs_get_dest_entries(get, user);
+       }
+       break;
+
+       case IP_VS_SO_GET_TIMEOUT:
+       {
+               struct ip_vs_timeout_user t;
+
+               __ip_vs_get_timeouts(&t);
+               if (copy_to_user(user, &t, sizeof(t)) != 0)
+                       ret = -EFAULT;
+       }
+       break;
+
+       case IP_VS_SO_GET_DAEMON:
+       {
+               struct ip_vs_daemon_user d[2];
+
+               memset(&d, 0, sizeof(d));
+               if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+                       d[0].state = IP_VS_STATE_MASTER;
+                       strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
+                       d[0].syncid = ip_vs_master_syncid;
+               }
+               if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+                       d[1].state = IP_VS_STATE_BACKUP;
+                       strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
+                       d[1].syncid = ip_vs_backup_syncid;
+               }
+               if (copy_to_user(user, &d, sizeof(d)) != 0)
+                       ret = -EFAULT;
+       }
+       break;
+
+       default:
+               ret = -EINVAL;
+       }
+
+  out:
+       mutex_unlock(&__ip_vs_mutex);
+       return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+       .pf             = PF_INET,
+       .set_optmin     = IP_VS_BASE_CTL,
+       .set_optmax     = IP_VS_SO_SET_MAX+1,
+       .set            = do_ip_vs_set_ctl,
+       .get_optmin     = IP_VS_BASE_CTL,
+       .get_optmax     = IP_VS_SO_GET_MAX+1,
+       .get            = do_ip_vs_get_ctl,
+       .owner          = THIS_MODULE,
+};
+
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+       .id             = GENL_ID_GENERATE,
+       .hdrsize        = 0,
+       .name           = IPVS_GENL_NAME,
+       .version        = IPVS_GENL_VERSION,
+       .maxattr        = IPVS_CMD_MAX,
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+       [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
+       [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
+       [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
+       [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
+       [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
+       [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+       [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
+       [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
+                                           .len = IP_VS_IFNAME_MAXLEN },
+       [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+       [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
+       [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
+       [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
+                                           .len = sizeof(union nf_inet_addr) },
+       [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
+       [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
+       [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
+                                           .len = IP_VS_SCHEDNAME_MAXLEN },
+       [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
+                                           .len = sizeof(struct ip_vs_flags) },
+       [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
+       [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
+       [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+       [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
+                                           .len = sizeof(union nf_inet_addr) },
+       [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
+       [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
+       [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+                                struct ip_vs_stats *stats)
+{
+       struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+       if (!nl_stats)
+               return -EMSGSIZE;
+
+       spin_lock_bh(&stats->lock);
+
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
+       NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
+       NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
+       NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
+
+       spin_unlock_bh(&stats->lock);
+
+       nla_nest_end(skb, nl_stats);
+
+       return 0;
+
+nla_put_failure:
+       spin_unlock_bh(&stats->lock);
+       nla_nest_cancel(skb, nl_stats);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+                                  struct ip_vs_service *svc)
+{
+       struct nlattr *nl_service;
+       struct ip_vs_flags flags = { .flags = svc->flags,
+                                    .mask = ~0 };
+
+       nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+       if (!nl_service)
+               return -EMSGSIZE;
+
+       NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
+
+       if (svc->fwmark) {
+               NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
+       } else {
+               NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
+               NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
+               NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
+       }
+
+       NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+       NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
+       NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
+       NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
+
+       if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
+               goto nla_put_failure;
+
+       nla_nest_end(skb, nl_service);
+
+       return 0;
+
+nla_put_failure:
+       nla_nest_cancel(skb, nl_service);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+                                  struct ip_vs_service *svc,
+                                  struct netlink_callback *cb)
+{
+       void *hdr;
+
+       hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+                         &ip_vs_genl_family, NLM_F_MULTI,
+                         IPVS_CMD_NEW_SERVICE);
+       if (!hdr)
+               return -EMSGSIZE;
+
+       if (ip_vs_genl_fill_service(skb, svc) < 0)
+               goto nla_put_failure;
+
+       return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+       genlmsg_cancel(skb, hdr);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+                                   struct netlink_callback *cb)
+{
+       int idx = 0, i;
+       int start = cb->args[0];
+       struct ip_vs_service *svc;
+
+       mutex_lock(&__ip_vs_mutex);
+       for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+               list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+                       if (++idx <= start)
+                               continue;
+                       if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+                               idx--;
+                               goto nla_put_failure;
+                       }
+               }
+       }
+
+       for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+               list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+                       if (++idx <= start)
+                               continue;
+                       if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+                               idx--;
+                               goto nla_put_failure;
+                       }
+               }
+       }
+
+nla_put_failure:
+       mutex_unlock(&__ip_vs_mutex);
+       cb->args[0] = idx;
+
+       return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+                                   struct nlattr *nla, int full_entry)
+{
+       struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+       struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+
+       /* Parse mandatory identifying service fields first */
+       if (nla == NULL ||
+           nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+               return -EINVAL;
+
+       nla_af          = attrs[IPVS_SVC_ATTR_AF];
+       nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
+       nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
+       nla_port        = attrs[IPVS_SVC_ATTR_PORT];
+       nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
+
+       if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+               return -EINVAL;
+
+       usvc->af = nla_get_u16(nla_af);
+#ifdef CONFIG_IP_VS_IPV6
+       if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+       if (usvc->af != AF_INET)
+#endif
+               return -EAFNOSUPPORT;
+
+       if (nla_fwmark) {
+               usvc->protocol = IPPROTO_TCP;
+               usvc->fwmark = nla_get_u32(nla_fwmark);
+       } else {
+               usvc->protocol = nla_get_u16(nla_protocol);
+               nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+               usvc->port = nla_get_u16(nla_port);
+               usvc->fwmark = 0;
+       }
+
+       /* If a full entry was requested, check for the additional fields */
+       if (full_entry) {
+               struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+                             *nla_netmask;
+               struct ip_vs_flags flags;
+               struct ip_vs_service *svc;
+
+               nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+               nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+               nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+               nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+               if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+                       return -EINVAL;
+
+               nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+               /* prefill flags from service if it already exists */
+               if (usvc->fwmark)
+                       svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
+               else
+                       svc = __ip_vs_service_get(usvc->af, usvc->protocol,
+                                                 &usvc->addr, usvc->port);
+               if (svc) {
+                       usvc->flags = svc->flags;
+                       ip_vs_service_put(svc);
+               } else
+                       usvc->flags = 0;
+
+               /* set new flags from userland */
+               usvc->flags = (usvc->flags & ~flags.mask) |
+                             (flags.flags & flags.mask);
+               usvc->sched_name = nla_data(nla_sched);
+               usvc->timeout = nla_get_u32(nla_timeout);
+               usvc->netmask = nla_get_u32(nla_netmask);
+       }
+
+       return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+{
+       struct ip_vs_service_user_kern usvc;
+       int ret;
+
+       ret = ip_vs_genl_parse_service(&usvc, nla, 0);
+       if (ret)
+               return ERR_PTR(ret);
+
+       if (usvc.fwmark)
+               return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+       else
+               return __ip_vs_service_get(usvc.af, usvc.protocol,
+                                          &usvc.addr, usvc.port);
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+       struct nlattr *nl_dest;
+
+       nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+       if (!nl_dest)
+               return -EMSGSIZE;
+
+       NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
+       NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
+
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+                   atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+                   atomic_read(&dest->activeconns));
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+                   atomic_read(&dest->inactconns));
+       NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+                   atomic_read(&dest->persistconns));
+
+       if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
+               goto nla_put_failure;
+
+       nla_nest_end(skb, nl_dest);
+
+       return 0;
+
+nla_put_failure:
+       nla_nest_cancel(skb, nl_dest);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+                               struct netlink_callback *cb)
+{
+       void *hdr;
+
+       hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+                         &ip_vs_genl_family, NLM_F_MULTI,
+                         IPVS_CMD_NEW_DEST);
+       if (!hdr)
+               return -EMSGSIZE;
+
+       if (ip_vs_genl_fill_dest(skb, dest) < 0)
+               goto nla_put_failure;
+
+       return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+       genlmsg_cancel(skb, hdr);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+                                struct netlink_callback *cb)
+{
+       int idx = 0;
+       int start = cb->args[0];
+       struct ip_vs_service *svc;
+       struct ip_vs_dest *dest;
+       struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+
+       mutex_lock(&__ip_vs_mutex);
+
+       /* Try to find the service for which to dump destinations */
+       if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+                       IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+               goto out_err;
+
+       svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+       if (IS_ERR(svc) || svc == NULL)
+               goto out_err;
+
+       /* Dump the destinations */
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if (++idx <= start)
+                       continue;
+               if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+                       idx--;
+                       goto nla_put_failure;
+               }
+       }
+
+nla_put_failure:
+       cb->args[0] = idx;
+       ip_vs_service_put(svc);
+
+out_err:
+       mutex_unlock(&__ip_vs_mutex);
+
+       return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
+                                struct nlattr *nla, int full_entry)
+{
+       struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+       struct nlattr *nla_addr, *nla_port;
+
+       /* Parse mandatory identifying destination fields first */
+       if (nla == NULL ||
+           nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+               return -EINVAL;
+
+       nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
+       nla_port        = attrs[IPVS_DEST_ATTR_PORT];
+
+       if (!(nla_addr && nla_port))
+               return -EINVAL;
+
+       nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+       udest->port = nla_get_u16(nla_port);
+
+       /* If a full entry was requested, check for the additional fields */
+       if (full_entry) {
+               struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+                             *nla_l_thresh;
+
+               nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
+               nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
+               nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
+               nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
+
+               if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+                       return -EINVAL;
+
+               udest->conn_flags = nla_get_u32(nla_fwd)
+                                   & IP_VS_CONN_F_FWD_MASK;
+               udest->weight = nla_get_u32(nla_weight);
+               udest->u_threshold = nla_get_u32(nla_u_thresh);
+               udest->l_threshold = nla_get_u32(nla_l_thresh);
+       }
+
+       return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
+                                 const char *mcast_ifn, __be32 syncid)
+{
+       struct nlattr *nl_daemon;
+
+       nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+       if (!nl_daemon)
+               return -EMSGSIZE;
+
+       NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
+       NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
+       NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
+
+       nla_nest_end(skb, nl_daemon);
+
+       return 0;
+
+nla_put_failure:
+       nla_nest_cancel(skb, nl_daemon);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
+                                 const char *mcast_ifn, __be32 syncid,
+                                 struct netlink_callback *cb)
+{
+       void *hdr;
+       hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+                         &ip_vs_genl_family, NLM_F_MULTI,
+                         IPVS_CMD_NEW_DAEMON);
+       if (!hdr)
+               return -EMSGSIZE;
+
+       if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+               goto nla_put_failure;
+
+       return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+       genlmsg_cancel(skb, hdr);
+       return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+                                  struct netlink_callback *cb)
+{
+       mutex_lock(&__ip_vs_mutex);
+       if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+               if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+                                          ip_vs_master_mcast_ifn,
+                                          ip_vs_master_syncid, cb) < 0)
+                       goto nla_put_failure;
+
+               cb->args[0] = 1;
+       }
+
+       if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+               if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+                                          ip_vs_backup_mcast_ifn,
+                                          ip_vs_backup_syncid, cb) < 0)
+                       goto nla_put_failure;
+
+               cb->args[1] = 1;
+       }
+
+nla_put_failure:
+       mutex_unlock(&__ip_vs_mutex);
+
+       return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+{
+       if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+             attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+             attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+               return -EINVAL;
+
+       return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+                                nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+                                nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+{
+       if (!attrs[IPVS_DAEMON_ATTR_STATE])
+               return -EINVAL;
+
+       return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct nlattr **attrs)
+{
+       struct ip_vs_timeout_user t;
+
+       __ip_vs_get_timeouts(&t);
+
+       if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+               t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+       if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+               t.tcp_fin_timeout =
+                       nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+       if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+               t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+       return ip_vs_set_timeout(&t);
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+       struct ip_vs_service *svc = NULL;
+       struct ip_vs_service_user_kern usvc;
+       struct ip_vs_dest_user_kern udest;
+       int ret = 0, cmd;
+       int need_full_svc = 0, need_full_dest = 0;
+
+       cmd = info->genlhdr->cmd;
+
+       mutex_lock(&__ip_vs_mutex);
+
+       if (cmd == IPVS_CMD_FLUSH) {
+               ret = ip_vs_flush();
+               goto out;
+       } else if (cmd == IPVS_CMD_SET_CONFIG) {
+               ret = ip_vs_genl_set_config(info->attrs);
+               goto out;
+       } else if (cmd == IPVS_CMD_NEW_DAEMON ||
+                  cmd == IPVS_CMD_DEL_DAEMON) {
+
+               struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+               if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+                   nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+                                    info->attrs[IPVS_CMD_ATTR_DAEMON],
+                                    ip_vs_daemon_policy)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               if (cmd == IPVS_CMD_NEW_DAEMON)
+                       ret = ip_vs_genl_new_daemon(daemon_attrs);
+               else
+                       ret = ip_vs_genl_del_daemon(daemon_attrs);
+               goto out;
+       } else if (cmd == IPVS_CMD_ZERO &&
+                  !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+               ret = ip_vs_zero_all();
+               goto out;
+       }
+
+       /* All following commands require a service argument, so check if we
+        * received a valid one. We need a full service specification when
+        * adding / editing a service. Only identifying members otherwise. */
+       if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+               need_full_svc = 1;
+
+       ret = ip_vs_genl_parse_service(&usvc,
+                                      info->attrs[IPVS_CMD_ATTR_SERVICE],
+                                      need_full_svc);
+       if (ret)
+               goto out;
+
+       /* Lookup the exact service by <protocol, addr, port> or fwmark */
+       if (usvc.fwmark == 0)
+               svc = __ip_vs_service_get(usvc.af, usvc.protocol,
+                                         &usvc.addr, usvc.port);
+       else
+               svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+
+       /* Unless we're adding a new service, the service must already exist */
+       if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+               ret = -ESRCH;
+               goto out;
+       }
+
+       /* Destination commands require a valid destination argument. For
+        * adding / editing a destination, we need a full destination
+        * specification. */
+       if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+           cmd == IPVS_CMD_DEL_DEST) {
+               if (cmd != IPVS_CMD_DEL_DEST)
+                       need_full_dest = 1;
+
+               ret = ip_vs_genl_parse_dest(&udest,
+                                           info->attrs[IPVS_CMD_ATTR_DEST],
+                                           need_full_dest);
+               if (ret)
+                       goto out;
+       }
+
+       switch (cmd) {
+       case IPVS_CMD_NEW_SERVICE:
+               if (svc == NULL)
+                       ret = ip_vs_add_service(&usvc, &svc);
+               else
+                       ret = -EEXIST;
+               break;
+       case IPVS_CMD_SET_SERVICE:
+               ret = ip_vs_edit_service(svc, &usvc);
+               break;
+       case IPVS_CMD_DEL_SERVICE:
+               ret = ip_vs_del_service(svc);
+               break;
+       case IPVS_CMD_NEW_DEST:
+               ret = ip_vs_add_dest(svc, &udest);
+               break;
+       case IPVS_CMD_SET_DEST:
+               ret = ip_vs_edit_dest(svc, &udest);
+               break;
+       case IPVS_CMD_DEL_DEST:
+               ret = ip_vs_del_dest(svc, &udest);
+               break;
+       case IPVS_CMD_ZERO:
+               ret = ip_vs_zero_service(svc);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+out:
+       if (svc)
+               ip_vs_service_put(svc);
+       mutex_unlock(&__ip_vs_mutex);
+
+       return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+       struct sk_buff *msg;
+       void *reply;
+       int ret, cmd, reply_cmd;
+
+       cmd = info->genlhdr->cmd;
+
+       if (cmd == IPVS_CMD_GET_SERVICE)
+               reply_cmd = IPVS_CMD_NEW_SERVICE;
+       else if (cmd == IPVS_CMD_GET_INFO)
+               reply_cmd = IPVS_CMD_SET_INFO;
+       else if (cmd == IPVS_CMD_GET_CONFIG)
+               reply_cmd = IPVS_CMD_SET_CONFIG;
+       else {
+               IP_VS_ERR("unknown Generic Netlink command\n");
+               return -EINVAL;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg)
+               return -ENOMEM;
+
+       mutex_lock(&__ip_vs_mutex);
+
+       reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+       if (reply == NULL)
+               goto nla_put_failure;
+
+       switch (cmd) {
+       case IPVS_CMD_GET_SERVICE:
+       {
+               struct ip_vs_service *svc;
+
+               svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+               if (IS_ERR(svc)) {
+                       ret = PTR_ERR(svc);
+                       goto out_err;
+               } else if (svc) {
+                       ret = ip_vs_genl_fill_service(msg, svc);
+                       ip_vs_service_put(svc);
+                       if (ret)
+                               goto nla_put_failure;
+               } else {
+                       ret = -ESRCH;
+                       goto out_err;
+               }
+
+               break;
+       }
+
+       case IPVS_CMD_GET_CONFIG:
+       {
+               struct ip_vs_timeout_user t;
+
+               __ip_vs_get_timeouts(&t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+               NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
+               NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+                           t.tcp_fin_timeout);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+               NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
+#endif
+
+               break;
+       }
+
+       case IPVS_CMD_GET_INFO:
+               NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
+               NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+                           IP_VS_CONN_TAB_SIZE);
+               break;
+       }
+
+       genlmsg_end(msg, reply);
+       ret = genlmsg_unicast(msg, info->snd_pid);
+       goto out;
+
+nla_put_failure:
+       IP_VS_ERR("not enough space in Netlink message\n");
+       ret = -EMSGSIZE;
+
+out_err:
+       nlmsg_free(msg);
+out:
+       mutex_unlock(&__ip_vs_mutex);
+
+       return ret;
+}
+
+
+static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+       {
+               .cmd    = IPVS_CMD_NEW_SERVICE,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_SET_SERVICE,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_DEL_SERVICE,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_GET_SERVICE,
+               .flags  = GENL_ADMIN_PERM,
+               .doit   = ip_vs_genl_get_cmd,
+               .dumpit = ip_vs_genl_dump_services,
+               .policy = ip_vs_cmd_policy,
+       },
+       {
+               .cmd    = IPVS_CMD_NEW_DEST,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_SET_DEST,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_DEL_DEST,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_GET_DEST,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .dumpit = ip_vs_genl_dump_dests,
+       },
+       {
+               .cmd    = IPVS_CMD_NEW_DAEMON,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_DEL_DAEMON,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_GET_DAEMON,
+               .flags  = GENL_ADMIN_PERM,
+               .dumpit = ip_vs_genl_dump_daemons,
+       },
+       {
+               .cmd    = IPVS_CMD_SET_CONFIG,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_GET_CONFIG,
+               .flags  = GENL_ADMIN_PERM,
+               .doit   = ip_vs_genl_get_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_GET_INFO,
+               .flags  = GENL_ADMIN_PERM,
+               .doit   = ip_vs_genl_get_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_ZERO,
+               .flags  = GENL_ADMIN_PERM,
+               .policy = ip_vs_cmd_policy,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+       {
+               .cmd    = IPVS_CMD_FLUSH,
+               .flags  = GENL_ADMIN_PERM,
+               .doit   = ip_vs_genl_set_cmd,
+       },
+};
+
+static int __init ip_vs_genl_register(void)
+{
+       int ret, i;
+
+       ret = genl_register_family(&ip_vs_genl_family);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
+               ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
+               if (ret)
+                       goto err_out;
+       }
+       return 0;
+
+err_out:
+       genl_unregister_family(&ip_vs_genl_family);
+       return ret;
+}
+
+static void ip_vs_genl_unregister(void)
+{
+       genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
+
+int __init ip_vs_control_init(void)
+{
+       int ret;
+       int idx;
+
+       EnterFunction(2);
+
+       ret = nf_register_sockopt(&ip_vs_sockopts);
+       if (ret) {
+               IP_VS_ERR("cannot register sockopt.\n");
+               return ret;
+       }
+
+       ret = ip_vs_genl_register();
+       if (ret) {
+               IP_VS_ERR("cannot register Generic Netlink interface.\n");
+               nf_unregister_sockopt(&ip_vs_sockopts);
+               return ret;
+       }
+
+       proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
+       proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
+
+       sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
+
+       /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+       for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+               INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+               INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+       }
+       for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+               INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+       }
+
+       ip_vs_new_estimator(&ip_vs_stats);
+
+       /* Hook the defense timer */
+       schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+
+       LeaveFunction(2);
+       return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+       EnterFunction(2);
+       ip_vs_trash_cleanup();
+       cancel_rearming_delayed_work(&defense_work);
+       cancel_work_sync(&defense_work.work);
+       ip_vs_kill_estimator(&ip_vs_stats);
+       unregister_sysctl_table(sysctl_header);
+       proc_net_remove(&init_net, "ip_vs_stats");
+       proc_net_remove(&init_net, "ip_vs");
+       ip_vs_genl_unregister();
+       nf_unregister_sockopt(&ip_vs_sockopts);
+       LeaveFunction(2);
+}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c

new file mode 100644 (file)

index 0000000..a16943f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -0,0 +1,261 @@
+/*
+ * IPVS:        Destination Hashing scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              Inspired by the consistent hashing scheduler patch from
+ *              Thomas Proell <proellt@gmx.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[dest_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) OR (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+       struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS        8
+#endif
+#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
+
+
+/*
+ *     Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__be32 addr)
+{
+       return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr)
+{
+       return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+       int i;
+       struct ip_vs_dh_bucket *b;
+       struct list_head *p;
+       struct ip_vs_dest *dest;
+
+       b = tbl;
+       p = &svc->destinations;
+       for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+               if (list_empty(p)) {
+                       b->dest = NULL;
+               } else {
+                       if (p == &svc->destinations)
+                               p = p->next;
+
+                       dest = list_entry(p, struct ip_vs_dest, n_list);
+                       atomic_inc(&dest->refcnt);
+                       b->dest = dest;
+
+                       p = p->next;
+               }
+               b++;
+       }
+       return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+       int i;
+       struct ip_vs_dh_bucket *b;
+
+       b = tbl;
+       for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+               if (b->dest) {
+                       atomic_dec(&b->dest->refcnt);
+                       b->dest = NULL;
+               }
+               b++;
+       }
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_dh_bucket *tbl;
+
+       /* allocate the DH table for this service */
+       tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+                     GFP_ATOMIC);
+       if (tbl == NULL) {
+               IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+               return -ENOMEM;
+       }
+       svc->sched_data = tbl;
+       IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+                 "current service\n",
+                 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+       /* assign the hash buckets with the updated service */
+       ip_vs_dh_assign(tbl, svc);
+
+       return 0;
+}
+
+
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+       /* got to clean up hash buckets here */
+       ip_vs_dh_flush(tbl);
+
+       /* release the table itself */
+       kfree(svc->sched_data);
+       IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+                 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+       return 0;
+}
+
+
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+       /* got to clean up hash buckets here */
+       ip_vs_dh_flush(tbl);
+
+       /* assign the hash buckets with the updated service */
+       ip_vs_dh_assign(tbl, svc);
+
+       return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+       return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest;
+       struct ip_vs_dh_bucket *tbl;
+       struct iphdr *iph = ip_hdr(skb);
+
+       IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+
+       tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+       dest = ip_vs_dh_get(tbl, iph->daddr);
+       if (!dest
+           || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+           || atomic_read(&dest->weight) <= 0
+           || is_overloaded(dest)) {
+               return NULL;
+       }
+
+       IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+                 "--> server %u.%u.%u.%u:%d\n",
+                 NIPQUAD(iph->daddr),
+                 NIPQUAD(dest->addr.ip),
+                 ntohs(dest->port));
+
+       return dest;
+}
+
+
+/*
+ *      IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+       .name =                 "dh",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        0,
+#endif
+       .init_service =         ip_vs_dh_init_svc,
+       .done_service =         ip_vs_dh_done_svc,
+       .update_service =       ip_vs_dh_update_svc,
+       .schedule =             ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c

new file mode 100644 (file)

index 0000000..2eb2860
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,166 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+  This code is to estimate rate in a shorter interval (such as 8
+  seconds) for virtual services and real servers. For measure rate in a
+  long interval, it is easy to implement a user level daemon which
+  periodically reads those statistical counters and measure rate.
+
+  Currently, the measurement is activated by slow timer handler. Hope
+  this measurement will not introduce too much load.
+
+  We measure rate during the last 8 seconds every 2 seconds:
+
+    avgrate = avgrate*(1-W) + rate*W
+
+    where W = 2^(-2)
+
+  NOTES.
+
+  * The stored value for average bps is scaled by 2^5, so that maximal
+    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+  * A lot code is taken from net/sched/estimator.c
+ */
+
+
+static void estimation_timer(unsigned long arg);
+
+static LIST_HEAD(est_list);
+static DEFINE_SPINLOCK(est_lock);
+static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
+
+static void estimation_timer(unsigned long arg)
+{
+       struct ip_vs_estimator *e;
+       struct ip_vs_stats *s;
+       u32 n_conns;
+       u32 n_inpkts, n_outpkts;
+       u64 n_inbytes, n_outbytes;
+       u32 rate;
+
+       spin_lock(&est_lock);
+       list_for_each_entry(e, &est_list, list) {
+               s = container_of(e, struct ip_vs_stats, est);
+
+               spin_lock(&s->lock);
+               n_conns = s->ustats.conns;
+               n_inpkts = s->ustats.inpkts;
+               n_outpkts = s->ustats.outpkts;
+               n_inbytes = s->ustats.inbytes;
+               n_outbytes = s->ustats.outbytes;
+
+               /* scaled by 2^10, but divided 2 seconds */
+               rate = (n_conns - e->last_conns)<<9;
+               e->last_conns = n_conns;
+               e->cps += ((long)rate - (long)e->cps)>>2;
+               s->ustats.cps = (e->cps+0x1FF)>>10;
+
+               rate = (n_inpkts - e->last_inpkts)<<9;
+               e->last_inpkts = n_inpkts;
+               e->inpps += ((long)rate - (long)e->inpps)>>2;
+               s->ustats.inpps = (e->inpps+0x1FF)>>10;
+
+               rate = (n_outpkts - e->last_outpkts)<<9;
+               e->last_outpkts = n_outpkts;
+               e->outpps += ((long)rate - (long)e->outpps)>>2;
+               s->ustats.outpps = (e->outpps+0x1FF)>>10;
+
+               rate = (n_inbytes - e->last_inbytes)<<4;
+               e->last_inbytes = n_inbytes;
+               e->inbps += ((long)rate - (long)e->inbps)>>2;
+               s->ustats.inbps = (e->inbps+0xF)>>5;
+
+               rate = (n_outbytes - e->last_outbytes)<<4;
+               e->last_outbytes = n_outbytes;
+               e->outbps += ((long)rate - (long)e->outbps)>>2;
+               s->ustats.outbps = (e->outbps+0xF)>>5;
+               spin_unlock(&s->lock);
+       }
+       spin_unlock(&est_lock);
+       mod_timer(&est_timer, jiffies + 2*HZ);
+}
+
+void ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+       struct ip_vs_estimator *est = &stats->est;
+
+       INIT_LIST_HEAD(&est->list);
+
+       est->last_conns = stats->ustats.conns;
+       est->cps = stats->ustats.cps<<10;
+
+       est->last_inpkts = stats->ustats.inpkts;
+       est->inpps = stats->ustats.inpps<<10;
+
+       est->last_outpkts = stats->ustats.outpkts;
+       est->outpps = stats->ustats.outpps<<10;
+
+       est->last_inbytes = stats->ustats.inbytes;
+       est->inbps = stats->ustats.inbps<<5;
+
+       est->last_outbytes = stats->ustats.outbytes;
+       est->outbps = stats->ustats.outbps<<5;
+
+       spin_lock_bh(&est_lock);
+       list_add(&est->list, &est_list);
+       spin_unlock_bh(&est_lock);
+}
+
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+       struct ip_vs_estimator *est = &stats->est;
+
+       spin_lock_bh(&est_lock);
+       list_del(&est->list);
+       spin_unlock_bh(&est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+       struct ip_vs_estimator *est = &stats->est;
+
+       /* set counters zero, caller must hold the stats->lock lock */
+       est->last_inbytes = 0;
+       est->last_outbytes = 0;
+       est->last_conns = 0;
+       est->last_inpkts = 0;
+       est->last_outpkts = 0;
+       est->cps = 0;
+       est->inpps = 0;
+       est->outpps = 0;
+       est->inbps = 0;
+       est->outbps = 0;
+}
+
+int __init ip_vs_estimator_init(void)
+{
+       mod_timer(&est_timer, jiffies + 2 * HZ);
+       return 0;
+}
+
+void ip_vs_estimator_cleanup(void)
+{
+       del_timer_sync(&est_timer);
+}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c

new file mode 100644 (file)

index 0000000..2e7dbd8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -0,0 +1,410 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Authors:    Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ *             IP_MASQ_FTP ftp masquerading module
+ *
+ * Version:    @(#)ip_masq_ftp.c 0.04   02/05/96
+ *
+ * Author:     Wouter Gadeyne
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, ushort, NULL, 0);
+MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
+
+
+/*     Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+       return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+       return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+                                 const char *pattern, size_t plen, char term,
+                                 __be32 *addr, __be16 *port,
+                                 char **start, char **end)
+{
+       unsigned char p[6];
+       int i = 0;
+
+       if (data_limit - data < plen) {
+               /* check if there is partial match */
+               if (strnicmp(data, pattern, data_limit - data) == 0)
+                       return -1;
+               else
+                       return 0;
+       }
+
+       if (strnicmp(data, pattern, plen) != 0) {
+               return 0;
+       }
+       *start = data + plen;
+
+       for (data = *start; *data != term; data++) {
+               if (data == data_limit)
+                       return -1;
+       }
+       *end = data;
+
+       memset(p, 0, sizeof(p));
+       for (data = *start; data != *end; data++) {
+               if (*data >= '0' && *data <= '9') {
+                       p[i] = p[i]*10 + *data - '0';
+               } else if (*data == ',' && i < 5) {
+                       i++;
+               } else {
+                       /* unexpected character */
+                       return -1;
+               }
+       }
+
+       if (i != 5)
+               return -1;
+
+       *addr = get_unaligned((__be32 *)p);
+       *port = get_unaligned((__be16 *)(p + 4));
+       return 1;
+}
+
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port.  Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+                        struct sk_buff *skb, int *diff)
+{
+       struct iphdr *iph;
+       struct tcphdr *th;
+       char *data, *data_limit;
+       char *start, *end;
+       union nf_inet_addr from;
+       __be16 port;
+       struct ip_vs_conn *n_cp;
+       char buf[24];           /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+       unsigned buf_len;
+       int ret;
+
+#ifdef CONFIG_IP_VS_IPV6
+       /* This application helper doesn't work with IPv6 yet,
+        * so turn this into a no-op for IPv6 packets
+        */
+       if (cp->af == AF_INET6)
+               return 1;
+#endif
+
+       *diff = 0;
+
+       /* Only useful for established sessions */
+       if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+               return 1;
+
+       /* Linear packets are much easier to deal with. */
+       if (!skb_make_writable(skb, skb->len))
+               return 0;
+
+       if (cp->app_data == &ip_vs_ftp_pasv) {
+               iph = ip_hdr(skb);
+               th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+               data = (char *)th + (th->doff << 2);
+               data_limit = skb_tail_pointer(skb);
+
+               if (ip_vs_ftp_get_addrport(data, data_limit,
+                                          SERVER_STRING,
+                                          sizeof(SERVER_STRING)-1, ')',
+                                          &from.ip, &port,
+                                          &start, &end) != 1)
+                       return 1;
+
+               IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
+                         "%u.%u.%u.%u:%d detected\n",
+                         NIPQUAD(from.ip), ntohs(port),
+                         NIPQUAD(cp->caddr.ip), 0);
+
+               /*
+                * Now update or create an connection entry for it
+                */
+               n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
+                                         &cp->caddr, 0);
+               if (!n_cp) {
+                       n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+                                             &cp->caddr, 0,
+                                             &cp->vaddr, port,
+                                             &from, port,
+                                             IP_VS_CONN_F_NO_CPORT,
+                                             cp->dest);
+                       if (!n_cp)
+                               return 0;
+
+                       /* add its controller */
+                       ip_vs_control_add(n_cp, cp);
+               }
+
+               /*
+                * Replace the old passive address with the new one
+                */
+               from.ip = n_cp->vaddr.ip;
+               port = n_cp->vport;
+               sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
+                       (ntohs(port)>>8)&255, ntohs(port)&255);
+               buf_len = strlen(buf);
+
+               /*
+                * Calculate required delta-offset to keep TCP happy
+                */
+               *diff = buf_len - (end-start);
+
+               if (*diff == 0) {
+                       /* simply replace it with new passive address */
+                       memcpy(start, buf, buf_len);
+                       ret = 1;
+               } else {
+                       ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
+                                         end-start, buf, buf_len);
+               }
+
+               cp->app_data = NULL;
+               ip_vs_tcp_conn_listen(n_cp);
+               ip_vs_conn_put(n_cp);
+               return ret;
+       }
+       return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+                       struct sk_buff *skb, int *diff)
+{
+       struct iphdr *iph;
+       struct tcphdr *th;
+       char *data, *data_start, *data_limit;
+       char *start, *end;
+       union nf_inet_addr to;
+       __be16 port;
+       struct ip_vs_conn *n_cp;
+
+#ifdef CONFIG_IP_VS_IPV6
+       /* This application helper doesn't work with IPv6 yet,
+        * so turn this into a no-op for IPv6 packets
+        */
+       if (cp->af == AF_INET6)
+               return 1;
+#endif
+
+       /* no diff required for incoming packets */
+       *diff = 0;
+
+       /* Only useful for established sessions */
+       if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+               return 1;
+
+       /* Linear packets are much easier to deal with. */
+       if (!skb_make_writable(skb, skb->len))
+               return 0;
+
+       /*
+        * Detecting whether it is passive
+        */
+       iph = ip_hdr(skb);
+       th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+       /* Since there may be OPTIONS in the TCP packet and the HLEN is
+          the length of the header in 32-bit multiples, it is accurate
+          to calculate data address by th+HLEN*4 */
+       data = data_start = (char *)th + (th->doff << 2);
+       data_limit = skb_tail_pointer(skb);
+
+       while (data <= data_limit - 6) {
+               if (strnicmp(data, "PASV\r\n", 6) == 0) {
+                       /* Passive mode on */
+                       IP_VS_DBG(7, "got PASV at %td of %td\n",
+                                 data - data_start,
+                                 data_limit - data_start);
+                       cp->app_data = &ip_vs_ftp_pasv;
+                       return 1;
+               }
+               data++;
+       }
+
+       /*
+        * To support virtual FTP server, the scenerio is as follows:
+        *       FTP client ----> Load Balancer ----> FTP server
+        * First detect the port number in the application data,
+        * then create a new connection entry for the coming data
+        * connection.
+        */
+       if (ip_vs_ftp_get_addrport(data_start, data_limit,
+                                  CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+                                  '\r', &to.ip, &port,
+                                  &start, &end) != 1)
+               return 1;
+
+       IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
+                 NIPQUAD(to.ip), ntohs(port));
+
+       /* Passive mode off */
+       cp->app_data = NULL;
+
+       /*
+        * Now update or create a connection entry for it
+        */
+       IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+                 ip_vs_proto_name(iph->protocol),
+                 NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
+
+       n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
+                                &to, port,
+                                &cp->vaddr, htons(ntohs(cp->vport)-1));
+       if (!n_cp) {
+               n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+                                     &to, port,
+                                     &cp->vaddr, htons(ntohs(cp->vport)-1),
+                                     &cp->daddr, htons(ntohs(cp->dport)-1),
+                                     0,
+                                     cp->dest);
+               if (!n_cp)
+                       return 0;
+
+               /* add its controller */
+               ip_vs_control_add(n_cp, cp);
+       }
+
+       /*
+        *      Move tunnel to listen state
+        */
+       ip_vs_tcp_conn_listen(n_cp);
+       ip_vs_conn_put(n_cp);
+
+       return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+       .name =         "ftp",
+       .type =         IP_VS_APP_TYPE_FTP,
+       .protocol =     IPPROTO_TCP,
+       .module =       THIS_MODULE,
+       .incs_list =    LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+       .init_conn =    ip_vs_ftp_init_conn,
+       .done_conn =    ip_vs_ftp_done_conn,
+       .bind_conn =    NULL,
+       .unbind_conn =  NULL,
+       .pkt_out =      ip_vs_ftp_out,
+       .pkt_in =       ip_vs_ftp_in,
+};
+
+
+/*
+ *     ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+       int i, ret;
+       struct ip_vs_app *app = &ip_vs_ftp;
+
+       ret = register_ip_vs_app(app);
+       if (ret)
+               return ret;
+
+       for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+               if (!ports[i])
+                       continue;
+               ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+               if (ret)
+                       break;
+               IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
+                          app->name, i, ports[i]);
+       }
+
+       if (ret)
+               unregister_ip_vs_app(app);
+
+       return ret;
+}
+
+
+/*
+ *     ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+       unregister_ip_vs_app(&ip_vs_ftp);
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c

new file mode 100644 (file)

index 0000000..6ecef35
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -0,0 +1,555 @@
+/*
+ * IPVS:        Locality-Based Least-Connection scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Martin Hamilton         :    fixed the terrible locking bugs
+ *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
+ *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    added doing full expiration check to
+ *                                   collect stale entries of 24+ hours when
+ *                                   no partial expire check in a half hour
+ *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
+ *                                   to avoid the possible race between timer
+ *                                   handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ *       if cachenode[dest_ip] is null then
+ *               n, cachenode[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- cachenode[dest_ip];
+ *               if (n is dead) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                 n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ *       return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblc entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+
+
+/*
+ *     for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS      10
+#endif
+#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS lblc entry represents an association between destination
+ *      IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+       struct list_head        list;
+       __be32                  addr;           /* destination IP address */
+       struct ip_vs_dest       *dest;          /* real server (cache) */
+       unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+       struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+       atomic_t                entries;        /* number of entries */
+       int                     max_size;       /* maximum size of entries */
+       struct timer_list       periodic_timer; /* collect stale entries */
+       int                     rover;          /* rover for expire check */
+       int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLC sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+       {
+               .procname       = "lblc_expiration",
+               .data           = &sysctl_ip_vs_lblc_expiration,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       { .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+       list_del(&en->list);
+       /*
+        * We don't kfree dest because it is refered either by its service
+        * or the trash dest list.
+        */
+       atomic_dec(&en->dest->refcnt);
+       kfree(en);
+}
+
+
+/*
+ *     Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
+{
+       return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ *     Hash an entry in the ip_vs_lblc_table.
+ *     returns bool success.
+ */
+static void
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+       unsigned hash = ip_vs_lblc_hashkey(en->addr);
+
+       list_add(&en->list, &tbl->bucket[hash]);
+       atomic_inc(&tbl->entries);
+}
+
+
+/*
+ *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
+ *  lock
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
+{
+       unsigned hash = ip_vs_lblc_hashkey(addr);
+       struct ip_vs_lblc_entry *en;
+
+       list_for_each_entry(en, &tbl->bucket[hash], list)
+               if (en->addr == addr)
+                       return en;
+
+       return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
+              struct ip_vs_dest *dest)
+{
+       struct ip_vs_lblc_entry *en;
+
+       en = ip_vs_lblc_get(tbl, daddr);
+       if (!en) {
+               en = kmalloc(sizeof(*en), GFP_ATOMIC);
+               if (!en) {
+                       IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+                       return NULL;
+               }
+
+               en->addr = daddr;
+               en->lastuse = jiffies;
+
+               atomic_inc(&dest->refcnt);
+               en->dest = dest;
+
+               ip_vs_lblc_hash(tbl, en);
+       } else if (en->dest != dest) {
+               atomic_dec(&en->dest->refcnt);
+               atomic_inc(&dest->refcnt);
+               en->dest = dest;
+       }
+
+       return en;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+       struct ip_vs_lblc_entry *en, *nxt;
+       int i;
+
+       for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+               list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+                       ip_vs_lblc_free(en);
+                       atomic_dec(&tbl->entries);
+               }
+       }
+}
+
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
+{
+       struct ip_vs_lblc_table *tbl = svc->sched_data;
+       struct ip_vs_lblc_entry *en, *nxt;
+       unsigned long now = jiffies;
+       int i, j;
+
+       for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+               j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+               write_lock(&svc->sched_lock);
+               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                       if (time_before(now,
+                                       en->lastuse + sysctl_ip_vs_lblc_expiration))
+                               continue;
+
+                       ip_vs_lblc_free(en);
+                       atomic_dec(&tbl->entries);
+               }
+               write_unlock(&svc->sched_lock);
+       }
+       tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblc table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+       struct ip_vs_service *svc = (struct ip_vs_service *) data;
+       struct ip_vs_lblc_table *tbl = svc->sched_data;
+       unsigned long now = jiffies;
+       int goal;
+       int i, j;
+       struct ip_vs_lblc_entry *en, *nxt;
+
+       if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+               /* do full expiration check */
+               ip_vs_lblc_full_check(svc);
+               tbl->counter = 1;
+               goto out;
+       }
+
+       if (atomic_read(&tbl->entries) <= tbl->max_size) {
+               tbl->counter++;
+               goto out;
+       }
+
+       goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+       if (goal > tbl->max_size/2)
+               goal = tbl->max_size/2;
+
+       for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+               j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+               write_lock(&svc->sched_lock);
+               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                       if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+                               continue;
+
+                       ip_vs_lblc_free(en);
+                       atomic_dec(&tbl->entries);
+                       goal--;
+               }
+               write_unlock(&svc->sched_lock);
+               if (goal <= 0)
+                       break;
+       }
+       tbl->rover = j;
+
+  out:
+       mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+       int i;
+       struct ip_vs_lblc_table *tbl;
+
+       /*
+        *    Allocate the ip_vs_lblc_table for this service
+        */
+       tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+       if (tbl == NULL) {
+               IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+               return -ENOMEM;
+       }
+       svc->sched_data = tbl;
+       IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+                 "current service\n", sizeof(*tbl));
+
+       /*
+        *    Initialize the hash buckets
+        */
+       for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+               INIT_LIST_HEAD(&tbl->bucket[i]);
+       }
+       tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+       tbl->rover = 0;
+       tbl->counter = 1;
+
+       /*
+        *    Hook periodic timer for garbage collection
+        */
+       setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+                       (unsigned long)svc);
+       mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+       return 0;
+}
+
+
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+       /* remove periodic timer */
+       del_timer_sync(&tbl->periodic_timer);
+
+       /* got to clean up table entries here */
+       ip_vs_lblc_flush(tbl);
+
+       /* release the table itself */
+       kfree(tbl);
+       IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+                 sizeof(*tbl));
+
+       return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+       struct ip_vs_dest *dest, *least;
+       int loh, doh;
+
+       /*
+        * We think the overhead of processing active connections is fifty
+        * times higher than that of inactive connections in average. (This
+        * fifty times might not be accurate, we will change it later.) We
+        * use the following formula to estimate the overhead:
+        *                dest->activeconns*50 + dest->inactconns
+        * and the load:
+        *                (dest overhead) / dest->weight
+        *
+        * Remember -- no floats in kernel mode!!!
+        * The comparison of h1*w2 > h2*w1 is equivalent to that of
+        *                h1/w1 > h2/w2
+        * if every weight is larger than zero.
+        *
+        * The server with weight=0 is quiesced and will not receive any
+        * new connection.
+        */
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+               if (atomic_read(&dest->weight) > 0) {
+                       least = dest;
+                       loh = atomic_read(&least->activeconns) * 50
+                               + atomic_read(&least->inactconns);
+                       goto nextstage;
+               }
+       }
+       return NULL;
+
+       /*
+        *    Find the destination with the least load.
+        */
+  nextstage:
+       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+
+               doh = atomic_read(&dest->activeconns) * 50
+                       + atomic_read(&dest->inactconns);
+               if (loh * atomic_read(&dest->weight) >
+                   doh * atomic_read(&least->weight)) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+                 "activeconns %d refcnt %d weight %d overhead %d\n",
+                 NIPQUAD(least->addr.ip), ntohs(least->port),
+                 atomic_read(&least->activeconns),
+                 atomic_read(&least->refcnt),
+                 atomic_read(&least->weight), loh);
+
+       return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+               struct ip_vs_dest *d;
+
+               list_for_each_entry(d, &svc->destinations, n_list) {
+                       if (atomic_read(&d->activeconns)*2
+                           < atomic_read(&d->weight)) {
+                               return 1;
+                       }
+               }
+       }
+       return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_lblc_table *tbl = svc->sched_data;
+       struct iphdr *iph = ip_hdr(skb);
+       struct ip_vs_dest *dest = NULL;
+       struct ip_vs_lblc_entry *en;
+
+       IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+
+       /* First look in our cache */
+       read_lock(&svc->sched_lock);
+       en = ip_vs_lblc_get(tbl, iph->daddr);
+       if (en) {
+               /* We only hold a read lock, but this is atomic */
+               en->lastuse = jiffies;
+
+               /*
+                * If the destination is not available, i.e. it's in the trash,
+                * we must ignore it, as it may be removed from under our feet,
+                * if someone drops our reference count. Our caller only makes
+                * sure that destinations, that are not in the trash, are not
+                * moved to the trash, while we are scheduling. But anyone can
+                * free up entries from the trash at any time.
+                */
+
+               if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
+                       dest = en->dest;
+       }
+       read_unlock(&svc->sched_lock);
+
+       /* If the destination has a weight and is not overloaded, use it */
+       if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+               goto out;
+
+       /* No cache entry or it is invalid, time to schedule */
+       dest = __ip_vs_lblc_schedule(svc, iph);
+       if (!dest) {
+               IP_VS_DBG(1, "no destination available\n");
+               return NULL;
+       }
+
+       /* If we fail to create a cache entry, we'll just use the valid dest */
+       write_lock(&svc->sched_lock);
+       ip_vs_lblc_new(tbl, iph->daddr, dest);
+       write_unlock(&svc->sched_lock);
+
+out:
+       IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+                 "--> server %u.%u.%u.%u:%d\n",
+                 NIPQUAD(iph->daddr),
+                 NIPQUAD(dest->addr.ip),
+                 ntohs(dest->port));
+
+       return dest;
+}
+
+
+/*
+ *      IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+       .name =                 "lblc",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        0,
+#endif
+       .init_service =         ip_vs_lblc_init_svc,
+       .done_service =         ip_vs_lblc_done_svc,
+       .schedule =             ip_vs_lblc_schedule,
+};
+
+
+static int __init ip_vs_lblc_init(void)
+{
+       int ret;
+
+       sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+       ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+       if (ret)
+               unregister_sysctl_table(sysctl_header);
+       return ret;
+}
+
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+       unregister_sysctl_table(sysctl_header);
+       unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c

new file mode 100644 (file)

index 0000000..1f75ea8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,755 @@
+/*
+ * IPVS:        Locality-Based Least-Connection with Replication scheduler
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Julian Anastasov        :    Added the missing (dest->weight>0)
+ *                                  condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ *       if serverSet[dest_ip] is null then
+ *               n, serverSet[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- {least-conn (alive) node in serverSet[dest_ip]};
+ *               if (n is null) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                   n <- {weighted least-conn node};
+ *                   add n to serverSet[dest_ip];
+ *               if |serverSet[dest_ip]| > 1 AND
+ *                   now - serverSet[dest_ip].lastMod > T then
+ *                   m <- {most conn node in serverSet[dest_ip]};
+ *                   remove m from serverSet[dest_ip];
+ *       if serverSet[dest_ip] changed then
+ *               serverSet[dest_ip].lastMod <- now;
+ *
+ *       return n;
+ *
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblcr entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+
+
+/*
+ *     for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
+#endif
+#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+       struct ip_vs_dest_list  *next;          /* list link */
+       struct ip_vs_dest       *dest;          /* destination server */
+};
+
+struct ip_vs_dest_set {
+       atomic_t                size;           /* set size */
+       unsigned long           lastmod;        /* last modified time */
+       struct ip_vs_dest_list  *list;          /* destination list */
+       rwlock_t                lock;           /* lock for this list */
+};
+
+
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+       struct ip_vs_dest_list *e;
+
+       for (e=set->list; e!=NULL; e=e->next) {
+               if (e->dest == dest)
+                       /* already existed */
+                       return NULL;
+       }
+
+       e = kmalloc(sizeof(*e), GFP_ATOMIC);
+       if (e == NULL) {
+               IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+               return NULL;
+       }
+
+       atomic_inc(&dest->refcnt);
+       e->dest = dest;
+
+       /* link it to the list */
+       e->next = set->list;
+       set->list = e;
+       atomic_inc(&set->size);
+
+       set->lastmod = jiffies;
+       return e;
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+       struct ip_vs_dest_list *e, **ep;
+
+       for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+               if (e->dest == dest) {
+                       /* HIT */
+                       *ep = e->next;
+                       atomic_dec(&set->size);
+                       set->lastmod = jiffies;
+                       atomic_dec(&e->dest->refcnt);
+                       kfree(e);
+                       break;
+               }
+               ep = &e->next;
+       }
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+       struct ip_vs_dest_list *e, **ep;
+
+       write_lock(&set->lock);
+       for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+               *ep = e->next;
+               /*
+                * We don't kfree dest because it is refered either
+                * by its service or by the trash dest list.
+                */
+               atomic_dec(&e->dest->refcnt);
+               kfree(e);
+       }
+       write_unlock(&set->lock);
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+       register struct ip_vs_dest_list *e;
+       struct ip_vs_dest *dest, *least;
+       int loh, doh;
+
+       if (set == NULL)
+               return NULL;
+
+       /* select the first destination server, whose weight > 0 */
+       for (e=set->list; e!=NULL; e=e->next) {
+               least = e->dest;
+               if (least->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+
+               if ((atomic_read(&least->weight) > 0)
+                   && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+                       loh = atomic_read(&least->activeconns) * 50
+                               + atomic_read(&least->inactconns);
+                       goto nextstage;
+               }
+       }
+       return NULL;
+
+       /* find the destination with the weighted least load */
+  nextstage:
+       for (e=e->next; e!=NULL; e=e->next) {
+               dest = e->dest;
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+
+               doh = atomic_read(&dest->activeconns) * 50
+                       + atomic_read(&dest->inactconns);
+               if ((loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight))
+                   && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+                 "activeconns %d refcnt %d weight %d overhead %d\n",
+                 NIPQUAD(least->addr.ip), ntohs(least->port),
+                 atomic_read(&least->activeconns),
+                 atomic_read(&least->refcnt),
+                 atomic_read(&least->weight), loh);
+       return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+       register struct ip_vs_dest_list *e;
+       struct ip_vs_dest *dest, *most;
+       int moh, doh;
+
+       if (set == NULL)
+               return NULL;
+
+       /* select the first destination server, whose weight > 0 */
+       for (e=set->list; e!=NULL; e=e->next) {
+               most = e->dest;
+               if (atomic_read(&most->weight) > 0) {
+                       moh = atomic_read(&most->activeconns) * 50
+                               + atomic_read(&most->inactconns);
+                       goto nextstage;
+               }
+       }
+       return NULL;
+
+       /* find the destination with the weighted most load */
+  nextstage:
+       for (e=e->next; e!=NULL; e=e->next) {
+               dest = e->dest;
+               doh = atomic_read(&dest->activeconns) * 50
+                       + atomic_read(&dest->inactconns);
+               /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+               if ((moh * atomic_read(&dest->weight) <
+                    doh * atomic_read(&most->weight))
+                   && (atomic_read(&dest->weight) > 0)) {
+                       most = dest;
+                       moh = doh;
+               }
+       }
+
+       IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+                 "activeconns %d refcnt %d weight %d overhead %d\n",
+                 NIPQUAD(most->addr.ip), ntohs(most->port),
+                 atomic_read(&most->activeconns),
+                 atomic_read(&most->refcnt),
+                 atomic_read(&most->weight), moh);
+       return most;
+}
+
+
+/*
+ *      IPVS lblcr entry represents an association between destination
+ *      IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+       struct list_head        list;
+       __be32                   addr;           /* destination IP address */
+       struct ip_vs_dest_set   set;            /* destination server set */
+       unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+       struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+       atomic_t                entries;        /* number of entries */
+       int                     max_size;       /* maximum size of entries */
+       struct timer_list       periodic_timer; /* collect stale entries */
+       int                     rover;          /* rover for expire check */
+       int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLCR sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+       {
+               .procname       = "lblcr_expiration",
+               .data           = &sysctl_ip_vs_lblcr_expiration,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_jiffies,
+       },
+       { .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+       list_del(&en->list);
+       ip_vs_dest_set_eraseall(&en->set);
+       kfree(en);
+}
+
+
+/*
+ *     Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
+{
+       return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ *     Hash an entry in the ip_vs_lblcr_table.
+ *     returns bool success.
+ */
+static void
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+       unsigned hash = ip_vs_lblcr_hashkey(en->addr);
+
+       list_add(&en->list, &tbl->bucket[hash]);
+       atomic_inc(&tbl->entries);
+}
+
+
+/*
+ *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
+ *  read lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
+{
+       unsigned hash = ip_vs_lblcr_hashkey(addr);
+       struct ip_vs_lblcr_entry *en;
+
+       list_for_each_entry(en, &tbl->bucket[hash], list)
+               if (en->addr == addr)
+                       return en;
+
+       return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
+               struct ip_vs_dest *dest)
+{
+       struct ip_vs_lblcr_entry *en;
+
+       en = ip_vs_lblcr_get(tbl, daddr);
+       if (!en) {
+               en = kmalloc(sizeof(*en), GFP_ATOMIC);
+               if (!en) {
+                       IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+                       return NULL;
+               }
+
+               en->addr = daddr;
+               en->lastuse = jiffies;
+
+               /* initilize its dest set */
+               atomic_set(&(en->set.size), 0);
+               en->set.list = NULL;
+               rwlock_init(&en->set.lock);
+
+               ip_vs_lblcr_hash(tbl, en);
+       }
+
+       write_lock(&en->set.lock);
+       ip_vs_dest_set_insert(&en->set, dest);
+       write_unlock(&en->set.lock);
+
+       return en;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+       int i;
+       struct ip_vs_lblcr_entry *en, *nxt;
+
+       /* No locking required, only called during cleanup. */
+       for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+               list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+                       ip_vs_lblcr_free(en);
+               }
+       }
+}
+
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
+{
+       struct ip_vs_lblcr_table *tbl = svc->sched_data;
+       unsigned long now = jiffies;
+       int i, j;
+       struct ip_vs_lblcr_entry *en, *nxt;
+
+       for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+               j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+               write_lock(&svc->sched_lock);
+               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                       if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+                                      now))
+                               continue;
+
+                       ip_vs_lblcr_free(en);
+                       atomic_dec(&tbl->entries);
+               }
+               write_unlock(&svc->sched_lock);
+       }
+       tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblcr table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+       struct ip_vs_service *svc = (struct ip_vs_service *) data;
+       struct ip_vs_lblcr_table *tbl = svc->sched_data;
+       unsigned long now = jiffies;
+       int goal;
+       int i, j;
+       struct ip_vs_lblcr_entry *en, *nxt;
+
+       if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+               /* do full expiration check */
+               ip_vs_lblcr_full_check(svc);
+               tbl->counter = 1;
+               goto out;
+       }
+
+       if (atomic_read(&tbl->entries) <= tbl->max_size) {
+               tbl->counter++;
+               goto out;
+       }
+
+       goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+       if (goal > tbl->max_size/2)
+               goal = tbl->max_size/2;
+
+       for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+               j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+               write_lock(&svc->sched_lock);
+               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+                       if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+                               continue;
+
+                       ip_vs_lblcr_free(en);
+                       atomic_dec(&tbl->entries);
+                       goal--;
+               }
+               write_unlock(&svc->sched_lock);
+               if (goal <= 0)
+                       break;
+       }
+       tbl->rover = j;
+
+  out:
+       mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+       int i;
+       struct ip_vs_lblcr_table *tbl;
+
+       /*
+        *    Allocate the ip_vs_lblcr_table for this service
+        */
+       tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+       if (tbl == NULL) {
+               IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+               return -ENOMEM;
+       }
+       svc->sched_data = tbl;
+       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+                 "current service\n", sizeof(*tbl));
+
+       /*
+        *    Initialize the hash buckets
+        */
+       for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+               INIT_LIST_HEAD(&tbl->bucket[i]);
+       }
+       tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+       tbl->rover = 0;
+       tbl->counter = 1;
+
+       /*
+        *    Hook periodic timer for garbage collection
+        */
+       setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+                       (unsigned long)svc);
+       mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+       return 0;
+}
+
+
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+       /* remove periodic timer */
+       del_timer_sync(&tbl->periodic_timer);
+
+       /* got to clean up table entries here */
+       ip_vs_lblcr_flush(tbl);
+
+       /* release the table itself */
+       kfree(tbl);
+       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+                 sizeof(*tbl));
+
+       return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+       struct ip_vs_dest *dest, *least;
+       int loh, doh;
+
+       /*
+        * We think the overhead of processing active connections is fifty
+        * times higher than that of inactive connections in average. (This
+        * fifty times might not be accurate, we will change it later.) We
+        * use the following formula to estimate the overhead:
+        *                dest->activeconns*50 + dest->inactconns
+        * and the load:
+        *                (dest overhead) / dest->weight
+        *
+        * Remember -- no floats in kernel mode!!!
+        * The comparison of h1*w2 > h2*w1 is equivalent to that of
+        *                h1/w1 > h2/w2
+        * if every weight is larger than zero.
+        *
+        * The server with weight=0 is quiesced and will not receive any
+        * new connection.
+        */
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+
+               if (atomic_read(&dest->weight) > 0) {
+                       least = dest;
+                       loh = atomic_read(&least->activeconns) * 50
+                               + atomic_read(&least->inactconns);
+                       goto nextstage;
+               }
+       }
+       return NULL;
+
+       /*
+        *    Find the destination with the least load.
+        */
+  nextstage:
+       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+
+               doh = atomic_read(&dest->activeconns) * 50
+                       + atomic_read(&dest->inactconns);
+               if (loh * atomic_read(&dest->weight) >
+                   doh * atomic_read(&least->weight)) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+                 "activeconns %d refcnt %d weight %d overhead %d\n",
+                 NIPQUAD(least->addr.ip), ntohs(least->port),
+                 atomic_read(&least->activeconns),
+                 atomic_read(&least->refcnt),
+                 atomic_read(&least->weight), loh);
+
+       return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+               struct ip_vs_dest *d;
+
+               list_for_each_entry(d, &svc->destinations, n_list) {
+                       if (atomic_read(&d->activeconns)*2
+                           < atomic_read(&d->weight)) {
+                               return 1;
+                       }
+               }
+       }
+       return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_lblcr_table *tbl = svc->sched_data;
+       struct iphdr *iph = ip_hdr(skb);
+       struct ip_vs_dest *dest = NULL;
+       struct ip_vs_lblcr_entry *en;
+
+       IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+
+       /* First look in our cache */
+       read_lock(&svc->sched_lock);
+       en = ip_vs_lblcr_get(tbl, iph->daddr);
+       if (en) {
+               /* We only hold a read lock, but this is atomic */
+               en->lastuse = jiffies;
+
+               /* Get the least loaded destination */
+               read_lock(&en->set.lock);
+               dest = ip_vs_dest_set_min(&en->set);
+               read_unlock(&en->set.lock);
+
+               /* More than one destination + enough time passed by, cleanup */
+               if (atomic_read(&en->set.size) > 1 &&
+                               time_after(jiffies, en->set.lastmod +
+                               sysctl_ip_vs_lblcr_expiration)) {
+                       struct ip_vs_dest *m;
+
+                       write_lock(&en->set.lock);
+                       m = ip_vs_dest_set_max(&en->set);
+                       if (m)
+                               ip_vs_dest_set_erase(&en->set, m);
+                       write_unlock(&en->set.lock);
+               }
+
+               /* If the destination is not overloaded, use it */
+               if (dest && !is_overloaded(dest, svc)) {
+                       read_unlock(&svc->sched_lock);
+                       goto out;
+               }
+
+               /* The cache entry is invalid, time to schedule */
+               dest = __ip_vs_lblcr_schedule(svc, iph);
+               if (!dest) {
+                       IP_VS_DBG(1, "no destination available\n");
+                       read_unlock(&svc->sched_lock);
+                       return NULL;
+               }
+
+               /* Update our cache entry */
+               write_lock(&en->set.lock);
+               ip_vs_dest_set_insert(&en->set, dest);
+               write_unlock(&en->set.lock);
+       }
+       read_unlock(&svc->sched_lock);
+
+       if (dest)
+               goto out;
+
+       /* No cache entry, time to schedule */
+       dest = __ip_vs_lblcr_schedule(svc, iph);
+       if (!dest) {
+               IP_VS_DBG(1, "no destination available\n");
+               return NULL;
+       }
+
+       /* If we fail to create a cache entry, we'll just use the valid dest */
+       write_lock(&svc->sched_lock);
+       ip_vs_lblcr_new(tbl, iph->daddr, dest);
+       write_unlock(&svc->sched_lock);
+
+out:
+       IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+                 "--> server %u.%u.%u.%u:%d\n",
+                 NIPQUAD(iph->daddr),
+                 NIPQUAD(dest->addr.ip),
+                 ntohs(dest->port));
+
+       return dest;
+}
+
+
+/*
+ *      IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+       .name =                 "lblcr",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        0,
+#endif
+       .init_service =         ip_vs_lblcr_init_svc,
+       .done_service =         ip_vs_lblcr_done_svc,
+       .schedule =             ip_vs_lblcr_schedule,
+};
+
+
+static int __init ip_vs_lblcr_init(void)
+{
+       int ret;
+
+       sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+       ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+       if (ret)
+               unregister_sysctl_table(sysctl_header);
+       return ret;
+}
+
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+       unregister_sysctl_table(sysctl_header);
+       unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c

new file mode 100644 (file)

index 0000000..b69f808
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -0,0 +1,103 @@
+/*
+ * IPVS:        Least-Connection Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     added the ip_vs_lc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+       /*
+        * We think the overhead of processing active connections is 256
+        * times higher than that of inactive connections in average. (This
+        * 256 times might not be accurate, we will change it later) We
+        * use the following formula to estimate the overhead now:
+        *                dest->activeconns*256 + dest->inactconns
+        */
+       return (atomic_read(&dest->activeconns) << 8) +
+               atomic_read(&dest->inactconns);
+}
+
+
+/*
+ *     Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest, *least = NULL;
+       unsigned int loh = 0, doh;
+
+       IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+
+       /*
+        * Simply select the server with the least number of
+        *        (activeconns<<5) + inactconns
+        * Except whose weight is equal to zero.
+        * If the weight is equal to zero, it means that the server is
+        * quiesced, the existing connections to the server still get
+        * served, but no new connection is assigned to the server.
+        */
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+                   atomic_read(&dest->weight) == 0)
+                       continue;
+               doh = ip_vs_lc_dest_overhead(dest);
+               if (!least || doh < loh) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       if (least)
+       IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
+                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+                     atomic_read(&least->activeconns),
+                     atomic_read(&least->inactconns));
+
+       return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+       .name =                 "lc",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        1,
+#endif
+       .schedule =             ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c

new file mode 100644 (file)

index 0000000..9a2d803
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -0,0 +1,138 @@
+/*
+ * IPVS:        Never Queue scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+       /*
+        * We only use the active connection number in the cost
+        * calculation here.
+        */
+       return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *     Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest, *least = NULL;
+       unsigned int loh = 0, doh;
+
+       IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+
+       /*
+        * We calculate the load of each dest server as follows:
+        *      (server expected overhead) / dest->weight
+        *
+        * Remember -- no floats in kernel mode!!!
+        * The comparison of h1*w2 > h2*w1 is equivalent to that of
+        *                h1/w1 > h2/w2
+        * if every weight is larger than zero.
+        *
+        * The server with weight=0 is quiesced and will not receive any
+        * new connections.
+        */
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+                   !atomic_read(&dest->weight))
+                       continue;
+
+               doh = ip_vs_nq_dest_overhead(dest);
+
+               /* return the server directly if it is idle */
+               if (atomic_read(&dest->activeconns) == 0) {
+                       least = dest;
+                       loh = doh;
+                       goto out;
+               }
+
+               if (!least ||
+                   (loh * atomic_read(&dest->weight) >
+                    doh * atomic_read(&least->weight))) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       if (!least)
+               return NULL;
+
+  out:
+       IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+                     "activeconns %d refcnt %d weight %d overhead %d\n",
+                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+                     atomic_read(&least->activeconns),
+                     atomic_read(&least->refcnt),
+                     atomic_read(&least->weight), loh);
+
+       return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+       .name =                 "nq",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        1,
+#endif
+       .schedule =             ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c

new file mode 100644 (file)

index 0000000..0791f9e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -0,0 +1,288 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE           32      /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto)                ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ *     register an ipvs protocol
+ */
+static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+       unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+       pp->next = ip_vs_proto_table[hash];
+       ip_vs_proto_table[hash] = pp;
+
+       if (pp->init != NULL)
+               pp->init(pp);
+
+       return 0;
+}
+
+
+/*
+ *     unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+       struct ip_vs_protocol **pp_p;
+       unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+       pp_p = &ip_vs_proto_table[hash];
+       for (; *pp_p; pp_p = &(*pp_p)->next) {
+               if (*pp_p == pp) {
+                       *pp_p = pp->next;
+                       if (pp->exit != NULL)
+                               pp->exit(pp);
+                       return 0;
+               }
+       }
+
+       return -ESRCH;
+}
+
+
+/*
+ *     get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+       struct ip_vs_protocol *pp;
+       unsigned hash = IP_VS_PROTO_HASH(proto);
+
+       for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+               if (pp->protocol == proto)
+                       return pp;
+       }
+
+       return NULL;
+}
+
+
+/*
+ *     Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+       struct ip_vs_protocol *pp;
+       int i;
+
+       for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+               for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+                       if (pp->timeout_change)
+                               pp->timeout_change(pp, flags);
+               }
+       }
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+       return kmemdup(table, size, GFP_ATOMIC);
+}
+
+
+/*
+ *     Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+       int i;
+
+       if (!table || !name || !to)
+               return -EINVAL;
+
+       for (i = 0; i < num; i++) {
+               if (strcmp(names[i], name))
+                       continue;
+               table[i] = to * HZ;
+               return 0;
+       }
+       return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+       struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+       if (pp == NULL || pp->state_name == NULL)
+               return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+       return pp->state_name(state);
+}
+
+
+static void
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+                            const struct sk_buff *skb,
+                            int offset,
+                            const char *msg)
+{
+       char buf[128];
+       struct iphdr _iph, *ih;
+
+       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+       if (ih == NULL)
+               sprintf(buf, "%s TRUNCATED", pp->name);
+       else if (ih->frag_off & htons(IP_OFFSET))
+               sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+                       pp->name, NIPQUAD(ih->saddr),
+                       NIPQUAD(ih->daddr));
+       else {
+               __be16 _ports[2], *pptr
+;
+               pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+                                         sizeof(_ports), _ports);
+               if (pptr == NULL)
+                       sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
+                               pp->name,
+                               NIPQUAD(ih->saddr),
+                               NIPQUAD(ih->daddr));
+               else
+                       sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+                               pp->name,
+                               NIPQUAD(ih->saddr),
+                               ntohs(pptr[0]),
+                               NIPQUAD(ih->daddr),
+                               ntohs(pptr[1]));
+       }
+
+       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+                            const struct sk_buff *skb,
+                            int offset,
+                            const char *msg)
+{
+       char buf[192];
+       struct ipv6hdr _iph, *ih;
+
+       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+       if (ih == NULL)
+               sprintf(buf, "%s TRUNCATED", pp->name);
+       else if (ih->nexthdr == IPPROTO_FRAGMENT)
+               sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
+                       pp->name, NIP6(ih->saddr),
+                       NIP6(ih->daddr));
+       else {
+               __be16 _ports[2], *pptr;
+
+               pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+                                         sizeof(_ports), _ports);
+               if (pptr == NULL)
+                       sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
+                               pp->name,
+                               NIP6(ih->saddr),
+                               NIP6(ih->daddr));
+               else
+                       sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
+                               pp->name,
+                               NIP6(ih->saddr),
+                               ntohs(pptr[0]),
+                               NIP6(ih->daddr),
+                               ntohs(pptr[1]));
+       }
+
+       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+                         const struct sk_buff *skb,
+                         int offset,
+                         const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (skb->protocol == htons(ETH_P_IPV6))
+               ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+       else
+#endif
+               ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+
+int __init ip_vs_protocol_init(void)
+{
+       char protocols[64];
+#define REGISTER_PROTOCOL(p)                   \
+       do {                                    \
+               register_ip_vs_protocol(p);     \
+               strcat(protocols, ", ");        \
+               strcat(protocols, (p)->name);   \
+       } while (0)
+
+       protocols[0] = '\0';
+       protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+       REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+       REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+       REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+       REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+       IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+
+       return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+       struct ip_vs_protocol *pp;
+       int i;
+
+       /* unregister all the ipvs protocols */
+       for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+               while ((pp = ip_vs_proto_table[i]) != NULL)
+                       unregister_ip_vs_protocol(pp);
+       }
+}
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c

new file mode 100644 (file)

index 0000000..80ab0c8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,235 @@
+/*
+ * ip_vs_proto_ah_esp.c:       AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors:    Julian Anastasov <ja@ssi.bg>, February 2002
+ *             Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+       __u8            icookie[8];
+       __u8            rcookie[8];
+       __u8            np;
+       __u8            version;
+       __u8            xchgtype;
+       __u8            flags;
+       __u32           msgid;
+       __u32           length;
+};
+
+*/
+
+#define PORT_ISAKMP    500
+
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                  const struct ip_vs_iphdr *iph, unsigned int proto_off,
+                  int inverse)
+{
+       struct ip_vs_conn *cp;
+
+       if (likely(!inverse)) {
+               cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+                                      &iph->saddr,
+                                      htons(PORT_ISAKMP),
+                                      &iph->daddr,
+                                      htons(PORT_ISAKMP));
+       } else {
+               cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+                                      &iph->daddr,
+                                      htons(PORT_ISAKMP),
+                                      &iph->saddr,
+                                      htons(PORT_ISAKMP));
+       }
+
+       if (!cp) {
+               /*
+                * We are not sure if the packet is from our
+                * service, so our conn_schedule hook should return NF_ACCEPT
+                */
+               IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+                             "%s%s %s->%s\n",
+                             inverse ? "ICMP+" : "",
+                             pp->name,
+                             IP_VS_DBG_ADDR(af, &iph->saddr),
+                             IP_VS_DBG_ADDR(af, &iph->daddr));
+       }
+
+       return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+                   struct ip_vs_protocol *pp,
+                   const struct ip_vs_iphdr *iph,
+                   unsigned int proto_off,
+                   int inverse)
+{
+       struct ip_vs_conn *cp;
+
+       if (likely(!inverse)) {
+               cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+                                       &iph->saddr,
+                                       htons(PORT_ISAKMP),
+                                       &iph->daddr,
+                                       htons(PORT_ISAKMP));
+       } else {
+               cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+                                       &iph->daddr,
+                                       htons(PORT_ISAKMP),
+                                       &iph->saddr,
+                                       htons(PORT_ISAKMP));
+       }
+
+       if (!cp) {
+               IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+                             "%s%s %s->%s\n",
+                             inverse ? "ICMP+" : "",
+                             pp->name,
+                             IP_VS_DBG_ADDR(af, &iph->saddr),
+                             IP_VS_DBG_ADDR(af, &iph->daddr));
+       }
+
+       return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+                    int *verdict, struct ip_vs_conn **cpp)
+{
+       /*
+        * AH/ESP is only related traffic. Pass the packet to IP stack.
+        */
+       *verdict = NF_ACCEPT;
+       return 0;
+}
+
+
+static void
+ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                      int offset, const char *msg)
+{
+       char buf[256];
+       struct iphdr _iph, *ih;
+
+       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+       if (ih == NULL)
+               sprintf(buf, "%s TRUNCATED", pp->name);
+       else
+               sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+                       pp->name, NIPQUAD(ih->saddr),
+                       NIPQUAD(ih->daddr));
+
+       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                      int offset, const char *msg)
+{
+       char buf[256];
+       struct ipv6hdr _iph, *ih;
+
+       ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+       if (ih == NULL)
+               sprintf(buf, "%s TRUNCATED", pp->name);
+       else
+               sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
+                       pp->name, NIP6(ih->saddr),
+                       NIP6(ih->daddr));
+
+       printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+#endif
+
+static void
+ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+                   int offset, const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (skb->protocol == htons(ETH_P_IPV6))
+               ah_esp_debug_packet_v6(pp, skb, offset, msg);
+       else
+#endif
+               ah_esp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+
+static void ah_esp_init(struct ip_vs_protocol *pp)
+{
+       /* nothing to do now */
+}
+
+
+static void ah_esp_exit(struct ip_vs_protocol *pp)
+{
+       /* nothing to do now */
+}
+
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+       .name =                 "AH",
+       .protocol =             IPPROTO_AH,
+       .num_states =           1,
+       .dont_defrag =          1,
+       .init =                 ah_esp_init,
+       .exit =                 ah_esp_exit,
+       .conn_schedule =        ah_esp_conn_schedule,
+       .conn_in_get =          ah_esp_conn_in_get,
+       .conn_out_get =         ah_esp_conn_out_get,
+       .snat_handler =         NULL,
+       .dnat_handler =         NULL,
+       .csum_check =           NULL,
+       .state_transition =     NULL,
+       .register_app =         NULL,
+       .unregister_app =       NULL,
+       .app_conn_bind =        NULL,
+       .debug_packet =         ah_esp_debug_packet,
+       .timeout_change =       NULL,           /* ISAKMP */
+       .set_state_timeout =    NULL,
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+       .name =                 "ESP",
+       .protocol =             IPPROTO_ESP,
+       .num_states =           1,
+       .dont_defrag =          1,
+       .init =                 ah_esp_init,
+       .exit =                 ah_esp_exit,
+       .conn_schedule =        ah_esp_conn_schedule,
+       .conn_in_get =          ah_esp_conn_in_get,
+       .conn_out_get =         ah_esp_conn_out_get,
+       .snat_handler =         NULL,
+       .dnat_handler =         NULL,
+       .csum_check =           NULL,
+       .state_transition =     NULL,
+       .register_app =         NULL,
+       .unregister_app =       NULL,
+       .app_conn_bind =        NULL,
+       .debug_packet =         ah_esp_debug_packet,
+       .timeout_change =       NULL,           /* ISAKMP */
+};
+#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c

new file mode 100644 (file)

index 0000000..dd4566e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,732 @@
+/*
+ * ip_vs_proto_tcp.c:  TCP load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+               const struct ip_vs_iphdr *iph, unsigned int proto_off,
+               int inverse)
+{
+       __be16 _ports[2], *pptr;
+
+       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+       if (pptr == NULL)
+               return NULL;
+
+       if (likely(!inverse)) {
+               return ip_vs_conn_in_get(af, iph->protocol,
+                                        &iph->saddr, pptr[0],
+                                        &iph->daddr, pptr[1]);
+       } else {
+               return ip_vs_conn_in_get(af, iph->protocol,
+                                        &iph->daddr, pptr[1],
+                                        &iph->saddr, pptr[0]);
+       }
+}
+
+static struct ip_vs_conn *
+tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct ip_vs_iphdr *iph, unsigned int proto_off,
+                int inverse)
+{
+       __be16 _ports[2], *pptr;
+
+       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+       if (pptr == NULL)
+               return NULL;
+
+       if (likely(!inverse)) {
+               return ip_vs_conn_out_get(af, iph->protocol,
+                                         &iph->saddr, pptr[0],
+                                         &iph->daddr, pptr[1]);
+       } else {
+               return ip_vs_conn_out_get(af, iph->protocol,
+                                         &iph->daddr, pptr[1],
+                                         &iph->saddr, pptr[0]);
+       }
+}
+
+
+static int
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 int *verdict, struct ip_vs_conn **cpp)
+{
+       struct ip_vs_service *svc;
+       struct tcphdr _tcph, *th;
+       struct ip_vs_iphdr iph;
+
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+       th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+       if (th == NULL) {
+               *verdict = NF_DROP;
+               return 0;
+       }
+
+       if (th->syn &&
+           (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
+                                    th->dest))) {
+               if (ip_vs_todrop()) {
+                       /*
+                        * It seems that we are very loaded.
+                        * We have to drop this packet :(
+                        */
+                       ip_vs_service_put(svc);
+                       *verdict = NF_DROP;
+                       return 0;
+               }
+
+               /*
+                * Let the virtual server select a real server for the
+                * incoming connection, and create a connection entry.
+                */
+               *cpp = ip_vs_schedule(svc, skb);
+               if (!*cpp) {
+                       *verdict = ip_vs_leave(svc, skb, pp);
+                       return 0;
+               }
+               ip_vs_service_put(svc);
+       }
+       return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+                    const union nf_inet_addr *oldip,
+                    const union nf_inet_addr *newip,
+                    __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               tcph->check =
+                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                                        ip_vs_check_diff2(oldport, newport,
+                                               ~csum_unfold(tcph->check))));
+       else
+#endif
+       tcph->check =
+               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+                                ip_vs_check_diff2(oldport, newport,
+                                               ~csum_unfold(tcph->check))));
+}
+
+
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+                    const union nf_inet_addr *oldip,
+                    const union nf_inet_addr *newip,
+                    __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               tcph->check =
+                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                                        ip_vs_check_diff2(oldlen, newlen,
+                                               ~csum_unfold(tcph->check))));
+       else
+#endif
+       tcph->check =
+               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+                               ip_vs_check_diff2(oldlen, newlen,
+                                               ~csum_unfold(tcph->check))));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb,
+                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+       struct tcphdr *tcph;
+       unsigned int tcphoff;
+       int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (cp->af == AF_INET6)
+               tcphoff = sizeof(struct ipv6hdr);
+       else
+#endif
+               tcphoff = ip_hdrlen(skb);
+       oldlen = skb->len - tcphoff;
+
+       /* csum_check requires unshared skb */
+       if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+               return 0;
+
+       if (unlikely(cp->app != NULL)) {
+               /* Some checks before mangling */
+               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+                       return 0;
+
+               /* Call application helper if needed */
+               if (!ip_vs_app_pkt_out(cp, skb))
+                       return 0;
+       }
+
+       tcph = (void *)skb_network_header(skb) + tcphoff;
+       tcph->source = cp->vport;
+
+       /* Adjust TCP checksums */
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+                                       htonl(oldlen),
+                                       htonl(skb->len - tcphoff));
+       } else if (!cp->app) {
+               /* Only port and addr are changed, do fast csum update */
+               tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+                                    cp->dport, cp->vport);
+               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
+       } else {
+               /* full checksum calculation */
+               tcph->check = 0;
+               skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+               if (cp->af == AF_INET6)
+                       tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+                                                     &cp->caddr.in6,
+                                                     skb->len - tcphoff,
+                                                     cp->protocol, skb->csum);
+               else
+#endif
+                       tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+                                                       cp->caddr.ip,
+                                                       skb->len - tcphoff,
+                                                       cp->protocol,
+                                                       skb->csum);
+
+               IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+                         pp->name, tcph->check,
+                         (char*)&(tcph->check) - (char*)tcph);
+       }
+       return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb,
+                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+       struct tcphdr *tcph;
+       unsigned int tcphoff;
+       int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (cp->af == AF_INET6)
+               tcphoff = sizeof(struct ipv6hdr);
+       else
+#endif
+               tcphoff = ip_hdrlen(skb);
+       oldlen = skb->len - tcphoff;
+
+       /* csum_check requires unshared skb */
+       if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+               return 0;
+
+       if (unlikely(cp->app != NULL)) {
+               /* Some checks before mangling */
+               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+                       return 0;
+
+               /*
+                *      Attempt ip_vs_app call.
+                *      It will fix ip_vs_conn and iph ack_seq stuff
+                */
+               if (!ip_vs_app_pkt_in(cp, skb))
+                       return 0;
+       }
+
+       tcph = (void *)skb_network_header(skb) + tcphoff;
+       tcph->dest = cp->dport;
+
+       /*
+        *      Adjust TCP checksums
+        */
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+                                       htonl(oldlen),
+                                       htonl(skb->len - tcphoff));
+       } else if (!cp->app) {
+               /* Only port and addr are changed, do fast csum update */
+               tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+                                    cp->vport, cp->dport);
+               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
+       } else {
+               /* full checksum calculation */
+               tcph->check = 0;
+               skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+               if (cp->af == AF_INET6)
+                       tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+                                                     &cp->daddr.in6,
+                                                     skb->len - tcphoff,
+                                                     cp->protocol, skb->csum);
+               else
+#endif
+                       tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+                                                       cp->daddr.ip,
+                                                       skb->len - tcphoff,
+                                                       cp->protocol,
+                                                       skb->csum);
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       }
+       return 1;
+}
+
+
+static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+       unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               tcphoff = sizeof(struct ipv6hdr);
+       else
+#endif
+               tcphoff = ip_hdrlen(skb);
+
+       switch (skb->ip_summed) {
+       case CHECKSUM_NONE:
+               skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+       case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+               if (af == AF_INET6) {
+                       if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+                                           &ipv6_hdr(skb)->daddr,
+                                           skb->len - tcphoff,
+                                           ipv6_hdr(skb)->nexthdr,
+                                           skb->csum)) {
+                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                                "Failed checksum for");
+                               return 0;
+                       }
+               } else
+#endif
+                       if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+                                             ip_hdr(skb)->daddr,
+                                             skb->len - tcphoff,
+                                             ip_hdr(skb)->protocol,
+                                             skb->csum)) {
+                               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                                "Failed checksum for");
+                               return 0;
+                       }
+               break;
+       default:
+               /* No need to checksum. */
+               break;
+       }
+
+       return 1;
+}
+
+
+#define TCP_DIR_INPUT          0
+#define TCP_DIR_OUTPUT         4
+#define TCP_DIR_INPUT_ONLY     8
+
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
+       [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
+       [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
+       [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ *     Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+       [IP_VS_TCP_S_NONE]              =       2*HZ,
+       [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
+       [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
+       [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
+       [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
+       [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
+       [IP_VS_TCP_S_CLOSE]             =       10*HZ,
+       [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
+       [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
+       [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
+       [IP_VS_TCP_S_SYNACK]            =       120*HZ,
+       [IP_VS_TCP_S_LAST]              =       2*HZ,
+};
+
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+       [IP_VS_TCP_S_NONE]              =       "NONE",
+       [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
+       [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
+       [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
+       [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
+       [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
+       [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
+       [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
+       [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
+       [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
+       [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
+       [IP_VS_TCP_S_LAST]              =       "BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+       int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+       if (state >= IP_VS_TCP_S_LAST)
+               return "ERR!";
+       return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/*     INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/*     OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*     INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/*     INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/*     OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*     INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA        */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t *tcp_state_table = tcp_states;
+
+
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+       int on = (flags & 1);           /* secure_tcp */
+
+       /*
+       ** FIXME: change secure_tcp to independent sysctl var
+       ** or make it per-service or per-app because it is valid
+       ** for most if not for all of the applications. Something
+       ** like "capabilities" (flags) for each object.
+       */
+       tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+       return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+                                      tcp_state_name_table, sname, to);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+       if (th->rst)
+               return 3;
+       if (th->syn)
+               return 0;
+       if (th->fin)
+               return 1;
+       if (th->ack)
+               return 2;
+       return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+             int direction, struct tcphdr *th)
+{
+       int state_idx;
+       int new_state = IP_VS_TCP_S_CLOSE;
+       int state_off = tcp_state_off[direction];
+
+       /*
+        *    Update state offset to INPUT_ONLY if necessary
+        *    or delete NO_OUTPUT flag if output packet detected
+        */
+       if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+               if (state_off == TCP_DIR_OUTPUT)
+                       cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+               else
+                       state_off = TCP_DIR_INPUT_ONLY;
+       }
+
+       if ((state_idx = tcp_state_idx(th)) < 0) {
+               IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+               goto tcp_state_out;
+       }
+
+       new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+  tcp_state_out:
+       if (new_state != cp->state) {
+               struct ip_vs_dest *dest = cp->dest;
+
+               IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+                             "%s:%d state: %s->%s conn->refcnt:%d\n",
+                             pp->name,
+                             ((state_off == TCP_DIR_OUTPUT) ?
+                              "output " : "input "),
+                             th->syn ? 'S' : '.',
+                             th->fin ? 'F' : '.',
+                             th->ack ? 'A' : '.',
+                             th->rst ? 'R' : '.',
+                             IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+                             ntohs(cp->dport),
+                             IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+                             ntohs(cp->cport),
+                             tcp_state_name(cp->state),
+                             tcp_state_name(new_state),
+                             atomic_read(&cp->refcnt));
+
+               if (dest) {
+                       if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                           (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+                               atomic_dec(&dest->activeconns);
+                               atomic_inc(&dest->inactconns);
+                               cp->flags |= IP_VS_CONN_F_INACTIVE;
+                       } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                                  (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+                               atomic_inc(&dest->activeconns);
+                               atomic_dec(&dest->inactconns);
+                               cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+                       }
+               }
+       }
+
+       cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+
+
+/*
+ *     Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+                    const struct sk_buff *skb,
+                    struct ip_vs_protocol *pp)
+{
+       struct tcphdr _tcph, *th;
+
+#ifdef CONFIG_IP_VS_IPV6
+       int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+       int ihl = ip_hdrlen(skb);
+#endif
+
+       th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
+       if (th == NULL)
+               return 0;
+
+       spin_lock(&cp->lock);
+       set_tcp_state(pp, cp, direction, th);
+       spin_unlock(&cp->lock);
+
+       return 1;
+}
+
+
+/*
+ *     Hash table for TCP application incarnations
+ */
+#define        TCP_APP_TAB_BITS        4
+#define        TCP_APP_TAB_SIZE        (1 << TCP_APP_TAB_BITS)
+#define        TCP_APP_TAB_MASK        (TCP_APP_TAB_SIZE - 1)
+
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(tcp_app_lock);
+
+static inline __u16 tcp_app_hashkey(__be16 port)
+{
+       return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
+               & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+       struct ip_vs_app *i;
+       __u16 hash;
+       __be16 port = inc->port;
+       int ret = 0;
+
+       hash = tcp_app_hashkey(port);
+
+       spin_lock_bh(&tcp_app_lock);
+       list_for_each_entry(i, &tcp_apps[hash], p_list) {
+               if (i->port == port) {
+                       ret = -EEXIST;
+                       goto out;
+               }
+       }
+       list_add(&inc->p_list, &tcp_apps[hash]);
+       atomic_inc(&ip_vs_protocol_tcp.appcnt);
+
+  out:
+       spin_unlock_bh(&tcp_app_lock);
+       return ret;
+}
+
+
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+       spin_lock_bh(&tcp_app_lock);
+       atomic_dec(&ip_vs_protocol_tcp.appcnt);
+       list_del(&inc->p_list);
+       spin_unlock_bh(&tcp_app_lock);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+       int hash;
+       struct ip_vs_app *inc;
+       int result = 0;
+
+       /* Default binding: bind app only for NAT */
+       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+               return 0;
+
+       /* Lookup application incarnations and bind the right one */
+       hash = tcp_app_hashkey(cp->vport);
+
+       spin_lock(&tcp_app_lock);
+       list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+               if (inc->port == cp->vport) {
+                       if (unlikely(!ip_vs_app_inc_get(inc)))
+                               break;
+                       spin_unlock(&tcp_app_lock);
+
+                       IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+                                     "%s:%u to app %s on port %u\n",
+                                     __func__,
+                                     IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+                                     ntohs(cp->cport),
+                                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+                                     ntohs(cp->vport),
+                                     inc->name, ntohs(inc->port));
+
+                       cp->app = inc;
+                       if (inc->init_conn)
+                               result = inc->init_conn(inc, cp);
+                       goto out;
+               }
+       }
+       spin_unlock(&tcp_app_lock);
+
+  out:
+       return result;
+}
+
+
+/*
+ *     Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+       spin_lock(&cp->lock);
+       cp->state = IP_VS_TCP_S_LISTEN;
+       cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+       spin_unlock(&cp->lock);
+}
+
+
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+{
+       IP_VS_INIT_HASH_TABLE(tcp_apps);
+       pp->timeout_table = tcp_timeouts;
+}
+
+
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+       .name =                 "TCP",
+       .protocol =             IPPROTO_TCP,
+       .num_states =           IP_VS_TCP_S_LAST,
+       .dont_defrag =          0,
+       .appcnt =               ATOMIC_INIT(0),
+       .init =                 ip_vs_tcp_init,
+       .exit =                 ip_vs_tcp_exit,
+       .register_app =         tcp_register_app,
+       .unregister_app =       tcp_unregister_app,
+       .conn_schedule =        tcp_conn_schedule,
+       .conn_in_get =          tcp_conn_in_get,
+       .conn_out_get =         tcp_conn_out_get,
+       .snat_handler =         tcp_snat_handler,
+       .dnat_handler =         tcp_dnat_handler,
+       .csum_check =           tcp_csum_check,
+       .state_name =           tcp_state_name,
+       .state_transition =     tcp_state_transition,
+       .app_conn_bind =        tcp_app_conn_bind,
+       .debug_packet =         ip_vs_tcpudp_debug_packet,
+       .timeout_change =       tcp_timeout_change,
+       .set_state_timeout =    tcp_set_state_timeout,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c

new file mode 100644 (file)

index 0000000..6eb6039
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,533 @@
+/*
+ * ip_vs_proto_udp.c:  UDP load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
+
+#include <net/ip_vs.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+
+static struct ip_vs_conn *
+udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+               const struct ip_vs_iphdr *iph, unsigned int proto_off,
+               int inverse)
+{
+       struct ip_vs_conn *cp;
+       __be16 _ports[2], *pptr;
+
+       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+       if (pptr == NULL)
+               return NULL;
+
+       if (likely(!inverse)) {
+               cp = ip_vs_conn_in_get(af, iph->protocol,
+                                      &iph->saddr, pptr[0],
+                                      &iph->daddr, pptr[1]);
+       } else {
+               cp = ip_vs_conn_in_get(af, iph->protocol,
+                                      &iph->daddr, pptr[1],
+                                      &iph->saddr, pptr[0]);
+       }
+
+       return cp;
+}
+
+
+static struct ip_vs_conn *
+udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+                const struct ip_vs_iphdr *iph, unsigned int proto_off,
+                int inverse)
+{
+       struct ip_vs_conn *cp;
+       __be16 _ports[2], *pptr;
+
+       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+       if (pptr == NULL)
+               return NULL;
+
+       if (likely(!inverse)) {
+               cp = ip_vs_conn_out_get(af, iph->protocol,
+                                       &iph->saddr, pptr[0],
+                                       &iph->daddr, pptr[1]);
+       } else {
+               cp = ip_vs_conn_out_get(af, iph->protocol,
+                                       &iph->daddr, pptr[1],
+                                       &iph->saddr, pptr[0]);
+       }
+
+       return cp;
+}
+
+
+static int
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+                 int *verdict, struct ip_vs_conn **cpp)
+{
+       struct ip_vs_service *svc;
+       struct udphdr _udph, *uh;
+       struct ip_vs_iphdr iph;
+
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+       uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+       if (uh == NULL) {
+               *verdict = NF_DROP;
+               return 0;
+       }
+
+       svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+                               &iph.daddr, uh->dest);
+       if (svc) {
+               if (ip_vs_todrop()) {
+                       /*
+                        * It seems that we are very loaded.
+                        * We have to drop this packet :(
+                        */
+                       ip_vs_service_put(svc);
+                       *verdict = NF_DROP;
+                       return 0;
+               }
+
+               /*
+                * Let the virtual server select a real server for the
+                * incoming connection, and create a connection entry.
+                */
+               *cpp = ip_vs_schedule(svc, skb);
+               if (!*cpp) {
+                       *verdict = ip_vs_leave(svc, skb, pp);
+                       return 0;
+               }
+               ip_vs_service_put(svc);
+       }
+       return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+                    const union nf_inet_addr *oldip,
+                    const union nf_inet_addr *newip,
+                    __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               uhdr->check =
+                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                                        ip_vs_check_diff2(oldport, newport,
+                                               ~csum_unfold(uhdr->check))));
+       else
+#endif
+               uhdr->check =
+                       csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+                                        ip_vs_check_diff2(oldport, newport,
+                                               ~csum_unfold(uhdr->check))));
+       if (!uhdr->check)
+               uhdr->check = CSUM_MANGLED_0;
+}
+
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+                    const union nf_inet_addr *oldip,
+                    const union nf_inet_addr *newip,
+                    __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               uhdr->check =
+                       csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+                                        ip_vs_check_diff2(oldlen, newlen,
+                                               ~csum_unfold(uhdr->check))));
+       else
+#endif
+       uhdr->check =
+               csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+                               ip_vs_check_diff2(oldlen, newlen,
+                                               ~csum_unfold(uhdr->check))));
+}
+
+
+static int
+udp_snat_handler(struct sk_buff *skb,
+                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+       struct udphdr *udph;
+       unsigned int udphoff;
+       int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (cp->af == AF_INET6)
+               udphoff = sizeof(struct ipv6hdr);
+       else
+#endif
+               udphoff = ip_hdrlen(skb);
+       oldlen = skb->len - udphoff;
+
+       /* csum_check requires unshared skb */
+       if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+               return 0;
+
+       if (unlikely(cp->app != NULL)) {
+               /* Some checks before mangling */
+               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+                       return 0;
+
+               /*
+                *      Call application helper if needed
+                */
+               if (!ip_vs_app_pkt_out(cp, skb))
+                       return 0;
+       }
+
+       udph = (void *)skb_network_header(skb) + udphoff;
+       udph->source = cp->vport;
+
+       /*
+        *      Adjust UDP checksums
+        */
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+                                       htonl(oldlen),
+                                       htonl(skb->len - udphoff));
+       } else if (!cp->app && (udph->check != 0)) {
+               /* Only port and addr are changed, do fast csum update */
+               udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+                                    cp->dport, cp->vport);
+               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
+       } else {
+               /* full checksum calculation */
+               udph->check = 0;
+               skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+               if (cp->af == AF_INET6)
+                       udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+                                                     &cp->caddr.in6,
+                                                     skb->len - udphoff,
+                                                     cp->protocol, skb->csum);
+               else
+#endif
+                       udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+                                                       cp->caddr.ip,
+                                                       skb->len - udphoff,
+                                                       cp->protocol,
+                                                       skb->csum);
+               if (udph->check == 0)
+                       udph->check = CSUM_MANGLED_0;
+               IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+                         pp->name, udph->check,
+                         (char*)&(udph->check) - (char*)udph);
+       }
+       return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb,
+                struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+       struct udphdr *udph;
+       unsigned int udphoff;
+       int oldlen;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (cp->af == AF_INET6)
+               udphoff = sizeof(struct ipv6hdr);
+       else
+#endif
+               udphoff = ip_hdrlen(skb);
+       oldlen = skb->len - udphoff;
+
+       /* csum_check requires unshared skb */
+       if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+               return 0;
+
+       if (unlikely(cp->app != NULL)) {
+               /* Some checks before mangling */
+               if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+                       return 0;
+
+               /*
+                *      Attempt ip_vs_app call.
+                *      It will fix ip_vs_conn
+                */
+               if (!ip_vs_app_pkt_in(cp, skb))
+                       return 0;
+       }
+
+       udph = (void *)skb_network_header(skb) + udphoff;
+       udph->dest = cp->dport;
+
+       /*
+        *      Adjust UDP checksums
+        */
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+                                       htonl(oldlen),
+                                       htonl(skb->len - udphoff));
+       } else if (!cp->app && (udph->check != 0)) {
+               /* Only port and addr are changed, do fast csum update */
+               udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+                                    cp->vport, cp->dport);
+               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
+       } else {
+               /* full checksum calculation */
+               udph->check = 0;
+               skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+               if (cp->af == AF_INET6)
+                       udph->check = csum_ipv6_magic(&cp->caddr.in6,
+                                                     &cp->daddr.in6,
+                                                     skb->len - udphoff,
+                                                     cp->protocol, skb->csum);
+               else
+#endif
+                       udph->check = csum_tcpudp_magic(cp->caddr.ip,
+                                                       cp->daddr.ip,
+                                                       skb->len - udphoff,
+                                                       cp->protocol,
+                                                       skb->csum);
+               if (udph->check == 0)
+                       udph->check = CSUM_MANGLED_0;
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       }
+       return 1;
+}
+
+
+static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+       struct udphdr _udph, *uh;
+       unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6)
+               udphoff = sizeof(struct ipv6hdr);
+       else
+#endif
+               udphoff = ip_hdrlen(skb);
+
+       uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+       if (uh == NULL)
+               return 0;
+
+       if (uh->check != 0) {
+               switch (skb->ip_summed) {
+               case CHECKSUM_NONE:
+                       skb->csum = skb_checksum(skb, udphoff,
+                                                skb->len - udphoff, 0);
+               case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+                       if (af == AF_INET6) {
+                               if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+                                                   &ipv6_hdr(skb)->daddr,
+                                                   skb->len - udphoff,
+                                                   ipv6_hdr(skb)->nexthdr,
+                                                   skb->csum)) {
+                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                                        "Failed checksum for");
+                                       return 0;
+                               }
+                       } else
+#endif
+                               if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+                                                     ip_hdr(skb)->daddr,
+                                                     skb->len - udphoff,
+                                                     ip_hdr(skb)->protocol,
+                                                     skb->csum)) {
+                                       IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                                        "Failed checksum for");
+                                       return 0;
+                               }
+                       break;
+               default:
+                       /* No need to checksum. */
+                       break;
+               }
+       }
+       return 1;
+}
+
+
+/*
+ *     Note: the caller guarantees that only one of register_app,
+ *     unregister_app or app_conn_bind is called each time.
+ */
+
+#define        UDP_APP_TAB_BITS        4
+#define        UDP_APP_TAB_SIZE        (1 << UDP_APP_TAB_BITS)
+#define        UDP_APP_TAB_MASK        (UDP_APP_TAB_SIZE - 1)
+
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(udp_app_lock);
+
+static inline __u16 udp_app_hashkey(__be16 port)
+{
+       return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
+               & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct ip_vs_app *inc)
+{
+       struct ip_vs_app *i;
+       __u16 hash;
+       __be16 port = inc->port;
+       int ret = 0;
+
+       hash = udp_app_hashkey(port);
+
+
+       spin_lock_bh(&udp_app_lock);
+       list_for_each_entry(i, &udp_apps[hash], p_list) {
+               if (i->port == port) {
+                       ret = -EEXIST;
+                       goto out;
+               }
+       }
+       list_add(&inc->p_list, &udp_apps[hash]);
+       atomic_inc(&ip_vs_protocol_udp.appcnt);
+
+  out:
+       spin_unlock_bh(&udp_app_lock);
+       return ret;
+}
+
+
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+       spin_lock_bh(&udp_app_lock);
+       atomic_dec(&ip_vs_protocol_udp.appcnt);
+       list_del(&inc->p_list);
+       spin_unlock_bh(&udp_app_lock);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+       int hash;
+       struct ip_vs_app *inc;
+       int result = 0;
+
+       /* Default binding: bind app only for NAT */
+       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+               return 0;
+
+       /* Lookup application incarnations and bind the right one */
+       hash = udp_app_hashkey(cp->vport);
+
+       spin_lock(&udp_app_lock);
+       list_for_each_entry(inc, &udp_apps[hash], p_list) {
+               if (inc->port == cp->vport) {
+                       if (unlikely(!ip_vs_app_inc_get(inc)))
+                               break;
+                       spin_unlock(&udp_app_lock);
+
+                       IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+                                     "%s:%u to app %s on port %u\n",
+                                     __func__,
+                                     IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+                                     ntohs(cp->cport),
+                                     IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+                                     ntohs(cp->vport),
+                                     inc->name, ntohs(inc->port));
+
+                       cp->app = inc;
+                       if (inc->init_conn)
+                               result = inc->init_conn(inc, cp);
+                       goto out;
+               }
+       }
+       spin_unlock(&udp_app_lock);
+
+  out:
+       return result;
+}
+
+
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+       [IP_VS_UDP_S_NORMAL]            =       5*60*HZ,
+       [IP_VS_UDP_S_LAST]              =       2*HZ,
+};
+
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+       [IP_VS_UDP_S_NORMAL]            =       "UDP",
+       [IP_VS_UDP_S_LAST]              =       "BUG!",
+};
+
+
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+       return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+                                      udp_state_name_table, sname, to);
+}
+
+static const char * udp_state_name(int state)
+{
+       if (state >= IP_VS_UDP_S_LAST)
+               return "ERR!";
+       return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+                    const struct sk_buff *skb,
+                    struct ip_vs_protocol *pp)
+{
+       cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+       return 1;
+}
+
+static void udp_init(struct ip_vs_protocol *pp)
+{
+       IP_VS_INIT_HASH_TABLE(udp_apps);
+       pp->timeout_table = udp_timeouts;
+}
+
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+       .name =                 "UDP",
+       .protocol =             IPPROTO_UDP,
+       .num_states =           IP_VS_UDP_S_LAST,
+       .dont_defrag =          0,
+       .init =                 udp_init,
+       .exit =                 udp_exit,
+       .conn_schedule =        udp_conn_schedule,
+       .conn_in_get =          udp_conn_in_get,
+       .conn_out_get =         udp_conn_out_get,
+       .snat_handler =         udp_snat_handler,
+       .dnat_handler =         udp_dnat_handler,
+       .csum_check =           udp_csum_check,
+       .state_transition =     udp_state_transition,
+       .state_name =           udp_state_name,
+       .register_app =         udp_register_app,
+       .unregister_app =       udp_unregister_app,
+       .app_conn_bind =        udp_app_conn_bind,
+       .debug_packet =         ip_vs_tcpudp_debug_packet,
+       .timeout_change =       NULL,
+       .set_state_timeout =    udp_set_state_timeout,
+};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c

new file mode 100644 (file)

index 0000000..a22195f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -0,0 +1,112 @@
+/*
+ * IPVS:        Round-Robin Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
+ *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_rr_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+       svc->sched_data = &svc->destinations;
+       return 0;
+}
+
+
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+       svc->sched_data = &svc->destinations;
+       return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct list_head *p, *q;
+       struct ip_vs_dest *dest;
+
+       IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+
+       write_lock(&svc->sched_lock);
+       p = (struct list_head *)svc->sched_data;
+       p = p->next;
+       q = p;
+       do {
+               /* skip list head */
+               if (q == &svc->destinations) {
+                       q = q->next;
+                       continue;
+               }
+
+               dest = list_entry(q, struct ip_vs_dest, n_list);
+               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                   atomic_read(&dest->weight) > 0)
+                       /* HIT */
+                       goto out;
+               q = q->next;
+       } while (q != p);
+       write_unlock(&svc->sched_lock);
+       return NULL;
+
+  out:
+       svc->sched_data = q;
+       write_unlock(&svc->sched_lock);
+       IP_VS_DBG_BUF(6, "RR: server %s:%u "
+                     "activeconns %d refcnt %d weight %d\n",
+                     IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+                     atomic_read(&dest->activeconns),
+                     atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+       return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+       .name =                 "rr",                   /* name */
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        1,
+#endif
+       .init_service =         ip_vs_rr_init_svc,
+       .update_service =       ip_vs_rr_update_svc,
+       .schedule =             ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c

new file mode 100644 (file)

index 0000000..a46ad9e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/*
+ *  IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_sched_lock);
+
+
+/*
+ *  Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+                        struct ip_vs_scheduler *scheduler)
+{
+       int ret;
+
+       if (svc == NULL) {
+               IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+               return -EINVAL;
+       }
+       if (scheduler == NULL) {
+               IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+               return -EINVAL;
+       }
+
+       svc->scheduler = scheduler;
+
+       if (scheduler->init_service) {
+               ret = scheduler->init_service(svc);
+               if (ret) {
+                       IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+ *  Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+       struct ip_vs_scheduler *sched;
+
+       if (svc == NULL) {
+               IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+               return -EINVAL;
+       }
+
+       sched = svc->scheduler;
+       if (sched == NULL) {
+               IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+               return -EINVAL;
+       }
+
+       if (sched->done_service) {
+               if (sched->done_service(svc) != 0) {
+                       IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+                       return -EINVAL;
+               }
+       }
+
+       svc->scheduler = NULL;
+       return 0;
+}
+
+
+/*
+ *  Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+       struct ip_vs_scheduler *sched;
+
+       IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+                 sched_name);
+
+       read_lock_bh(&__ip_vs_sched_lock);
+
+       list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+               /*
+                * Test and get the modules atomically
+                */
+               if (sched->module && !try_module_get(sched->module)) {
+                       /*
+                        * This scheduler is just deleted
+                        */
+                       continue;
+               }
+               if (strcmp(sched_name, sched->name)==0) {
+                       /* HIT */
+                       read_unlock_bh(&__ip_vs_sched_lock);
+                       return sched;
+               }
+               if (sched->module)
+                       module_put(sched->module);
+       }
+
+       read_unlock_bh(&__ip_vs_sched_lock);
+       return NULL;
+}
+
+
+/*
+ *  Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+       struct ip_vs_scheduler *sched;
+
+       /*
+        *  Search for the scheduler by sched_name
+        */
+       sched = ip_vs_sched_getbyname(sched_name);
+
+       /*
+        *  If scheduler not found, load the module and search again
+        */
+       if (sched == NULL) {
+               request_module("ip_vs_%s", sched_name);
+               sched = ip_vs_sched_getbyname(sched_name);
+       }
+
+       return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+       if (scheduler->module)
+               module_put(scheduler->module);
+}
+
+
+/*
+ *  Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+       struct ip_vs_scheduler *sched;
+
+       if (!scheduler) {
+               IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+               return -EINVAL;
+       }
+
+       if (!scheduler->name) {
+               IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+               return -EINVAL;
+       }
+
+       /* increase the module use count */
+       ip_vs_use_count_inc();
+
+       write_lock_bh(&__ip_vs_sched_lock);
+
+       if (!list_empty(&scheduler->n_list)) {
+               write_unlock_bh(&__ip_vs_sched_lock);
+               ip_vs_use_count_dec();
+               IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+                         "already linked\n", scheduler->name);
+               return -EINVAL;
+       }
+
+       /*
+        *  Make sure that the scheduler with this name doesn't exist
+        *  in the scheduler list.
+        */
+       list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+               if (strcmp(scheduler->name, sched->name) == 0) {
+                       write_unlock_bh(&__ip_vs_sched_lock);
+                       ip_vs_use_count_dec();
+                       IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+                                       "already existed in the system\n",
+                                       scheduler->name);
+                       return -EINVAL;
+               }
+       }
+       /*
+        *      Add it into the d-linked scheduler list
+        */
+       list_add(&scheduler->n_list, &ip_vs_schedulers);
+       write_unlock_bh(&__ip_vs_sched_lock);
+
+       IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+
+       return 0;
+}
+
+
+/*
+ *  Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+       if (!scheduler) {
+               IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+               return -EINVAL;
+       }
+
+       write_lock_bh(&__ip_vs_sched_lock);
+       if (list_empty(&scheduler->n_list)) {
+               write_unlock_bh(&__ip_vs_sched_lock);
+               IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+                         "is not in the list. failed\n", scheduler->name);
+               return -EINVAL;
+       }
+
+       /*
+        *      Remove it from the d-linked scheduler list
+        */
+       list_del(&scheduler->n_list);
+       write_unlock_bh(&__ip_vs_sched_lock);
+
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+
+       IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+
+       return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c

new file mode 100644 (file)

index 0000000..7d2f22f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -0,0 +1,140 @@
+/*
+ * IPVS:        Shortest Expected Delay scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+       /*
+        * We only use the active connection number in the cost
+        * calculation here.
+        */
+       return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *     Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest, *least;
+       unsigned int loh, doh;
+
+       IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+
+       /*
+        * We calculate the load of each dest server as follows:
+        *      (server expected overhead) / dest->weight
+        *
+        * Remember -- no floats in kernel mode!!!
+        * The comparison of h1*w2 > h2*w1 is equivalent to that of
+        *                h1/w1 > h2/w2
+        * if every weight is larger than zero.
+        *
+        * The server with weight=0 is quiesced and will not receive any
+        * new connections.
+        */
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                   atomic_read(&dest->weight) > 0) {
+                       least = dest;
+                       loh = ip_vs_sed_dest_overhead(least);
+                       goto nextstage;
+               }
+       }
+       return NULL;
+
+       /*
+        *    Find the destination with the least load.
+        */
+  nextstage:
+       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+               doh = ip_vs_sed_dest_overhead(dest);
+               if (loh * atomic_read(&dest->weight) >
+                   doh * atomic_read(&least->weight)) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       IP_VS_DBG_BUF(6, "SED: server %s:%u "
+                     "activeconns %d refcnt %d weight %d overhead %d\n",
+                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+                     atomic_read(&least->activeconns),
+                     atomic_read(&least->refcnt),
+                     atomic_read(&least->weight), loh);
+
+       return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+       .name =                 "sed",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        1,
+#endif
+       .schedule =             ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c

new file mode 100644 (file)

index 0000000..1d96de2
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -0,0 +1,258 @@
+/*
+ * IPVS:        Source Hashing scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[src_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) or (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+       struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS        8
+#endif
+#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
+
+
+/*
+ *     Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__be32 addr)
+{
+       return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
+{
+       return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+       int i;
+       struct ip_vs_sh_bucket *b;
+       struct list_head *p;
+       struct ip_vs_dest *dest;
+
+       b = tbl;
+       p = &svc->destinations;
+       for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+               if (list_empty(p)) {
+                       b->dest = NULL;
+               } else {
+                       if (p == &svc->destinations)
+                               p = p->next;
+
+                       dest = list_entry(p, struct ip_vs_dest, n_list);
+                       atomic_inc(&dest->refcnt);
+                       b->dest = dest;
+
+                       p = p->next;
+               }
+               b++;
+       }
+       return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+       int i;
+       struct ip_vs_sh_bucket *b;
+
+       b = tbl;
+       for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+               if (b->dest) {
+                       atomic_dec(&b->dest->refcnt);
+                       b->dest = NULL;
+               }
+               b++;
+       }
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_sh_bucket *tbl;
+
+       /* allocate the SH table for this service */
+       tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+                     GFP_ATOMIC);
+       if (tbl == NULL) {
+               IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+               return -ENOMEM;
+       }
+       svc->sched_data = tbl;
+       IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+                 "current service\n",
+                 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+       /* assign the hash buckets with the updated service */
+       ip_vs_sh_assign(tbl, svc);
+
+       return 0;
+}
+
+
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+       /* got to clean up hash buckets here */
+       ip_vs_sh_flush(tbl);
+
+       /* release the table itself */
+       kfree(svc->sched_data);
+       IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+                 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+       return 0;
+}
+
+
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+       /* got to clean up hash buckets here */
+       ip_vs_sh_flush(tbl);
+
+       /* assign the hash buckets with the updated service */
+       ip_vs_sh_assign(tbl, svc);
+
+       return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+       return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest;
+       struct ip_vs_sh_bucket *tbl;
+       struct iphdr *iph = ip_hdr(skb);
+
+       IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+       tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+       dest = ip_vs_sh_get(tbl, iph->saddr);
+       if (!dest
+           || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+           || atomic_read(&dest->weight) <= 0
+           || is_overloaded(dest)) {
+               return NULL;
+       }
+
+       IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+                 "--> server %u.%u.%u.%u:%d\n",
+                 NIPQUAD(iph->saddr),
+                 NIPQUAD(dest->addr.ip),
+                 ntohs(dest->port));
+
+       return dest;
+}
+
+
+/*
+ *      IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+       .name =                 "sh",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list  =              LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        0,
+#endif
+       .init_service =         ip_vs_sh_init_svc,
+       .done_service =         ip_vs_sh_done_svc,
+       .update_service =       ip_vs_sh_update_svc,
+       .schedule =             ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c

new file mode 100644 (file)

index 0000000..de5e7e1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -0,0 +1,942 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync:  sync connection info from master load balancer to backups
+ *              through multicast
+ *
+ * Changes:
+ *     Alexandre Cassen        :       Added master & backup support at a time.
+ *     Alexandre Cassen        :       Added SyncID support for incoming sync
+ *                                     messages filtering.
+ *     Justin Ossevoort        :       Fix endian problem on sync message size.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <linux/udp.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/kernel.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT  8848          /* multicast port */
+
+
+/*
+ *     IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+       __u8                    reserved;
+
+       /* Protocol, addresses and port numbers */
+       __u8                    protocol;       /* Which protocol (TCP/UDP) */
+       __be16                  cport;
+       __be16                  vport;
+       __be16                  dport;
+       __be32                  caddr;          /* client address */
+       __be32                  vaddr;          /* virtual address */
+       __be32                  daddr;          /* destination address */
+
+       /* Flags and state transition */
+       __be16                  flags;          /* status flags */
+       __be16                  state;          /* state info */
+
+       /* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+       struct ip_vs_seq        in_seq;         /* incoming seq. struct */
+       struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+};
+
+struct ip_vs_sync_thread_data {
+       struct socket *sock;
+       char *buf;
+};
+
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE  \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+  The master mulitcasts messages to the backup load balancers in the
+  following format.
+
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (1)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                            .                                  |
+      |                            .                                  |
+      |                            .                                  |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (n)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+#define SYNC_MESG_HEADER_LEN   4
+#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
+
+struct ip_vs_sync_mesg {
+       __u8                    nr_conns;
+       __u8                    syncid;
+       __u16                   size;
+
+       /* ip_vs_sync_conn entries start here */
+};
+
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+
+struct ip_vs_sync_buff {
+       struct list_head        list;
+       unsigned long           firstuse;
+
+       /* pointers for the message data */
+       struct ip_vs_sync_mesg  *mesg;
+       unsigned char           *head;
+       unsigned char           *end;
+};
+
+
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static DEFINE_SPINLOCK(ip_vs_sync_lock);
+
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff   *curr_sb = NULL;
+static DEFINE_SPINLOCK(curr_sb_lock);
+
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+
+/* sync daemon tasks */
+static struct task_struct *sync_master_thread;
+static struct task_struct *sync_backup_thread;
+
+/* multicast addr */
+static struct sockaddr_in mcast_addr = {
+       .sin_family             = AF_INET,
+       .sin_port               = __constant_htons(IP_VS_SYNC_PORT),
+       .sin_addr.s_addr        = __constant_htonl(IP_VS_SYNC_GROUP),
+};
+
+
+static inline struct ip_vs_sync_buff *sb_dequeue(void)
+{
+       struct ip_vs_sync_buff *sb;
+
+       spin_lock_bh(&ip_vs_sync_lock);
+       if (list_empty(&ip_vs_sync_queue)) {
+               sb = NULL;
+       } else {
+               sb = list_entry(ip_vs_sync_queue.next,
+                               struct ip_vs_sync_buff,
+                               list);
+               list_del(&sb->list);
+       }
+       spin_unlock_bh(&ip_vs_sync_lock);
+
+       return sb;
+}
+
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+       struct ip_vs_sync_buff *sb;
+
+       if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+               return NULL;
+
+       if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+               kfree(sb);
+               return NULL;
+       }
+       sb->mesg->nr_conns = 0;
+       sb->mesg->syncid = ip_vs_master_syncid;
+       sb->mesg->size = 4;
+       sb->head = (unsigned char *)sb->mesg + 4;
+       sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+       sb->firstuse = jiffies;
+       return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+       kfree(sb->mesg);
+       kfree(sb);
+}
+
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+       spin_lock(&ip_vs_sync_lock);
+       if (ip_vs_sync_state & IP_VS_STATE_MASTER)
+               list_add_tail(&sb->list, &ip_vs_sync_queue);
+       else
+               ip_vs_sync_buff_release(sb);
+       spin_unlock(&ip_vs_sync_lock);
+}
+
+/*
+ *     Get the current sync buffer if it has been created for more
+ *     than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+       struct ip_vs_sync_buff *sb;
+
+       spin_lock_bh(&curr_sb_lock);
+       if (curr_sb && (time == 0 ||
+                       time_before(jiffies - curr_sb->firstuse, time))) {
+               sb = curr_sb;
+               curr_sb = NULL;
+       } else
+               sb = NULL;
+       spin_unlock_bh(&curr_sb_lock);
+       return sb;
+}
+
+
+/*
+ *      Add an ip_vs_conn information into the current sync_buff.
+ *      Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+       struct ip_vs_sync_mesg *m;
+       struct ip_vs_sync_conn *s;
+       int len;
+
+       spin_lock(&curr_sb_lock);
+       if (!curr_sb) {
+               if (!(curr_sb=ip_vs_sync_buff_create())) {
+                       spin_unlock(&curr_sb_lock);
+                       IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+                       return;
+               }
+       }
+
+       len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+               SIMPLE_CONN_SIZE;
+       m = curr_sb->mesg;
+       s = (struct ip_vs_sync_conn *)curr_sb->head;
+
+       /* copy members */
+       s->protocol = cp->protocol;
+       s->cport = cp->cport;
+       s->vport = cp->vport;
+       s->dport = cp->dport;
+       s->caddr = cp->caddr.ip;
+       s->vaddr = cp->vaddr.ip;
+       s->daddr = cp->daddr.ip;
+       s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+       s->state = htons(cp->state);
+       if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+               struct ip_vs_sync_conn_options *opt =
+                       (struct ip_vs_sync_conn_options *)&s[1];
+               memcpy(opt, &cp->in_seq, sizeof(*opt));
+       }
+
+       m->nr_conns++;
+       m->size += len;
+       curr_sb->head += len;
+
+       /* check if there is a space for next one */
+       if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+               sb_queue_tail(curr_sb);
+               curr_sb = NULL;
+       }
+       spin_unlock(&curr_sb_lock);
+
+       /* synchronize its controller if it has */
+       if (cp->control)
+               ip_vs_sync_conn(cp->control);
+}
+
+
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+       struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+       struct ip_vs_sync_conn *s;
+       struct ip_vs_sync_conn_options *opt;
+       struct ip_vs_conn *cp;
+       struct ip_vs_protocol *pp;
+       struct ip_vs_dest *dest;
+       char *p;
+       int i;
+
+       if (buflen < sizeof(struct ip_vs_sync_mesg)) {
+               IP_VS_ERR_RL("sync message header too short\n");
+               return;
+       }
+
+       /* Convert size back to host byte order */
+       m->size = ntohs(m->size);
+
+       if (buflen != m->size) {
+               IP_VS_ERR_RL("bogus sync message size\n");
+               return;
+       }
+
+       /* SyncID sanity check */
+       if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+               IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+                         m->syncid);
+               return;
+       }
+
+       p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+       for (i=0; i<m->nr_conns; i++) {
+               unsigned flags, state;
+
+               if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
+                       IP_VS_ERR_RL("bogus conn in sync message\n");
+                       return;
+               }
+               s = (struct ip_vs_sync_conn *) p;
+               flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+               flags &= ~IP_VS_CONN_F_HASHED;
+               if (flags & IP_VS_CONN_F_SEQ_MASK) {
+                       opt = (struct ip_vs_sync_conn_options *)&s[1];
+                       p += FULL_CONN_SIZE;
+                       if (p > buffer+buflen) {
+                               IP_VS_ERR_RL("bogus conn options in sync message\n");
+                               return;
+                       }
+               } else {
+                       opt = NULL;
+                       p += SIMPLE_CONN_SIZE;
+               }
+
+               state = ntohs(s->state);
+               if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+                       pp = ip_vs_proto_get(s->protocol);
+                       if (!pp) {
+                               IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+                                       s->protocol);
+                               continue;
+                       }
+                       if (state >= pp->num_states) {
+                               IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+                                       pp->name, state);
+                               continue;
+                       }
+               } else {
+                       /* protocol in templates is not used for state/timeout */
+                       pp = NULL;
+                       if (state > 0) {
+                               IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+                                       state);
+                               state = 0;
+                       }
+               }
+
+               if (!(flags & IP_VS_CONN_F_TEMPLATE))
+                       cp = ip_vs_conn_in_get(AF_INET, s->protocol,
+                                              (union nf_inet_addr *)&s->caddr,
+                                              s->cport,
+                                              (union nf_inet_addr *)&s->vaddr,
+                                              s->vport);
+               else
+                       cp = ip_vs_ct_in_get(AF_INET, s->protocol,
+                                            (union nf_inet_addr *)&s->caddr,
+                                            s->cport,
+                                            (union nf_inet_addr *)&s->vaddr,
+                                            s->vport);
+               if (!cp) {
+                       /*
+                        * Find the appropriate destination for the connection.
+                        * If it is not found the connection will remain unbound
+                        * but still handled.
+                        */
+                       dest = ip_vs_find_dest(AF_INET,
+                                              (union nf_inet_addr *)&s->daddr,
+                                              s->dport,
+                                              (union nf_inet_addr *)&s->vaddr,
+                                              s->vport,
+                                              s->protocol);
+                       /*  Set the approprite ativity flag */
+                       if (s->protocol == IPPROTO_TCP) {
+                               if (state != IP_VS_TCP_S_ESTABLISHED)
+                                       flags |= IP_VS_CONN_F_INACTIVE;
+                               else
+                                       flags &= ~IP_VS_CONN_F_INACTIVE;
+                       }
+                       cp = ip_vs_conn_new(AF_INET, s->protocol,
+                                           (union nf_inet_addr *)&s->caddr,
+                                           s->cport,
+                                           (union nf_inet_addr *)&s->vaddr,
+                                           s->vport,
+                                           (union nf_inet_addr *)&s->daddr,
+                                           s->dport,
+                                           flags, dest);
+                       if (dest)
+                               atomic_dec(&dest->refcnt);
+                       if (!cp) {
+                               IP_VS_ERR("ip_vs_conn_new failed\n");
+                               return;
+                       }
+               } else if (!cp->dest) {
+                       dest = ip_vs_try_bind_dest(cp);
+                       if (dest)
+                               atomic_dec(&dest->refcnt);
+               } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+                          (cp->state != state)) {
+                       /* update active/inactive flag for the connection */
+                       dest = cp->dest;
+                       if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                               (state != IP_VS_TCP_S_ESTABLISHED)) {
+                               atomic_dec(&dest->activeconns);
+                               atomic_inc(&dest->inactconns);
+                               cp->flags |= IP_VS_CONN_F_INACTIVE;
+                       } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+                               (state == IP_VS_TCP_S_ESTABLISHED)) {
+                               atomic_inc(&dest->activeconns);
+                               atomic_dec(&dest->inactconns);
+                               cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+                       }
+               }
+
+               if (opt)
+                       memcpy(&cp->in_seq, opt, sizeof(*opt));
+               atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+               cp->state = state;
+               cp->old_state = cp->state;
+               /*
+                * We can not recover the right timeout for templates
+                * in all cases, we can not find the right fwmark
+                * virtual service. If needed, we can do it for
+                * non-fwmark persistent services.
+                */
+               if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
+                       cp->timeout = pp->timeout_table[state];
+               else
+                       cp->timeout = (3*60*HZ);
+               ip_vs_conn_put(cp);
+       }
+}
+
+
+/*
+ *      Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+       struct inet_sock *inet = inet_sk(sk);
+
+       /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+       lock_sock(sk);
+       inet->mc_loop = loop ? 1 : 0;
+       release_sock(sk);
+}
+
+/*
+ *      Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+       struct inet_sock *inet = inet_sk(sk);
+
+       /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+       lock_sock(sk);
+       inet->mc_ttl = ttl;
+       release_sock(sk);
+}
+
+/*
+ *      Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+       struct net_device *dev;
+       struct inet_sock *inet = inet_sk(sk);
+
+       if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+               return -ENODEV;
+
+       if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+               return -EINVAL;
+
+       lock_sock(sk);
+       inet->mc_index = dev->ifindex;
+       /*  inet->mc_addr  = 0; */
+       release_sock(sk);
+
+       return 0;
+}
+
+
+/*
+ *     Set the maximum length of sync message according to the
+ *     specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+       struct net_device *dev;
+       int num;
+
+       if (sync_state == IP_VS_STATE_MASTER) {
+               if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+                       return -ENODEV;
+
+               num = (dev->mtu - sizeof(struct iphdr) -
+                      sizeof(struct udphdr) -
+                      SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+               sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+                       SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
+               IP_VS_DBG(7, "setting the maximum length of sync sending "
+                         "message %d.\n", sync_send_mesg_maxlen);
+       } else if (sync_state == IP_VS_STATE_BACKUP) {
+               if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+                       return -ENODEV;
+
+               sync_recv_mesg_maxlen = dev->mtu -
+                       sizeof(struct iphdr) - sizeof(struct udphdr);
+               IP_VS_DBG(7, "setting the maximum length of sync receiving "
+                         "message %d.\n", sync_recv_mesg_maxlen);
+       }
+
+       return 0;
+}
+
+
+/*
+ *      Join a multicast group.
+ *      the group is specified by a class D multicast address 224.0.0.0/8
+ *      in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+       struct ip_mreqn mreq;
+       struct net_device *dev;
+       int ret;
+
+       memset(&mreq, 0, sizeof(mreq));
+       memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+       if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+               return -ENODEV;
+       if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+               return -EINVAL;
+
+       mreq.imr_ifindex = dev->ifindex;
+
+       lock_sock(sk);
+       ret = ip_mc_join_group(sk, &mreq);
+       release_sock(sk);
+
+       return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+       struct net_device *dev;
+       __be32 addr;
+       struct sockaddr_in sin;
+
+       if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+               return -ENODEV;
+
+       addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+       if (!addr)
+               IP_VS_ERR("You probably need to specify IP address on "
+                         "multicast interface.\n");
+
+       IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+                 ifname, NIPQUAD(addr));
+
+       /* Now bind the socket with the address of multicast interface */
+       sin.sin_family       = AF_INET;
+       sin.sin_addr.s_addr  = addr;
+       sin.sin_port         = 0;
+
+       return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ *      Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+       struct socket *sock;
+       int result;
+
+       /* First create a socket */
+       result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+       if (result < 0) {
+               IP_VS_ERR("Error during creation of socket; terminating\n");
+               return ERR_PTR(result);
+       }
+
+       result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+       if (result < 0) {
+               IP_VS_ERR("Error setting outbound mcast interface\n");
+               goto error;
+       }
+
+       set_mcast_loop(sock->sk, 0);
+       set_mcast_ttl(sock->sk, 1);
+
+       result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+       if (result < 0) {
+               IP_VS_ERR("Error binding address of the mcast interface\n");
+               goto error;
+       }
+
+       result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+                       sizeof(struct sockaddr), 0);
+       if (result < 0) {
+               IP_VS_ERR("Error connecting to the multicast addr\n");
+               goto error;
+       }
+
+       return sock;
+
+  error:
+       sock_release(sock);
+       return ERR_PTR(result);
+}
+
+
+/*
+ *      Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+       struct socket *sock;
+       int result;
+
+       /* First create a socket */
+       result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+       if (result < 0) {
+               IP_VS_ERR("Error during creation of socket; terminating\n");
+               return ERR_PTR(result);
+       }
+
+       /* it is equivalent to the REUSEADDR option in user-space */
+       sock->sk->sk_reuse = 1;
+
+       result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+                       sizeof(struct sockaddr));
+       if (result < 0) {
+               IP_VS_ERR("Error binding to the multicast addr\n");
+               goto error;
+       }
+
+       /* join the multicast group */
+       result = join_mcast_group(sock->sk,
+                       (struct in_addr *) &mcast_addr.sin_addr,
+                       ip_vs_backup_mcast_ifn);
+       if (result < 0) {
+               IP_VS_ERR("Error joining to the multicast group\n");
+               goto error;
+       }
+
+       return sock;
+
+  error:
+       sock_release(sock);
+       return ERR_PTR(result);
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+       struct msghdr   msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+       struct kvec     iov;
+       int             len;
+
+       EnterFunction(7);
+       iov.iov_base     = (void *)buffer;
+       iov.iov_len      = length;
+
+       len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+       LeaveFunction(7);
+       return len;
+}
+
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+       int msize;
+
+       msize = msg->size;
+
+       /* Put size in network byte order */
+       msg->size = htons(msg->size);
+
+       if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+               IP_VS_ERR("ip_vs_send_async error\n");
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+       struct msghdr           msg = {NULL,};
+       struct kvec             iov;
+       int                     len;
+
+       EnterFunction(7);
+
+       /* Receive a packet */
+       iov.iov_base     = buffer;
+       iov.iov_len      = (size_t)buflen;
+
+       len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+
+       if (len < 0)
+               return -1;
+
+       LeaveFunction(7);
+       return len;
+}
+
+
+static int sync_thread_master(void *data)
+{
+       struct ip_vs_sync_thread_data *tinfo = data;
+       struct ip_vs_sync_buff *sb;
+
+       IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+                  "syncid = %d\n",
+                  ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+
+       while (!kthread_should_stop()) {
+               while ((sb = sb_dequeue())) {
+                       ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
+                       ip_vs_sync_buff_release(sb);
+               }
+
+               /* check if entries stay in curr_sb for 2 seconds */
+               sb = get_curr_sync_buff(2 * HZ);
+               if (sb) {
+                       ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
+                       ip_vs_sync_buff_release(sb);
+               }
+
+               schedule_timeout_interruptible(HZ);
+       }
+
+       /* clean up the sync_buff queue */
+       while ((sb=sb_dequeue())) {
+               ip_vs_sync_buff_release(sb);
+       }
+
+       /* clean up the current sync_buff */
+       if ((sb = get_curr_sync_buff(0))) {
+               ip_vs_sync_buff_release(sb);
+       }
+
+       /* release the sending multicast socket */
+       sock_release(tinfo->sock);
+       kfree(tinfo);
+
+       return 0;
+}
+
+
+static int sync_thread_backup(void *data)
+{
+       struct ip_vs_sync_thread_data *tinfo = data;
+       int len;
+
+       IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+                  "syncid = %d\n",
+                  ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
+                        !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
+                        || kthread_should_stop());
+
+               /* do we have data now? */
+               while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+                       len = ip_vs_receive(tinfo->sock, tinfo->buf,
+                                       sync_recv_mesg_maxlen);
+                       if (len <= 0) {
+                               IP_VS_ERR("receiving message error\n");
+                               break;
+                       }
+
+                       /* disable bottom half, because it accesses the data
+                          shared by softirq while getting/creating conns */
+                       local_bh_disable();
+                       ip_vs_process_message(tinfo->buf, len);
+                       local_bh_enable();
+               }
+       }
+
+       /* release the sending multicast socket */
+       sock_release(tinfo->sock);
+       kfree(tinfo->buf);
+       kfree(tinfo);
+
+       return 0;
+}
+
+
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+       struct ip_vs_sync_thread_data *tinfo;
+       struct task_struct **realtask, *task;
+       struct socket *sock;
+       char *name, *buf = NULL;
+       int (*threadfn)(void *data);
+       int result = -ENOMEM;
+
+       IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
+       IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+                 sizeof(struct ip_vs_sync_conn));
+
+       if (state == IP_VS_STATE_MASTER) {
+               if (sync_master_thread)
+                       return -EEXIST;
+
+               strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
+                       sizeof(ip_vs_master_mcast_ifn));
+               ip_vs_master_syncid = syncid;
+               realtask = &sync_master_thread;
+               name = "ipvs_syncmaster";
+               threadfn = sync_thread_master;
+               sock = make_send_sock();
+       } else if (state == IP_VS_STATE_BACKUP) {
+               if (sync_backup_thread)
+                       return -EEXIST;
+
+               strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
+                       sizeof(ip_vs_backup_mcast_ifn));
+               ip_vs_backup_syncid = syncid;
+               realtask = &sync_backup_thread;
+               name = "ipvs_syncbackup";
+               threadfn = sync_thread_backup;
+               sock = make_receive_sock();
+       } else {
+               return -EINVAL;
+       }
+
+       if (IS_ERR(sock)) {
+               result = PTR_ERR(sock);
+               goto out;
+       }
+
+       set_sync_mesg_maxlen(state);
+       if (state == IP_VS_STATE_BACKUP) {
+               buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+               if (!buf)
+                       goto outsocket;
+       }
+
+       tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+       if (!tinfo)
+               goto outbuf;
+
+       tinfo->sock = sock;
+       tinfo->buf = buf;
+
+       task = kthread_run(threadfn, tinfo, name);
+       if (IS_ERR(task)) {
+               result = PTR_ERR(task);
+               goto outtinfo;
+       }
+
+       /* mark as active */
+       *realtask = task;
+       ip_vs_sync_state |= state;
+
+       /* increase the module use count */
+       ip_vs_use_count_inc();
+
+       return 0;
+
+outtinfo:
+       kfree(tinfo);
+outbuf:
+       kfree(buf);
+outsocket:
+       sock_release(sock);
+out:
+       return result;
+}
+
+
+int stop_sync_thread(int state)
+{
+       IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
+
+       if (state == IP_VS_STATE_MASTER) {
+               if (!sync_master_thread)
+                       return -ESRCH;
+
+               IP_VS_INFO("stopping master sync thread %d ...\n",
+                          task_pid_nr(sync_master_thread));
+
+               /*
+                * The lock synchronizes with sb_queue_tail(), so that we don't
+                * add sync buffers to the queue, when we are already in
+                * progress of stopping the master sync daemon.
+                */
+
+               spin_lock_bh(&ip_vs_sync_lock);
+               ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
+               spin_unlock_bh(&ip_vs_sync_lock);
+               kthread_stop(sync_master_thread);
+               sync_master_thread = NULL;
+       } else if (state == IP_VS_STATE_BACKUP) {
+               if (!sync_backup_thread)
+                       return -ESRCH;
+
+               IP_VS_INFO("stopping backup sync thread %d ...\n",
+                          task_pid_nr(sync_backup_thread));
+
+               ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
+               kthread_stop(sync_backup_thread);
+               sync_backup_thread = NULL;
+       } else {
+               return -EINVAL;
+       }
+
+       /* decrease the module use count */
+       ip_vs_use_count_dec();
+
+       return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c

new file mode 100644 (file)

index 0000000..8c596e7
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -0,0 +1,128 @@
+/*
+ * IPVS:        Weighted Least-Connection Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
+ *     Wensong Zhang            :     changed to use the inactconns in scheduling
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+       /*
+        * We think the overhead of processing active connections is 256
+        * times higher than that of inactive connections in average. (This
+        * 256 times might not be accurate, we will change it later) We
+        * use the following formula to estimate the overhead now:
+        *                dest->activeconns*256 + dest->inactconns
+        */
+       return (atomic_read(&dest->activeconns) << 8) +
+               atomic_read(&dest->inactconns);
+}
+
+
+/*
+ *     Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest, *least;
+       unsigned int loh, doh;
+
+       IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+       /*
+        * We calculate the load of each dest server as follows:
+        *                (dest overhead) / dest->weight
+        *
+        * Remember -- no floats in kernel mode!!!
+        * The comparison of h1*w2 > h2*w1 is equivalent to that of
+        *                h1/w1 > h2/w2
+        * if every weight is larger than zero.
+        *
+        * The server with weight=0 is quiesced and will not receive any
+        * new connections.
+        */
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                   atomic_read(&dest->weight) > 0) {
+                       least = dest;
+                       loh = ip_vs_wlc_dest_overhead(least);
+                       goto nextstage;
+               }
+       }
+       return NULL;
+
+       /*
+        *    Find the destination with the least load.
+        */
+  nextstage:
+       list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+                       continue;
+               doh = ip_vs_wlc_dest_overhead(dest);
+               if (loh * atomic_read(&dest->weight) >
+                   doh * atomic_read(&least->weight)) {
+                       least = dest;
+                       loh = doh;
+               }
+       }
+
+       IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+                     "activeconns %d refcnt %d weight %d overhead %d\n",
+                     IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+                     atomic_read(&least->activeconns),
+                     atomic_read(&least->refcnt),
+                     atomic_read(&least->weight), loh);
+
+       return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+       .name =                 "wlc",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        1,
+#endif
+       .schedule =             ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c

new file mode 100644 (file)

index 0000000..7ea92fe
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -0,0 +1,237 @@
+/*
+ * IPVS:        Weighted Round-Robin Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
+ *     Julian Anastasov         :     fixed the bug of returning destination
+ *                                    with weight 0 when all weights are zero
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+       struct list_head *cl;   /* current list head */
+       int cw;                 /* current weight */
+       int mw;                 /* maximum weight */
+       int di;                 /* decreasing interval */
+};
+
+
+/*
+ *    Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+       int c;
+
+       while ((c = a % b)) {
+               a = b;
+               b = c;
+       }
+       return b;
+}
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+       struct ip_vs_dest *dest;
+       int weight;
+       int g = 0;
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               weight = atomic_read(&dest->weight);
+               if (weight > 0) {
+                       if (g > 0)
+                               g = gcd(weight, g);
+                       else
+                               g = weight;
+               }
+       }
+       return g ? g : 1;
+}
+
+
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+       struct ip_vs_dest *dest;
+       int weight = 0;
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               if (atomic_read(&dest->weight) > weight)
+                       weight = atomic_read(&dest->weight);
+       }
+
+       return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_wrr_mark *mark;
+
+       /*
+        *    Allocate the mark variable for WRR scheduling
+        */
+       mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+       if (mark == NULL) {
+               IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+               return -ENOMEM;
+       }
+       mark->cl = &svc->destinations;
+       mark->cw = 0;
+       mark->mw = ip_vs_wrr_max_weight(svc);
+       mark->di = ip_vs_wrr_gcd_weight(svc);
+       svc->sched_data = mark;
+
+       return 0;
+}
+
+
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+       /*
+        *    Release the mark variable
+        */
+       kfree(svc->sched_data);
+
+       return 0;
+}
+
+
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+       struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+       mark->cl = &svc->destinations;
+       mark->mw = ip_vs_wrr_max_weight(svc);
+       mark->di = ip_vs_wrr_gcd_weight(svc);
+       if (mark->cw > mark->mw)
+               mark->cw = 0;
+       return 0;
+}
+
+
+/*
+ *    Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct ip_vs_dest *dest;
+       struct ip_vs_wrr_mark *mark = svc->sched_data;
+       struct list_head *p;
+
+       IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+
+       /*
+        * This loop will always terminate, because mark->cw in (0, max_weight]
+        * and at least one server has its weight equal to max_weight.
+        */
+       write_lock(&svc->sched_lock);
+       p = mark->cl;
+       while (1) {
+               if (mark->cl == &svc->destinations) {
+                       /* it is at the head of the destination list */
+
+                       if (mark->cl == mark->cl->next) {
+                               /* no dest entry */
+                               dest = NULL;
+                               goto out;
+                       }
+
+                       mark->cl = svc->destinations.next;
+                       mark->cw -= mark->di;
+                       if (mark->cw <= 0) {
+                               mark->cw = mark->mw;
+                               /*
+                                * Still zero, which means no available servers.
+                                */
+                               if (mark->cw == 0) {
+                                       mark->cl = &svc->destinations;
+                                       IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
+                                                  "no available servers\n");
+                                       dest = NULL;
+                                       goto out;
+                               }
+                       }
+               } else
+                       mark->cl = mark->cl->next;
+
+               if (mark->cl != &svc->destinations) {
+                       /* not at the head of the list */
+                       dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+                       if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+                           atomic_read(&dest->weight) >= mark->cw) {
+                               /* got it */
+                               break;
+                       }
+               }
+
+               if (mark->cl == p && mark->cw == mark->di) {
+                       /* back to the start, and no dest is found.
+                          It is only possible when all dests are OVERLOADED */
+                       dest = NULL;
+                       goto out;
+               }
+       }
+
+       IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+                     "activeconns %d refcnt %d weight %d\n",
+                     IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+                     atomic_read(&dest->activeconns),
+                     atomic_read(&dest->refcnt),
+                     atomic_read(&dest->weight));
+
+  out:
+       write_unlock(&svc->sched_lock);
+       return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+       .name =                 "wrr",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .n_list =               LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+#ifdef CONFIG_IP_VS_IPV6
+       .supports_ipv6 =        1,
+#endif
+       .init_service =         ip_vs_wrr_init_svc,
+       .done_service =         ip_vs_wrr_done_svc,
+       .update_service =       ip_vs_wrr_update_svc,
+       .schedule =             ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+       return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c

new file mode 100644 (file)

index 0000000..02ddc2b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -0,0 +1,1004 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+       struct dst_entry *old_dst;
+
+       old_dst = dest->dst_cache;
+       dest->dst_cache = dst;
+       dest->dst_rtos = rtos;
+       dst_release(old_dst);
+}
+
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+       struct dst_entry *dst = dest->dst_cache;
+
+       if (!dst)
+               return NULL;
+       if ((dst->obsolete
+            || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
+           dst->ops->check(dst, cookie) == NULL) {
+               dest->dst_cache = NULL;
+               dst_release(dst);
+               return NULL;
+       }
+       dst_hold(dst);
+       return dst;
+}
+
+static struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+       struct rtable *rt;                      /* Route to the other host */
+       struct ip_vs_dest *dest = cp->dest;
+
+       if (dest) {
+               spin_lock(&dest->dst_lock);
+               if (!(rt = (struct rtable *)
+                     __ip_vs_dst_check(dest, rtos, 0))) {
+                       struct flowi fl = {
+                               .oif = 0,
+                               .nl_u = {
+                                       .ip4_u = {
+                                               .daddr = dest->addr.ip,
+                                               .saddr = 0,
+                                               .tos = rtos, } },
+                       };
+
+                       if (ip_route_output_key(&init_net, &rt, &fl)) {
+                               spin_unlock(&dest->dst_lock);
+                               IP_VS_DBG_RL("ip_route_output error, "
+                                            "dest: %u.%u.%u.%u\n",
+                                            NIPQUAD(dest->addr.ip));
+                               return NULL;
+                       }
+                       __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+                       IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+                                 NIPQUAD(dest->addr.ip),
+                                 atomic_read(&rt->u.dst.__refcnt), rtos);
+               }
+               spin_unlock(&dest->dst_lock);
+       } else {
+               struct flowi fl = {
+                       .oif = 0,
+                       .nl_u = {
+                               .ip4_u = {
+                                       .daddr = cp->daddr.ip,
+                                       .saddr = 0,
+                                       .tos = rtos, } },
+               };
+
+               if (ip_route_output_key(&init_net, &rt, &fl)) {
+                       IP_VS_DBG_RL("ip_route_output error, dest: "
+                                    "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
+                       return NULL;
+               }
+       }
+
+       return rt;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static struct rt6_info *
+__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+{
+       struct rt6_info *rt;                    /* Route to the other host */
+       struct ip_vs_dest *dest = cp->dest;
+
+       if (dest) {
+               spin_lock(&dest->dst_lock);
+               rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+               if (!rt) {
+                       struct flowi fl = {
+                               .oif = 0,
+                               .nl_u = {
+                                       .ip6_u = {
+                                               .daddr = dest->addr.in6,
+                                               .saddr = {
+                                                       .s6_addr32 =
+                                                               { 0, 0, 0, 0 },
+                                               },
+                                       },
+                               },
+                       };
+
+                       rt = (struct rt6_info *)ip6_route_output(&init_net,
+                                                                NULL, &fl);
+                       if (!rt) {
+                               spin_unlock(&dest->dst_lock);
+                               IP_VS_DBG_RL("ip6_route_output error, "
+                                            "dest: " NIP6_FMT "\n",
+                                            NIP6(dest->addr.in6));
+                               return NULL;
+                       }
+                       __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
+                       IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
+                                 NIP6(dest->addr.in6),
+                                 atomic_read(&rt->u.dst.__refcnt));
+               }
+               spin_unlock(&dest->dst_lock);
+       } else {
+               struct flowi fl = {
+                       .oif = 0,
+                       .nl_u = {
+                               .ip6_u = {
+                                       .daddr = cp->daddr.in6,
+                                       .saddr = {
+                                               .s6_addr32 = { 0, 0, 0, 0 },
+                                       },
+                               },
+                       },
+               };
+
+               rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+               if (!rt) {
+                       IP_VS_DBG_RL("ip6_route_output error, dest: "
+                                    NIP6_FMT "\n", NIP6(cp->daddr.in6));
+                       return NULL;
+               }
+       }
+
+       return rt;
+}
+#endif
+
+
+/*
+ *     Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+       struct dst_entry *old_dst;
+
+       old_dst = dest->dst_cache;
+       dest->dst_cache = NULL;
+       dst_release(old_dst);
+}
+
+#define IP_VS_XMIT(pf, skb, rt)                                \
+do {                                                   \
+       (skb)->ipvs_property = 1;                       \
+       skb_forward_csum(skb);                          \
+       NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
+               (rt)->u.dst.dev, dst_output);           \
+} while (0)
+
+
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+               struct ip_vs_protocol *pp)
+{
+       /* we do not touch skb and do not need pskb ptr */
+       return NF_ACCEPT;
+}
+
+
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                 struct ip_vs_protocol *pp)
+{
+       struct rtable *rt;                      /* Route to the other host */
+       struct iphdr  *iph = ip_hdr(skb);
+       u8     tos = iph->tos;
+       int    mtu;
+       struct flowi fl = {
+               .oif = 0,
+               .nl_u = {
+                       .ip4_u = {
+                               .daddr = iph->daddr,
+                               .saddr = 0,
+                               .tos = RT_TOS(tos), } },
+       };
+
+       EnterFunction(10);
+
+       if (ip_route_output_key(&init_net, &rt, &fl)) {
+               IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+                            "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+               goto tx_error_icmp;
+       }
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+               ip_rt_put(rt);
+               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+               IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+               goto tx_error;
+       }
+
+       /*
+        * Call ip_send_check because we are not sure it is called
+        * after ip_defrag. Is copy-on-write needed?
+        */
+       if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+               ip_rt_put(rt);
+               return NF_STOLEN;
+       }
+       ip_send_check(ip_hdr(skb));
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET, skb, rt);
+
+       LeaveFunction(10);
+       return NF_STOLEN;
+
+ tx_error_icmp:
+       dst_link_failure(skb);
+ tx_error:
+       kfree_skb(skb);
+       LeaveFunction(10);
+       return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+                    struct ip_vs_protocol *pp)
+{
+       struct rt6_info *rt;                    /* Route to the other host */
+       struct ipv6hdr  *iph = ipv6_hdr(skb);
+       int    mtu;
+       struct flowi fl = {
+               .oif = 0,
+               .nl_u = {
+                       .ip6_u = {
+                               .daddr = iph->daddr,
+                               .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
+       };
+
+       EnterFunction(10);
+
+       rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+       if (!rt) {
+               IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
+                            "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
+               goto tx_error_icmp;
+       }
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if (skb->len > mtu) {
+               dst_release(&rt->u.dst);
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+               IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
+               goto tx_error;
+       }
+
+       /*
+        * Call ip_send_check because we are not sure it is called
+        * after ip_defrag. Is copy-on-write needed?
+        */
+       skb = skb_share_check(skb, GFP_ATOMIC);
+       if (unlikely(skb == NULL)) {
+               dst_release(&rt->u.dst);
+               return NF_STOLEN;
+       }
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET6, skb, rt);
+
+       LeaveFunction(10);
+       return NF_STOLEN;
+
+ tx_error_icmp:
+       dst_link_failure(skb);
+ tx_error:
+       kfree_skb(skb);
+       LeaveFunction(10);
+       return NF_STOLEN;
+}
+#endif
+
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ *      Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+              struct ip_vs_protocol *pp)
+{
+       struct rtable *rt;              /* Route to the other host */
+       int mtu;
+       struct iphdr *iph = ip_hdr(skb);
+
+       EnterFunction(10);
+
+       /* check if it is a connection of no-client-port */
+       if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+               __be16 _pt, *p;
+               p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+               if (p == NULL)
+                       goto tx_error;
+               ip_vs_conn_fill_cport(cp, *p);
+               IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+       }
+
+       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+               goto tx_error_icmp;
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
+               ip_rt_put(rt);
+               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+               IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+               goto tx_error;
+       }
+
+       /* copy-on-write the packet before mangling it */
+       if (!skb_make_writable(skb, sizeof(struct iphdr)))
+               goto tx_error_put;
+
+       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+               goto tx_error_put;
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /* mangle the packet */
+       if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+               goto tx_error;
+       ip_hdr(skb)->daddr = cp->daddr.ip;
+       ip_send_check(ip_hdr(skb));
+
+       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+       /* FIXME: when application helper enlarges the packet and the length
+          is larger than the MTU of outgoing device, there will be still
+          MTU problem. */
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET, skb, rt);
+
+       LeaveFunction(10);
+       return NF_STOLEN;
+
+  tx_error_icmp:
+       dst_link_failure(skb);
+  tx_error:
+       LeaveFunction(10);
+       kfree_skb(skb);
+       return NF_STOLEN;
+  tx_error_put:
+       ip_rt_put(rt);
+       goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+                 struct ip_vs_protocol *pp)
+{
+       struct rt6_info *rt;            /* Route to the other host */
+       int mtu;
+
+       EnterFunction(10);
+
+       /* check if it is a connection of no-client-port */
+       if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+               __be16 _pt, *p;
+               p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
+                                      sizeof(_pt), &_pt);
+               if (p == NULL)
+                       goto tx_error;
+               ip_vs_conn_fill_cport(cp, *p);
+               IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+       }
+
+       rt = __ip_vs_get_out_rt_v6(cp);
+       if (!rt)
+               goto tx_error_icmp;
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if (skb->len > mtu) {
+               dst_release(&rt->u.dst);
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+               IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+                                "ip_vs_nat_xmit_v6(): frag needed for");
+               goto tx_error;
+       }
+
+       /* copy-on-write the packet before mangling it */
+       if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+               goto tx_error_put;
+
+       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+               goto tx_error_put;
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /* mangle the packet */
+       if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+               goto tx_error;
+       ipv6_hdr(skb)->daddr = cp->daddr.in6;
+
+       IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+       /* FIXME: when application helper enlarges the packet and the length
+          is larger than the MTU of outgoing device, there will be still
+          MTU problem. */
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET6, skb, rt);
+
+       LeaveFunction(10);
+       return NF_STOLEN;
+
+tx_error_icmp:
+       dst_link_failure(skb);
+tx_error:
+       LeaveFunction(10);
+       kfree_skb(skb);
+       return NF_STOLEN;
+tx_error_put:
+       dst_release(&rt->u.dst);
+       goto tx_error;
+}
+#endif
+
+
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ *
+ *   Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+                 struct ip_vs_protocol *pp)
+{
+       struct rtable *rt;                      /* Route to the other host */
+       struct net_device *tdev;                /* Device to other host */
+       struct iphdr  *old_iph = ip_hdr(skb);
+       u8     tos = old_iph->tos;
+       __be16 df = old_iph->frag_off;
+       sk_buff_data_t old_transport_header = skb->transport_header;
+       struct iphdr  *iph;                     /* Our new IP header */
+       unsigned int max_headroom;              /* The extra header space needed */
+       int    mtu;
+
+       EnterFunction(10);
+
+       if (skb->protocol != htons(ETH_P_IP)) {
+               IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+                            "ETH_P_IP: %d, skb protocol: %d\n",
+                            htons(ETH_P_IP), skb->protocol);
+               goto tx_error;
+       }
+
+       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+               goto tx_error_icmp;
+
+       tdev = rt->u.dst.dev;
+
+       mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+       if (mtu < 68) {
+               ip_rt_put(rt);
+               IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+               goto tx_error;
+       }
+       if (skb->dst)
+               skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+       df |= (old_iph->frag_off & htons(IP_DF));
+
+       if ((old_iph->frag_off & htons(IP_DF))
+           && mtu < ntohs(old_iph->tot_len)) {
+               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+               ip_rt_put(rt);
+               IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+               goto tx_error;
+       }
+
+       /*
+        * Okay, now see if we can stuff it in the buffer as-is.
+        */
+       max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+       if (skb_headroom(skb) < max_headroom
+           || skb_cloned(skb) || skb_shared(skb)) {
+               struct sk_buff *new_skb =
+                       skb_realloc_headroom(skb, max_headroom);
+               if (!new_skb) {
+                       ip_rt_put(rt);
+                       kfree_skb(skb);
+                       IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+                       return NF_STOLEN;
+               }
+               kfree_skb(skb);
+               skb = new_skb;
+               old_iph = ip_hdr(skb);
+       }
+
+       skb->transport_header = old_transport_header;
+
+       /* fix old IP header checksum */
+       ip_send_check(old_iph);
+
+       skb_push(skb, sizeof(struct iphdr));
+       skb_reset_network_header(skb);
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /*
+        *      Push down and install the IPIP header.
+        */
+       iph                     =       ip_hdr(skb);
+       iph->version            =       4;
+       iph->ihl                =       sizeof(struct iphdr)>>2;
+       iph->frag_off           =       df;
+       iph->protocol           =       IPPROTO_IPIP;
+       iph->tos                =       tos;
+       iph->daddr              =       rt->rt_dst;
+       iph->saddr              =       rt->rt_src;
+       iph->ttl                =       old_iph->ttl;
+       ip_select_ident(iph, &rt->u.dst, NULL);
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       ip_local_out(skb);
+
+       LeaveFunction(10);
+
+       return NF_STOLEN;
+
+  tx_error_icmp:
+       dst_link_failure(skb);
+  tx_error:
+       kfree_skb(skb);
+       LeaveFunction(10);
+       return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+                    struct ip_vs_protocol *pp)
+{
+       struct rt6_info *rt;            /* Route to the other host */
+       struct net_device *tdev;        /* Device to other host */
+       struct ipv6hdr  *old_iph = ipv6_hdr(skb);
+       sk_buff_data_t old_transport_header = skb->transport_header;
+       struct ipv6hdr  *iph;           /* Our new IP header */
+       unsigned int max_headroom;      /* The extra header space needed */
+       int    mtu;
+
+       EnterFunction(10);
+
+       if (skb->protocol != htons(ETH_P_IPV6)) {
+               IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
+                            "ETH_P_IPV6: %d, skb protocol: %d\n",
+                            htons(ETH_P_IPV6), skb->protocol);
+               goto tx_error;
+       }
+
+       rt = __ip_vs_get_out_rt_v6(cp);
+       if (!rt)
+               goto tx_error_icmp;
+
+       tdev = rt->u.dst.dev;
+
+       mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
+       /* TODO IPv6: do we need this check in IPv6? */
+       if (mtu < 1280) {
+               dst_release(&rt->u.dst);
+               IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
+               goto tx_error;
+       }
+       if (skb->dst)
+               skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+       if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+               dst_release(&rt->u.dst);
+               IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
+               goto tx_error;
+       }
+
+       /*
+        * Okay, now see if we can stuff it in the buffer as-is.
+        */
+       max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+       if (skb_headroom(skb) < max_headroom
+           || skb_cloned(skb) || skb_shared(skb)) {
+               struct sk_buff *new_skb =
+                       skb_realloc_headroom(skb, max_headroom);
+               if (!new_skb) {
+                       dst_release(&rt->u.dst);
+                       kfree_skb(skb);
+                       IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
+                       return NF_STOLEN;
+               }
+               kfree_skb(skb);
+               skb = new_skb;
+               old_iph = ipv6_hdr(skb);
+       }
+
+       skb->transport_header = old_transport_header;
+
+       skb_push(skb, sizeof(struct ipv6hdr));
+       skb_reset_network_header(skb);
+       memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /*
+        *      Push down and install the IPIP header.
+        */
+       iph                     =       ipv6_hdr(skb);
+       iph->version            =       6;
+       iph->nexthdr            =       IPPROTO_IPV6;
+       iph->payload_len        =       old_iph->payload_len + sizeof(old_iph);
+       iph->priority           =       old_iph->priority;
+       memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+       iph->daddr              =       rt->rt6i_dst.addr;
+       iph->saddr              =       cp->vaddr.in6; /* rt->rt6i_src.addr; */
+       iph->hop_limit          =       old_iph->hop_limit;
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       ip6_local_out(skb);
+
+       LeaveFunction(10);
+
+       return NF_STOLEN;
+
+tx_error_icmp:
+       dst_link_failure(skb);
+tx_error:
+       kfree_skb(skb);
+       LeaveFunction(10);
+       return NF_STOLEN;
+}
+#endif
+
+
+/*
+ *      Direct Routing transmitter
+ *      Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+             struct ip_vs_protocol *pp)
+{
+       struct rtable *rt;                      /* Route to the other host */
+       struct iphdr  *iph = ip_hdr(skb);
+       int    mtu;
+
+       EnterFunction(10);
+
+       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+               goto tx_error_icmp;
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
+               icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+               ip_rt_put(rt);
+               IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+               goto tx_error;
+       }
+
+       /*
+        * Call ip_send_check because we are not sure it is called
+        * after ip_defrag. Is copy-on-write needed?
+        */
+       if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+               ip_rt_put(rt);
+               return NF_STOLEN;
+       }
+       ip_send_check(ip_hdr(skb));
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET, skb, rt);
+
+       LeaveFunction(10);
+       return NF_STOLEN;
+
+  tx_error_icmp:
+       dst_link_failure(skb);
+  tx_error:
+       kfree_skb(skb);
+       LeaveFunction(10);
+       return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+                struct ip_vs_protocol *pp)
+{
+       struct rt6_info *rt;                    /* Route to the other host */
+       int    mtu;
+
+       EnterFunction(10);
+
+       rt = __ip_vs_get_out_rt_v6(cp);
+       if (!rt)
+               goto tx_error_icmp;
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if (skb->len > mtu) {
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+               dst_release(&rt->u.dst);
+               IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
+               goto tx_error;
+       }
+
+       /*
+        * Call ip_send_check because we are not sure it is called
+        * after ip_defrag. Is copy-on-write needed?
+        */
+       skb = skb_share_check(skb, GFP_ATOMIC);
+       if (unlikely(skb == NULL)) {
+               dst_release(&rt->u.dst);
+               return NF_STOLEN;
+       }
+
+       /* drop old route */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET6, skb, rt);
+
+       LeaveFunction(10);
+       return NF_STOLEN;
+
+tx_error_icmp:
+       dst_link_failure(skb);
+tx_error:
+       kfree_skb(skb);
+       LeaveFunction(10);
+       return NF_STOLEN;
+}
+#endif
+
+
+/*
+ *     ICMP packet transmitter
+ *     called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+               struct ip_vs_protocol *pp, int offset)
+{
+       struct rtable   *rt;    /* Route to the other host */
+       int mtu;
+       int rc;
+
+       EnterFunction(10);
+
+       /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+          forwarded directly here, because there is no need to
+          translate address/port back */
+       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+               if (cp->packet_xmit)
+                       rc = cp->packet_xmit(skb, cp, pp);
+               else
+                       rc = NF_ACCEPT;
+               /* do not touch skb anymore */
+               atomic_inc(&cp->in_pkts);
+               goto out;
+       }
+
+       /*
+        * mangle and send the packet here (only for VS/NAT)
+        */
+
+       if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+               goto tx_error_icmp;
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
+               ip_rt_put(rt);
+               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+               IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+               goto tx_error;
+       }
+
+       /* copy-on-write the packet before mangling it */
+       if (!skb_make_writable(skb, offset))
+               goto tx_error_put;
+
+       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+               goto tx_error_put;
+
+       /* drop the old route when skb is not shared */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       ip_vs_nat_icmp(skb, pp, cp, 0);
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET, skb, rt);
+
+       rc = NF_STOLEN;
+       goto out;
+
+  tx_error_icmp:
+       dst_link_failure(skb);
+  tx_error:
+       dev_kfree_skb(skb);
+       rc = NF_STOLEN;
+  out:
+       LeaveFunction(10);
+       return rc;
+  tx_error_put:
+       ip_rt_put(rt);
+       goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+               struct ip_vs_protocol *pp, int offset)
+{
+       struct rt6_info *rt;    /* Route to the other host */
+       int mtu;
+       int rc;
+
+       EnterFunction(10);
+
+       /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+          forwarded directly here, because there is no need to
+          translate address/port back */
+       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+               if (cp->packet_xmit)
+                       rc = cp->packet_xmit(skb, cp, pp);
+               else
+                       rc = NF_ACCEPT;
+               /* do not touch skb anymore */
+               atomic_inc(&cp->in_pkts);
+               goto out;
+       }
+
+       /*
+        * mangle and send the packet here (only for VS/NAT)
+        */
+
+       rt = __ip_vs_get_out_rt_v6(cp);
+       if (!rt)
+               goto tx_error_icmp;
+
+       /* MTU checking */
+       mtu = dst_mtu(&rt->u.dst);
+       if (skb->len > mtu) {
+               dst_release(&rt->u.dst);
+               icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+               IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+               goto tx_error;
+       }
+
+       /* copy-on-write the packet before mangling it */
+       if (!skb_make_writable(skb, offset))
+               goto tx_error_put;
+
+       if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+               goto tx_error_put;
+
+       /* drop the old route when skb is not shared */
+       dst_release(skb->dst);
+       skb->dst = &rt->u.dst;
+
+       ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+       /* Another hack: avoid icmp_send in ip_fragment */
+       skb->local_df = 1;
+
+       IP_VS_XMIT(PF_INET6, skb, rt);
+
+       rc = NF_STOLEN;
+       goto out;
+
+tx_error_icmp:
+       dst_link_failure(skb);
+tx_error:
+       dev_kfree_skb(skb);
+       rc = NF_STOLEN;
+out:
+       LeaveFunction(10);
+       return rc;
+tx_error_put:
+       dst_release(&rt->u.dst);
+       goto tx_error;
+}
+#endif
author	Julius Volz <juliusv@google.com>
	Fri, 19 Sep 2008 10:32:57 +0000 (12:32 +0200)
committer	Simon Horman <horms@verge.net.au>
	Mon, 6 Oct 2008 21:38:24 +0000 (08:38 +1100)
net/ipv4/Kconfig		patch \| blob \| blame \| history
net/ipv4/Makefile		patch \| blob \| blame \| history
net/ipv4/ipvs/Kconfig	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/Makefile	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_app.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_conn.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_core.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_ctl.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_est.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_ftp.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_lblc.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_lblcr.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_lc.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_nq.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_proto.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_proto_ah_esp.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_proto_tcp.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_proto_udp.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_rr.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_sched.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_sed.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_sh.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_sync.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_wlc.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_wrr.c	[deleted file]	patch \| blob \| blame \| history
net/ipv4/ipvs/ip_vs_xmit.c	[deleted file]	patch \| blob \| blame \| history
net/netfilter/Kconfig		patch \| blob \| blame \| history
net/netfilter/Makefile		patch \| blob \| blame \| history
net/netfilter/ipvs/Kconfig	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/Makefile	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_app.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_conn.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_core.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_ctl.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_dh.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_est.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_ftp.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_lblc.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_lblcr.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_lc.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_nq.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_proto.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_proto_ah_esp.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_proto_tcp.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_proto_udp.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_rr.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_sched.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_sed.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_sh.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_sync.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_wlc.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_wrr.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_xmit.c	[new file with mode: 0644]	patch \| blob