Merge tag 'v3.10.105' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
CommitLineData
1da177e4
LT
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
4a85b96c 12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
1da177e4 13 *
4a85b96c
HS
14 * Network name space (netns) aware.
15 * Global data moved to netns i.e struct netns_ipvs
16 * tcp_timeouts table has copy per netns in a hash table per
17 * protocol ip_vs_proto_data and is handled by netns
1da177e4
LT
18 */
19
9aada7ac
HE
20#define KMSG_COMPONENT "IPVS"
21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
1da177e4
LT
23#include <linux/kernel.h>
24#include <linux/ip.h>
25#include <linux/tcp.h> /* for tcphdr */
26#include <net/ip.h>
27#include <net/tcp.h> /* for csum_tcpudp_magic */
63f2c046 28#include <net/ip6_checksum.h>
af1e1cf0 29#include <linux/netfilter.h>
1da177e4
LT
30#include <linux/netfilter_ipv4.h>
31
32#include <net/ip_vs.h>
33
1da177e4 34static int
9330419d 35tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
d4383f04
JDB
36 int *verdict, struct ip_vs_conn **cpp,
37 struct ip_vs_iphdr *iph)
1da177e4 38{
fc723250 39 struct net *net;
1da177e4
LT
40 struct ip_vs_service *svc;
41 struct tcphdr _tcph, *th;
42
d4383f04 43 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
1da177e4
LT
44 if (th == NULL) {
45 *verdict = NF_DROP;
46 return 0;
47 }
fc723250 48 net = skb_net(skb);
190ecd27 49 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
ceec4c38 50 rcu_read_lock();
1da177e4 51 if (th->syn &&
ceec4c38
JA
52 (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
53 &iph->daddr, th->dest))) {
190ecd27
JA
54 int ignored;
55
a0840e2e 56 if (ip_vs_todrop(net_ipvs(net))) {
1da177e4
LT
57 /*
58 * It seems that we are very loaded.
59 * We have to drop this packet :(
60 */
ceec4c38 61 rcu_read_unlock();
1da177e4
LT
62 *verdict = NF_DROP;
63 return 0;
64 }
65
66 /*
67 * Let the virtual server select a real server for the
68 * incoming connection, and create a connection entry.
69 */
d4383f04 70 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
a5959d53
HS
71 if (!*cpp && ignored <= 0) {
72 if (!ignored)
d4383f04 73 *verdict = ip_vs_leave(svc, skb, pd, iph);
ceec4c38 74 else
a5959d53 75 *verdict = NF_DROP;
ceec4c38 76 rcu_read_unlock();
1da177e4
LT
77 return 0;
78 }
1da177e4 79 }
ceec4c38 80 rcu_read_unlock();
a5959d53 81 /* NF_ACCEPT */
1da177e4
LT
82 return 1;
83}
84
85
86static inline void
0bbdd42b
JV
87tcp_fast_csum_update(int af, struct tcphdr *tcph,
88 const union nf_inet_addr *oldip,
89 const union nf_inet_addr *newip,
014d730d 90 __be16 oldport, __be16 newport)
1da177e4 91{
0bbdd42b
JV
92#ifdef CONFIG_IP_VS_IPV6
93 if (af == AF_INET6)
94 tcph->check =
95 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
96 ip_vs_check_diff2(oldport, newport,
97 ~csum_unfold(tcph->check))));
98 else
99#endif
1da177e4 100 tcph->check =
0bbdd42b 101 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
f9214b26
AV
102 ip_vs_check_diff2(oldport, newport,
103 ~csum_unfold(tcph->check))));
1da177e4
LT
104}
105
106
503e81f6
SH
107static inline void
108tcp_partial_csum_update(int af, struct tcphdr *tcph,
109 const union nf_inet_addr *oldip,
110 const union nf_inet_addr *newip,
111 __be16 oldlen, __be16 newlen)
112{
113#ifdef CONFIG_IP_VS_IPV6
114 if (af == AF_INET6)
115 tcph->check =
5bc9068e 116 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
503e81f6 117 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 118 csum_unfold(tcph->check))));
503e81f6
SH
119 else
120#endif
121 tcph->check =
5bc9068e 122 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
503e81f6 123 ip_vs_check_diff2(oldlen, newlen,
5bc9068e 124 csum_unfold(tcph->check))));
503e81f6
SH
125}
126
127
1da177e4 128static int
d4383f04
JDB
129tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
130 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
1da177e4
LT
131{
132 struct tcphdr *tcph;
d4383f04 133 unsigned int tcphoff = iph->len;
503e81f6 134 int oldlen;
8b27b10f 135 int payload_csum = 0;
0bbdd42b
JV
136
137#ifdef CONFIG_IP_VS_IPV6
d4383f04 138 if (cp->af == AF_INET6 && iph->fragoffs)
63dca2c0 139 return 1;
0bbdd42b 140#endif
503e81f6 141 oldlen = skb->len - tcphoff;
1da177e4
LT
142
143 /* csum_check requires unshared skb */
3db05fea 144 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
1da177e4
LT
145 return 0;
146
147 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
148 int ret;
149
1da177e4 150 /* Some checks before mangling */
0bbdd42b 151 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
1da177e4
LT
152 return 0;
153
154 /* Call application helper if needed */
8b27b10f 155 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
1da177e4 156 return 0;
8b27b10f
JA
157 /* ret=2: csum update is needed after payload mangling */
158 if (ret == 1)
159 oldlen = skb->len - tcphoff;
160 else
161 payload_csum = 1;
1da177e4
LT
162 }
163
0bbdd42b 164 tcph = (void *)skb_network_header(skb) + tcphoff;
1da177e4
LT
165 tcph->source = cp->vport;
166
167 /* Adjust TCP checksums */
503e81f6
SH
168 if (skb->ip_summed == CHECKSUM_PARTIAL) {
169 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
ca62059b
HH
170 htons(oldlen),
171 htons(skb->len - tcphoff));
8b27b10f 172 } else if (!payload_csum) {
1da177e4 173 /* Only port and addr are changed, do fast csum update */
0bbdd42b 174 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
1da177e4 175 cp->dport, cp->vport);
3db05fea 176 if (skb->ip_summed == CHECKSUM_COMPLETE)
8b27b10f
JA
177 skb->ip_summed = (cp->app && pp->csum_check) ?
178 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
179 } else {
180 /* full checksum calculation */
181 tcph->check = 0;
3db05fea 182 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
0bbdd42b
JV
183#ifdef CONFIG_IP_VS_IPV6
184 if (cp->af == AF_INET6)
185 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
186 &cp->caddr.in6,
187 skb->len - tcphoff,
188 cp->protocol, skb->csum);
189 else
190#endif
191 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
192 cp->caddr.ip,
193 skb->len - tcphoff,
194 cp->protocol,
195 skb->csum);
8b27b10f 196 skb->ip_summed = CHECKSUM_UNNECESSARY;
0bbdd42b 197
1da177e4
LT
198 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
199 pp->name, tcph->check,
200 (char*)&(tcph->check) - (char*)tcph);
201 }
202 return 1;
203}
204
205
206static int
d4383f04
JDB
207tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
208 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
1da177e4
LT
209{
210 struct tcphdr *tcph;
d4383f04 211 unsigned int tcphoff = iph->len;
503e81f6 212 int oldlen;
8b27b10f 213 int payload_csum = 0;
0bbdd42b
JV
214
215#ifdef CONFIG_IP_VS_IPV6
d4383f04 216 if (cp->af == AF_INET6 && iph->fragoffs)
63dca2c0 217 return 1;
0bbdd42b 218#endif
503e81f6 219 oldlen = skb->len - tcphoff;
1da177e4
LT
220
221 /* csum_check requires unshared skb */
3db05fea 222 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
1da177e4
LT
223 return 0;
224
225 if (unlikely(cp->app != NULL)) {
8b27b10f
JA
226 int ret;
227
1da177e4 228 /* Some checks before mangling */
0bbdd42b 229 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
1da177e4
LT
230 return 0;
231
232 /*
233 * Attempt ip_vs_app call.
234 * It will fix ip_vs_conn and iph ack_seq stuff
235 */
8b27b10f 236 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
1da177e4 237 return 0;
8b27b10f
JA
238 /* ret=2: csum update is needed after payload mangling */
239 if (ret == 1)
240 oldlen = skb->len - tcphoff;
241 else
242 payload_csum = 1;
1da177e4
LT
243 }
244
0bbdd42b 245 tcph = (void *)skb_network_header(skb) + tcphoff;
1da177e4
LT
246 tcph->dest = cp->dport;
247
248 /*
249 * Adjust TCP checksums
250 */
503e81f6 251 if (skb->ip_summed == CHECKSUM_PARTIAL) {
5bc9068e 252 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
ca62059b
HH
253 htons(oldlen),
254 htons(skb->len - tcphoff));
8b27b10f 255 } else if (!payload_csum) {
1da177e4 256 /* Only port and addr are changed, do fast csum update */
0bbdd42b 257 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
1da177e4 258 cp->vport, cp->dport);
3db05fea 259 if (skb->ip_summed == CHECKSUM_COMPLETE)
8b27b10f
JA
260 skb->ip_summed = (cp->app && pp->csum_check) ?
261 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
1da177e4
LT
262 } else {
263 /* full checksum calculation */
264 tcph->check = 0;
3db05fea 265 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
0bbdd42b
JV
266#ifdef CONFIG_IP_VS_IPV6
267 if (cp->af == AF_INET6)
268 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
269 &cp->daddr.in6,
270 skb->len - tcphoff,
271 cp->protocol, skb->csum);
272 else
273#endif
274 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
275 cp->daddr.ip,
276 skb->len - tcphoff,
277 cp->protocol,
278 skb->csum);
3db05fea 279 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4
LT
280 }
281 return 1;
282}
283
284
285static int
51ef348b 286tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
1da177e4 287{
51ef348b
JV
288 unsigned int tcphoff;
289
290#ifdef CONFIG_IP_VS_IPV6
291 if (af == AF_INET6)
292 tcphoff = sizeof(struct ipv6hdr);
293 else
294#endif
295 tcphoff = ip_hdrlen(skb);
1da177e4
LT
296
297 switch (skb->ip_summed) {
298 case CHECKSUM_NONE:
299 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
84fa7933 300 case CHECKSUM_COMPLETE:
51ef348b
JV
301#ifdef CONFIG_IP_VS_IPV6
302 if (af == AF_INET6) {
303 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
304 &ipv6_hdr(skb)->daddr,
305 skb->len - tcphoff,
306 ipv6_hdr(skb)->nexthdr,
307 skb->csum)) {
0d79641a 308 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
309 "Failed checksum for");
310 return 0;
311 }
312 } else
313#endif
314 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
315 ip_hdr(skb)->daddr,
316 skb->len - tcphoff,
317 ip_hdr(skb)->protocol,
318 skb->csum)) {
0d79641a 319 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
51ef348b
JV
320 "Failed checksum for");
321 return 0;
322 }
1da177e4
LT
323 break;
324 default:
84fa7933 325 /* No need to checksum. */
1da177e4
LT
326 break;
327 }
328
329 return 1;
330}
331
332
333#define TCP_DIR_INPUT 0
334#define TCP_DIR_OUTPUT 4
335#define TCP_DIR_INPUT_ONLY 8
336
9b5b5cff 337static const int tcp_state_off[IP_VS_DIR_LAST] = {
1da177e4
LT
338 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
339 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
340 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
341};
342
343/*
344 * Timeout table[state]
345 */
4a85b96c 346static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
1da177e4
LT
347 [IP_VS_TCP_S_NONE] = 2*HZ,
348 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
349 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
350 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
351 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
352 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
353 [IP_VS_TCP_S_CLOSE] = 10*HZ,
354 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
355 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
356 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
357 [IP_VS_TCP_S_SYNACK] = 120*HZ,
358 [IP_VS_TCP_S_LAST] = 2*HZ,
359};
360
36cbd3dc 361static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
1da177e4
LT
362 [IP_VS_TCP_S_NONE] = "NONE",
363 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
364 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
365 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
366 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
367 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
368 [IP_VS_TCP_S_CLOSE] = "CLOSE",
369 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
370 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
371 [IP_VS_TCP_S_LISTEN] = "LISTEN",
372 [IP_VS_TCP_S_SYNACK] = "SYNACK",
373 [IP_VS_TCP_S_LAST] = "BUG!",
374};
375
d53b6609
MK
376static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = {
377 [IP_VS_TCP_S_NONE] = false,
378 [IP_VS_TCP_S_ESTABLISHED] = true,
379 [IP_VS_TCP_S_SYN_SENT] = true,
380 [IP_VS_TCP_S_SYN_RECV] = true,
381 [IP_VS_TCP_S_FIN_WAIT] = false,
382 [IP_VS_TCP_S_TIME_WAIT] = false,
383 [IP_VS_TCP_S_CLOSE] = false,
384 [IP_VS_TCP_S_CLOSE_WAIT] = false,
385 [IP_VS_TCP_S_LAST_ACK] = false,
386 [IP_VS_TCP_S_LISTEN] = false,
387 [IP_VS_TCP_S_SYNACK] = true,
388};
389
1da177e4
LT
390#define sNO IP_VS_TCP_S_NONE
391#define sES IP_VS_TCP_S_ESTABLISHED
392#define sSS IP_VS_TCP_S_SYN_SENT
393#define sSR IP_VS_TCP_S_SYN_RECV
394#define sFW IP_VS_TCP_S_FIN_WAIT
395#define sTW IP_VS_TCP_S_TIME_WAIT
396#define sCL IP_VS_TCP_S_CLOSE
397#define sCW IP_VS_TCP_S_CLOSE_WAIT
398#define sLA IP_VS_TCP_S_LAST_ACK
399#define sLI IP_VS_TCP_S_LISTEN
400#define sSA IP_VS_TCP_S_SYNACK
401
402struct tcp_states_t {
403 int next_state[IP_VS_TCP_S_LAST];
404};
405
406static const char * tcp_state_name(int state)
407{
408 if (state >= IP_VS_TCP_S_LAST)
409 return "ERR!";
410 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
411}
412
d53b6609
MK
413static bool tcp_state_active(int state)
414{
415 if (state >= IP_VS_TCP_S_LAST)
416 return false;
417 return tcp_state_active_table[state];
418}
419
1da177e4
LT
420static struct tcp_states_t tcp_states [] = {
421/* INPUT */
422/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
423/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
424/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
425/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
426/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
427
428/* OUTPUT */
429/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
430/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
431/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
432/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
433/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
434
435/* INPUT-ONLY */
436/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
437/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
438/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
439/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
440/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
441};
442
443static struct tcp_states_t tcp_states_dos [] = {
444/* INPUT */
445/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
446/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
447/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
448/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
449/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
450
451/* OUTPUT */
452/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
453/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
454/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
455/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
456/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
457
458/* INPUT-ONLY */
459/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
460/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
461/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
462/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
463/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
464};
465
9330419d 466static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
1da177e4
LT
467{
468 int on = (flags & 1); /* secure_tcp */
469
470 /*
471 ** FIXME: change secure_tcp to independent sysctl var
472 ** or make it per-service or per-app because it is valid
473 ** for most if not for all of the applications. Something
474 ** like "capabilities" (flags) for each object.
475 */
9330419d 476 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
1da177e4
LT
477}
478
1da177e4
LT
479static inline int tcp_state_idx(struct tcphdr *th)
480{
481 if (th->rst)
482 return 3;
483 if (th->syn)
484 return 0;
485 if (th->fin)
486 return 1;
487 if (th->ack)
488 return 2;
489 return -1;
490}
491
492static inline void
9330419d 493set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
1da177e4
LT
494 int direction, struct tcphdr *th)
495{
496 int state_idx;
497 int new_state = IP_VS_TCP_S_CLOSE;
498 int state_off = tcp_state_off[direction];
499
500 /*
501 * Update state offset to INPUT_ONLY if necessary
502 * or delete NO_OUTPUT flag if output packet detected
503 */
504 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
505 if (state_off == TCP_DIR_OUTPUT)
506 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
507 else
508 state_off = TCP_DIR_INPUT_ONLY;
509 }
510
511 if ((state_idx = tcp_state_idx(th)) < 0) {
512 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
513 goto tcp_state_out;
514 }
515
9330419d
HS
516 new_state =
517 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
1da177e4
LT
518
519 tcp_state_out:
520 if (new_state != cp->state) {
521 struct ip_vs_dest *dest = cp->dest;
522
cfc78c5a
JV
523 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
524 "%s:%d state: %s->%s conn->refcnt:%d\n",
9330419d 525 pd->pp->name,
cfc78c5a
JV
526 ((state_off == TCP_DIR_OUTPUT) ?
527 "output " : "input "),
528 th->syn ? 'S' : '.',
529 th->fin ? 'F' : '.',
530 th->ack ? 'A' : '.',
531 th->rst ? 'R' : '.',
532 IP_VS_DBG_ADDR(cp->af, &cp->daddr),
533 ntohs(cp->dport),
534 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
535 ntohs(cp->cport),
536 tcp_state_name(cp->state),
537 tcp_state_name(new_state),
538 atomic_read(&cp->refcnt));
539
1da177e4
LT
540 if (dest) {
541 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
d53b6609 542 !tcp_state_active(new_state)) {
1da177e4
LT
543 atomic_dec(&dest->activeconns);
544 atomic_inc(&dest->inactconns);
545 cp->flags |= IP_VS_CONN_F_INACTIVE;
546 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
d53b6609 547 tcp_state_active(new_state)) {
1da177e4
LT
548 atomic_inc(&dest->activeconns);
549 atomic_dec(&dest->inactconns);
550 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
551 }
552 }
553 }
554
4a85b96c
HS
555 if (likely(pd))
556 cp->timeout = pd->timeout_table[cp->state = new_state];
557 else /* What to do ? */
558 cp->timeout = tcp_timeouts[cp->state = new_state];
1da177e4
LT
559}
560
1da177e4
LT
561/*
562 * Handle state transitions
563 */
4a516f11 564static void
1da177e4
LT
565tcp_state_transition(struct ip_vs_conn *cp, int direction,
566 const struct sk_buff *skb,
9330419d 567 struct ip_vs_proto_data *pd)
1da177e4
LT
568{
569 struct tcphdr _tcph, *th;
570
0bbdd42b
JV
571#ifdef CONFIG_IP_VS_IPV6
572 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
573#else
574 int ihl = ip_hdrlen(skb);
575#endif
576
577 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
1da177e4 578 if (th == NULL)
4a516f11 579 return;
1da177e4 580
ac69269a 581 spin_lock_bh(&cp->lock);
9330419d 582 set_tcp_state(pd, cp, direction, th);
ac69269a 583 spin_unlock_bh(&cp->lock);
1da177e4
LT
584}
585
75e7ce66 586static inline __u16 tcp_app_hashkey(__be16 port)
1da177e4 587{
75e7ce66
AV
588 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
589 & TCP_APP_TAB_MASK;
1da177e4
LT
590}
591
592
ab8a5e84 593static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
1da177e4
LT
594{
595 struct ip_vs_app *i;
75e7ce66
AV
596 __u16 hash;
597 __be16 port = inc->port;
1da177e4 598 int ret = 0;
ab8a5e84
HS
599 struct netns_ipvs *ipvs = net_ipvs(net);
600 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
1da177e4
LT
601
602 hash = tcp_app_hashkey(port);
603
4a85b96c 604 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
1da177e4
LT
605 if (i->port == port) {
606 ret = -EEXIST;
607 goto out;
608 }
609 }
363c97d7 610 list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
9bbac6a9 611 atomic_inc(&pd->appcnt);
1da177e4
LT
612
613 out:
1da177e4
LT
614 return ret;
615}
616
617
618static void
ab8a5e84 619tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
1da177e4 620{
ab8a5e84 621 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
4a85b96c 622
9bbac6a9 623 atomic_dec(&pd->appcnt);
363c97d7 624 list_del_rcu(&inc->p_list);
1da177e4
LT
625}
626
627
628static int
629tcp_app_conn_bind(struct ip_vs_conn *cp)
630{
6e67e586 631 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
1da177e4
LT
632 int hash;
633 struct ip_vs_app *inc;
634 int result = 0;
635
636 /* Default binding: bind app only for NAT */
637 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
638 return 0;
639
640 /* Lookup application incarnations and bind the right one */
641 hash = tcp_app_hashkey(cp->vport);
642
363c97d7
JA
643 rcu_read_lock();
644 list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
1da177e4
LT
645 if (inc->port == cp->vport) {
646 if (unlikely(!ip_vs_app_inc_get(inc)))
647 break;
363c97d7 648 rcu_read_unlock();
1da177e4 649
1e3e238e 650 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
cfc78c5a
JV
651 "%s:%u to app %s on port %u\n",
652 __func__,
653 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
654 ntohs(cp->cport),
655 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
656 ntohs(cp->vport),
657 inc->name, ntohs(inc->port));
658
1da177e4
LT
659 cp->app = inc;
660 if (inc->init_conn)
661 result = inc->init_conn(inc, cp);
662 goto out;
663 }
664 }
363c97d7 665 rcu_read_unlock();
1da177e4
LT
666
667 out:
668 return result;
669}
670
671
672/*
673 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
674 */
4a85b96c 675void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
1da177e4 676{
4a85b96c
HS
677 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
678
ac69269a 679 spin_lock_bh(&cp->lock);
1da177e4 680 cp->state = IP_VS_TCP_S_LISTEN;
4a85b96c
HS
681 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
682 : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
ac69269a 683 spin_unlock_bh(&cp->lock);
1da177e4
LT
684}
685
4a85b96c
HS
686/* ---------------------------------------------
687 * timeouts is netns related now.
688 * ---------------------------------------------
689 */
582b8e3e 690static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
1da177e4 691{
4a85b96c 692 struct netns_ipvs *ipvs = net_ipvs(net);
1da177e4 693
4a85b96c 694 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
4a85b96c
HS
695 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
696 sizeof(tcp_timeouts));
582b8e3e
HS
697 if (!pd->timeout_table)
698 return -ENOMEM;
9330419d 699 pd->tcp_state_table = tcp_states;
582b8e3e 700 return 0;
4a85b96c 701}
1da177e4 702
4a85b96c 703static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
1da177e4 704{
4a85b96c 705 kfree(pd->timeout_table);
1da177e4
LT
706}
707
708
709struct ip_vs_protocol ip_vs_protocol_tcp = {
710 .name = "TCP",
711 .protocol = IPPROTO_TCP,
2ad17def 712 .num_states = IP_VS_TCP_S_LAST,
1da177e4 713 .dont_defrag = 0,
4a85b96c
HS
714 .init = NULL,
715 .exit = NULL,
716 .init_netns = __ip_vs_tcp_init,
717 .exit_netns = __ip_vs_tcp_exit,
1da177e4
LT
718 .register_app = tcp_register_app,
719 .unregister_app = tcp_unregister_app,
720 .conn_schedule = tcp_conn_schedule,
5c0d2374
SH
721 .conn_in_get = ip_vs_conn_in_get_proto,
722 .conn_out_get = ip_vs_conn_out_get_proto,
1da177e4
LT
723 .snat_handler = tcp_snat_handler,
724 .dnat_handler = tcp_dnat_handler,
725 .csum_check = tcp_csum_check,
726 .state_name = tcp_state_name,
727 .state_transition = tcp_state_transition,
728 .app_conn_bind = tcp_app_conn_bind,
729 .debug_packet = ip_vs_tcpudp_debug_packet,
730 .timeout_change = tcp_timeout_change,
1da177e4 731};