[TCP]: Move the tcp sock states to net/tcp_states.h
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
60236fdd 39 * request_sock handling and moved
1da177e4
LT
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4
LT
68#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
0f7ff927
ACM
92struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
6e04e021 97 .port_rover = 1024 - 1,
1da177e4
LT
98};
99
100/*
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
103 * 32768-61000
104 */
105int sysctl_local_port_range[2] = { 1024, 4999 };
1da177e4 106
0f7ff927 107static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
1da177e4
LT
108{
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130}
131
132/* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136{
0f7ff927 137 struct inet_bind_hashbucket *head;
1da177e4 138 struct hlist_node *node;
0f7ff927 139 struct inet_bind_bucket *tb;
1da177e4
LT
140 int ret;
141
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
148
6e04e021
ACM
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
0b2531bd
FH
151 rover = low;
152 else
6e04e021 153 rover = tcp_hashinfo.port_rover;
1da177e4
LT
154 do {
155 rover++;
0b2531bd 156 if (rover > high)
1da177e4 157 rover = low;
6e04e021 158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
1da177e4 159 spin_lock(&head->lock);
0f7ff927 160 inet_bind_bucket_for_each(tb, node, &head->chain)
1da177e4
LT
161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
6e04e021
ACM
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
1da177e4 169
d5d28375
DM
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
175 */
1da177e4 176 ret = 1;
d5d28375 177 if (unlikely(remaining <= 0))
1da177e4
LT
178 goto fail;
179
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
182 */
183 snum = rover;
184 } else {
6e04e021 185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
1da177e4 186 spin_lock(&head->lock);
0f7ff927 187 inet_bind_bucket_for_each(tb, node, &head->chain)
1da177e4
LT
188 if (tb->port == snum)
189 goto tb_found;
190 }
191 tb = NULL;
192 goto tb_not_found;
193tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
204 }
205 }
206tb_not_found:
207 ret = 1;
6e04e021 208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
1da177e4
LT
209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218success:
a55ebcc4 219 if (!inet_sk(sk)->bind_hash)
2d8c4ce5 220 inet_bind_hash(sk, tb, snum);
a55ebcc4 221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
1da177e4
LT
222 ret = 0;
223
224fail_unlock:
225 spin_unlock(&head->lock);
226fail:
227 local_bh_enable();
228 return ret;
229}
230
1da177e4
LT
231static void tcp_v4_hash(struct sock *sk)
232{
233 if (sk->sk_state != TCP_CLOSE) {
234 local_bh_disable();
f3f05f70 235 __inet_hash(&tcp_hashinfo, sk, 1);
1da177e4
LT
236 local_bh_enable();
237 }
238}
239
240void tcp_unhash(struct sock *sk)
241{
242 rwlock_t *lock;
243
244 if (sk_unhashed(sk))
245 goto ende;
246
247 if (sk->sk_state == TCP_LISTEN) {
248 local_bh_disable();
f3f05f70 249 inet_listen_wlock(&tcp_hashinfo);
6e04e021 250 lock = &tcp_hashinfo.lhash_lock;
1da177e4 251 } else {
6e04e021 252 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent];
1da177e4
LT
253 lock = &head->lock;
254 write_lock_bh(&head->lock);
255 }
256
257 if (__sk_del_node_init(sk))
258 sock_prot_dec_use(sk->sk_prot);
259 write_unlock_bh(lock);
260
261 ende:
262 if (sk->sk_state == TCP_LISTEN)
6e04e021 263 wake_up(&tcp_hashinfo.lhash_wait);
1da177e4
LT
264}
265
266/* Don't inline this cruft. Here are some nice properties to
267 * exploit here. The BSD API does not allow a listening TCP
268 * to specify the remote port nor the remote address for the
269 * connection. So always assume those are both wildcarded
270 * during the search since they can never be otherwise.
271 */
0f7ff927
ACM
272static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
273 const u32 daddr,
274 const unsigned short hnum,
275 const int dif)
1da177e4
LT
276{
277 struct sock *result = NULL, *sk;
278 struct hlist_node *node;
279 int score, hiscore;
280
281 hiscore=-1;
282 sk_for_each(sk, node, head) {
283 struct inet_sock *inet = inet_sk(sk);
284
285 if (inet->num == hnum && !ipv6_only_sock(sk)) {
286 __u32 rcv_saddr = inet->rcv_saddr;
287
288 score = (sk->sk_family == PF_INET ? 1 : 0);
289 if (rcv_saddr) {
290 if (rcv_saddr != daddr)
291 continue;
292 score+=2;
293 }
294 if (sk->sk_bound_dev_if) {
295 if (sk->sk_bound_dev_if != dif)
296 continue;
297 score+=2;
298 }
299 if (score == 5)
300 return sk;
301 if (score > hiscore) {
302 hiscore = score;
303 result = sk;
304 }
305 }
306 }
307 return result;
308}
309
310/* Optimize the common listener case. */
0f7ff927
ACM
311static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
312 const unsigned short hnum,
313 const int dif)
1da177e4
LT
314{
315 struct sock *sk = NULL;
316 struct hlist_head *head;
317
6e04e021
ACM
318 read_lock(&tcp_hashinfo.lhash_lock);
319 head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)];
1da177e4
LT
320 if (!hlist_empty(head)) {
321 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
322
323 if (inet->num == hnum && !sk->sk_node.next &&
324 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
325 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
326 !sk->sk_bound_dev_if)
327 goto sherry_cache;
328 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
329 }
330 if (sk) {
331sherry_cache:
332 sock_hold(sk);
333 }
6e04e021 334 read_unlock(&tcp_hashinfo.lhash_lock);
1da177e4
LT
335 return sk;
336}
337
338/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
339 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
340 *
341 * Local BH must be disabled here.
342 */
343
0f7ff927
ACM
344static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
345 const u16 sport,
346 const u32 daddr,
347 const u16 hnum,
348 const int dif)
1da177e4 349{
0f7ff927 350 struct inet_ehash_bucket *head;
1da177e4
LT
351 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
352 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
353 struct sock *sk;
354 struct hlist_node *node;
355 /* Optimize here for direct hit, only listening connections can
356 * have wildcards anyways.
357 */
6e04e021
ACM
358 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
359 head = &tcp_hashinfo.ehash[hash];
1da177e4
LT
360 read_lock(&head->lock);
361 sk_for_each(sk, node, &head->chain) {
362 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
363 goto hit; /* You sunk my battleship! */
364 }
365
366 /* Must check for a TIME_WAIT'er before going to listener hash. */
6e04e021 367 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
1da177e4
LT
368 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
369 goto hit;
370 }
371 sk = NULL;
372out:
373 read_unlock(&head->lock);
374 return sk;
375hit:
376 sock_hold(sk);
377 goto out;
378}
379
380static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
381 u32 daddr, u16 hnum, int dif)
382{
383 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
384 daddr, hnum, dif);
385
386 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
387}
388
389inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
390 u16 dport, int dif)
391{
392 struct sock *sk;
393
394 local_bh_disable();
395 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
396 local_bh_enable();
397
398 return sk;
399}
400
401EXPORT_SYMBOL_GPL(tcp_v4_lookup);
402
403static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
404{
405 return secure_tcp_sequence_number(skb->nh.iph->daddr,
406 skb->nh.iph->saddr,
407 skb->h.th->dest,
408 skb->h.th->source);
409}
410
411/* called with local bh disabled */
412static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
413 struct tcp_tw_bucket **twp)
414{
415 struct inet_sock *inet = inet_sk(sk);
416 u32 daddr = inet->rcv_saddr;
417 u32 saddr = inet->daddr;
418 int dif = sk->sk_bound_dev_if;
419 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
420 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
6e04e021
ACM
421 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
422 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
1da177e4
LT
423 struct sock *sk2;
424 struct hlist_node *node;
425 struct tcp_tw_bucket *tw;
426
427 write_lock(&head->lock);
428
429 /* Check TIME-WAIT sockets first. */
6e04e021 430 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
1da177e4
LT
431 tw = (struct tcp_tw_bucket *)sk2;
432
433 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
434 struct tcp_sock *tp = tcp_sk(sk);
435
436 /* With PAWS, it is safe from the viewpoint
437 of data integrity. Even without PAWS it
438 is safe provided sequence spaces do not
439 overlap i.e. at data rates <= 80Mbit/sec.
440
441 Actually, the idea is close to VJ's one,
442 only timestamp cache is held not per host,
443 but per port pair and TW bucket is used
444 as state holder.
445
446 If TW bucket has been already destroyed we
447 fall back to VJ's scheme and use initial
448 timestamp retrieved from peer table.
449 */
450 if (tw->tw_ts_recent_stamp &&
451 (!twp || (sysctl_tcp_tw_reuse &&
452 xtime.tv_sec -
453 tw->tw_ts_recent_stamp > 1))) {
454 if ((tp->write_seq =
455 tw->tw_snd_nxt + 65535 + 2) == 0)
456 tp->write_seq = 1;
457 tp->rx_opt.ts_recent = tw->tw_ts_recent;
458 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
459 sock_hold(sk2);
460 goto unique;
461 } else
462 goto not_unique;
463 }
464 }
465 tw = NULL;
466
467 /* And established part... */
468 sk_for_each(sk2, node, &head->chain) {
469 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
470 goto not_unique;
471 }
472
473unique:
474 /* Must record num and sport now. Otherwise we will see
475 * in hash table socket with a funny identity. */
476 inet->num = lport;
477 inet->sport = htons(lport);
478 sk->sk_hashent = hash;
479 BUG_TRAP(sk_unhashed(sk));
480 __sk_add_node(sk, &head->chain);
481 sock_prot_inc_use(sk->sk_prot);
482 write_unlock(&head->lock);
483
484 if (twp) {
485 *twp = tw;
486 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
487 } else if (tw) {
488 /* Silly. Should hash-dance instead... */
489 tcp_tw_deschedule(tw);
490 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
491
492 tcp_tw_put(tw);
493 }
494
495 return 0;
496
497not_unique:
498 write_unlock(&head->lock);
499 return -EADDRNOTAVAIL;
500}
501
502static inline u32 connect_port_offset(const struct sock *sk)
503{
504 const struct inet_sock *inet = inet_sk(sk);
505
506 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
507 inet->dport);
508}
509
510/*
511 * Bind a port for a connect operation and hash it.
512 */
513static inline int tcp_v4_hash_connect(struct sock *sk)
514{
0f7ff927
ACM
515 const unsigned short snum = inet_sk(sk)->num;
516 struct inet_bind_hashbucket *head;
517 struct inet_bind_bucket *tb;
1da177e4
LT
518 int ret;
519
520 if (!snum) {
521 int low = sysctl_local_port_range[0];
522 int high = sysctl_local_port_range[1];
523 int range = high - low;
524 int i;
525 int port;
526 static u32 hint;
527 u32 offset = hint + connect_port_offset(sk);
528 struct hlist_node *node;
529 struct tcp_tw_bucket *tw = NULL;
530
531 local_bh_disable();
532 for (i = 1; i <= range; i++) {
533 port = low + (i + offset) % range;
6e04e021 534 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
1da177e4
LT
535 spin_lock(&head->lock);
536
537 /* Does not bother with rcv_saddr checks,
538 * because the established check is already
539 * unique enough.
540 */
0f7ff927 541 inet_bind_bucket_for_each(tb, node, &head->chain) {
1da177e4
LT
542 if (tb->port == port) {
543 BUG_TRAP(!hlist_empty(&tb->owners));
544 if (tb->fastreuse >= 0)
545 goto next_port;
546 if (!__tcp_v4_check_established(sk,
547 port,
548 &tw))
549 goto ok;
550 goto next_port;
551 }
552 }
553
6e04e021 554 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
1da177e4
LT
555 if (!tb) {
556 spin_unlock(&head->lock);
557 break;
558 }
559 tb->fastreuse = -1;
560 goto ok;
561
562 next_port:
563 spin_unlock(&head->lock);
564 }
565 local_bh_enable();
566
567 return -EADDRNOTAVAIL;
568
569ok:
570 hint += i;
571
572 /* Head lock still held and bh's disabled */
2d8c4ce5 573 inet_bind_hash(sk, tb, port);
1da177e4
LT
574 if (sk_unhashed(sk)) {
575 inet_sk(sk)->sport = htons(port);
f3f05f70 576 __inet_hash(&tcp_hashinfo, sk, 0);
1da177e4
LT
577 }
578 spin_unlock(&head->lock);
579
580 if (tw) {
581 tcp_tw_deschedule(tw);
582 tcp_tw_put(tw);
583 }
584
585 ret = 0;
586 goto out;
587 }
588
6e04e021 589 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
a55ebcc4 590 tb = inet_sk(sk)->bind_hash;
1da177e4
LT
591 spin_lock_bh(&head->lock);
592 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
f3f05f70 593 __inet_hash(&tcp_hashinfo, sk, 0);
1da177e4
LT
594 spin_unlock_bh(&head->lock);
595 return 0;
596 } else {
597 spin_unlock(&head->lock);
598 /* No definite answer... Walk to established hash table */
599 ret = __tcp_v4_check_established(sk, snum, NULL);
600out:
601 local_bh_enable();
602 return ret;
603 }
604}
605
606/* This will initiate an outgoing connection. */
607int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
608{
609 struct inet_sock *inet = inet_sk(sk);
610 struct tcp_sock *tp = tcp_sk(sk);
611 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
612 struct rtable *rt;
613 u32 daddr, nexthop;
614 int tmp;
615 int err;
616
617 if (addr_len < sizeof(struct sockaddr_in))
618 return -EINVAL;
619
620 if (usin->sin_family != AF_INET)
621 return -EAFNOSUPPORT;
622
623 nexthop = daddr = usin->sin_addr.s_addr;
624 if (inet->opt && inet->opt->srr) {
625 if (!daddr)
626 return -EINVAL;
627 nexthop = inet->opt->faddr;
628 }
629
630 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
631 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
632 IPPROTO_TCP,
633 inet->sport, usin->sin_port, sk);
634 if (tmp < 0)
635 return tmp;
636
637 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
638 ip_rt_put(rt);
639 return -ENETUNREACH;
640 }
641
642 if (!inet->opt || !inet->opt->srr)
643 daddr = rt->rt_dst;
644
645 if (!inet->saddr)
646 inet->saddr = rt->rt_src;
647 inet->rcv_saddr = inet->saddr;
648
649 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
650 /* Reset inherited state */
651 tp->rx_opt.ts_recent = 0;
652 tp->rx_opt.ts_recent_stamp = 0;
653 tp->write_seq = 0;
654 }
655
656 if (sysctl_tcp_tw_recycle &&
657 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
658 struct inet_peer *peer = rt_get_peer(rt);
659
660 /* VJ's idea. We save last timestamp seen from
661 * the destination in peer table, when entering state TIME-WAIT
662 * and initialize rx_opt.ts_recent from it, when trying new connection.
663 */
664
665 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
666 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
667 tp->rx_opt.ts_recent = peer->tcp_ts;
668 }
669 }
670
671 inet->dport = usin->sin_port;
672 inet->daddr = daddr;
673
674 tp->ext_header_len = 0;
675 if (inet->opt)
676 tp->ext_header_len = inet->opt->optlen;
677
678 tp->rx_opt.mss_clamp = 536;
679
680 /* Socket identity is still unknown (sport may be zero).
681 * However we set state to SYN-SENT and not releasing socket
682 * lock select source port, enter ourselves into the hash tables and
683 * complete initialization after this.
684 */
685 tcp_set_state(sk, TCP_SYN_SENT);
686 err = tcp_v4_hash_connect(sk);
687 if (err)
688 goto failure;
689
690 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
691 if (err)
692 goto failure;
693
694 /* OK, now commit destination to socket. */
6cbb0df7 695 sk_setup_caps(sk, &rt->u.dst);
1da177e4
LT
696
697 if (!tp->write_seq)
698 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
699 inet->daddr,
700 inet->sport,
701 usin->sin_port);
702
703 inet->id = tp->write_seq ^ jiffies;
704
705 err = tcp_connect(sk);
706 rt = NULL;
707 if (err)
708 goto failure;
709
710 return 0;
711
712failure:
713 /* This unhashes the socket and releases the local port, if necessary. */
714 tcp_set_state(sk, TCP_CLOSE);
715 ip_rt_put(rt);
716 sk->sk_route_caps = 0;
717 inet->dport = 0;
718 return err;
719}
720
721static __inline__ int tcp_v4_iif(struct sk_buff *skb)
722{
723 return ((struct rtable *)skb->dst)->rt_iif;
724}
725
726static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
727{
728 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
729}
730
60236fdd
ACM
731static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
732 struct request_sock ***prevp,
1da177e4
LT
733 __u16 rport,
734 __u32 raddr, __u32 laddr)
735{
2ad69c55 736 struct listen_sock *lopt = tp->accept_queue.listen_opt;
60236fdd 737 struct request_sock *req, **prev;
1da177e4
LT
738
739 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
740 (req = *prev) != NULL;
741 prev = &req->dl_next) {
2e6599cb
ACM
742 const struct inet_request_sock *ireq = inet_rsk(req);
743
744 if (ireq->rmt_port == rport &&
745 ireq->rmt_addr == raddr &&
746 ireq->loc_addr == laddr &&
60236fdd 747 TCP_INET_FAMILY(req->rsk_ops->family)) {
1da177e4
LT
748 BUG_TRAP(!req->sk);
749 *prevp = prev;
750 break;
751 }
752 }
753
754 return req;
755}
756
60236fdd 757static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
1da177e4
LT
758{
759 struct tcp_sock *tp = tcp_sk(sk);
2ad69c55 760 struct listen_sock *lopt = tp->accept_queue.listen_opt;
2e6599cb 761 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
1da177e4 762
0e87506f 763 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
1da177e4
LT
764 tcp_synq_added(sk);
765}
766
767
768/*
769 * This routine does path mtu discovery as defined in RFC1191.
770 */
771static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
772 u32 mtu)
773{
774 struct dst_entry *dst;
775 struct inet_sock *inet = inet_sk(sk);
776 struct tcp_sock *tp = tcp_sk(sk);
777
778 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
779 * send out by Linux are always <576bytes so they should go through
780 * unfragmented).
781 */
782 if (sk->sk_state == TCP_LISTEN)
783 return;
784
785 /* We don't check in the destentry if pmtu discovery is forbidden
786 * on this route. We just assume that no packet_to_big packets
787 * are send back when pmtu discovery is not active.
788 * There is a small race when the user changes this flag in the
789 * route, but I think that's acceptable.
790 */
791 if ((dst = __sk_dst_check(sk, 0)) == NULL)
792 return;
793
794 dst->ops->update_pmtu(dst, mtu);
795
796 /* Something is about to be wrong... Remember soft error
797 * for the case, if this connection will not able to recover.
798 */
799 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
800 sk->sk_err_soft = EMSGSIZE;
801
802 mtu = dst_mtu(dst);
803
804 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
805 tp->pmtu_cookie > mtu) {
806 tcp_sync_mss(sk, mtu);
807
808 /* Resend the TCP packet because it's
809 * clear that the old packet has been
810 * dropped. This is the new "fast" path mtu
811 * discovery.
812 */
813 tcp_simple_retransmit(sk);
814 } /* else let the usual retransmit timer handle it */
815}
816
817/*
818 * This routine is called by the ICMP module when it gets some
819 * sort of error condition. If err < 0 then the socket should
820 * be closed and the error returned to the user. If err > 0
821 * it's just the icmp type << 8 | icmp code. After adjustment
822 * header points to the first 8 bytes of the tcp header. We need
823 * to find the appropriate port.
824 *
825 * The locking strategy used here is very "optimistic". When
826 * someone else accesses the socket the ICMP is just dropped
827 * and for some paths there is no check at all.
828 * A more general error queue to queue errors for later handling
829 * is probably better.
830 *
831 */
832
833void tcp_v4_err(struct sk_buff *skb, u32 info)
834{
835 struct iphdr *iph = (struct iphdr *)skb->data;
836 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
837 struct tcp_sock *tp;
838 struct inet_sock *inet;
839 int type = skb->h.icmph->type;
840 int code = skb->h.icmph->code;
841 struct sock *sk;
842 __u32 seq;
843 int err;
844
845 if (skb->len < (iph->ihl << 2) + 8) {
846 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
847 return;
848 }
849
850 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
851 th->source, tcp_v4_iif(skb));
852 if (!sk) {
853 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
854 return;
855 }
856 if (sk->sk_state == TCP_TIME_WAIT) {
857 tcp_tw_put((struct tcp_tw_bucket *)sk);
858 return;
859 }
860
861 bh_lock_sock(sk);
862 /* If too many ICMPs get dropped on busy
863 * servers this needs to be solved differently.
864 */
865 if (sock_owned_by_user(sk))
866 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
867
868 if (sk->sk_state == TCP_CLOSE)
869 goto out;
870
871 tp = tcp_sk(sk);
872 seq = ntohl(th->seq);
873 if (sk->sk_state != TCP_LISTEN &&
874 !between(seq, tp->snd_una, tp->snd_nxt)) {
875 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
876 goto out;
877 }
878
879 switch (type) {
880 case ICMP_SOURCE_QUENCH:
881 /* Just silently ignore these. */
882 goto out;
883 case ICMP_PARAMETERPROB:
884 err = EPROTO;
885 break;
886 case ICMP_DEST_UNREACH:
887 if (code > NR_ICMP_UNREACH)
888 goto out;
889
890 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
891 if (!sock_owned_by_user(sk))
892 do_pmtu_discovery(sk, iph, info);
893 goto out;
894 }
895
896 err = icmp_err_convert[code].errno;
897 break;
898 case ICMP_TIME_EXCEEDED:
899 err = EHOSTUNREACH;
900 break;
901 default:
902 goto out;
903 }
904
905 switch (sk->sk_state) {
60236fdd 906 struct request_sock *req, **prev;
1da177e4
LT
907 case TCP_LISTEN:
908 if (sock_owned_by_user(sk))
909 goto out;
910
911 req = tcp_v4_search_req(tp, &prev, th->dest,
912 iph->daddr, iph->saddr);
913 if (!req)
914 goto out;
915
916 /* ICMPs are not backlogged, hence we cannot get
917 an established socket here.
918 */
919 BUG_TRAP(!req->sk);
920
2e6599cb 921 if (seq != tcp_rsk(req)->snt_isn) {
1da177e4
LT
922 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
923 goto out;
924 }
925
926 /*
927 * Still in SYN_RECV, just remove it silently.
928 * There is no good way to pass the error to the newly
929 * created socket, and POSIX does not want network
930 * errors returned from accept().
931 */
932 tcp_synq_drop(sk, req, prev);
933 goto out;
934
935 case TCP_SYN_SENT:
936 case TCP_SYN_RECV: /* Cannot happen.
937 It can f.e. if SYNs crossed.
938 */
939 if (!sock_owned_by_user(sk)) {
940 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
941 sk->sk_err = err;
942
943 sk->sk_error_report(sk);
944
945 tcp_done(sk);
946 } else {
947 sk->sk_err_soft = err;
948 }
949 goto out;
950 }
951
952 /* If we've already connected we will keep trying
953 * until we time out, or the user gives up.
954 *
955 * rfc1122 4.2.3.9 allows to consider as hard errors
956 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
957 * but it is obsoleted by pmtu discovery).
958 *
959 * Note, that in modern internet, where routing is unreliable
960 * and in each dark corner broken firewalls sit, sending random
961 * errors ordered by their masters even this two messages finally lose
962 * their original sense (even Linux sends invalid PORT_UNREACHs)
963 *
964 * Now we are in compliance with RFCs.
965 * --ANK (980905)
966 */
967
968 inet = inet_sk(sk);
969 if (!sock_owned_by_user(sk) && inet->recverr) {
970 sk->sk_err = err;
971 sk->sk_error_report(sk);
972 } else { /* Only an error on timeout */
973 sk->sk_err_soft = err;
974 }
975
976out:
977 bh_unlock_sock(sk);
978 sock_put(sk);
979}
980
981/* This routine computes an IPv4 TCP checksum. */
982void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
983 struct sk_buff *skb)
984{
985 struct inet_sock *inet = inet_sk(sk);
986
987 if (skb->ip_summed == CHECKSUM_HW) {
988 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
989 skb->csum = offsetof(struct tcphdr, check);
990 } else {
991 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
992 csum_partial((char *)th,
993 th->doff << 2,
994 skb->csum));
995 }
996}
997
998/*
999 * This routine will send an RST to the other tcp.
1000 *
1001 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1002 * for reset.
1003 * Answer: if a packet caused RST, it is not for a socket
1004 * existing in our system, if it is matched to a socket,
1005 * it is just duplicate segment or bug in other side's TCP.
1006 * So that we build reply only basing on parameters
1007 * arrived with segment.
1008 * Exception: precedence violation. We do not implement it in any case.
1009 */
1010
1011static void tcp_v4_send_reset(struct sk_buff *skb)
1012{
1013 struct tcphdr *th = skb->h.th;
1014 struct tcphdr rth;
1015 struct ip_reply_arg arg;
1016
1017 /* Never send a reset in response to a reset. */
1018 if (th->rst)
1019 return;
1020
1021 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1022 return;
1023
1024 /* Swap the send and the receive. */
1025 memset(&rth, 0, sizeof(struct tcphdr));
1026 rth.dest = th->source;
1027 rth.source = th->dest;
1028 rth.doff = sizeof(struct tcphdr) / 4;
1029 rth.rst = 1;
1030
1031 if (th->ack) {
1032 rth.seq = th->ack_seq;
1033 } else {
1034 rth.ack = 1;
1035 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1036 skb->len - (th->doff << 2));
1037 }
1038
1039 memset(&arg, 0, sizeof arg);
1040 arg.iov[0].iov_base = (unsigned char *)&rth;
1041 arg.iov[0].iov_len = sizeof rth;
1042 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1043 skb->nh.iph->saddr, /*XXX*/
1044 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1045 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1046
1047 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1048
1049 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1050 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1051}
1052
1053/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1054 outside socket context is ugly, certainly. What can I do?
1055 */
1056
1057static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1058 u32 win, u32 ts)
1059{
1060 struct tcphdr *th = skb->h.th;
1061 struct {
1062 struct tcphdr th;
1063 u32 tsopt[3];
1064 } rep;
1065 struct ip_reply_arg arg;
1066
1067 memset(&rep.th, 0, sizeof(struct tcphdr));
1068 memset(&arg, 0, sizeof arg);
1069
1070 arg.iov[0].iov_base = (unsigned char *)&rep;
1071 arg.iov[0].iov_len = sizeof(rep.th);
1072 if (ts) {
1073 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1074 (TCPOPT_TIMESTAMP << 8) |
1075 TCPOLEN_TIMESTAMP);
1076 rep.tsopt[1] = htonl(tcp_time_stamp);
1077 rep.tsopt[2] = htonl(ts);
1078 arg.iov[0].iov_len = sizeof(rep);
1079 }
1080
1081 /* Swap the send and the receive. */
1082 rep.th.dest = th->source;
1083 rep.th.source = th->dest;
1084 rep.th.doff = arg.iov[0].iov_len / 4;
1085 rep.th.seq = htonl(seq);
1086 rep.th.ack_seq = htonl(ack);
1087 rep.th.ack = 1;
1088 rep.th.window = htons(win);
1089
1090 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1091 skb->nh.iph->saddr, /*XXX*/
1092 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1093 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1094
1095 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1096
1097 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1098}
1099
1100static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1101{
1102 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1103
1104 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1105 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1106
1107 tcp_tw_put(tw);
1108}
1109
60236fdd 1110static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1da177e4 1111{
2e6599cb 1112 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1da177e4
LT
1113 req->ts_recent);
1114}
1115
1116static struct dst_entry* tcp_v4_route_req(struct sock *sk,
60236fdd 1117 struct request_sock *req)
1da177e4
LT
1118{
1119 struct rtable *rt;
2e6599cb
ACM
1120 const struct inet_request_sock *ireq = inet_rsk(req);
1121 struct ip_options *opt = inet_rsk(req)->opt;
1da177e4
LT
1122 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1123 .nl_u = { .ip4_u =
1124 { .daddr = ((opt && opt->srr) ?
1125 opt->faddr :
2e6599cb
ACM
1126 ireq->rmt_addr),
1127 .saddr = ireq->loc_addr,
1da177e4
LT
1128 .tos = RT_CONN_FLAGS(sk) } },
1129 .proto = IPPROTO_TCP,
1130 .uli_u = { .ports =
1131 { .sport = inet_sk(sk)->sport,
2e6599cb 1132 .dport = ireq->rmt_port } } };
1da177e4
LT
1133
1134 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1135 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1136 return NULL;
1137 }
1138 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1139 ip_rt_put(rt);
1140 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1141 return NULL;
1142 }
1143 return &rt->u.dst;
1144}
1145
1146/*
1147 * Send a SYN-ACK after having received an ACK.
60236fdd 1148 * This still operates on a request_sock only, not on a big
1da177e4
LT
1149 * socket.
1150 */
60236fdd 1151static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1da177e4
LT
1152 struct dst_entry *dst)
1153{
2e6599cb 1154 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
1155 int err = -1;
1156 struct sk_buff * skb;
1157
1158 /* First, grab a route. */
1159 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1160 goto out;
1161
1162 skb = tcp_make_synack(sk, dst, req);
1163
1164 if (skb) {
1165 struct tcphdr *th = skb->h.th;
1166
1167 th->check = tcp_v4_check(th, skb->len,
2e6599cb
ACM
1168 ireq->loc_addr,
1169 ireq->rmt_addr,
1da177e4
LT
1170 csum_partial((char *)th, skb->len,
1171 skb->csum));
1172
2e6599cb
ACM
1173 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1174 ireq->rmt_addr,
1175 ireq->opt);
1da177e4
LT
1176 if (err == NET_XMIT_CN)
1177 err = 0;
1178 }
1179
1180out:
1181 dst_release(dst);
1182 return err;
1183}
1184
1185/*
60236fdd 1186 * IPv4 request_sock destructor.
1da177e4 1187 */
60236fdd 1188static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 1189{
2e6599cb
ACM
1190 if (inet_rsk(req)->opt)
1191 kfree(inet_rsk(req)->opt);
1da177e4
LT
1192}
1193
1194static inline void syn_flood_warning(struct sk_buff *skb)
1195{
1196 static unsigned long warntime;
1197
1198 if (time_after(jiffies, (warntime + HZ * 60))) {
1199 warntime = jiffies;
1200 printk(KERN_INFO
1201 "possible SYN flooding on port %d. Sending cookies.\n",
1202 ntohs(skb->h.th->dest));
1203 }
1204}
1205
1206/*
60236fdd 1207 * Save and compile IPv4 options into the request_sock if needed.
1da177e4
LT
1208 */
1209static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1210 struct sk_buff *skb)
1211{
1212 struct ip_options *opt = &(IPCB(skb)->opt);
1213 struct ip_options *dopt = NULL;
1214
1215 if (opt && opt->optlen) {
1216 int opt_size = optlength(opt);
1217 dopt = kmalloc(opt_size, GFP_ATOMIC);
1218 if (dopt) {
1219 if (ip_options_echo(dopt, skb)) {
1220 kfree(dopt);
1221 dopt = NULL;
1222 }
1223 }
1224 }
1225 return dopt;
1226}
1227
60236fdd 1228struct request_sock_ops tcp_request_sock_ops = {
1da177e4 1229 .family = PF_INET,
2e6599cb 1230 .obj_size = sizeof(struct tcp_request_sock),
1da177e4 1231 .rtx_syn_ack = tcp_v4_send_synack,
60236fdd
ACM
1232 .send_ack = tcp_v4_reqsk_send_ack,
1233 .destructor = tcp_v4_reqsk_destructor,
1da177e4
LT
1234 .send_reset = tcp_v4_send_reset,
1235};
1236
1237int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1238{
2e6599cb 1239 struct inet_request_sock *ireq;
1da177e4 1240 struct tcp_options_received tmp_opt;
60236fdd 1241 struct request_sock *req;
1da177e4
LT
1242 __u32 saddr = skb->nh.iph->saddr;
1243 __u32 daddr = skb->nh.iph->daddr;
1244 __u32 isn = TCP_SKB_CB(skb)->when;
1245 struct dst_entry *dst = NULL;
1246#ifdef CONFIG_SYN_COOKIES
1247 int want_cookie = 0;
1248#else
1249#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1250#endif
1251
1252 /* Never answer to SYNs send to broadcast or multicast */
1253 if (((struct rtable *)skb->dst)->rt_flags &
1254 (RTCF_BROADCAST | RTCF_MULTICAST))
1255 goto drop;
1256
1257 /* TW buckets are converted to open requests without
1258 * limitations, they conserve resources and peer is
1259 * evidently real one.
1260 */
1261 if (tcp_synq_is_full(sk) && !isn) {
1262#ifdef CONFIG_SYN_COOKIES
1263 if (sysctl_tcp_syncookies) {
1264 want_cookie = 1;
1265 } else
1266#endif
1267 goto drop;
1268 }
1269
1270 /* Accept backlog is full. If we have already queued enough
1271 * of warm entries in syn queue, drop request. It is better than
1272 * clogging syn queue with openreqs with exponentially increasing
1273 * timeout.
1274 */
1275 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1276 goto drop;
1277
60236fdd 1278 req = reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1279 if (!req)
1280 goto drop;
1281
1282 tcp_clear_options(&tmp_opt);
1283 tmp_opt.mss_clamp = 536;
1284 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1285
1286 tcp_parse_options(skb, &tmp_opt, 0);
1287
1288 if (want_cookie) {
1289 tcp_clear_options(&tmp_opt);
1290 tmp_opt.saw_tstamp = 0;
1291 }
1292
1293 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1294 /* Some OSes (unknown ones, but I see them on web server, which
1295 * contains information interesting only for windows'
1296 * users) do not send their stamp in SYN. It is easy case.
1297 * We simply do not advertise TS support.
1298 */
1299 tmp_opt.saw_tstamp = 0;
1300 tmp_opt.tstamp_ok = 0;
1301 }
1302 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1303
1304 tcp_openreq_init(req, &tmp_opt, skb);
1305
2e6599cb
ACM
1306 ireq = inet_rsk(req);
1307 ireq->loc_addr = daddr;
1308 ireq->rmt_addr = saddr;
1309 ireq->opt = tcp_v4_save_options(sk, skb);
1da177e4
LT
1310 if (!want_cookie)
1311 TCP_ECN_create_request(req, skb->h.th);
1312
1313 if (want_cookie) {
1314#ifdef CONFIG_SYN_COOKIES
1315 syn_flood_warning(skb);
1316#endif
1317 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1318 } else if (!isn) {
1319 struct inet_peer *peer = NULL;
1320
1321 /* VJ's idea. We save last timestamp seen
1322 * from the destination in peer table, when entering
1323 * state TIME-WAIT, and check against it before
1324 * accepting new connection request.
1325 *
1326 * If "isn" is not zero, this request hit alive
1327 * timewait bucket, so that all the necessary checks
1328 * are made in the function processing timewait state.
1329 */
1330 if (tmp_opt.saw_tstamp &&
1331 sysctl_tcp_tw_recycle &&
1332 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1333 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1334 peer->v4daddr == saddr) {
1335 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1336 (s32)(peer->tcp_ts - req->ts_recent) >
1337 TCP_PAWS_WINDOW) {
1338 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1339 dst_release(dst);
1340 goto drop_and_free;
1341 }
1342 }
1343 /* Kill the following clause, if you dislike this way. */
1344 else if (!sysctl_tcp_syncookies &&
1345 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1346 (sysctl_max_syn_backlog >> 2)) &&
1347 (!peer || !peer->tcp_ts_stamp) &&
1348 (!dst || !dst_metric(dst, RTAX_RTT))) {
1349 /* Without syncookies last quarter of
1350 * backlog is filled with destinations,
1351 * proven to be alive.
1352 * It means that we continue to communicate
1353 * to destinations, already remembered
1354 * to the moment of synflood.
1355 */
ca933452
HO
1356 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1357 "request from %u.%u."
1358 "%u.%u/%u\n",
1359 NIPQUAD(saddr),
1360 ntohs(skb->h.th->source)));
1da177e4
LT
1361 dst_release(dst);
1362 goto drop_and_free;
1363 }
1364
1365 isn = tcp_v4_init_sequence(sk, skb);
1366 }
2e6599cb 1367 tcp_rsk(req)->snt_isn = isn;
1da177e4
LT
1368
1369 if (tcp_v4_send_synack(sk, req, dst))
1370 goto drop_and_free;
1371
1372 if (want_cookie) {
60236fdd 1373 reqsk_free(req);
1da177e4
LT
1374 } else {
1375 tcp_v4_synq_add(sk, req);
1376 }
1377 return 0;
1378
1379drop_and_free:
60236fdd 1380 reqsk_free(req);
1da177e4
LT
1381drop:
1382 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1383 return 0;
1384}
1385
1386
1387/*
1388 * The three way handshake has completed - we got a valid synack -
1389 * now create the new socket.
1390 */
1391struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1392 struct request_sock *req,
1da177e4
LT
1393 struct dst_entry *dst)
1394{
2e6599cb 1395 struct inet_request_sock *ireq;
1da177e4
LT
1396 struct inet_sock *newinet;
1397 struct tcp_sock *newtp;
1398 struct sock *newsk;
1399
1400 if (sk_acceptq_is_full(sk))
1401 goto exit_overflow;
1402
1403 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1404 goto exit;
1405
1406 newsk = tcp_create_openreq_child(sk, req, skb);
1407 if (!newsk)
1408 goto exit;
1409
6cbb0df7 1410 sk_setup_caps(newsk, dst);
1da177e4
LT
1411
1412 newtp = tcp_sk(newsk);
1413 newinet = inet_sk(newsk);
2e6599cb
ACM
1414 ireq = inet_rsk(req);
1415 newinet->daddr = ireq->rmt_addr;
1416 newinet->rcv_saddr = ireq->loc_addr;
1417 newinet->saddr = ireq->loc_addr;
1418 newinet->opt = ireq->opt;
1419 ireq->opt = NULL;
1da177e4
LT
1420 newinet->mc_index = tcp_v4_iif(skb);
1421 newinet->mc_ttl = skb->nh.iph->ttl;
1422 newtp->ext_header_len = 0;
1423 if (newinet->opt)
1424 newtp->ext_header_len = newinet->opt->optlen;
1425 newinet->id = newtp->write_seq ^ jiffies;
1426
1427 tcp_sync_mss(newsk, dst_mtu(dst));
1428 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1429 tcp_initialize_rcv_mss(newsk);
1430
f3f05f70 1431 __inet_hash(&tcp_hashinfo, newsk, 0);
2d8c4ce5 1432 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1da177e4
LT
1433
1434 return newsk;
1435
1436exit_overflow:
1437 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1438exit:
1439 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1440 dst_release(dst);
1441 return NULL;
1442}
1443
1444static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1445{
1446 struct tcphdr *th = skb->h.th;
1447 struct iphdr *iph = skb->nh.iph;
1448 struct tcp_sock *tp = tcp_sk(sk);
1449 struct sock *nsk;
60236fdd 1450 struct request_sock **prev;
1da177e4 1451 /* Find possible connection requests. */
60236fdd 1452 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1da177e4
LT
1453 iph->saddr, iph->daddr);
1454 if (req)
1455 return tcp_check_req(sk, skb, req, prev);
1456
1457 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1458 th->source,
1459 skb->nh.iph->daddr,
1460 ntohs(th->dest),
1461 tcp_v4_iif(skb));
1462
1463 if (nsk) {
1464 if (nsk->sk_state != TCP_TIME_WAIT) {
1465 bh_lock_sock(nsk);
1466 return nsk;
1467 }
1468 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1469 return NULL;
1470 }
1471
1472#ifdef CONFIG_SYN_COOKIES
1473 if (!th->rst && !th->syn && th->ack)
1474 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1475#endif
1476 return sk;
1477}
1478
1479static int tcp_v4_checksum_init(struct sk_buff *skb)
1480{
1481 if (skb->ip_summed == CHECKSUM_HW) {
1482 skb->ip_summed = CHECKSUM_UNNECESSARY;
1483 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1484 skb->nh.iph->daddr, skb->csum))
1485 return 0;
1486
ca933452 1487 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1da177e4
LT
1488 skb->ip_summed = CHECKSUM_NONE;
1489 }
1490 if (skb->len <= 76) {
1491 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1492 skb->nh.iph->daddr,
1493 skb_checksum(skb, 0, skb->len, 0)))
1494 return -1;
1495 skb->ip_summed = CHECKSUM_UNNECESSARY;
1496 } else {
1497 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1498 skb->nh.iph->saddr,
1499 skb->nh.iph->daddr, 0);
1500 }
1501 return 0;
1502}
1503
1504
1505/* The socket must have it's spinlock held when we get
1506 * here.
1507 *
1508 * We have a potential double-lock case here, so even when
1509 * doing backlog processing we use the BH locking scheme.
1510 * This is because we cannot sleep with the original spinlock
1511 * held.
1512 */
1513int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1514{
1515 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1516 TCP_CHECK_TIMER(sk);
1517 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1518 goto reset;
1519 TCP_CHECK_TIMER(sk);
1520 return 0;
1521 }
1522
1523 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1524 goto csum_err;
1525
1526 if (sk->sk_state == TCP_LISTEN) {
1527 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1528 if (!nsk)
1529 goto discard;
1530
1531 if (nsk != sk) {
1532 if (tcp_child_process(sk, nsk, skb))
1533 goto reset;
1534 return 0;
1535 }
1536 }
1537
1538 TCP_CHECK_TIMER(sk);
1539 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1540 goto reset;
1541 TCP_CHECK_TIMER(sk);
1542 return 0;
1543
1544reset:
1545 tcp_v4_send_reset(skb);
1546discard:
1547 kfree_skb(skb);
1548 /* Be careful here. If this function gets more complicated and
1549 * gcc suffers from register pressure on the x86, sk (in %ebx)
1550 * might be destroyed here. This current version compiles correctly,
1551 * but you have been warned.
1552 */
1553 return 0;
1554
1555csum_err:
1556 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1557 goto discard;
1558}
1559
1560/*
1561 * From tcp_input.c
1562 */
1563
1564int tcp_v4_rcv(struct sk_buff *skb)
1565{
1566 struct tcphdr *th;
1567 struct sock *sk;
1568 int ret;
1569
1570 if (skb->pkt_type != PACKET_HOST)
1571 goto discard_it;
1572
1573 /* Count it even if it's bad */
1574 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1575
1576 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1577 goto discard_it;
1578
1579 th = skb->h.th;
1580
1581 if (th->doff < sizeof(struct tcphdr) / 4)
1582 goto bad_packet;
1583 if (!pskb_may_pull(skb, th->doff * 4))
1584 goto discard_it;
1585
1586 /* An explanation is required here, I think.
1587 * Packet length and doff are validated by header prediction,
1588 * provided case of th->doff==0 is elimineted.
1589 * So, we defer the checks. */
1590 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1591 tcp_v4_checksum_init(skb) < 0))
1592 goto bad_packet;
1593
1594 th = skb->h.th;
1595 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1596 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1597 skb->len - th->doff * 4);
1598 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1599 TCP_SKB_CB(skb)->when = 0;
1600 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1601 TCP_SKB_CB(skb)->sacked = 0;
1602
1603 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1604 skb->nh.iph->daddr, ntohs(th->dest),
1605 tcp_v4_iif(skb));
1606
1607 if (!sk)
1608 goto no_tcp_socket;
1609
1610process:
1611 if (sk->sk_state == TCP_TIME_WAIT)
1612 goto do_time_wait;
1613
1614 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1615 goto discard_and_relse;
1616
1617 if (sk_filter(sk, skb, 0))
1618 goto discard_and_relse;
1619
1620 skb->dev = NULL;
1621
1622 bh_lock_sock(sk);
1623 ret = 0;
1624 if (!sock_owned_by_user(sk)) {
1625 if (!tcp_prequeue(sk, skb))
1626 ret = tcp_v4_do_rcv(sk, skb);
1627 } else
1628 sk_add_backlog(sk, skb);
1629 bh_unlock_sock(sk);
1630
1631 sock_put(sk);
1632
1633 return ret;
1634
1635no_tcp_socket:
1636 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1637 goto discard_it;
1638
1639 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1640bad_packet:
1641 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1642 } else {
1643 tcp_v4_send_reset(skb);
1644 }
1645
1646discard_it:
1647 /* Discard frame. */
1648 kfree_skb(skb);
1649 return 0;
1650
1651discard_and_relse:
1652 sock_put(sk);
1653 goto discard_it;
1654
1655do_time_wait:
1656 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1657 tcp_tw_put((struct tcp_tw_bucket *) sk);
1658 goto discard_it;
1659 }
1660
1661 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1662 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1663 tcp_tw_put((struct tcp_tw_bucket *) sk);
1664 goto discard_it;
1665 }
1666 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1667 skb, th, skb->len)) {
1668 case TCP_TW_SYN: {
1669 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1670 ntohs(th->dest),
1671 tcp_v4_iif(skb));
1672 if (sk2) {
1673 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1674 tcp_tw_put((struct tcp_tw_bucket *)sk);
1675 sk = sk2;
1676 goto process;
1677 }
1678 /* Fall through to ACK */
1679 }
1680 case TCP_TW_ACK:
1681 tcp_v4_timewait_ack(sk, skb);
1682 break;
1683 case TCP_TW_RST:
1684 goto no_tcp_socket;
1685 case TCP_TW_SUCCESS:;
1686 }
1687 goto discard_it;
1688}
1689
1da177e4
LT
1690static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1691{
1692 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1693 struct inet_sock *inet = inet_sk(sk);
1694
1695 sin->sin_family = AF_INET;
1696 sin->sin_addr.s_addr = inet->daddr;
1697 sin->sin_port = inet->dport;
1698}
1699
1700/* VJ's idea. Save last timestamp seen from this destination
1701 * and hold it at least for normal timewait interval to use for duplicate
1702 * segment detection in subsequent connections, before they enter synchronized
1703 * state.
1704 */
1705
1706int tcp_v4_remember_stamp(struct sock *sk)
1707{
1708 struct inet_sock *inet = inet_sk(sk);
1709 struct tcp_sock *tp = tcp_sk(sk);
1710 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1711 struct inet_peer *peer = NULL;
1712 int release_it = 0;
1713
1714 if (!rt || rt->rt_dst != inet->daddr) {
1715 peer = inet_getpeer(inet->daddr, 1);
1716 release_it = 1;
1717 } else {
1718 if (!rt->peer)
1719 rt_bind_peer(rt, 1);
1720 peer = rt->peer;
1721 }
1722
1723 if (peer) {
1724 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1725 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1726 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1727 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1728 peer->tcp_ts = tp->rx_opt.ts_recent;
1729 }
1730 if (release_it)
1731 inet_putpeer(peer);
1732 return 1;
1733 }
1734
1735 return 0;
1736}
1737
1738int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1739{
1740 struct inet_peer *peer = NULL;
1741
1742 peer = inet_getpeer(tw->tw_daddr, 1);
1743
1744 if (peer) {
1745 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1746 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1747 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1748 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1749 peer->tcp_ts = tw->tw_ts_recent;
1750 }
1751 inet_putpeer(peer);
1752 return 1;
1753 }
1754
1755 return 0;
1756}
1757
1758struct tcp_func ipv4_specific = {
1759 .queue_xmit = ip_queue_xmit,
1760 .send_check = tcp_v4_send_check,
32519f11 1761 .rebuild_header = inet_sk_rebuild_header,
1da177e4
LT
1762 .conn_request = tcp_v4_conn_request,
1763 .syn_recv_sock = tcp_v4_syn_recv_sock,
1764 .remember_stamp = tcp_v4_remember_stamp,
1765 .net_header_len = sizeof(struct iphdr),
1766 .setsockopt = ip_setsockopt,
1767 .getsockopt = ip_getsockopt,
1768 .addr2sockaddr = v4_addr2sockaddr,
1769 .sockaddr_len = sizeof(struct sockaddr_in),
1770};
1771
1772/* NOTE: A lot of things set to zero explicitly by call to
1773 * sk_alloc() so need not be done here.
1774 */
1775static int tcp_v4_init_sock(struct sock *sk)
1776{
1777 struct tcp_sock *tp = tcp_sk(sk);
1778
1779 skb_queue_head_init(&tp->out_of_order_queue);
1780 tcp_init_xmit_timers(sk);
1781 tcp_prequeue_init(tp);
1782
1783 tp->rto = TCP_TIMEOUT_INIT;
1784 tp->mdev = TCP_TIMEOUT_INIT;
1785
1786 /* So many TCP implementations out there (incorrectly) count the
1787 * initial SYN frame in their delayed-ACK and congestion control
1788 * algorithms that we must have the following bandaid to talk
1789 * efficiently to them. -DaveM
1790 */
1791 tp->snd_cwnd = 2;
1792
1793 /* See draft-stevens-tcpca-spec-01 for discussion of the
1794 * initialization of these values.
1795 */
1796 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1797 tp->snd_cwnd_clamp = ~0;
c1b4a7e6 1798 tp->mss_cache = 536;
1da177e4
LT
1799
1800 tp->reordering = sysctl_tcp_reordering;
5f8ef48d 1801 tp->ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1802
1803 sk->sk_state = TCP_CLOSE;
1804
1805 sk->sk_write_space = sk_stream_write_space;
1806 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1807
1808 tp->af_specific = &ipv4_specific;
1809
1810 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1811 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1812
1813 atomic_inc(&tcp_sockets_allocated);
1814
1815 return 0;
1816}
1817
1818int tcp_v4_destroy_sock(struct sock *sk)
1819{
1820 struct tcp_sock *tp = tcp_sk(sk);
1821
1822 tcp_clear_xmit_timers(sk);
1823
317a76f9
SH
1824 tcp_cleanup_congestion_control(tp);
1825
1da177e4
LT
1826 /* Cleanup up the write buffer. */
1827 sk_stream_writequeue_purge(sk);
1828
1829 /* Cleans up our, hopefully empty, out_of_order_queue. */
1830 __skb_queue_purge(&tp->out_of_order_queue);
1831
1832 /* Clean prequeue, it must be empty really */
1833 __skb_queue_purge(&tp->ucopy.prequeue);
1834
1835 /* Clean up a referenced TCP bind bucket. */
a55ebcc4 1836 if (inet_sk(sk)->bind_hash)
2d8c4ce5 1837 inet_put_port(&tcp_hashinfo, sk);
1da177e4
LT
1838
1839 /*
1840 * If sendmsg cached page exists, toss it.
1841 */
1842 if (sk->sk_sndmsg_page) {
1843 __free_page(sk->sk_sndmsg_page);
1844 sk->sk_sndmsg_page = NULL;
1845 }
1846
1847 atomic_dec(&tcp_sockets_allocated);
1848
1849 return 0;
1850}
1851
1852EXPORT_SYMBOL(tcp_v4_destroy_sock);
1853
1854#ifdef CONFIG_PROC_FS
1855/* Proc filesystem TCP sock list dumping. */
1856
1857static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1858{
1859 return hlist_empty(head) ? NULL :
1860 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1861}
1862
1863static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1864{
1865 return tw->tw_node.next ?
1866 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1867}
1868
1869static void *listening_get_next(struct seq_file *seq, void *cur)
1870{
1871 struct tcp_sock *tp;
1872 struct hlist_node *node;
1873 struct sock *sk = cur;
1874 struct tcp_iter_state* st = seq->private;
1875
1876 if (!sk) {
1877 st->bucket = 0;
6e04e021 1878 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1da177e4
LT
1879 goto get_sk;
1880 }
1881
1882 ++st->num;
1883
1884 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1885 struct request_sock *req = cur;
1da177e4
LT
1886
1887 tp = tcp_sk(st->syn_wait_sk);
1888 req = req->dl_next;
1889 while (1) {
1890 while (req) {
60236fdd 1891 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1892 cur = req;
1893 goto out;
1894 }
1895 req = req->dl_next;
1896 }
1897 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1898 break;
1899get_req:
0e87506f 1900 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4
LT
1901 }
1902 sk = sk_next(st->syn_wait_sk);
1903 st->state = TCP_SEQ_STATE_LISTENING;
0e87506f 1904 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
1905 } else {
1906 tp = tcp_sk(sk);
0e87506f
ACM
1907 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1908 if (reqsk_queue_len(&tp->accept_queue))
1da177e4 1909 goto start_req;
0e87506f 1910 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
1911 sk = sk_next(sk);
1912 }
1913get_sk:
1914 sk_for_each_from(sk, node) {
1915 if (sk->sk_family == st->family) {
1916 cur = sk;
1917 goto out;
1918 }
1919 tp = tcp_sk(sk);
0e87506f
ACM
1920 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1921 if (reqsk_queue_len(&tp->accept_queue)) {
1da177e4
LT
1922start_req:
1923 st->uid = sock_i_uid(sk);
1924 st->syn_wait_sk = sk;
1925 st->state = TCP_SEQ_STATE_OPENREQ;
1926 st->sbucket = 0;
1927 goto get_req;
1928 }
0e87506f 1929 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4 1930 }
0f7ff927 1931 if (++st->bucket < INET_LHTABLE_SIZE) {
6e04e021 1932 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1da177e4
LT
1933 goto get_sk;
1934 }
1935 cur = NULL;
1936out:
1937 return cur;
1938}
1939
1940static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1941{
1942 void *rc = listening_get_next(seq, NULL);
1943
1944 while (rc && *pos) {
1945 rc = listening_get_next(seq, rc);
1946 --*pos;
1947 }
1948 return rc;
1949}
1950
1951static void *established_get_first(struct seq_file *seq)
1952{
1953 struct tcp_iter_state* st = seq->private;
1954 void *rc = NULL;
1955
6e04e021 1956 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1da177e4
LT
1957 struct sock *sk;
1958 struct hlist_node *node;
1959 struct tcp_tw_bucket *tw;
1960
1961 /* We can reschedule _before_ having picked the target: */
1962 cond_resched_softirq();
1963
6e04e021
ACM
1964 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1965 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1da177e4
LT
1966 if (sk->sk_family != st->family) {
1967 continue;
1968 }
1969 rc = sk;
1970 goto out;
1971 }
1972 st->state = TCP_SEQ_STATE_TIME_WAIT;
1973 tw_for_each(tw, node,
6e04e021 1974 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1da177e4
LT
1975 if (tw->tw_family != st->family) {
1976 continue;
1977 }
1978 rc = tw;
1979 goto out;
1980 }
6e04e021 1981 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1da177e4
LT
1982 st->state = TCP_SEQ_STATE_ESTABLISHED;
1983 }
1984out:
1985 return rc;
1986}
1987
1988static void *established_get_next(struct seq_file *seq, void *cur)
1989{
1990 struct sock *sk = cur;
1991 struct tcp_tw_bucket *tw;
1992 struct hlist_node *node;
1993 struct tcp_iter_state* st = seq->private;
1994
1995 ++st->num;
1996
1997 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1998 tw = cur;
1999 tw = tw_next(tw);
2000get_tw:
2001 while (tw && tw->tw_family != st->family) {
2002 tw = tw_next(tw);
2003 }
2004 if (tw) {
2005 cur = tw;
2006 goto out;
2007 }
6e04e021 2008 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1da177e4
LT
2009 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010
2011 /* We can reschedule between buckets: */
2012 cond_resched_softirq();
2013
6e04e021
ACM
2014 if (++st->bucket < tcp_hashinfo.ehash_size) {
2015 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2016 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4
LT
2017 } else {
2018 cur = NULL;
2019 goto out;
2020 }
2021 } else
2022 sk = sk_next(sk);
2023
2024 sk_for_each_from(sk, node) {
2025 if (sk->sk_family == st->family)
2026 goto found;
2027 }
2028
2029 st->state = TCP_SEQ_STATE_TIME_WAIT;
6e04e021 2030 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1da177e4
LT
2031 goto get_tw;
2032found:
2033 cur = sk;
2034out:
2035 return cur;
2036}
2037
2038static void *established_get_idx(struct seq_file *seq, loff_t pos)
2039{
2040 void *rc = established_get_first(seq);
2041
2042 while (rc && pos) {
2043 rc = established_get_next(seq, rc);
2044 --pos;
2045 }
2046 return rc;
2047}
2048
2049static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2050{
2051 void *rc;
2052 struct tcp_iter_state* st = seq->private;
2053
f3f05f70 2054 inet_listen_lock(&tcp_hashinfo);
1da177e4
LT
2055 st->state = TCP_SEQ_STATE_LISTENING;
2056 rc = listening_get_idx(seq, &pos);
2057
2058 if (!rc) {
f3f05f70 2059 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
2060 local_bh_disable();
2061 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062 rc = established_get_idx(seq, pos);
2063 }
2064
2065 return rc;
2066}
2067
2068static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2069{
2070 struct tcp_iter_state* st = seq->private;
2071 st->state = TCP_SEQ_STATE_LISTENING;
2072 st->num = 0;
2073 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2074}
2075
2076static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2077{
2078 void *rc = NULL;
2079 struct tcp_iter_state* st;
2080
2081 if (v == SEQ_START_TOKEN) {
2082 rc = tcp_get_idx(seq, 0);
2083 goto out;
2084 }
2085 st = seq->private;
2086
2087 switch (st->state) {
2088 case TCP_SEQ_STATE_OPENREQ:
2089 case TCP_SEQ_STATE_LISTENING:
2090 rc = listening_get_next(seq, v);
2091 if (!rc) {
f3f05f70 2092 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
2093 local_bh_disable();
2094 st->state = TCP_SEQ_STATE_ESTABLISHED;
2095 rc = established_get_first(seq);
2096 }
2097 break;
2098 case TCP_SEQ_STATE_ESTABLISHED:
2099 case TCP_SEQ_STATE_TIME_WAIT:
2100 rc = established_get_next(seq, v);
2101 break;
2102 }
2103out:
2104 ++*pos;
2105 return rc;
2106}
2107
2108static void tcp_seq_stop(struct seq_file *seq, void *v)
2109{
2110 struct tcp_iter_state* st = seq->private;
2111
2112 switch (st->state) {
2113 case TCP_SEQ_STATE_OPENREQ:
2114 if (v) {
2115 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
0e87506f 2116 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1da177e4
LT
2117 }
2118 case TCP_SEQ_STATE_LISTENING:
2119 if (v != SEQ_START_TOKEN)
f3f05f70 2120 inet_listen_unlock(&tcp_hashinfo);
1da177e4
LT
2121 break;
2122 case TCP_SEQ_STATE_TIME_WAIT:
2123 case TCP_SEQ_STATE_ESTABLISHED:
2124 if (v)
6e04e021 2125 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1da177e4
LT
2126 local_bh_enable();
2127 break;
2128 }
2129}
2130
2131static int tcp_seq_open(struct inode *inode, struct file *file)
2132{
2133 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2134 struct seq_file *seq;
2135 struct tcp_iter_state *s;
2136 int rc;
2137
2138 if (unlikely(afinfo == NULL))
2139 return -EINVAL;
2140
2141 s = kmalloc(sizeof(*s), GFP_KERNEL);
2142 if (!s)
2143 return -ENOMEM;
2144 memset(s, 0, sizeof(*s));
2145 s->family = afinfo->family;
2146 s->seq_ops.start = tcp_seq_start;
2147 s->seq_ops.next = tcp_seq_next;
2148 s->seq_ops.show = afinfo->seq_show;
2149 s->seq_ops.stop = tcp_seq_stop;
2150
2151 rc = seq_open(file, &s->seq_ops);
2152 if (rc)
2153 goto out_kfree;
2154 seq = file->private_data;
2155 seq->private = s;
2156out:
2157 return rc;
2158out_kfree:
2159 kfree(s);
2160 goto out;
2161}
2162
2163int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2164{
2165 int rc = 0;
2166 struct proc_dir_entry *p;
2167
2168 if (!afinfo)
2169 return -EINVAL;
2170 afinfo->seq_fops->owner = afinfo->owner;
2171 afinfo->seq_fops->open = tcp_seq_open;
2172 afinfo->seq_fops->read = seq_read;
2173 afinfo->seq_fops->llseek = seq_lseek;
2174 afinfo->seq_fops->release = seq_release_private;
2175
2176 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2177 if (p)
2178 p->data = afinfo;
2179 else
2180 rc = -ENOMEM;
2181 return rc;
2182}
2183
2184void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2185{
2186 if (!afinfo)
2187 return;
2188 proc_net_remove(afinfo->name);
2189 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2190}
2191
60236fdd 2192static void get_openreq4(struct sock *sk, struct request_sock *req,
1da177e4
LT
2193 char *tmpbuf, int i, int uid)
2194{
2e6599cb 2195 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2196 int ttd = req->expires - jiffies;
2197
2198 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2199 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2200 i,
2e6599cb 2201 ireq->loc_addr,
1da177e4 2202 ntohs(inet_sk(sk)->sport),
2e6599cb
ACM
2203 ireq->rmt_addr,
2204 ntohs(ireq->rmt_port),
1da177e4
LT
2205 TCP_SYN_RECV,
2206 0, 0, /* could print option size, but that is af dependent. */
2207 1, /* timers active (only the expire timer) */
2208 jiffies_to_clock_t(ttd),
2209 req->retrans,
2210 uid,
2211 0, /* non standard timer */
2212 0, /* open_requests have no inode */
2213 atomic_read(&sk->sk_refcnt),
2214 req);
2215}
2216
2217static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2218{
2219 int timer_active;
2220 unsigned long timer_expires;
2221 struct tcp_sock *tp = tcp_sk(sp);
2222 struct inet_sock *inet = inet_sk(sp);
2223 unsigned int dest = inet->daddr;
2224 unsigned int src = inet->rcv_saddr;
2225 __u16 destp = ntohs(inet->dport);
2226 __u16 srcp = ntohs(inet->sport);
2227
2228 if (tp->pending == TCP_TIME_RETRANS) {
2229 timer_active = 1;
2230 timer_expires = tp->timeout;
2231 } else if (tp->pending == TCP_TIME_PROBE0) {
2232 timer_active = 4;
2233 timer_expires = tp->timeout;
2234 } else if (timer_pending(&sp->sk_timer)) {
2235 timer_active = 2;
2236 timer_expires = sp->sk_timer.expires;
2237 } else {
2238 timer_active = 0;
2239 timer_expires = jiffies;
2240 }
2241
2242 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2243 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2244 i, src, srcp, dest, destp, sp->sk_state,
2245 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2246 timer_active,
2247 jiffies_to_clock_t(timer_expires - jiffies),
2248 tp->retransmits,
2249 sock_i_uid(sp),
2250 tp->probes_out,
2251 sock_i_ino(sp),
2252 atomic_read(&sp->sk_refcnt), sp,
2253 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2254 tp->snd_cwnd,
2255 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2256}
2257
2258static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2259{
2260 unsigned int dest, src;
2261 __u16 destp, srcp;
2262 int ttd = tw->tw_ttd - jiffies;
2263
2264 if (ttd < 0)
2265 ttd = 0;
2266
2267 dest = tw->tw_daddr;
2268 src = tw->tw_rcv_saddr;
2269 destp = ntohs(tw->tw_dport);
2270 srcp = ntohs(tw->tw_sport);
2271
2272 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2273 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2274 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2275 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2276 atomic_read(&tw->tw_refcnt), tw);
2277}
2278
2279#define TMPSZ 150
2280
2281static int tcp4_seq_show(struct seq_file *seq, void *v)
2282{
2283 struct tcp_iter_state* st;
2284 char tmpbuf[TMPSZ + 1];
2285
2286 if (v == SEQ_START_TOKEN) {
2287 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2288 " sl local_address rem_address st tx_queue "
2289 "rx_queue tr tm->when retrnsmt uid timeout "
2290 "inode");
2291 goto out;
2292 }
2293 st = seq->private;
2294
2295 switch (st->state) {
2296 case TCP_SEQ_STATE_LISTENING:
2297 case TCP_SEQ_STATE_ESTABLISHED:
2298 get_tcp4_sock(v, tmpbuf, st->num);
2299 break;
2300 case TCP_SEQ_STATE_OPENREQ:
2301 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2302 break;
2303 case TCP_SEQ_STATE_TIME_WAIT:
2304 get_timewait4_sock(v, tmpbuf, st->num);
2305 break;
2306 }
2307 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2308out:
2309 return 0;
2310}
2311
2312static struct file_operations tcp4_seq_fops;
2313static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2314 .owner = THIS_MODULE,
2315 .name = "tcp",
2316 .family = AF_INET,
2317 .seq_show = tcp4_seq_show,
2318 .seq_fops = &tcp4_seq_fops,
2319};
2320
2321int __init tcp4_proc_init(void)
2322{
2323 return tcp_proc_register(&tcp4_seq_afinfo);
2324}
2325
2326void tcp4_proc_exit(void)
2327{
2328 tcp_proc_unregister(&tcp4_seq_afinfo);
2329}
2330#endif /* CONFIG_PROC_FS */
2331
2332struct proto tcp_prot = {
2333 .name = "TCP",
2334 .owner = THIS_MODULE,
2335 .close = tcp_close,
2336 .connect = tcp_v4_connect,
2337 .disconnect = tcp_disconnect,
2338 .accept = tcp_accept,
2339 .ioctl = tcp_ioctl,
2340 .init = tcp_v4_init_sock,
2341 .destroy = tcp_v4_destroy_sock,
2342 .shutdown = tcp_shutdown,
2343 .setsockopt = tcp_setsockopt,
2344 .getsockopt = tcp_getsockopt,
2345 .sendmsg = tcp_sendmsg,
2346 .recvmsg = tcp_recvmsg,
2347 .backlog_rcv = tcp_v4_do_rcv,
2348 .hash = tcp_v4_hash,
2349 .unhash = tcp_unhash,
2350 .get_port = tcp_v4_get_port,
2351 .enter_memory_pressure = tcp_enter_memory_pressure,
2352 .sockets_allocated = &tcp_sockets_allocated,
2353 .memory_allocated = &tcp_memory_allocated,
2354 .memory_pressure = &tcp_memory_pressure,
2355 .sysctl_mem = sysctl_tcp_mem,
2356 .sysctl_wmem = sysctl_tcp_wmem,
2357 .sysctl_rmem = sysctl_tcp_rmem,
2358 .max_header = MAX_TCP_HEADER,
2359 .obj_size = sizeof(struct tcp_sock),
60236fdd 2360 .rsk_prot = &tcp_request_sock_ops,
1da177e4
LT
2361};
2362
2363
2364
2365void __init tcp_v4_init(struct net_proto_family *ops)
2366{
2367 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2368 if (err < 0)
2369 panic("Failed to create the TCP control socket.\n");
2370 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2371 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2372
2373 /* Unhash it so that IP input processing does not even
2374 * see it, we do not wish this socket to see incoming
2375 * packets.
2376 */
2377 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2378}
2379
2380EXPORT_SYMBOL(ipv4_specific);
0f7ff927 2381EXPORT_SYMBOL(inet_bind_bucket_create);
1da177e4 2382EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 2383EXPORT_SYMBOL(tcp_prot);
1da177e4
LT
2384EXPORT_SYMBOL(tcp_unhash);
2385EXPORT_SYMBOL(tcp_v4_conn_request);
2386EXPORT_SYMBOL(tcp_v4_connect);
2387EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
2388EXPORT_SYMBOL(tcp_v4_remember_stamp);
2389EXPORT_SYMBOL(tcp_v4_send_check);
2390EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2391
2392#ifdef CONFIG_PROC_FS
2393EXPORT_SYMBOL(tcp_proc_register);
2394EXPORT_SYMBOL(tcp_proc_unregister);
2395#endif
2396EXPORT_SYMBOL(sysctl_local_port_range);
1da177e4
LT
2397EXPORT_SYMBOL(sysctl_tcp_low_latency);
2398EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2399