40fe4f5fca1cef6afe4050826a5dbb0253241725
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / tcp_ipv4.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26 /*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 };
98
99 /*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108 {
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
110 struct sock *sk2;
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
113
114 sk_for_each_bound(sk2, node, &tb->owners) {
115 if (sk != sk2 &&
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
125 break;
126 }
127 }
128 }
129 return node != NULL;
130 }
131
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
134 */
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136 {
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
140 int ret;
141
142 local_bh_disable();
143 if (!snum) {
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
147 int rover;
148
149 spin_lock(&tcp_portalloc_lock);
150 if (tcp_port_rover < low)
151 rover = low;
152 else
153 rover = tcp_port_rover;
154 do {
155 rover++;
156 if (rover > high)
157 rover = low;
158 head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
162 goto next;
163 break;
164 next:
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_port_rover = rover;
168 spin_unlock(&tcp_portalloc_lock);
169
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
175 */
176 ret = 1;
177 if (unlikely(remaining <= 0))
178 goto fail;
179
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
182 */
183 snum = rover;
184 } else {
185 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
189 goto tb_found;
190 }
191 tb = NULL;
192 goto tb_not_found;
193 tb_found:
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
196 goto success;
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199 goto success;
200 } else {
201 ret = 1;
202 if (tcp_bind_conflict(sk, tb))
203 goto fail_unlock;
204 }
205 }
206 tb_not_found:
207 ret = 1;
208 if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
209 goto fail_unlock;
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212 tb->fastreuse = 1;
213 else
214 tb->fastreuse = 0;
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217 tb->fastreuse = 0;
218 success:
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
222 ret = 0;
223
224 fail_unlock:
225 spin_unlock(&head->lock);
226 fail:
227 local_bh_enable();
228 return ret;
229 }
230
231 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
232 * Look, when several writers sleep and reader wakes them up, all but one
233 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
234 * this, _but_ remember, it adds useless work on UP machines (wake up each
235 * exclusive lock release). It should be ifdefed really.
236 */
237
238 void tcp_listen_wlock(void)
239 {
240 write_lock(&tcp_lhash_lock);
241
242 if (atomic_read(&tcp_lhash_users)) {
243 DEFINE_WAIT(wait);
244
245 for (;;) {
246 prepare_to_wait_exclusive(&tcp_lhash_wait,
247 &wait, TASK_UNINTERRUPTIBLE);
248 if (!atomic_read(&tcp_lhash_users))
249 break;
250 write_unlock_bh(&tcp_lhash_lock);
251 schedule();
252 write_lock_bh(&tcp_lhash_lock);
253 }
254
255 finish_wait(&tcp_lhash_wait, &wait);
256 }
257 }
258
259 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
260 {
261 struct hlist_head *list;
262 rwlock_t *lock;
263
264 BUG_TRAP(sk_unhashed(sk));
265 if (listen_possible && sk->sk_state == TCP_LISTEN) {
266 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
267 lock = &tcp_lhash_lock;
268 tcp_listen_wlock();
269 } else {
270 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
271 list = &tcp_ehash[sk->sk_hashent].chain;
272 lock = &tcp_ehash[sk->sk_hashent].lock;
273 write_lock(lock);
274 }
275 __sk_add_node(sk, list);
276 sock_prot_inc_use(sk->sk_prot);
277 write_unlock(lock);
278 if (listen_possible && sk->sk_state == TCP_LISTEN)
279 wake_up(&tcp_lhash_wait);
280 }
281
282 static void tcp_v4_hash(struct sock *sk)
283 {
284 if (sk->sk_state != TCP_CLOSE) {
285 local_bh_disable();
286 __tcp_v4_hash(sk, 1);
287 local_bh_enable();
288 }
289 }
290
291 void tcp_unhash(struct sock *sk)
292 {
293 rwlock_t *lock;
294
295 if (sk_unhashed(sk))
296 goto ende;
297
298 if (sk->sk_state == TCP_LISTEN) {
299 local_bh_disable();
300 tcp_listen_wlock();
301 lock = &tcp_lhash_lock;
302 } else {
303 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
304 lock = &head->lock;
305 write_lock_bh(&head->lock);
306 }
307
308 if (__sk_del_node_init(sk))
309 sock_prot_dec_use(sk->sk_prot);
310 write_unlock_bh(lock);
311
312 ende:
313 if (sk->sk_state == TCP_LISTEN)
314 wake_up(&tcp_lhash_wait);
315 }
316
317 /* Don't inline this cruft. Here are some nice properties to
318 * exploit here. The BSD API does not allow a listening TCP
319 * to specify the remote port nor the remote address for the
320 * connection. So always assume those are both wildcarded
321 * during the search since they can never be otherwise.
322 */
323 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
324 const u32 daddr,
325 const unsigned short hnum,
326 const int dif)
327 {
328 struct sock *result = NULL, *sk;
329 struct hlist_node *node;
330 int score, hiscore;
331
332 hiscore=-1;
333 sk_for_each(sk, node, head) {
334 struct inet_sock *inet = inet_sk(sk);
335
336 if (inet->num == hnum && !ipv6_only_sock(sk)) {
337 __u32 rcv_saddr = inet->rcv_saddr;
338
339 score = (sk->sk_family == PF_INET ? 1 : 0);
340 if (rcv_saddr) {
341 if (rcv_saddr != daddr)
342 continue;
343 score+=2;
344 }
345 if (sk->sk_bound_dev_if) {
346 if (sk->sk_bound_dev_if != dif)
347 continue;
348 score+=2;
349 }
350 if (score == 5)
351 return sk;
352 if (score > hiscore) {
353 hiscore = score;
354 result = sk;
355 }
356 }
357 }
358 return result;
359 }
360
361 /* Optimize the common listener case. */
362 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
363 const unsigned short hnum,
364 const int dif)
365 {
366 struct sock *sk = NULL;
367 struct hlist_head *head;
368
369 read_lock(&tcp_lhash_lock);
370 head = &tcp_listening_hash[inet_lhashfn(hnum)];
371 if (!hlist_empty(head)) {
372 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
373
374 if (inet->num == hnum && !sk->sk_node.next &&
375 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
376 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
377 !sk->sk_bound_dev_if)
378 goto sherry_cache;
379 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
380 }
381 if (sk) {
382 sherry_cache:
383 sock_hold(sk);
384 }
385 read_unlock(&tcp_lhash_lock);
386 return sk;
387 }
388
389 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
390 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
391 *
392 * Local BH must be disabled here.
393 */
394
395 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
396 const u16 sport,
397 const u32 daddr,
398 const u16 hnum,
399 const int dif)
400 {
401 struct inet_ehash_bucket *head;
402 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
403 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
404 struct sock *sk;
405 struct hlist_node *node;
406 /* Optimize here for direct hit, only listening connections can
407 * have wildcards anyways.
408 */
409 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
410 head = &tcp_ehash[hash];
411 read_lock(&head->lock);
412 sk_for_each(sk, node, &head->chain) {
413 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
414 goto hit; /* You sunk my battleship! */
415 }
416
417 /* Must check for a TIME_WAIT'er before going to listener hash. */
418 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
419 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
420 goto hit;
421 }
422 sk = NULL;
423 out:
424 read_unlock(&head->lock);
425 return sk;
426 hit:
427 sock_hold(sk);
428 goto out;
429 }
430
431 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
432 u32 daddr, u16 hnum, int dif)
433 {
434 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
435 daddr, hnum, dif);
436
437 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
438 }
439
440 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
441 u16 dport, int dif)
442 {
443 struct sock *sk;
444
445 local_bh_disable();
446 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
447 local_bh_enable();
448
449 return sk;
450 }
451
452 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
453
454 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
455 {
456 return secure_tcp_sequence_number(skb->nh.iph->daddr,
457 skb->nh.iph->saddr,
458 skb->h.th->dest,
459 skb->h.th->source);
460 }
461
462 /* called with local bh disabled */
463 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
464 struct tcp_tw_bucket **twp)
465 {
466 struct inet_sock *inet = inet_sk(sk);
467 u32 daddr = inet->rcv_saddr;
468 u32 saddr = inet->daddr;
469 int dif = sk->sk_bound_dev_if;
470 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
471 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
472 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
473 struct inet_ehash_bucket *head = &tcp_ehash[hash];
474 struct sock *sk2;
475 struct hlist_node *node;
476 struct tcp_tw_bucket *tw;
477
478 write_lock(&head->lock);
479
480 /* Check TIME-WAIT sockets first. */
481 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
482 tw = (struct tcp_tw_bucket *)sk2;
483
484 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
485 struct tcp_sock *tp = tcp_sk(sk);
486
487 /* With PAWS, it is safe from the viewpoint
488 of data integrity. Even without PAWS it
489 is safe provided sequence spaces do not
490 overlap i.e. at data rates <= 80Mbit/sec.
491
492 Actually, the idea is close to VJ's one,
493 only timestamp cache is held not per host,
494 but per port pair and TW bucket is used
495 as state holder.
496
497 If TW bucket has been already destroyed we
498 fall back to VJ's scheme and use initial
499 timestamp retrieved from peer table.
500 */
501 if (tw->tw_ts_recent_stamp &&
502 (!twp || (sysctl_tcp_tw_reuse &&
503 xtime.tv_sec -
504 tw->tw_ts_recent_stamp > 1))) {
505 if ((tp->write_seq =
506 tw->tw_snd_nxt + 65535 + 2) == 0)
507 tp->write_seq = 1;
508 tp->rx_opt.ts_recent = tw->tw_ts_recent;
509 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
510 sock_hold(sk2);
511 goto unique;
512 } else
513 goto not_unique;
514 }
515 }
516 tw = NULL;
517
518 /* And established part... */
519 sk_for_each(sk2, node, &head->chain) {
520 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
521 goto not_unique;
522 }
523
524 unique:
525 /* Must record num and sport now. Otherwise we will see
526 * in hash table socket with a funny identity. */
527 inet->num = lport;
528 inet->sport = htons(lport);
529 sk->sk_hashent = hash;
530 BUG_TRAP(sk_unhashed(sk));
531 __sk_add_node(sk, &head->chain);
532 sock_prot_inc_use(sk->sk_prot);
533 write_unlock(&head->lock);
534
535 if (twp) {
536 *twp = tw;
537 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
538 } else if (tw) {
539 /* Silly. Should hash-dance instead... */
540 tcp_tw_deschedule(tw);
541 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
542
543 tcp_tw_put(tw);
544 }
545
546 return 0;
547
548 not_unique:
549 write_unlock(&head->lock);
550 return -EADDRNOTAVAIL;
551 }
552
553 static inline u32 connect_port_offset(const struct sock *sk)
554 {
555 const struct inet_sock *inet = inet_sk(sk);
556
557 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
558 inet->dport);
559 }
560
561 /*
562 * Bind a port for a connect operation and hash it.
563 */
564 static inline int tcp_v4_hash_connect(struct sock *sk)
565 {
566 const unsigned short snum = inet_sk(sk)->num;
567 struct inet_bind_hashbucket *head;
568 struct inet_bind_bucket *tb;
569 int ret;
570
571 if (!snum) {
572 int low = sysctl_local_port_range[0];
573 int high = sysctl_local_port_range[1];
574 int range = high - low;
575 int i;
576 int port;
577 static u32 hint;
578 u32 offset = hint + connect_port_offset(sk);
579 struct hlist_node *node;
580 struct tcp_tw_bucket *tw = NULL;
581
582 local_bh_disable();
583 for (i = 1; i <= range; i++) {
584 port = low + (i + offset) % range;
585 head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
586 spin_lock(&head->lock);
587
588 /* Does not bother with rcv_saddr checks,
589 * because the established check is already
590 * unique enough.
591 */
592 inet_bind_bucket_for_each(tb, node, &head->chain) {
593 if (tb->port == port) {
594 BUG_TRAP(!hlist_empty(&tb->owners));
595 if (tb->fastreuse >= 0)
596 goto next_port;
597 if (!__tcp_v4_check_established(sk,
598 port,
599 &tw))
600 goto ok;
601 goto next_port;
602 }
603 }
604
605 tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
606 if (!tb) {
607 spin_unlock(&head->lock);
608 break;
609 }
610 tb->fastreuse = -1;
611 goto ok;
612
613 next_port:
614 spin_unlock(&head->lock);
615 }
616 local_bh_enable();
617
618 return -EADDRNOTAVAIL;
619
620 ok:
621 hint += i;
622
623 /* Head lock still held and bh's disabled */
624 inet_bind_hash(sk, tb, port);
625 if (sk_unhashed(sk)) {
626 inet_sk(sk)->sport = htons(port);
627 __tcp_v4_hash(sk, 0);
628 }
629 spin_unlock(&head->lock);
630
631 if (tw) {
632 tcp_tw_deschedule(tw);
633 tcp_tw_put(tw);
634 }
635
636 ret = 0;
637 goto out;
638 }
639
640 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
641 tb = inet_sk(sk)->bind_hash;
642 spin_lock_bh(&head->lock);
643 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
644 __tcp_v4_hash(sk, 0);
645 spin_unlock_bh(&head->lock);
646 return 0;
647 } else {
648 spin_unlock(&head->lock);
649 /* No definite answer... Walk to established hash table */
650 ret = __tcp_v4_check_established(sk, snum, NULL);
651 out:
652 local_bh_enable();
653 return ret;
654 }
655 }
656
657 /* This will initiate an outgoing connection. */
658 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
659 {
660 struct inet_sock *inet = inet_sk(sk);
661 struct tcp_sock *tp = tcp_sk(sk);
662 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
663 struct rtable *rt;
664 u32 daddr, nexthop;
665 int tmp;
666 int err;
667
668 if (addr_len < sizeof(struct sockaddr_in))
669 return -EINVAL;
670
671 if (usin->sin_family != AF_INET)
672 return -EAFNOSUPPORT;
673
674 nexthop = daddr = usin->sin_addr.s_addr;
675 if (inet->opt && inet->opt->srr) {
676 if (!daddr)
677 return -EINVAL;
678 nexthop = inet->opt->faddr;
679 }
680
681 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
682 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
683 IPPROTO_TCP,
684 inet->sport, usin->sin_port, sk);
685 if (tmp < 0)
686 return tmp;
687
688 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
689 ip_rt_put(rt);
690 return -ENETUNREACH;
691 }
692
693 if (!inet->opt || !inet->opt->srr)
694 daddr = rt->rt_dst;
695
696 if (!inet->saddr)
697 inet->saddr = rt->rt_src;
698 inet->rcv_saddr = inet->saddr;
699
700 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
701 /* Reset inherited state */
702 tp->rx_opt.ts_recent = 0;
703 tp->rx_opt.ts_recent_stamp = 0;
704 tp->write_seq = 0;
705 }
706
707 if (sysctl_tcp_tw_recycle &&
708 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
709 struct inet_peer *peer = rt_get_peer(rt);
710
711 /* VJ's idea. We save last timestamp seen from
712 * the destination in peer table, when entering state TIME-WAIT
713 * and initialize rx_opt.ts_recent from it, when trying new connection.
714 */
715
716 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
717 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
718 tp->rx_opt.ts_recent = peer->tcp_ts;
719 }
720 }
721
722 inet->dport = usin->sin_port;
723 inet->daddr = daddr;
724
725 tp->ext_header_len = 0;
726 if (inet->opt)
727 tp->ext_header_len = inet->opt->optlen;
728
729 tp->rx_opt.mss_clamp = 536;
730
731 /* Socket identity is still unknown (sport may be zero).
732 * However we set state to SYN-SENT and not releasing socket
733 * lock select source port, enter ourselves into the hash tables and
734 * complete initialization after this.
735 */
736 tcp_set_state(sk, TCP_SYN_SENT);
737 err = tcp_v4_hash_connect(sk);
738 if (err)
739 goto failure;
740
741 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
742 if (err)
743 goto failure;
744
745 /* OK, now commit destination to socket. */
746 sk_setup_caps(sk, &rt->u.dst);
747
748 if (!tp->write_seq)
749 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
750 inet->daddr,
751 inet->sport,
752 usin->sin_port);
753
754 inet->id = tp->write_seq ^ jiffies;
755
756 err = tcp_connect(sk);
757 rt = NULL;
758 if (err)
759 goto failure;
760
761 return 0;
762
763 failure:
764 /* This unhashes the socket and releases the local port, if necessary. */
765 tcp_set_state(sk, TCP_CLOSE);
766 ip_rt_put(rt);
767 sk->sk_route_caps = 0;
768 inet->dport = 0;
769 return err;
770 }
771
772 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
773 {
774 return ((struct rtable *)skb->dst)->rt_iif;
775 }
776
777 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
778 {
779 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
780 }
781
782 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
783 struct request_sock ***prevp,
784 __u16 rport,
785 __u32 raddr, __u32 laddr)
786 {
787 struct listen_sock *lopt = tp->accept_queue.listen_opt;
788 struct request_sock *req, **prev;
789
790 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
791 (req = *prev) != NULL;
792 prev = &req->dl_next) {
793 const struct inet_request_sock *ireq = inet_rsk(req);
794
795 if (ireq->rmt_port == rport &&
796 ireq->rmt_addr == raddr &&
797 ireq->loc_addr == laddr &&
798 TCP_INET_FAMILY(req->rsk_ops->family)) {
799 BUG_TRAP(!req->sk);
800 *prevp = prev;
801 break;
802 }
803 }
804
805 return req;
806 }
807
808 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
809 {
810 struct tcp_sock *tp = tcp_sk(sk);
811 struct listen_sock *lopt = tp->accept_queue.listen_opt;
812 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
813
814 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
815 tcp_synq_added(sk);
816 }
817
818
819 /*
820 * This routine does path mtu discovery as defined in RFC1191.
821 */
822 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
823 u32 mtu)
824 {
825 struct dst_entry *dst;
826 struct inet_sock *inet = inet_sk(sk);
827 struct tcp_sock *tp = tcp_sk(sk);
828
829 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
830 * send out by Linux are always <576bytes so they should go through
831 * unfragmented).
832 */
833 if (sk->sk_state == TCP_LISTEN)
834 return;
835
836 /* We don't check in the destentry if pmtu discovery is forbidden
837 * on this route. We just assume that no packet_to_big packets
838 * are send back when pmtu discovery is not active.
839 * There is a small race when the user changes this flag in the
840 * route, but I think that's acceptable.
841 */
842 if ((dst = __sk_dst_check(sk, 0)) == NULL)
843 return;
844
845 dst->ops->update_pmtu(dst, mtu);
846
847 /* Something is about to be wrong... Remember soft error
848 * for the case, if this connection will not able to recover.
849 */
850 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
851 sk->sk_err_soft = EMSGSIZE;
852
853 mtu = dst_mtu(dst);
854
855 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
856 tp->pmtu_cookie > mtu) {
857 tcp_sync_mss(sk, mtu);
858
859 /* Resend the TCP packet because it's
860 * clear that the old packet has been
861 * dropped. This is the new "fast" path mtu
862 * discovery.
863 */
864 tcp_simple_retransmit(sk);
865 } /* else let the usual retransmit timer handle it */
866 }
867
868 /*
869 * This routine is called by the ICMP module when it gets some
870 * sort of error condition. If err < 0 then the socket should
871 * be closed and the error returned to the user. If err > 0
872 * it's just the icmp type << 8 | icmp code. After adjustment
873 * header points to the first 8 bytes of the tcp header. We need
874 * to find the appropriate port.
875 *
876 * The locking strategy used here is very "optimistic". When
877 * someone else accesses the socket the ICMP is just dropped
878 * and for some paths there is no check at all.
879 * A more general error queue to queue errors for later handling
880 * is probably better.
881 *
882 */
883
884 void tcp_v4_err(struct sk_buff *skb, u32 info)
885 {
886 struct iphdr *iph = (struct iphdr *)skb->data;
887 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
888 struct tcp_sock *tp;
889 struct inet_sock *inet;
890 int type = skb->h.icmph->type;
891 int code = skb->h.icmph->code;
892 struct sock *sk;
893 __u32 seq;
894 int err;
895
896 if (skb->len < (iph->ihl << 2) + 8) {
897 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
898 return;
899 }
900
901 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
902 th->source, tcp_v4_iif(skb));
903 if (!sk) {
904 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
905 return;
906 }
907 if (sk->sk_state == TCP_TIME_WAIT) {
908 tcp_tw_put((struct tcp_tw_bucket *)sk);
909 return;
910 }
911
912 bh_lock_sock(sk);
913 /* If too many ICMPs get dropped on busy
914 * servers this needs to be solved differently.
915 */
916 if (sock_owned_by_user(sk))
917 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
918
919 if (sk->sk_state == TCP_CLOSE)
920 goto out;
921
922 tp = tcp_sk(sk);
923 seq = ntohl(th->seq);
924 if (sk->sk_state != TCP_LISTEN &&
925 !between(seq, tp->snd_una, tp->snd_nxt)) {
926 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
927 goto out;
928 }
929
930 switch (type) {
931 case ICMP_SOURCE_QUENCH:
932 /* Just silently ignore these. */
933 goto out;
934 case ICMP_PARAMETERPROB:
935 err = EPROTO;
936 break;
937 case ICMP_DEST_UNREACH:
938 if (code > NR_ICMP_UNREACH)
939 goto out;
940
941 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
942 if (!sock_owned_by_user(sk))
943 do_pmtu_discovery(sk, iph, info);
944 goto out;
945 }
946
947 err = icmp_err_convert[code].errno;
948 break;
949 case ICMP_TIME_EXCEEDED:
950 err = EHOSTUNREACH;
951 break;
952 default:
953 goto out;
954 }
955
956 switch (sk->sk_state) {
957 struct request_sock *req, **prev;
958 case TCP_LISTEN:
959 if (sock_owned_by_user(sk))
960 goto out;
961
962 req = tcp_v4_search_req(tp, &prev, th->dest,
963 iph->daddr, iph->saddr);
964 if (!req)
965 goto out;
966
967 /* ICMPs are not backlogged, hence we cannot get
968 an established socket here.
969 */
970 BUG_TRAP(!req->sk);
971
972 if (seq != tcp_rsk(req)->snt_isn) {
973 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
974 goto out;
975 }
976
977 /*
978 * Still in SYN_RECV, just remove it silently.
979 * There is no good way to pass the error to the newly
980 * created socket, and POSIX does not want network
981 * errors returned from accept().
982 */
983 tcp_synq_drop(sk, req, prev);
984 goto out;
985
986 case TCP_SYN_SENT:
987 case TCP_SYN_RECV: /* Cannot happen.
988 It can f.e. if SYNs crossed.
989 */
990 if (!sock_owned_by_user(sk)) {
991 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
992 sk->sk_err = err;
993
994 sk->sk_error_report(sk);
995
996 tcp_done(sk);
997 } else {
998 sk->sk_err_soft = err;
999 }
1000 goto out;
1001 }
1002
1003 /* If we've already connected we will keep trying
1004 * until we time out, or the user gives up.
1005 *
1006 * rfc1122 4.2.3.9 allows to consider as hard errors
1007 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1008 * but it is obsoleted by pmtu discovery).
1009 *
1010 * Note, that in modern internet, where routing is unreliable
1011 * and in each dark corner broken firewalls sit, sending random
1012 * errors ordered by their masters even this two messages finally lose
1013 * their original sense (even Linux sends invalid PORT_UNREACHs)
1014 *
1015 * Now we are in compliance with RFCs.
1016 * --ANK (980905)
1017 */
1018
1019 inet = inet_sk(sk);
1020 if (!sock_owned_by_user(sk) && inet->recverr) {
1021 sk->sk_err = err;
1022 sk->sk_error_report(sk);
1023 } else { /* Only an error on timeout */
1024 sk->sk_err_soft = err;
1025 }
1026
1027 out:
1028 bh_unlock_sock(sk);
1029 sock_put(sk);
1030 }
1031
1032 /* This routine computes an IPv4 TCP checksum. */
1033 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1034 struct sk_buff *skb)
1035 {
1036 struct inet_sock *inet = inet_sk(sk);
1037
1038 if (skb->ip_summed == CHECKSUM_HW) {
1039 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1040 skb->csum = offsetof(struct tcphdr, check);
1041 } else {
1042 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1043 csum_partial((char *)th,
1044 th->doff << 2,
1045 skb->csum));
1046 }
1047 }
1048
1049 /*
1050 * This routine will send an RST to the other tcp.
1051 *
1052 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1053 * for reset.
1054 * Answer: if a packet caused RST, it is not for a socket
1055 * existing in our system, if it is matched to a socket,
1056 * it is just duplicate segment or bug in other side's TCP.
1057 * So that we build reply only basing on parameters
1058 * arrived with segment.
1059 * Exception: precedence violation. We do not implement it in any case.
1060 */
1061
1062 static void tcp_v4_send_reset(struct sk_buff *skb)
1063 {
1064 struct tcphdr *th = skb->h.th;
1065 struct tcphdr rth;
1066 struct ip_reply_arg arg;
1067
1068 /* Never send a reset in response to a reset. */
1069 if (th->rst)
1070 return;
1071
1072 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1073 return;
1074
1075 /* Swap the send and the receive. */
1076 memset(&rth, 0, sizeof(struct tcphdr));
1077 rth.dest = th->source;
1078 rth.source = th->dest;
1079 rth.doff = sizeof(struct tcphdr) / 4;
1080 rth.rst = 1;
1081
1082 if (th->ack) {
1083 rth.seq = th->ack_seq;
1084 } else {
1085 rth.ack = 1;
1086 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1087 skb->len - (th->doff << 2));
1088 }
1089
1090 memset(&arg, 0, sizeof arg);
1091 arg.iov[0].iov_base = (unsigned char *)&rth;
1092 arg.iov[0].iov_len = sizeof rth;
1093 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1094 skb->nh.iph->saddr, /*XXX*/
1095 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1096 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1097
1098 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1099
1100 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1101 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1102 }
1103
1104 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1105 outside socket context is ugly, certainly. What can I do?
1106 */
1107
1108 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1109 u32 win, u32 ts)
1110 {
1111 struct tcphdr *th = skb->h.th;
1112 struct {
1113 struct tcphdr th;
1114 u32 tsopt[3];
1115 } rep;
1116 struct ip_reply_arg arg;
1117
1118 memset(&rep.th, 0, sizeof(struct tcphdr));
1119 memset(&arg, 0, sizeof arg);
1120
1121 arg.iov[0].iov_base = (unsigned char *)&rep;
1122 arg.iov[0].iov_len = sizeof(rep.th);
1123 if (ts) {
1124 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1125 (TCPOPT_TIMESTAMP << 8) |
1126 TCPOLEN_TIMESTAMP);
1127 rep.tsopt[1] = htonl(tcp_time_stamp);
1128 rep.tsopt[2] = htonl(ts);
1129 arg.iov[0].iov_len = sizeof(rep);
1130 }
1131
1132 /* Swap the send and the receive. */
1133 rep.th.dest = th->source;
1134 rep.th.source = th->dest;
1135 rep.th.doff = arg.iov[0].iov_len / 4;
1136 rep.th.seq = htonl(seq);
1137 rep.th.ack_seq = htonl(ack);
1138 rep.th.ack = 1;
1139 rep.th.window = htons(win);
1140
1141 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1142 skb->nh.iph->saddr, /*XXX*/
1143 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1144 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1145
1146 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1147
1148 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1149 }
1150
1151 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1152 {
1153 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1154
1155 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1156 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1157
1158 tcp_tw_put(tw);
1159 }
1160
1161 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1162 {
1163 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1164 req->ts_recent);
1165 }
1166
1167 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1168 struct request_sock *req)
1169 {
1170 struct rtable *rt;
1171 const struct inet_request_sock *ireq = inet_rsk(req);
1172 struct ip_options *opt = inet_rsk(req)->opt;
1173 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1174 .nl_u = { .ip4_u =
1175 { .daddr = ((opt && opt->srr) ?
1176 opt->faddr :
1177 ireq->rmt_addr),
1178 .saddr = ireq->loc_addr,
1179 .tos = RT_CONN_FLAGS(sk) } },
1180 .proto = IPPROTO_TCP,
1181 .uli_u = { .ports =
1182 { .sport = inet_sk(sk)->sport,
1183 .dport = ireq->rmt_port } } };
1184
1185 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1186 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1187 return NULL;
1188 }
1189 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1190 ip_rt_put(rt);
1191 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1192 return NULL;
1193 }
1194 return &rt->u.dst;
1195 }
1196
1197 /*
1198 * Send a SYN-ACK after having received an ACK.
1199 * This still operates on a request_sock only, not on a big
1200 * socket.
1201 */
1202 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1203 struct dst_entry *dst)
1204 {
1205 const struct inet_request_sock *ireq = inet_rsk(req);
1206 int err = -1;
1207 struct sk_buff * skb;
1208
1209 /* First, grab a route. */
1210 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1211 goto out;
1212
1213 skb = tcp_make_synack(sk, dst, req);
1214
1215 if (skb) {
1216 struct tcphdr *th = skb->h.th;
1217
1218 th->check = tcp_v4_check(th, skb->len,
1219 ireq->loc_addr,
1220 ireq->rmt_addr,
1221 csum_partial((char *)th, skb->len,
1222 skb->csum));
1223
1224 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1225 ireq->rmt_addr,
1226 ireq->opt);
1227 if (err == NET_XMIT_CN)
1228 err = 0;
1229 }
1230
1231 out:
1232 dst_release(dst);
1233 return err;
1234 }
1235
1236 /*
1237 * IPv4 request_sock destructor.
1238 */
1239 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1240 {
1241 if (inet_rsk(req)->opt)
1242 kfree(inet_rsk(req)->opt);
1243 }
1244
1245 static inline void syn_flood_warning(struct sk_buff *skb)
1246 {
1247 static unsigned long warntime;
1248
1249 if (time_after(jiffies, (warntime + HZ * 60))) {
1250 warntime = jiffies;
1251 printk(KERN_INFO
1252 "possible SYN flooding on port %d. Sending cookies.\n",
1253 ntohs(skb->h.th->dest));
1254 }
1255 }
1256
1257 /*
1258 * Save and compile IPv4 options into the request_sock if needed.
1259 */
1260 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1261 struct sk_buff *skb)
1262 {
1263 struct ip_options *opt = &(IPCB(skb)->opt);
1264 struct ip_options *dopt = NULL;
1265
1266 if (opt && opt->optlen) {
1267 int opt_size = optlength(opt);
1268 dopt = kmalloc(opt_size, GFP_ATOMIC);
1269 if (dopt) {
1270 if (ip_options_echo(dopt, skb)) {
1271 kfree(dopt);
1272 dopt = NULL;
1273 }
1274 }
1275 }
1276 return dopt;
1277 }
1278
1279 struct request_sock_ops tcp_request_sock_ops = {
1280 .family = PF_INET,
1281 .obj_size = sizeof(struct tcp_request_sock),
1282 .rtx_syn_ack = tcp_v4_send_synack,
1283 .send_ack = tcp_v4_reqsk_send_ack,
1284 .destructor = tcp_v4_reqsk_destructor,
1285 .send_reset = tcp_v4_send_reset,
1286 };
1287
1288 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1289 {
1290 struct inet_request_sock *ireq;
1291 struct tcp_options_received tmp_opt;
1292 struct request_sock *req;
1293 __u32 saddr = skb->nh.iph->saddr;
1294 __u32 daddr = skb->nh.iph->daddr;
1295 __u32 isn = TCP_SKB_CB(skb)->when;
1296 struct dst_entry *dst = NULL;
1297 #ifdef CONFIG_SYN_COOKIES
1298 int want_cookie = 0;
1299 #else
1300 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1301 #endif
1302
1303 /* Never answer to SYNs send to broadcast or multicast */
1304 if (((struct rtable *)skb->dst)->rt_flags &
1305 (RTCF_BROADCAST | RTCF_MULTICAST))
1306 goto drop;
1307
1308 /* TW buckets are converted to open requests without
1309 * limitations, they conserve resources and peer is
1310 * evidently real one.
1311 */
1312 if (tcp_synq_is_full(sk) && !isn) {
1313 #ifdef CONFIG_SYN_COOKIES
1314 if (sysctl_tcp_syncookies) {
1315 want_cookie = 1;
1316 } else
1317 #endif
1318 goto drop;
1319 }
1320
1321 /* Accept backlog is full. If we have already queued enough
1322 * of warm entries in syn queue, drop request. It is better than
1323 * clogging syn queue with openreqs with exponentially increasing
1324 * timeout.
1325 */
1326 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1327 goto drop;
1328
1329 req = reqsk_alloc(&tcp_request_sock_ops);
1330 if (!req)
1331 goto drop;
1332
1333 tcp_clear_options(&tmp_opt);
1334 tmp_opt.mss_clamp = 536;
1335 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1336
1337 tcp_parse_options(skb, &tmp_opt, 0);
1338
1339 if (want_cookie) {
1340 tcp_clear_options(&tmp_opt);
1341 tmp_opt.saw_tstamp = 0;
1342 }
1343
1344 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1345 /* Some OSes (unknown ones, but I see them on web server, which
1346 * contains information interesting only for windows'
1347 * users) do not send their stamp in SYN. It is easy case.
1348 * We simply do not advertise TS support.
1349 */
1350 tmp_opt.saw_tstamp = 0;
1351 tmp_opt.tstamp_ok = 0;
1352 }
1353 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1354
1355 tcp_openreq_init(req, &tmp_opt, skb);
1356
1357 ireq = inet_rsk(req);
1358 ireq->loc_addr = daddr;
1359 ireq->rmt_addr = saddr;
1360 ireq->opt = tcp_v4_save_options(sk, skb);
1361 if (!want_cookie)
1362 TCP_ECN_create_request(req, skb->h.th);
1363
1364 if (want_cookie) {
1365 #ifdef CONFIG_SYN_COOKIES
1366 syn_flood_warning(skb);
1367 #endif
1368 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1369 } else if (!isn) {
1370 struct inet_peer *peer = NULL;
1371
1372 /* VJ's idea. We save last timestamp seen
1373 * from the destination in peer table, when entering
1374 * state TIME-WAIT, and check against it before
1375 * accepting new connection request.
1376 *
1377 * If "isn" is not zero, this request hit alive
1378 * timewait bucket, so that all the necessary checks
1379 * are made in the function processing timewait state.
1380 */
1381 if (tmp_opt.saw_tstamp &&
1382 sysctl_tcp_tw_recycle &&
1383 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1384 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1385 peer->v4daddr == saddr) {
1386 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1387 (s32)(peer->tcp_ts - req->ts_recent) >
1388 TCP_PAWS_WINDOW) {
1389 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1390 dst_release(dst);
1391 goto drop_and_free;
1392 }
1393 }
1394 /* Kill the following clause, if you dislike this way. */
1395 else if (!sysctl_tcp_syncookies &&
1396 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1397 (sysctl_max_syn_backlog >> 2)) &&
1398 (!peer || !peer->tcp_ts_stamp) &&
1399 (!dst || !dst_metric(dst, RTAX_RTT))) {
1400 /* Without syncookies last quarter of
1401 * backlog is filled with destinations,
1402 * proven to be alive.
1403 * It means that we continue to communicate
1404 * to destinations, already remembered
1405 * to the moment of synflood.
1406 */
1407 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1408 "request from %u.%u."
1409 "%u.%u/%u\n",
1410 NIPQUAD(saddr),
1411 ntohs(skb->h.th->source)));
1412 dst_release(dst);
1413 goto drop_and_free;
1414 }
1415
1416 isn = tcp_v4_init_sequence(sk, skb);
1417 }
1418 tcp_rsk(req)->snt_isn = isn;
1419
1420 if (tcp_v4_send_synack(sk, req, dst))
1421 goto drop_and_free;
1422
1423 if (want_cookie) {
1424 reqsk_free(req);
1425 } else {
1426 tcp_v4_synq_add(sk, req);
1427 }
1428 return 0;
1429
1430 drop_and_free:
1431 reqsk_free(req);
1432 drop:
1433 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1434 return 0;
1435 }
1436
1437
1438 /*
1439 * The three way handshake has completed - we got a valid synack -
1440 * now create the new socket.
1441 */
1442 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1443 struct request_sock *req,
1444 struct dst_entry *dst)
1445 {
1446 struct inet_request_sock *ireq;
1447 struct inet_sock *newinet;
1448 struct tcp_sock *newtp;
1449 struct sock *newsk;
1450
1451 if (sk_acceptq_is_full(sk))
1452 goto exit_overflow;
1453
1454 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1455 goto exit;
1456
1457 newsk = tcp_create_openreq_child(sk, req, skb);
1458 if (!newsk)
1459 goto exit;
1460
1461 sk_setup_caps(newsk, dst);
1462
1463 newtp = tcp_sk(newsk);
1464 newinet = inet_sk(newsk);
1465 ireq = inet_rsk(req);
1466 newinet->daddr = ireq->rmt_addr;
1467 newinet->rcv_saddr = ireq->loc_addr;
1468 newinet->saddr = ireq->loc_addr;
1469 newinet->opt = ireq->opt;
1470 ireq->opt = NULL;
1471 newinet->mc_index = tcp_v4_iif(skb);
1472 newinet->mc_ttl = skb->nh.iph->ttl;
1473 newtp->ext_header_len = 0;
1474 if (newinet->opt)
1475 newtp->ext_header_len = newinet->opt->optlen;
1476 newinet->id = newtp->write_seq ^ jiffies;
1477
1478 tcp_sync_mss(newsk, dst_mtu(dst));
1479 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1480 tcp_initialize_rcv_mss(newsk);
1481
1482 __tcp_v4_hash(newsk, 0);
1483 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1484
1485 return newsk;
1486
1487 exit_overflow:
1488 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1489 exit:
1490 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1491 dst_release(dst);
1492 return NULL;
1493 }
1494
1495 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1496 {
1497 struct tcphdr *th = skb->h.th;
1498 struct iphdr *iph = skb->nh.iph;
1499 struct tcp_sock *tp = tcp_sk(sk);
1500 struct sock *nsk;
1501 struct request_sock **prev;
1502 /* Find possible connection requests. */
1503 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1504 iph->saddr, iph->daddr);
1505 if (req)
1506 return tcp_check_req(sk, skb, req, prev);
1507
1508 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1509 th->source,
1510 skb->nh.iph->daddr,
1511 ntohs(th->dest),
1512 tcp_v4_iif(skb));
1513
1514 if (nsk) {
1515 if (nsk->sk_state != TCP_TIME_WAIT) {
1516 bh_lock_sock(nsk);
1517 return nsk;
1518 }
1519 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1520 return NULL;
1521 }
1522
1523 #ifdef CONFIG_SYN_COOKIES
1524 if (!th->rst && !th->syn && th->ack)
1525 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1526 #endif
1527 return sk;
1528 }
1529
1530 static int tcp_v4_checksum_init(struct sk_buff *skb)
1531 {
1532 if (skb->ip_summed == CHECKSUM_HW) {
1533 skb->ip_summed = CHECKSUM_UNNECESSARY;
1534 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1535 skb->nh.iph->daddr, skb->csum))
1536 return 0;
1537
1538 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1539 skb->ip_summed = CHECKSUM_NONE;
1540 }
1541 if (skb->len <= 76) {
1542 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1543 skb->nh.iph->daddr,
1544 skb_checksum(skb, 0, skb->len, 0)))
1545 return -1;
1546 skb->ip_summed = CHECKSUM_UNNECESSARY;
1547 } else {
1548 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1549 skb->nh.iph->saddr,
1550 skb->nh.iph->daddr, 0);
1551 }
1552 return 0;
1553 }
1554
1555
1556 /* The socket must have it's spinlock held when we get
1557 * here.
1558 *
1559 * We have a potential double-lock case here, so even when
1560 * doing backlog processing we use the BH locking scheme.
1561 * This is because we cannot sleep with the original spinlock
1562 * held.
1563 */
1564 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1565 {
1566 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1567 TCP_CHECK_TIMER(sk);
1568 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1569 goto reset;
1570 TCP_CHECK_TIMER(sk);
1571 return 0;
1572 }
1573
1574 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1575 goto csum_err;
1576
1577 if (sk->sk_state == TCP_LISTEN) {
1578 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1579 if (!nsk)
1580 goto discard;
1581
1582 if (nsk != sk) {
1583 if (tcp_child_process(sk, nsk, skb))
1584 goto reset;
1585 return 0;
1586 }
1587 }
1588
1589 TCP_CHECK_TIMER(sk);
1590 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1591 goto reset;
1592 TCP_CHECK_TIMER(sk);
1593 return 0;
1594
1595 reset:
1596 tcp_v4_send_reset(skb);
1597 discard:
1598 kfree_skb(skb);
1599 /* Be careful here. If this function gets more complicated and
1600 * gcc suffers from register pressure on the x86, sk (in %ebx)
1601 * might be destroyed here. This current version compiles correctly,
1602 * but you have been warned.
1603 */
1604 return 0;
1605
1606 csum_err:
1607 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1608 goto discard;
1609 }
1610
1611 /*
1612 * From tcp_input.c
1613 */
1614
1615 int tcp_v4_rcv(struct sk_buff *skb)
1616 {
1617 struct tcphdr *th;
1618 struct sock *sk;
1619 int ret;
1620
1621 if (skb->pkt_type != PACKET_HOST)
1622 goto discard_it;
1623
1624 /* Count it even if it's bad */
1625 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1626
1627 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1628 goto discard_it;
1629
1630 th = skb->h.th;
1631
1632 if (th->doff < sizeof(struct tcphdr) / 4)
1633 goto bad_packet;
1634 if (!pskb_may_pull(skb, th->doff * 4))
1635 goto discard_it;
1636
1637 /* An explanation is required here, I think.
1638 * Packet length and doff are validated by header prediction,
1639 * provided case of th->doff==0 is elimineted.
1640 * So, we defer the checks. */
1641 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1642 tcp_v4_checksum_init(skb) < 0))
1643 goto bad_packet;
1644
1645 th = skb->h.th;
1646 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1647 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1648 skb->len - th->doff * 4);
1649 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1650 TCP_SKB_CB(skb)->when = 0;
1651 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1652 TCP_SKB_CB(skb)->sacked = 0;
1653
1654 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1655 skb->nh.iph->daddr, ntohs(th->dest),
1656 tcp_v4_iif(skb));
1657
1658 if (!sk)
1659 goto no_tcp_socket;
1660
1661 process:
1662 if (sk->sk_state == TCP_TIME_WAIT)
1663 goto do_time_wait;
1664
1665 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1666 goto discard_and_relse;
1667
1668 if (sk_filter(sk, skb, 0))
1669 goto discard_and_relse;
1670
1671 skb->dev = NULL;
1672
1673 bh_lock_sock(sk);
1674 ret = 0;
1675 if (!sock_owned_by_user(sk)) {
1676 if (!tcp_prequeue(sk, skb))
1677 ret = tcp_v4_do_rcv(sk, skb);
1678 } else
1679 sk_add_backlog(sk, skb);
1680 bh_unlock_sock(sk);
1681
1682 sock_put(sk);
1683
1684 return ret;
1685
1686 no_tcp_socket:
1687 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1688 goto discard_it;
1689
1690 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1691 bad_packet:
1692 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1693 } else {
1694 tcp_v4_send_reset(skb);
1695 }
1696
1697 discard_it:
1698 /* Discard frame. */
1699 kfree_skb(skb);
1700 return 0;
1701
1702 discard_and_relse:
1703 sock_put(sk);
1704 goto discard_it;
1705
1706 do_time_wait:
1707 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1708 tcp_tw_put((struct tcp_tw_bucket *) sk);
1709 goto discard_it;
1710 }
1711
1712 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1713 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1714 tcp_tw_put((struct tcp_tw_bucket *) sk);
1715 goto discard_it;
1716 }
1717 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1718 skb, th, skb->len)) {
1719 case TCP_TW_SYN: {
1720 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1721 ntohs(th->dest),
1722 tcp_v4_iif(skb));
1723 if (sk2) {
1724 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1725 tcp_tw_put((struct tcp_tw_bucket *)sk);
1726 sk = sk2;
1727 goto process;
1728 }
1729 /* Fall through to ACK */
1730 }
1731 case TCP_TW_ACK:
1732 tcp_v4_timewait_ack(sk, skb);
1733 break;
1734 case TCP_TW_RST:
1735 goto no_tcp_socket;
1736 case TCP_TW_SUCCESS:;
1737 }
1738 goto discard_it;
1739 }
1740
1741 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1742 {
1743 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1744 struct inet_sock *inet = inet_sk(sk);
1745
1746 sin->sin_family = AF_INET;
1747 sin->sin_addr.s_addr = inet->daddr;
1748 sin->sin_port = inet->dport;
1749 }
1750
1751 /* VJ's idea. Save last timestamp seen from this destination
1752 * and hold it at least for normal timewait interval to use for duplicate
1753 * segment detection in subsequent connections, before they enter synchronized
1754 * state.
1755 */
1756
1757 int tcp_v4_remember_stamp(struct sock *sk)
1758 {
1759 struct inet_sock *inet = inet_sk(sk);
1760 struct tcp_sock *tp = tcp_sk(sk);
1761 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1762 struct inet_peer *peer = NULL;
1763 int release_it = 0;
1764
1765 if (!rt || rt->rt_dst != inet->daddr) {
1766 peer = inet_getpeer(inet->daddr, 1);
1767 release_it = 1;
1768 } else {
1769 if (!rt->peer)
1770 rt_bind_peer(rt, 1);
1771 peer = rt->peer;
1772 }
1773
1774 if (peer) {
1775 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1776 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1777 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1778 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1779 peer->tcp_ts = tp->rx_opt.ts_recent;
1780 }
1781 if (release_it)
1782 inet_putpeer(peer);
1783 return 1;
1784 }
1785
1786 return 0;
1787 }
1788
1789 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1790 {
1791 struct inet_peer *peer = NULL;
1792
1793 peer = inet_getpeer(tw->tw_daddr, 1);
1794
1795 if (peer) {
1796 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1797 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1798 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1799 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1800 peer->tcp_ts = tw->tw_ts_recent;
1801 }
1802 inet_putpeer(peer);
1803 return 1;
1804 }
1805
1806 return 0;
1807 }
1808
1809 struct tcp_func ipv4_specific = {
1810 .queue_xmit = ip_queue_xmit,
1811 .send_check = tcp_v4_send_check,
1812 .rebuild_header = inet_sk_rebuild_header,
1813 .conn_request = tcp_v4_conn_request,
1814 .syn_recv_sock = tcp_v4_syn_recv_sock,
1815 .remember_stamp = tcp_v4_remember_stamp,
1816 .net_header_len = sizeof(struct iphdr),
1817 .setsockopt = ip_setsockopt,
1818 .getsockopt = ip_getsockopt,
1819 .addr2sockaddr = v4_addr2sockaddr,
1820 .sockaddr_len = sizeof(struct sockaddr_in),
1821 };
1822
1823 /* NOTE: A lot of things set to zero explicitly by call to
1824 * sk_alloc() so need not be done here.
1825 */
1826 static int tcp_v4_init_sock(struct sock *sk)
1827 {
1828 struct tcp_sock *tp = tcp_sk(sk);
1829
1830 skb_queue_head_init(&tp->out_of_order_queue);
1831 tcp_init_xmit_timers(sk);
1832 tcp_prequeue_init(tp);
1833
1834 tp->rto = TCP_TIMEOUT_INIT;
1835 tp->mdev = TCP_TIMEOUT_INIT;
1836
1837 /* So many TCP implementations out there (incorrectly) count the
1838 * initial SYN frame in their delayed-ACK and congestion control
1839 * algorithms that we must have the following bandaid to talk
1840 * efficiently to them. -DaveM
1841 */
1842 tp->snd_cwnd = 2;
1843
1844 /* See draft-stevens-tcpca-spec-01 for discussion of the
1845 * initialization of these values.
1846 */
1847 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1848 tp->snd_cwnd_clamp = ~0;
1849 tp->mss_cache = 536;
1850
1851 tp->reordering = sysctl_tcp_reordering;
1852 tp->ca_ops = &tcp_init_congestion_ops;
1853
1854 sk->sk_state = TCP_CLOSE;
1855
1856 sk->sk_write_space = sk_stream_write_space;
1857 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1858
1859 tp->af_specific = &ipv4_specific;
1860
1861 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1862 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1863
1864 atomic_inc(&tcp_sockets_allocated);
1865
1866 return 0;
1867 }
1868
1869 int tcp_v4_destroy_sock(struct sock *sk)
1870 {
1871 struct tcp_sock *tp = tcp_sk(sk);
1872
1873 tcp_clear_xmit_timers(sk);
1874
1875 tcp_cleanup_congestion_control(tp);
1876
1877 /* Cleanup up the write buffer. */
1878 sk_stream_writequeue_purge(sk);
1879
1880 /* Cleans up our, hopefully empty, out_of_order_queue. */
1881 __skb_queue_purge(&tp->out_of_order_queue);
1882
1883 /* Clean prequeue, it must be empty really */
1884 __skb_queue_purge(&tp->ucopy.prequeue);
1885
1886 /* Clean up a referenced TCP bind bucket. */
1887 if (inet_sk(sk)->bind_hash)
1888 inet_put_port(&tcp_hashinfo, sk);
1889
1890 /*
1891 * If sendmsg cached page exists, toss it.
1892 */
1893 if (sk->sk_sndmsg_page) {
1894 __free_page(sk->sk_sndmsg_page);
1895 sk->sk_sndmsg_page = NULL;
1896 }
1897
1898 atomic_dec(&tcp_sockets_allocated);
1899
1900 return 0;
1901 }
1902
1903 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1904
1905 #ifdef CONFIG_PROC_FS
1906 /* Proc filesystem TCP sock list dumping. */
1907
1908 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1909 {
1910 return hlist_empty(head) ? NULL :
1911 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1912 }
1913
1914 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1915 {
1916 return tw->tw_node.next ?
1917 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1918 }
1919
1920 static void *listening_get_next(struct seq_file *seq, void *cur)
1921 {
1922 struct tcp_sock *tp;
1923 struct hlist_node *node;
1924 struct sock *sk = cur;
1925 struct tcp_iter_state* st = seq->private;
1926
1927 if (!sk) {
1928 st->bucket = 0;
1929 sk = sk_head(&tcp_listening_hash[0]);
1930 goto get_sk;
1931 }
1932
1933 ++st->num;
1934
1935 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1936 struct request_sock *req = cur;
1937
1938 tp = tcp_sk(st->syn_wait_sk);
1939 req = req->dl_next;
1940 while (1) {
1941 while (req) {
1942 if (req->rsk_ops->family == st->family) {
1943 cur = req;
1944 goto out;
1945 }
1946 req = req->dl_next;
1947 }
1948 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1949 break;
1950 get_req:
1951 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1952 }
1953 sk = sk_next(st->syn_wait_sk);
1954 st->state = TCP_SEQ_STATE_LISTENING;
1955 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1956 } else {
1957 tp = tcp_sk(sk);
1958 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1959 if (reqsk_queue_len(&tp->accept_queue))
1960 goto start_req;
1961 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1962 sk = sk_next(sk);
1963 }
1964 get_sk:
1965 sk_for_each_from(sk, node) {
1966 if (sk->sk_family == st->family) {
1967 cur = sk;
1968 goto out;
1969 }
1970 tp = tcp_sk(sk);
1971 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1972 if (reqsk_queue_len(&tp->accept_queue)) {
1973 start_req:
1974 st->uid = sock_i_uid(sk);
1975 st->syn_wait_sk = sk;
1976 st->state = TCP_SEQ_STATE_OPENREQ;
1977 st->sbucket = 0;
1978 goto get_req;
1979 }
1980 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1981 }
1982 if (++st->bucket < INET_LHTABLE_SIZE) {
1983 sk = sk_head(&tcp_listening_hash[st->bucket]);
1984 goto get_sk;
1985 }
1986 cur = NULL;
1987 out:
1988 return cur;
1989 }
1990
1991 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1992 {
1993 void *rc = listening_get_next(seq, NULL);
1994
1995 while (rc && *pos) {
1996 rc = listening_get_next(seq, rc);
1997 --*pos;
1998 }
1999 return rc;
2000 }
2001
2002 static void *established_get_first(struct seq_file *seq)
2003 {
2004 struct tcp_iter_state* st = seq->private;
2005 void *rc = NULL;
2006
2007 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2008 struct sock *sk;
2009 struct hlist_node *node;
2010 struct tcp_tw_bucket *tw;
2011
2012 /* We can reschedule _before_ having picked the target: */
2013 cond_resched_softirq();
2014
2015 read_lock(&tcp_ehash[st->bucket].lock);
2016 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2017 if (sk->sk_family != st->family) {
2018 continue;
2019 }
2020 rc = sk;
2021 goto out;
2022 }
2023 st->state = TCP_SEQ_STATE_TIME_WAIT;
2024 tw_for_each(tw, node,
2025 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2026 if (tw->tw_family != st->family) {
2027 continue;
2028 }
2029 rc = tw;
2030 goto out;
2031 }
2032 read_unlock(&tcp_ehash[st->bucket].lock);
2033 st->state = TCP_SEQ_STATE_ESTABLISHED;
2034 }
2035 out:
2036 return rc;
2037 }
2038
2039 static void *established_get_next(struct seq_file *seq, void *cur)
2040 {
2041 struct sock *sk = cur;
2042 struct tcp_tw_bucket *tw;
2043 struct hlist_node *node;
2044 struct tcp_iter_state* st = seq->private;
2045
2046 ++st->num;
2047
2048 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2049 tw = cur;
2050 tw = tw_next(tw);
2051 get_tw:
2052 while (tw && tw->tw_family != st->family) {
2053 tw = tw_next(tw);
2054 }
2055 if (tw) {
2056 cur = tw;
2057 goto out;
2058 }
2059 read_unlock(&tcp_ehash[st->bucket].lock);
2060 st->state = TCP_SEQ_STATE_ESTABLISHED;
2061
2062 /* We can reschedule between buckets: */
2063 cond_resched_softirq();
2064
2065 if (++st->bucket < tcp_ehash_size) {
2066 read_lock(&tcp_ehash[st->bucket].lock);
2067 sk = sk_head(&tcp_ehash[st->bucket].chain);
2068 } else {
2069 cur = NULL;
2070 goto out;
2071 }
2072 } else
2073 sk = sk_next(sk);
2074
2075 sk_for_each_from(sk, node) {
2076 if (sk->sk_family == st->family)
2077 goto found;
2078 }
2079
2080 st->state = TCP_SEQ_STATE_TIME_WAIT;
2081 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2082 goto get_tw;
2083 found:
2084 cur = sk;
2085 out:
2086 return cur;
2087 }
2088
2089 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2090 {
2091 void *rc = established_get_first(seq);
2092
2093 while (rc && pos) {
2094 rc = established_get_next(seq, rc);
2095 --pos;
2096 }
2097 return rc;
2098 }
2099
2100 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2101 {
2102 void *rc;
2103 struct tcp_iter_state* st = seq->private;
2104
2105 tcp_listen_lock();
2106 st->state = TCP_SEQ_STATE_LISTENING;
2107 rc = listening_get_idx(seq, &pos);
2108
2109 if (!rc) {
2110 tcp_listen_unlock();
2111 local_bh_disable();
2112 st->state = TCP_SEQ_STATE_ESTABLISHED;
2113 rc = established_get_idx(seq, pos);
2114 }
2115
2116 return rc;
2117 }
2118
2119 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2120 {
2121 struct tcp_iter_state* st = seq->private;
2122 st->state = TCP_SEQ_STATE_LISTENING;
2123 st->num = 0;
2124 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2125 }
2126
2127 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2128 {
2129 void *rc = NULL;
2130 struct tcp_iter_state* st;
2131
2132 if (v == SEQ_START_TOKEN) {
2133 rc = tcp_get_idx(seq, 0);
2134 goto out;
2135 }
2136 st = seq->private;
2137
2138 switch (st->state) {
2139 case TCP_SEQ_STATE_OPENREQ:
2140 case TCP_SEQ_STATE_LISTENING:
2141 rc = listening_get_next(seq, v);
2142 if (!rc) {
2143 tcp_listen_unlock();
2144 local_bh_disable();
2145 st->state = TCP_SEQ_STATE_ESTABLISHED;
2146 rc = established_get_first(seq);
2147 }
2148 break;
2149 case TCP_SEQ_STATE_ESTABLISHED:
2150 case TCP_SEQ_STATE_TIME_WAIT:
2151 rc = established_get_next(seq, v);
2152 break;
2153 }
2154 out:
2155 ++*pos;
2156 return rc;
2157 }
2158
2159 static void tcp_seq_stop(struct seq_file *seq, void *v)
2160 {
2161 struct tcp_iter_state* st = seq->private;
2162
2163 switch (st->state) {
2164 case TCP_SEQ_STATE_OPENREQ:
2165 if (v) {
2166 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2167 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2168 }
2169 case TCP_SEQ_STATE_LISTENING:
2170 if (v != SEQ_START_TOKEN)
2171 tcp_listen_unlock();
2172 break;
2173 case TCP_SEQ_STATE_TIME_WAIT:
2174 case TCP_SEQ_STATE_ESTABLISHED:
2175 if (v)
2176 read_unlock(&tcp_ehash[st->bucket].lock);
2177 local_bh_enable();
2178 break;
2179 }
2180 }
2181
2182 static int tcp_seq_open(struct inode *inode, struct file *file)
2183 {
2184 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2185 struct seq_file *seq;
2186 struct tcp_iter_state *s;
2187 int rc;
2188
2189 if (unlikely(afinfo == NULL))
2190 return -EINVAL;
2191
2192 s = kmalloc(sizeof(*s), GFP_KERNEL);
2193 if (!s)
2194 return -ENOMEM;
2195 memset(s, 0, sizeof(*s));
2196 s->family = afinfo->family;
2197 s->seq_ops.start = tcp_seq_start;
2198 s->seq_ops.next = tcp_seq_next;
2199 s->seq_ops.show = afinfo->seq_show;
2200 s->seq_ops.stop = tcp_seq_stop;
2201
2202 rc = seq_open(file, &s->seq_ops);
2203 if (rc)
2204 goto out_kfree;
2205 seq = file->private_data;
2206 seq->private = s;
2207 out:
2208 return rc;
2209 out_kfree:
2210 kfree(s);
2211 goto out;
2212 }
2213
2214 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2215 {
2216 int rc = 0;
2217 struct proc_dir_entry *p;
2218
2219 if (!afinfo)
2220 return -EINVAL;
2221 afinfo->seq_fops->owner = afinfo->owner;
2222 afinfo->seq_fops->open = tcp_seq_open;
2223 afinfo->seq_fops->read = seq_read;
2224 afinfo->seq_fops->llseek = seq_lseek;
2225 afinfo->seq_fops->release = seq_release_private;
2226
2227 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2228 if (p)
2229 p->data = afinfo;
2230 else
2231 rc = -ENOMEM;
2232 return rc;
2233 }
2234
2235 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2236 {
2237 if (!afinfo)
2238 return;
2239 proc_net_remove(afinfo->name);
2240 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2241 }
2242
2243 static void get_openreq4(struct sock *sk, struct request_sock *req,
2244 char *tmpbuf, int i, int uid)
2245 {
2246 const struct inet_request_sock *ireq = inet_rsk(req);
2247 int ttd = req->expires - jiffies;
2248
2249 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2250 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2251 i,
2252 ireq->loc_addr,
2253 ntohs(inet_sk(sk)->sport),
2254 ireq->rmt_addr,
2255 ntohs(ireq->rmt_port),
2256 TCP_SYN_RECV,
2257 0, 0, /* could print option size, but that is af dependent. */
2258 1, /* timers active (only the expire timer) */
2259 jiffies_to_clock_t(ttd),
2260 req->retrans,
2261 uid,
2262 0, /* non standard timer */
2263 0, /* open_requests have no inode */
2264 atomic_read(&sk->sk_refcnt),
2265 req);
2266 }
2267
2268 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2269 {
2270 int timer_active;
2271 unsigned long timer_expires;
2272 struct tcp_sock *tp = tcp_sk(sp);
2273 struct inet_sock *inet = inet_sk(sp);
2274 unsigned int dest = inet->daddr;
2275 unsigned int src = inet->rcv_saddr;
2276 __u16 destp = ntohs(inet->dport);
2277 __u16 srcp = ntohs(inet->sport);
2278
2279 if (tp->pending == TCP_TIME_RETRANS) {
2280 timer_active = 1;
2281 timer_expires = tp->timeout;
2282 } else if (tp->pending == TCP_TIME_PROBE0) {
2283 timer_active = 4;
2284 timer_expires = tp->timeout;
2285 } else if (timer_pending(&sp->sk_timer)) {
2286 timer_active = 2;
2287 timer_expires = sp->sk_timer.expires;
2288 } else {
2289 timer_active = 0;
2290 timer_expires = jiffies;
2291 }
2292
2293 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2294 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2295 i, src, srcp, dest, destp, sp->sk_state,
2296 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2297 timer_active,
2298 jiffies_to_clock_t(timer_expires - jiffies),
2299 tp->retransmits,
2300 sock_i_uid(sp),
2301 tp->probes_out,
2302 sock_i_ino(sp),
2303 atomic_read(&sp->sk_refcnt), sp,
2304 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2305 tp->snd_cwnd,
2306 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2307 }
2308
2309 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2310 {
2311 unsigned int dest, src;
2312 __u16 destp, srcp;
2313 int ttd = tw->tw_ttd - jiffies;
2314
2315 if (ttd < 0)
2316 ttd = 0;
2317
2318 dest = tw->tw_daddr;
2319 src = tw->tw_rcv_saddr;
2320 destp = ntohs(tw->tw_dport);
2321 srcp = ntohs(tw->tw_sport);
2322
2323 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2324 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2325 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2326 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2327 atomic_read(&tw->tw_refcnt), tw);
2328 }
2329
2330 #define TMPSZ 150
2331
2332 static int tcp4_seq_show(struct seq_file *seq, void *v)
2333 {
2334 struct tcp_iter_state* st;
2335 char tmpbuf[TMPSZ + 1];
2336
2337 if (v == SEQ_START_TOKEN) {
2338 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2339 " sl local_address rem_address st tx_queue "
2340 "rx_queue tr tm->when retrnsmt uid timeout "
2341 "inode");
2342 goto out;
2343 }
2344 st = seq->private;
2345
2346 switch (st->state) {
2347 case TCP_SEQ_STATE_LISTENING:
2348 case TCP_SEQ_STATE_ESTABLISHED:
2349 get_tcp4_sock(v, tmpbuf, st->num);
2350 break;
2351 case TCP_SEQ_STATE_OPENREQ:
2352 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2353 break;
2354 case TCP_SEQ_STATE_TIME_WAIT:
2355 get_timewait4_sock(v, tmpbuf, st->num);
2356 break;
2357 }
2358 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2359 out:
2360 return 0;
2361 }
2362
2363 static struct file_operations tcp4_seq_fops;
2364 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2365 .owner = THIS_MODULE,
2366 .name = "tcp",
2367 .family = AF_INET,
2368 .seq_show = tcp4_seq_show,
2369 .seq_fops = &tcp4_seq_fops,
2370 };
2371
2372 int __init tcp4_proc_init(void)
2373 {
2374 return tcp_proc_register(&tcp4_seq_afinfo);
2375 }
2376
2377 void tcp4_proc_exit(void)
2378 {
2379 tcp_proc_unregister(&tcp4_seq_afinfo);
2380 }
2381 #endif /* CONFIG_PROC_FS */
2382
2383 struct proto tcp_prot = {
2384 .name = "TCP",
2385 .owner = THIS_MODULE,
2386 .close = tcp_close,
2387 .connect = tcp_v4_connect,
2388 .disconnect = tcp_disconnect,
2389 .accept = tcp_accept,
2390 .ioctl = tcp_ioctl,
2391 .init = tcp_v4_init_sock,
2392 .destroy = tcp_v4_destroy_sock,
2393 .shutdown = tcp_shutdown,
2394 .setsockopt = tcp_setsockopt,
2395 .getsockopt = tcp_getsockopt,
2396 .sendmsg = tcp_sendmsg,
2397 .recvmsg = tcp_recvmsg,
2398 .backlog_rcv = tcp_v4_do_rcv,
2399 .hash = tcp_v4_hash,
2400 .unhash = tcp_unhash,
2401 .get_port = tcp_v4_get_port,
2402 .enter_memory_pressure = tcp_enter_memory_pressure,
2403 .sockets_allocated = &tcp_sockets_allocated,
2404 .memory_allocated = &tcp_memory_allocated,
2405 .memory_pressure = &tcp_memory_pressure,
2406 .sysctl_mem = sysctl_tcp_mem,
2407 .sysctl_wmem = sysctl_tcp_wmem,
2408 .sysctl_rmem = sysctl_tcp_rmem,
2409 .max_header = MAX_TCP_HEADER,
2410 .obj_size = sizeof(struct tcp_sock),
2411 .rsk_prot = &tcp_request_sock_ops,
2412 };
2413
2414
2415
2416 void __init tcp_v4_init(struct net_proto_family *ops)
2417 {
2418 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2419 if (err < 0)
2420 panic("Failed to create the TCP control socket.\n");
2421 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2422 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2423
2424 /* Unhash it so that IP input processing does not even
2425 * see it, we do not wish this socket to see incoming
2426 * packets.
2427 */
2428 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2429 }
2430
2431 EXPORT_SYMBOL(ipv4_specific);
2432 EXPORT_SYMBOL(inet_bind_bucket_create);
2433 EXPORT_SYMBOL(tcp_hashinfo);
2434 EXPORT_SYMBOL(tcp_listen_wlock);
2435 EXPORT_SYMBOL(tcp_port_rover);
2436 EXPORT_SYMBOL(tcp_prot);
2437 EXPORT_SYMBOL(tcp_unhash);
2438 EXPORT_SYMBOL(tcp_v4_conn_request);
2439 EXPORT_SYMBOL(tcp_v4_connect);
2440 EXPORT_SYMBOL(tcp_v4_do_rcv);
2441 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2442 EXPORT_SYMBOL(tcp_v4_send_check);
2443 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2444
2445 #ifdef CONFIG_PROC_FS
2446 EXPORT_SYMBOL(tcp_proc_register);
2447 EXPORT_SYMBOL(tcp_proc_unregister);
2448 #endif
2449 EXPORT_SYMBOL(sysctl_local_port_range);
2450 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2451 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2452