Merge tag 'v3.10.106' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
1a2449a8 75#include <net/netdma.h>
6e5714ea 76#include <net/secure_seq.h>
d1a4c0b3 77#include <net/tcp_memcontrol.h>
1da177e4
LT
78
79#include <linux/inet.h>
80#include <linux/ipv6.h>
81#include <linux/stddef.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84
cfb6eeb4
YH
85#include <linux/crypto.h>
86#include <linux/scatterlist.h>
87
ab32ea5d
BH
88int sysctl_tcp_tw_reuse __read_mostly;
89int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 90EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 91
1da177e4 92
cfb6eeb4 93#ifdef CONFIG_TCP_MD5SIG
a915da9b 94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 95 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
96#endif
97
5caea4ea 98struct inet_hashinfo tcp_hashinfo;
4bc2f18b 99EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 100
cf533ea5 101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 102{
eddc9ec5
ACM
103 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104 ip_hdr(skb)->saddr,
aa8223c7
ACM
105 tcp_hdr(skb)->dest,
106 tcp_hdr(skb)->source);
1da177e4
LT
107}
108
6d6ee43e
ACM
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
113
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
120 holder.
121
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
124 */
125 if (tcptw->tw_ts_recent_stamp &&
126 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
130 tp->write_seq = 1;
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 sock_hold(sktw);
134 return 1;
135 }
136
137 return 0;
138}
6d6ee43e
ACM
139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
1da177e4
LT
141/* This will initiate an outgoing connection. */
142int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143{
2d7192d6 144 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
145 struct inet_sock *inet = inet_sk(sk);
146 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 147 __be16 orig_sport, orig_dport;
bada8adc 148 __be32 daddr, nexthop;
da905bd1 149 struct flowi4 *fl4;
2d7192d6 150 struct rtable *rt;
1da177e4 151 int err;
f6d8bd05 152 struct ip_options_rcu *inet_opt;
1da177e4
LT
153
154 if (addr_len < sizeof(struct sockaddr_in))
155 return -EINVAL;
156
157 if (usin->sin_family != AF_INET)
158 return -EAFNOSUPPORT;
159
160 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
161 inet_opt = rcu_dereference_protected(inet->inet_opt,
162 sock_owned_by_user(sk));
163 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
164 if (!daddr)
165 return -EINVAL;
f6d8bd05 166 nexthop = inet_opt->opt.faddr;
1da177e4
LT
167 }
168
dca8b089
DM
169 orig_sport = inet->inet_sport;
170 orig_dport = usin->sin_port;
da905bd1
DM
171 fl4 = &inet->cork.fl.u.ip4;
172 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 IPPROTO_TCP,
175 orig_sport, orig_dport, sk, true);
176 if (IS_ERR(rt)) {
177 err = PTR_ERR(rt);
178 if (err == -ENETUNREACH)
7be560d6 179 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 180 return err;
584bdf8c 181 }
1da177e4
LT
182
183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184 ip_rt_put(rt);
185 return -ENETUNREACH;
186 }
187
f6d8bd05 188 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 189 daddr = fl4->daddr;
1da177e4 190
c720c7e8 191 if (!inet->inet_saddr)
da905bd1 192 inet->inet_saddr = fl4->saddr;
c720c7e8 193 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 194
c720c7e8 195 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
196 /* Reset inherited state */
197 tp->rx_opt.ts_recent = 0;
198 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
199 if (likely(!tp->repair))
200 tp->write_seq = 0;
1da177e4
LT
201 }
202
295ff7ed 203 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
204 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
205 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 206
c720c7e8
ED
207 inet->inet_dport = usin->sin_port;
208 inet->inet_daddr = daddr;
1da177e4 209
d83d8461 210 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
211 if (inet_opt)
212 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 213
bee7ca9e 214 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
215
216 /* Socket identity is still unknown (sport may be zero).
217 * However we set state to SYN-SENT and not releasing socket
218 * lock select source port, enter ourselves into the hash tables and
219 * complete initialization after this.
220 */
221 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 222 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
223 if (err)
224 goto failure;
225
da905bd1 226 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
227 inet->inet_sport, inet->inet_dport, sk);
228 if (IS_ERR(rt)) {
229 err = PTR_ERR(rt);
230 rt = NULL;
1da177e4 231 goto failure;
b23dd4fe 232 }
1da177e4 233 /* OK, now commit destination to socket. */
bcd76111 234 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 235 sk_setup_caps(sk, &rt->dst);
6fa3eb70 236 printk(KERN_INFO "[socket_conn]IPV4 socket[%lu] sport:%u \n", SOCK_INODE(sk->sk_socket)->i_ino, ntohs(inet->inet_sport));
ee995283 237 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr,
240 inet->inet_sport,
1da177e4
LT
241 usin->sin_port);
242
c720c7e8 243 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 244
2b916477 245 err = tcp_connect(sk);
ee995283 246
1da177e4
LT
247 rt = NULL;
248 if (err)
249 goto failure;
250
251 return 0;
252
253failure:
7174259e
ACM
254 /*
255 * This unhashes the socket and releases the local port,
256 * if necessary.
257 */
1da177e4
LT
258 tcp_set_state(sk, TCP_CLOSE);
259 ip_rt_put(rt);
260 sk->sk_route_caps = 0;
c720c7e8 261 inet->inet_dport = 0;
1da177e4
LT
262 return err;
263}
4bc2f18b 264EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 265
1da177e4 266/*
563d34d0
ED
267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268 * It can be called through tcp_release_cb() if socket was owned by user
269 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 270 */
5f80f4d8 271void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4
LT
272{
273 struct dst_entry *dst;
274 struct inet_sock *inet = inet_sk(sk);
563d34d0 275 u32 mtu = tcp_sk(sk)->mtu_info;
1da177e4 276
80d0a69f
DM
277 dst = inet_csk_update_pmtu(sk, mtu);
278 if (!dst)
1da177e4
LT
279 return;
280
1da177e4
LT
281 /* Something is about to be wrong... Remember soft error
282 * for the case, if this connection will not able to recover.
283 */
284 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
285 sk->sk_err_soft = EMSGSIZE;
286
287 mtu = dst_mtu(dst);
288
289 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 290 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
291 tcp_sync_mss(sk, mtu);
292
293 /* Resend the TCP packet because it's
294 * clear that the old packet has been
295 * dropped. This is the new "fast" path mtu
296 * discovery.
297 */
298 tcp_simple_retransmit(sk);
299 } /* else let the usual retransmit timer handle it */
300}
5f80f4d8 301EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 302
55be7a9c
DM
303static void do_redirect(struct sk_buff *skb, struct sock *sk)
304{
305 struct dst_entry *dst = __sk_dst_check(sk, 0);
306
1ed5c48f 307 if (dst)
6700c270 308 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
309}
310
1da177e4
LT
311/*
312 * This routine is called by the ICMP module when it gets some
313 * sort of error condition. If err < 0 then the socket should
314 * be closed and the error returned to the user. If err > 0
315 * it's just the icmp type << 8 | icmp code. After adjustment
316 * header points to the first 8 bytes of the tcp header. We need
317 * to find the appropriate port.
318 *
319 * The locking strategy used here is very "optimistic". When
320 * someone else accesses the socket the ICMP is just dropped
321 * and for some paths there is no check at all.
322 * A more general error queue to queue errors for later handling
323 * is probably better.
324 *
325 */
326
4d1a2d9e 327void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 328{
b71d1d42 329 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 330 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 331 struct inet_connection_sock *icsk;
1da177e4
LT
332 struct tcp_sock *tp;
333 struct inet_sock *inet;
4d1a2d9e
DL
334 const int type = icmp_hdr(icmp_skb)->type;
335 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 336 struct sock *sk;
f1ecd5d9 337 struct sk_buff *skb;
168a8f58 338 struct request_sock *req;
1da177e4 339 __u32 seq;
f1ecd5d9 340 __u32 remaining;
1da177e4 341 int err;
4d1a2d9e 342 struct net *net = dev_net(icmp_skb->dev);
1da177e4 343
4d1a2d9e 344 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 345 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
346 return;
347 }
348
fd54d716 349 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 350 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 351 if (!sk) {
dcfc23ca 352 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
353 return;
354 }
355 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 356 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
357 return;
358 }
359
360 bh_lock_sock(sk);
361 /* If too many ICMPs get dropped on busy
362 * servers this needs to be solved differently.
563d34d0
ED
363 * We do take care of PMTU discovery (RFC1191) special case :
364 * we can receive locally generated ICMP messages while socket is held.
1da177e4 365 */
b74aa930
ED
366 if (sock_owned_by_user(sk)) {
367 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
368 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369 }
1da177e4
LT
370 if (sk->sk_state == TCP_CLOSE)
371 goto out;
372
97e3ecd1 373 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
374 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
375 goto out;
376 }
377
f1ecd5d9 378 icsk = inet_csk(sk);
1da177e4 379 tp = tcp_sk(sk);
168a8f58 380 req = tp->fastopen_rsk;
1da177e4
LT
381 seq = ntohl(th->seq);
382 if (sk->sk_state != TCP_LISTEN &&
168a8f58
JC
383 !between(seq, tp->snd_una, tp->snd_nxt) &&
384 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
385 /* For a Fast Open socket, allow seq to be snt_isn. */
de0744af 386 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
387 goto out;
388 }
389
390 switch (type) {
55be7a9c 391 case ICMP_REDIRECT:
29c4bf40
JM
392 if (!sock_owned_by_user(sk))
393 do_redirect(icmp_skb, sk);
55be7a9c 394 goto out;
1da177e4
LT
395 case ICMP_SOURCE_QUENCH:
396 /* Just silently ignore these. */
397 goto out;
398 case ICMP_PARAMETERPROB:
399 err = EPROTO;
400 break;
401 case ICMP_DEST_UNREACH:
402 if (code > NR_ICMP_UNREACH)
403 goto out;
404
405 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
406 /* We are not interested in TCP_LISTEN and open_requests
407 * (SYN-ACKs send out by Linux are always <576bytes so
408 * they should go through unfragmented).
409 */
410 if (sk->sk_state == TCP_LISTEN)
411 goto out;
412
563d34d0 413 tp->mtu_info = info;
144d56e9 414 if (!sock_owned_by_user(sk)) {
563d34d0 415 tcp_v4_mtu_reduced(sk);
144d56e9
ED
416 } else {
417 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
418 sock_hold(sk);
419 }
1da177e4
LT
420 goto out;
421 }
422
423 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
424 /* check if icmp_skb allows revert of backoff
425 * (see draft-zimmermann-tcp-lcd) */
426 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
427 break;
428 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
429 !icsk->icsk_backoff)
430 break;
431
168a8f58
JC
432 /* XXX (TFO) - revisit the following logic for TFO */
433
8f49c270
DM
434 if (sock_owned_by_user(sk))
435 break;
436
f1ecd5d9 437 icsk->icsk_backoff--;
9ad7c049
JC
438 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
439 TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
f1ecd5d9
DL
440 tcp_bound_rto(sk);
441
442 skb = tcp_write_queue_head(sk);
443 BUG_ON(!skb);
444
445 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
446 tcp_time_stamp - TCP_SKB_CB(skb)->when);
447
448 if (remaining) {
449 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
6fa3eb70 450 remaining, sysctl_tcp_rto_max);
f1ecd5d9
DL
451 } else {
452 /* RTO revert clocked out retransmission.
453 * Will retransmit now */
454 tcp_retransmit_timer(sk);
455 }
456
1da177e4
LT
457 break;
458 case ICMP_TIME_EXCEEDED:
459 err = EHOSTUNREACH;
460 break;
461 default:
462 goto out;
463 }
464
168a8f58
JC
465 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
466 * than following the TCP_SYN_RECV case and closing the socket,
467 * we ignore the ICMP error and keep trying like a fully established
468 * socket. Is this the right thing to do?
469 */
470 if (req && req->sk == NULL)
471 goto out;
472
1da177e4 473 switch (sk->sk_state) {
60236fdd 474 struct request_sock *req, **prev;
1da177e4
LT
475 case TCP_LISTEN:
476 if (sock_owned_by_user(sk))
477 goto out;
478
463c84b9
ACM
479 req = inet_csk_search_req(sk, &prev, th->dest,
480 iph->daddr, iph->saddr);
1da177e4
LT
481 if (!req)
482 goto out;
483
484 /* ICMPs are not backlogged, hence we cannot get
485 an established socket here.
486 */
547b792c 487 WARN_ON(req->sk);
1da177e4 488
2e6599cb 489 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 490 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
491 goto out;
492 }
493
494 /*
495 * Still in SYN_RECV, just remove it silently.
496 * There is no good way to pass the error to the newly
497 * created socket, and POSIX does not want network
498 * errors returned from accept().
499 */
463c84b9 500 inet_csk_reqsk_queue_drop(sk, req, prev);
848bf15f 501 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
502 goto out;
503
504 case TCP_SYN_SENT:
505 case TCP_SYN_RECV: /* Cannot happen.
168a8f58
JC
506 It can f.e. if SYNs crossed,
507 or Fast Open.
1da177e4
LT
508 */
509 if (!sock_owned_by_user(sk)) {
1da177e4
LT
510 sk->sk_err = err;
511
512 sk->sk_error_report(sk);
513
514 tcp_done(sk);
515 } else {
516 sk->sk_err_soft = err;
517 }
518 goto out;
519 }
520
521 /* If we've already connected we will keep trying
522 * until we time out, or the user gives up.
523 *
524 * rfc1122 4.2.3.9 allows to consider as hard errors
525 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
526 * but it is obsoleted by pmtu discovery).
527 *
528 * Note, that in modern internet, where routing is unreliable
529 * and in each dark corner broken firewalls sit, sending random
530 * errors ordered by their masters even this two messages finally lose
531 * their original sense (even Linux sends invalid PORT_UNREACHs)
532 *
533 * Now we are in compliance with RFCs.
534 * --ANK (980905)
535 */
536
537 inet = inet_sk(sk);
538 if (!sock_owned_by_user(sk) && inet->recverr) {
539 sk->sk_err = err;
540 sk->sk_error_report(sk);
541 } else { /* Only an error on timeout */
542 sk->sk_err_soft = err;
543 }
544
545out:
546 bh_unlock_sock(sk);
547 sock_put(sk);
548}
549
419f9f89
HX
550static void __tcp_v4_send_check(struct sk_buff *skb,
551 __be32 saddr, __be32 daddr)
1da177e4 552{
aa8223c7 553 struct tcphdr *th = tcp_hdr(skb);
1da177e4 554
84fa7933 555 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 556 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 557 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 558 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 559 } else {
419f9f89 560 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 561 csum_partial(th,
1da177e4
LT
562 th->doff << 2,
563 skb->csum));
564 }
565}
566
419f9f89 567/* This routine computes an IPv4 TCP checksum. */
bb296246 568void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 569{
cf533ea5 570 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
571
572 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
573}
4bc2f18b 574EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 575
a430a43d
HX
576int tcp_v4_gso_send_check(struct sk_buff *skb)
577{
eddc9ec5 578 const struct iphdr *iph;
a430a43d
HX
579 struct tcphdr *th;
580
581 if (!pskb_may_pull(skb, sizeof(*th)))
582 return -EINVAL;
583
eddc9ec5 584 iph = ip_hdr(skb);
aa8223c7 585 th = tcp_hdr(skb);
a430a43d
HX
586
587 th->check = 0;
84fa7933 588 skb->ip_summed = CHECKSUM_PARTIAL;
419f9f89 589 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
a430a43d
HX
590 return 0;
591}
592
1da177e4
LT
593/*
594 * This routine will send an RST to the other tcp.
595 *
596 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
597 * for reset.
598 * Answer: if a packet caused RST, it is not for a socket
599 * existing in our system, if it is matched to a socket,
600 * it is just duplicate segment or bug in other side's TCP.
601 * So that we build reply only basing on parameters
602 * arrived with segment.
603 * Exception: precedence violation. We do not implement it in any case.
604 */
605
cfb6eeb4 606static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 607{
cf533ea5 608 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
609 struct {
610 struct tcphdr th;
611#ifdef CONFIG_TCP_MD5SIG
714e85be 612 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
613#endif
614 } rep;
1da177e4 615 struct ip_reply_arg arg;
cfb6eeb4
YH
616#ifdef CONFIG_TCP_MD5SIG
617 struct tcp_md5sig_key *key;
658ddaaf
SL
618 const __u8 *hash_location = NULL;
619 unsigned char newhash[16];
620 int genhash;
621 struct sock *sk1 = NULL;
cfb6eeb4 622#endif
a86b1e30 623 struct net *net;
1da177e4
LT
624
625 /* Never send a reset in response to a reset. */
626 if (th->rst)
627 return;
628
511c3f92 629 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
630 return;
631
632 /* Swap the send and the receive. */
cfb6eeb4
YH
633 memset(&rep, 0, sizeof(rep));
634 rep.th.dest = th->source;
635 rep.th.source = th->dest;
636 rep.th.doff = sizeof(struct tcphdr) / 4;
637 rep.th.rst = 1;
1da177e4
LT
638
639 if (th->ack) {
cfb6eeb4 640 rep.th.seq = th->ack_seq;
1da177e4 641 } else {
cfb6eeb4
YH
642 rep.th.ack = 1;
643 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
644 skb->len - (th->doff << 2));
1da177e4
LT
645 }
646
7174259e 647 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
648 arg.iov[0].iov_base = (unsigned char *)&rep;
649 arg.iov[0].iov_len = sizeof(rep.th);
650
651#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
652 hash_location = tcp_parse_md5sig_option(th);
653 if (!sk && hash_location) {
654 /*
655 * active side is lost. Try to find listening socket through
656 * source port, and then find md5 key through listening socket.
657 * we are not loose security here:
658 * Incoming packet is checked with md5 hash with finding key,
659 * no RST generated if md5 hash doesn't match.
660 */
661 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
da5e3630
TH
662 &tcp_hashinfo, ip_hdr(skb)->saddr,
663 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
664 ntohs(th->source), inet_iif(skb));
665 /* don't send rst if it can't find key */
666 if (!sk1)
667 return;
668 rcu_read_lock();
669 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
670 &ip_hdr(skb)->saddr, AF_INET);
671 if (!key)
672 goto release_sk1;
673
674 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
675 if (genhash || memcmp(hash_location, newhash, 16) != 0)
676 goto release_sk1;
677 } else {
678 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
679 &ip_hdr(skb)->saddr,
680 AF_INET) : NULL;
681 }
682
cfb6eeb4
YH
683 if (key) {
684 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
685 (TCPOPT_NOP << 16) |
686 (TCPOPT_MD5SIG << 8) |
687 TCPOLEN_MD5SIG);
688 /* Update length and the length the header thinks exists */
689 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
690 rep.th.doff = arg.iov[0].iov_len / 4;
691
49a72dfb 692 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
693 key, ip_hdr(skb)->saddr,
694 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
695 }
696#endif
eddc9ec5
ACM
697 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
698 ip_hdr(skb)->saddr, /* XXX */
52cd5750 699 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 700 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 701 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 702 /* When socket is gone, all binding information is lost.
4c675258
AK
703 * routing might fail in this case. No choice here, if we choose to force
704 * input interface, we will misroute in case of asymmetric route.
e2446eaa 705 */
4c675258
AK
706 if (sk)
707 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 708
adf30907 709 net = dev_net(skb_dst(skb)->dev);
66b13d99 710 arg.tos = ip_hdr(skb)->tos;
6bed3166
ED
711 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
712 skb, ip_hdr(skb)->saddr,
70e73416 713 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 714
63231bdd
PE
715 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
716 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
717
718#ifdef CONFIG_TCP_MD5SIG
719release_sk1:
720 if (sk1) {
721 rcu_read_unlock();
722 sock_put(sk1);
723 }
724#endif
1da177e4
LT
725}
726
727/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
728 outside socket context is ugly, certainly. What can I do?
729 */
730
9501f972 731static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 732 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 733 struct tcp_md5sig_key *key,
66b13d99 734 int reply_flags, u8 tos)
1da177e4 735{
cf533ea5 736 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
737 struct {
738 struct tcphdr th;
714e85be 739 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 740#ifdef CONFIG_TCP_MD5SIG
714e85be 741 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
742#endif
743 ];
1da177e4
LT
744 } rep;
745 struct ip_reply_arg arg;
adf30907 746 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
747
748 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 749 memset(&arg, 0, sizeof(arg));
1da177e4
LT
750
751 arg.iov[0].iov_base = (unsigned char *)&rep;
752 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 753 if (tsecr) {
cfb6eeb4
YH
754 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
755 (TCPOPT_TIMESTAMP << 8) |
756 TCPOLEN_TIMESTAMP);
ee684b6f
AV
757 rep.opt[1] = htonl(tsval);
758 rep.opt[2] = htonl(tsecr);
cb48cfe8 759 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
760 }
761
762 /* Swap the send and the receive. */
763 rep.th.dest = th->source;
764 rep.th.source = th->dest;
765 rep.th.doff = arg.iov[0].iov_len / 4;
766 rep.th.seq = htonl(seq);
767 rep.th.ack_seq = htonl(ack);
768 rep.th.ack = 1;
769 rep.th.window = htons(win);
770
cfb6eeb4 771#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 772 if (key) {
ee684b6f 773 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
774
775 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
776 (TCPOPT_NOP << 16) |
777 (TCPOPT_MD5SIG << 8) |
778 TCPOLEN_MD5SIG);
779 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
780 rep.th.doff = arg.iov[0].iov_len/4;
781
49a72dfb 782 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
783 key, ip_hdr(skb)->saddr,
784 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
785 }
786#endif
88ef4a5a 787 arg.flags = reply_flags;
eddc9ec5
ACM
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
792 if (oif)
793 arg.bound_dev_if = oif;
66b13d99 794 arg.tos = tos;
6bed3166
ED
795 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
796 skb, ip_hdr(skb)->saddr,
70e73416 797 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
1da177e4 798
63231bdd 799 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
800}
801
802static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
803{
8feaf0c0 804 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 805 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 806
9501f972 807 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 808 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 809 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
810 tcptw->tw_ts_recent,
811 tw->tw_bound_dev_if,
88ef4a5a 812 tcp_twsk_md5_key(tcptw),
66b13d99
ED
813 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
814 tw->tw_tos
9501f972 815 );
1da177e4 816
8feaf0c0 817 inet_twsk_put(tw);
1da177e4
LT
818}
819
6edafaaf 820static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 821 struct request_sock *req)
1da177e4 822{
168a8f58
JC
823 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
824 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
825 */
826 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
827 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
1c50d3ae
ED
828 tcp_rsk(req)->rcv_nxt,
829 req->rcv_wnd >> inet_rsk(req)->rcv_wscale,
ee684b6f 830 tcp_time_stamp,
9501f972
YH
831 req->ts_recent,
832 0,
a915da9b
ED
833 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
834 AF_INET),
66b13d99
ED
835 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
836 ip_hdr(skb)->tos);
1da177e4
LT
837}
838
1da177e4 839/*
9bf1d83e 840 * Send a SYN-ACK after having received a SYN.
60236fdd 841 * This still operates on a request_sock only, not on a big
1da177e4
LT
842 * socket.
843 */
72659ecc
OP
844static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
845 struct request_sock *req,
7586eceb
ED
846 u16 queue_mapping,
847 bool nocache)
1da177e4 848{
2e6599cb 849 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 850 struct flowi4 fl4;
1da177e4
LT
851 int err = -1;
852 struct sk_buff * skb;
853
854 /* First, grab a route. */
ba3f7f04 855 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 856 return -1;
1da177e4 857
1a2c6181 858 skb = tcp_make_synack(sk, dst, req, NULL);
1da177e4
LT
859
860 if (skb) {
419f9f89 861 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 862
fff32699 863 skb_set_queue_mapping(skb, queue_mapping);
2e6599cb
ACM
864 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
865 ireq->rmt_addr,
866 ireq->opt);
b9df3cb8 867 err = net_xmit_eval(err);
016818d0
NC
868 if (!tcp_rsk(req)->snt_synack && !err)
869 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1da177e4
LT
870 }
871
1da177e4
LT
872 return err;
873}
874
1a2c6181 875static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
fd80eb94 876{
1a2c6181 877 int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
e6c022a4
ED
878
879 if (!res)
880 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
881 return res;
fd80eb94
DL
882}
883
1da177e4 884/*
60236fdd 885 * IPv4 request_sock destructor.
1da177e4 886 */
60236fdd 887static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 888{
a51482bd 889 kfree(inet_rsk(req)->opt);
1da177e4
LT
890}
891
946cedcc 892/*
a2a385d6 893 * Return true if a syncookie should be sent
946cedcc 894 */
a2a385d6 895bool tcp_syn_flood_action(struct sock *sk,
946cedcc
ED
896 const struct sk_buff *skb,
897 const char *proto)
1da177e4 898{
946cedcc 899 const char *msg = "Dropping request";
a2a385d6 900 bool want_cookie = false;
946cedcc
ED
901 struct listen_sock *lopt;
902
903
1da177e4 904
2a1d4bd4 905#ifdef CONFIG_SYN_COOKIES
946cedcc 906 if (sysctl_tcp_syncookies) {
2a1d4bd4 907 msg = "Sending cookies";
a2a385d6 908 want_cookie = true;
946cedcc
ED
909 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
910 } else
80e40daa 911#endif
946cedcc
ED
912 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
913
914 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
915 if (!lopt->synflood_warned) {
916 lopt->synflood_warned = 1;
afd46503 917 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
946cedcc
ED
918 proto, ntohs(tcp_hdr(skb)->dest), msg);
919 }
920 return want_cookie;
2a1d4bd4 921}
946cedcc 922EXPORT_SYMBOL(tcp_syn_flood_action);
1da177e4
LT
923
924/*
60236fdd 925 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 926 */
5dff747b 927static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
1da177e4 928{
f6d8bd05
ED
929 const struct ip_options *opt = &(IPCB(skb)->opt);
930 struct ip_options_rcu *dopt = NULL;
1da177e4
LT
931
932 if (opt && opt->optlen) {
f6d8bd05
ED
933 int opt_size = sizeof(*dopt) + opt->optlen;
934
1da177e4
LT
935 dopt = kmalloc(opt_size, GFP_ATOMIC);
936 if (dopt) {
f6d8bd05 937 if (ip_options_echo(&dopt->opt, skb)) {
1da177e4
LT
938 kfree(dopt);
939 dopt = NULL;
940 }
941 }
942 }
943 return dopt;
944}
945
cfb6eeb4
YH
946#ifdef CONFIG_TCP_MD5SIG
947/*
948 * RFC2385 MD5 checksumming requires a mapping of
949 * IP address->MD5 Key.
950 * We need to maintain these in the sk structure.
951 */
952
953/* Find the Key structure for an address. */
a915da9b
ED
954struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
955 const union tcp_md5_addr *addr,
956 int family)
cfb6eeb4
YH
957{
958 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 959 struct tcp_md5sig_key *key;
a915da9b 960 unsigned int size = sizeof(struct in_addr);
a8afca03 961 struct tcp_md5sig_info *md5sig;
cfb6eeb4 962
a8afca03
ED
963 /* caller either holds rcu_read_lock() or socket lock */
964 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea
ED
965 sock_owned_by_user(sk) ||
966 lockdep_is_held(&sk->sk_lock.slock));
a8afca03 967 if (!md5sig)
cfb6eeb4 968 return NULL;
a915da9b
ED
969#if IS_ENABLED(CONFIG_IPV6)
970 if (family == AF_INET6)
971 size = sizeof(struct in6_addr);
972#endif
b67bfe0d 973 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
974 if (key->family != family)
975 continue;
976 if (!memcmp(&key->addr, addr, size))
977 return key;
cfb6eeb4
YH
978 }
979 return NULL;
980}
a915da9b 981EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4
YH
982
983struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
984 struct sock *addr_sk)
985{
a915da9b
ED
986 union tcp_md5_addr *addr;
987
988 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
989 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 990}
cfb6eeb4
YH
991EXPORT_SYMBOL(tcp_v4_md5_lookup);
992
f5b99bcd
AB
993static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
994 struct request_sock *req)
cfb6eeb4 995{
a915da9b
ED
996 union tcp_md5_addr *addr;
997
998 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
999 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4
YH
1000}
1001
1002/* This can be called on a newly created socket, from other files */
a915da9b
ED
1003int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1004 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1005{
1006 /* Add Key to the list */
b0a713e9 1007 struct tcp_md5sig_key *key;
cfb6eeb4 1008 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1009 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1010
c0353c7b 1011 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
1012 if (key) {
1013 /* Pre-existing entry - just update that one. */
a915da9b 1014 memcpy(key->key, newkey, newkeylen);
b0a713e9 1015 key->keylen = newkeylen;
a915da9b
ED
1016 return 0;
1017 }
260fcbeb 1018
a8afca03 1019 md5sig = rcu_dereference_protected(tp->md5sig_info,
98d2ffdc
ED
1020 sock_owned_by_user(sk) ||
1021 lockdep_is_held(&sk->sk_lock.slock));
a915da9b
ED
1022 if (!md5sig) {
1023 md5sig = kmalloc(sizeof(*md5sig), gfp);
1024 if (!md5sig)
cfb6eeb4 1025 return -ENOMEM;
cfb6eeb4 1026
a915da9b
ED
1027 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1029 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1030 }
cfb6eeb4 1031
5f3d9cb2 1032 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1033 if (!key)
1034 return -ENOMEM;
1035 if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
5f3d9cb2 1036 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1037 return -ENOMEM;
cfb6eeb4 1038 }
a915da9b
ED
1039
1040 memcpy(key->key, newkey, newkeylen);
1041 key->keylen = newkeylen;
1042 key->family = family;
1043 memcpy(&key->addr, addr,
1044 (family == AF_INET6) ? sizeof(struct in6_addr) :
1045 sizeof(struct in_addr));
1046 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1047 return 0;
1048}
a915da9b 1049EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1050
a915da9b 1051int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4
YH
1052{
1053 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1054 struct tcp_md5sig_key *key;
a8afca03 1055 struct tcp_md5sig_info *md5sig;
a915da9b 1056
c0353c7b 1057 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
1058 if (!key)
1059 return -ENOENT;
1060 hlist_del_rcu(&key->node);
5f3d9cb2 1061 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1062 kfree_rcu(key, rcu);
a8afca03
ED
1063 md5sig = rcu_dereference_protected(tp->md5sig_info,
1064 sock_owned_by_user(sk));
1065 if (hlist_empty(&md5sig->head))
a915da9b
ED
1066 tcp_free_md5sig_pool();
1067 return 0;
cfb6eeb4 1068}
a915da9b 1069EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1070
e0683e70 1071static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1072{
1073 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1074 struct tcp_md5sig_key *key;
b67bfe0d 1075 struct hlist_node *n;
a8afca03 1076 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1077
a8afca03
ED
1078 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1079
1080 if (!hlist_empty(&md5sig->head))
cfb6eeb4 1081 tcp_free_md5sig_pool();
b67bfe0d 1082 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1083 hlist_del_rcu(&key->node);
5f3d9cb2 1084 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1085 kfree_rcu(key, rcu);
cfb6eeb4
YH
1086 }
1087}
1088
7174259e
ACM
1089static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1090 int optlen)
cfb6eeb4
YH
1091{
1092 struct tcp_md5sig cmd;
1093 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1094
1095 if (optlen < sizeof(cmd))
1096 return -EINVAL;
1097
7174259e 1098 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1099 return -EFAULT;
1100
1101 if (sin->sin_family != AF_INET)
1102 return -EINVAL;
1103
a8afca03 1104 if (!cmd.tcpm_key || !cmd.tcpm_keylen)
a915da9b
ED
1105 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1106 AF_INET);
cfb6eeb4
YH
1107
1108 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1109 return -EINVAL;
1110
a915da9b
ED
1111 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1112 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1113 GFP_KERNEL);
cfb6eeb4
YH
1114}
1115
49a72dfb
AL
1116static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1117 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1118{
cfb6eeb4 1119 struct tcp4_pseudohdr *bp;
49a72dfb 1120 struct scatterlist sg;
cfb6eeb4
YH
1121
1122 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1123
1124 /*
49a72dfb 1125 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1126 * destination IP address, zero-padded protocol number, and
1127 * segment length)
1128 */
1129 bp->saddr = saddr;
1130 bp->daddr = daddr;
1131 bp->pad = 0;
076fb722 1132 bp->protocol = IPPROTO_TCP;
49a72dfb 1133 bp->len = cpu_to_be16(nbytes);
c7da57a1 1134
49a72dfb
AL
1135 sg_init_one(&sg, bp, sizeof(*bp));
1136 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1137}
1138
a915da9b 1139static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1140 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1141{
1142 struct tcp_md5sig_pool *hp;
1143 struct hash_desc *desc;
1144
1145 hp = tcp_get_md5sig_pool();
1146 if (!hp)
1147 goto clear_hash_noput;
1148 desc = &hp->md5_desc;
1149
1150 if (crypto_hash_init(desc))
1151 goto clear_hash;
1152 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1153 goto clear_hash;
1154 if (tcp_md5_hash_header(hp, th))
1155 goto clear_hash;
1156 if (tcp_md5_hash_key(hp, key))
1157 goto clear_hash;
1158 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1159 goto clear_hash;
1160
cfb6eeb4 1161 tcp_put_md5sig_pool();
cfb6eeb4 1162 return 0;
49a72dfb 1163
cfb6eeb4
YH
1164clear_hash:
1165 tcp_put_md5sig_pool();
1166clear_hash_noput:
1167 memset(md5_hash, 0, 16);
49a72dfb 1168 return 1;
cfb6eeb4
YH
1169}
1170
49a72dfb 1171int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
318cf7aa
ED
1172 const struct sock *sk, const struct request_sock *req,
1173 const struct sk_buff *skb)
cfb6eeb4 1174{
49a72dfb
AL
1175 struct tcp_md5sig_pool *hp;
1176 struct hash_desc *desc;
318cf7aa 1177 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1178 __be32 saddr, daddr;
1179
1180 if (sk) {
c720c7e8
ED
1181 saddr = inet_sk(sk)->inet_saddr;
1182 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1183 } else if (req) {
1184 saddr = inet_rsk(req)->loc_addr;
1185 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1186 } else {
49a72dfb
AL
1187 const struct iphdr *iph = ip_hdr(skb);
1188 saddr = iph->saddr;
1189 daddr = iph->daddr;
cfb6eeb4 1190 }
49a72dfb
AL
1191
1192 hp = tcp_get_md5sig_pool();
1193 if (!hp)
1194 goto clear_hash_noput;
1195 desc = &hp->md5_desc;
1196
1197 if (crypto_hash_init(desc))
1198 goto clear_hash;
1199
1200 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1201 goto clear_hash;
1202 if (tcp_md5_hash_header(hp, th))
1203 goto clear_hash;
1204 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1205 goto clear_hash;
1206 if (tcp_md5_hash_key(hp, key))
1207 goto clear_hash;
1208 if (crypto_hash_final(desc, md5_hash))
1209 goto clear_hash;
1210
1211 tcp_put_md5sig_pool();
1212 return 0;
1213
1214clear_hash:
1215 tcp_put_md5sig_pool();
1216clear_hash_noput:
1217 memset(md5_hash, 0, 16);
1218 return 1;
cfb6eeb4 1219}
49a72dfb 1220EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1221
a2a385d6 1222static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
cfb6eeb4
YH
1223{
1224 /*
1225 * This gets called for each TCP segment that arrives
1226 * so we want to be efficient.
1227 * We have 3 drop cases:
1228 * o No MD5 hash and one expected.
1229 * o MD5 hash and we're not expecting one.
1230 * o MD5 hash and its wrong.
1231 */
cf533ea5 1232 const __u8 *hash_location = NULL;
cfb6eeb4 1233 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1234 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1235 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1236 int genhash;
cfb6eeb4
YH
1237 unsigned char newhash[16];
1238
a915da9b
ED
1239 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1240 AF_INET);
7d5d5525 1241 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1242
cfb6eeb4
YH
1243 /* We've parsed the options - do we have a hash? */
1244 if (!hash_expected && !hash_location)
a2a385d6 1245 return false;
cfb6eeb4
YH
1246
1247 if (hash_expected && !hash_location) {
785957d3 1248 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1249 return true;
cfb6eeb4
YH
1250 }
1251
1252 if (!hash_expected && hash_location) {
785957d3 1253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1254 return true;
cfb6eeb4
YH
1255 }
1256
1257 /* Okay, so this is hash_expected and hash_location -
1258 * so we need to calculate the checksum.
1259 */
49a72dfb
AL
1260 genhash = tcp_v4_md5_hash_skb(newhash,
1261 hash_expected,
1262 NULL, NULL, skb);
cfb6eeb4
YH
1263
1264 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1265 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1266 &iph->saddr, ntohs(th->source),
1267 &iph->daddr, ntohs(th->dest),
1268 genhash ? " tcp_v4_calc_md5_hash failed"
1269 : "");
a2a385d6 1270 return true;
cfb6eeb4 1271 }
a2a385d6 1272 return false;
cfb6eeb4
YH
1273}
1274
1275#endif
1276
72a3effa 1277struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1278 .family = PF_INET,
2e6599cb 1279 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1280 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1281 .send_ack = tcp_v4_reqsk_send_ack,
1282 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1283 .send_reset = tcp_v4_send_reset,
72659ecc 1284 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1285};
1286
cfb6eeb4 1287#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1288static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1289 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1290 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1291};
b6332e6c 1292#endif
cfb6eeb4 1293
168a8f58
JC
1294static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1295 struct request_sock *req,
1296 struct tcp_fastopen_cookie *foc,
1297 struct tcp_fastopen_cookie *valid_foc)
1298{
1299 bool skip_cookie = false;
1300 struct fastopen_queue *fastopenq;
1301
1302 if (likely(!fastopen_cookie_present(foc))) {
1303 /* See include/net/tcp.h for the meaning of these knobs */
1304 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1305 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1306 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1307 skip_cookie = true; /* no cookie to validate */
1308 else
1309 return false;
1310 }
1311 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1312 /* A FO option is present; bump the counter. */
1313 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1314
1315 /* Make sure the listener has enabled fastopen, and we don't
1316 * exceed the max # of pending TFO requests allowed before trying
1317 * to validating the cookie in order to avoid burning CPU cycles
1318 * unnecessarily.
1319 *
1320 * XXX (TFO) - The implication of checking the max_qlen before
1321 * processing a cookie request is that clients can't differentiate
1322 * between qlen overflow causing Fast Open to be disabled
1323 * temporarily vs a server not supporting Fast Open at all.
1324 */
1325 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1326 fastopenq == NULL || fastopenq->max_qlen == 0)
1327 return false;
1328
1329 if (fastopenq->qlen >= fastopenq->max_qlen) {
1330 struct request_sock *req1;
1331 spin_lock(&fastopenq->lock);
1332 req1 = fastopenq->rskq_rst_head;
1333 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1334 spin_unlock(&fastopenq->lock);
1335 NET_INC_STATS_BH(sock_net(sk),
1336 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1337 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1338 foc->len = -1;
1339 return false;
1340 }
1341 fastopenq->rskq_rst_head = req1->dl_next;
1342 fastopenq->qlen--;
1343 spin_unlock(&fastopenq->lock);
1344 reqsk_free(req1);
1345 }
1346 if (skip_cookie) {
1347 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348 return true;
1349 }
1350 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1351 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1352 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1353 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1354 memcmp(&foc->val[0], &valid_foc->val[0],
1355 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1356 return false;
1357 valid_foc->len = -1;
1358 }
1359 /* Acknowledge the data received from the peer. */
1360 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1361 return true;
1362 } else if (foc->len == 0) { /* Client requesting a cookie */
1363 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364 NET_INC_STATS_BH(sock_net(sk),
1365 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1366 } else {
1367 /* Client sent a cookie with wrong size. Treat it
1368 * the same as invalid and return a valid one.
1369 */
1370 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1371 }
1372 return false;
1373}
1374
1375static int tcp_v4_conn_req_fastopen(struct sock *sk,
1376 struct sk_buff *skb,
1377 struct sk_buff *skb_synack,
1a2c6181 1378 struct request_sock *req)
168a8f58
JC
1379{
1380 struct tcp_sock *tp = tcp_sk(sk);
1381 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1382 const struct inet_request_sock *ireq = inet_rsk(req);
1383 struct sock *child;
016818d0 1384 int err;
168a8f58 1385
e6c022a4
ED
1386 req->num_retrans = 0;
1387 req->num_timeout = 0;
168a8f58
JC
1388 req->sk = NULL;
1389
1390 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1391 if (child == NULL) {
1392 NET_INC_STATS_BH(sock_net(sk),
1393 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1394 kfree_skb(skb_synack);
1395 return -1;
1396 }
016818d0
NC
1397 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1398 ireq->rmt_addr, ireq->opt);
1399 err = net_xmit_eval(err);
1400 if (!err)
1401 tcp_rsk(req)->snt_synack = tcp_time_stamp;
168a8f58
JC
1402 /* XXX (TFO) - is it ok to ignore error and continue? */
1403
1404 spin_lock(&queue->fastopenq->lock);
1405 queue->fastopenq->qlen++;
1406 spin_unlock(&queue->fastopenq->lock);
1407
1408 /* Initialize the child socket. Have to fix some values to take
1409 * into account the child is a Fast Open socket and is created
1410 * only out of the bits carried in the SYN packet.
1411 */
1412 tp = tcp_sk(child);
1413
1414 tp->fastopen_rsk = req;
1415 /* Do a hold on the listner sk so that if the listener is being
1416 * closed, the child that has been accepted can live on and still
1417 * access listen_lock.
1418 */
1419 sock_hold(sk);
1420 tcp_rsk(req)->listener = sk;
1421
1422 /* RFC1323: The window in SYN & SYN/ACK segments is never
1423 * scaled. So correct it appropriately.
1424 */
1425 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
5bcc3fac 1426 tp->max_window = tp->snd_wnd;
168a8f58
JC
1427
1428 /* Activate the retrans timer so that SYNACK can be retransmitted.
1429 * The request socket is not added to the SYN table of the parent
1430 * because it's been added to the accept queue directly.
1431 */
1432 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
6fa3eb70 1433 TCP_TIMEOUT_INIT, sysctl_tcp_rto_max);
168a8f58
JC
1434
1435 /* Add the child socket directly into the accept queue */
1436 inet_csk_reqsk_queue_add(sk, req, child);
1437
1438 /* Now finish processing the fastopen child socket. */
1439 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1440 tcp_init_congestion_control(child);
1441 tcp_mtup_init(child);
1442 tcp_init_buffer_space(child);
1443 tcp_init_metrics(child);
1444
1445 /* Queue the data carried in the SYN packet. We need to first
1446 * bump skb's refcnt because the caller will attempt to free it.
1447 *
1448 * XXX (TFO) - we honor a zero-payload TFO request for now.
1449 * (Any reason not to?)
1450 */
1451 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1452 /* Don't queue the skb if there is no payload in SYN.
1453 * XXX (TFO) - How about SYN+FIN?
1454 */
1455 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456 } else {
1457 skb = skb_get(skb);
1458 skb_dst_drop(skb);
1459 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1460 skb_set_owner_r(skb, child);
1461 __skb_queue_tail(&child->sk_receive_queue, skb);
1462 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
6f73601e 1463 tp->syn_data_acked = 1;
168a8f58
JC
1464 }
1465 sk->sk_data_ready(sk, 0);
1466 bh_unlock_sock(child);
1467 sock_put(child);
1468 WARN_ON(req->sk == NULL);
1469 return 0;
1470}
1471
1da177e4
LT
1472int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1473{
1474 struct tcp_options_received tmp_opt;
60236fdd 1475 struct request_sock *req;
e6b4d113 1476 struct inet_request_sock *ireq;
4957faad 1477 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1478 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1479 __be32 saddr = ip_hdr(skb)->saddr;
1480 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1481 __u32 isn = TCP_SKB_CB(skb)->when;
a2a385d6 1482 bool want_cookie = false;
168a8f58
JC
1483 struct flowi4 fl4;
1484 struct tcp_fastopen_cookie foc = { .len = -1 };
1485 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1486 struct sk_buff *skb_synack;
1487 int do_fastopen;
1da177e4
LT
1488
1489 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1490 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1491 goto drop;
1492
1493 /* TW buckets are converted to open requests without
1494 * limitations, they conserve resources and peer is
1495 * evidently real one.
1496 */
463c84b9 1497 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
946cedcc
ED
1498 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1499 if (!want_cookie)
1500 goto drop;
1da177e4
LT
1501 }
1502
1503 /* Accept backlog is full. If we have already queued enough
1504 * of warm entries in syn queue, drop request. It is better than
1505 * clogging syn queue with openreqs with exponentially increasing
1506 * timeout.
1507 */
2aeef18d
NS
1508 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1509 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1da177e4 1510 goto drop;
2aeef18d 1511 }
1da177e4 1512
ce4a7d0d 1513 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1514 if (!req)
1515 goto drop;
1516
cfb6eeb4
YH
1517#ifdef CONFIG_TCP_MD5SIG
1518 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1519#endif
1520
1da177e4 1521 tcp_clear_options(&tmp_opt);
bee7ca9e 1522 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1523 tmp_opt.user_mss = tp->rx_opt.user_mss;
1a2c6181 1524 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1da177e4 1525
4dfc2817 1526 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1527 tcp_clear_options(&tmp_opt);
1da177e4 1528
1da177e4 1529 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1530 tcp_openreq_init(req, &tmp_opt, skb);
1531
bb5b7c11
DM
1532 ireq = inet_rsk(req);
1533 ireq->loc_addr = daddr;
1534 ireq->rmt_addr = saddr;
1535 ireq->no_srccheck = inet_sk(sk)->transparent;
5dff747b 1536 ireq->opt = tcp_v4_save_options(skb);
6fa3eb70 1537 ireq->ir_mark = inet_request_mark(sk, skb);
bb5b7c11 1538
284904aa 1539 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1540 goto drop_and_free;
284904aa 1541
172d69e6 1542 if (!want_cookie || tmp_opt.tstamp_ok)
5d134f1c 1543 TCP_ECN_create_request(req, skb, sock_net(sk));
1da177e4
LT
1544
1545 if (want_cookie) {
1da177e4 1546 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1547 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4 1548 } else if (!isn) {
1da177e4
LT
1549 /* VJ's idea. We save last timestamp seen
1550 * from the destination in peer table, when entering
1551 * state TIME-WAIT, and check against it before
1552 * accepting new connection request.
1553 *
1554 * If "isn" is not zero, this request hit alive
1555 * timewait bucket, so that all the necessary checks
1556 * are made in the function processing timewait state.
1557 */
1558 if (tmp_opt.saw_tstamp &&
295ff7ed 1559 tcp_death_row.sysctl_tw_recycle &&
ba3f7f04 1560 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
81166dd6
DM
1561 fl4.daddr == saddr) {
1562 if (!tcp_peer_is_proven(req, dst, true)) {
de0744af 1563 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1564 goto drop_and_release;
1da177e4
LT
1565 }
1566 }
1567 /* Kill the following clause, if you dislike this way. */
1568 else if (!sysctl_tcp_syncookies &&
463c84b9 1569 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4 1570 (sysctl_max_syn_backlog >> 2)) &&
81166dd6 1571 !tcp_peer_is_proven(req, dst, false)) {
1da177e4
LT
1572 /* Without syncookies last quarter of
1573 * backlog is filled with destinations,
1574 * proven to be alive.
1575 * It means that we continue to communicate
1576 * to destinations, already remembered
1577 * to the moment of synflood.
1578 */
afd46503 1579 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
673d57e7 1580 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1581 goto drop_and_release;
1da177e4
LT
1582 }
1583
a94f723d 1584 isn = tcp_v4_init_sequence(skb);
1da177e4 1585 }
2e6599cb 1586 tcp_rsk(req)->snt_isn = isn;
1da177e4 1587
168a8f58
JC
1588 if (dst == NULL) {
1589 dst = inet_csk_route_req(sk, &fl4, req);
1590 if (dst == NULL)
1591 goto drop_and_free;
1592 }
1593 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1594
1595 /* We don't call tcp_v4_send_synack() directly because we need
1596 * to make sure a child socket can be created successfully before
1597 * sending back synack!
1598 *
1599 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1600 * (or better yet, call tcp_send_synack() in the child context
1601 * directly, but will have to fix bunch of other code first)
1602 * after syn_recv_sock() except one will need to first fix the
1603 * latter to remove its dependency on the current implementation
1604 * of tcp_v4_send_synack()->tcp_select_initial_window().
1605 */
1606 skb_synack = tcp_make_synack(sk, dst, req,
168a8f58
JC
1607 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1608
1609 if (skb_synack) {
1610 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1611 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1612 } else
1613 goto drop_and_free;
1614
1615 if (likely(!do_fastopen)) {
1616 int err;
1617 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1618 ireq->rmt_addr, ireq->opt);
1619 err = net_xmit_eval(err);
1620 if (err || want_cookie)
1621 goto drop_and_free;
1622
016818d0 1623 tcp_rsk(req)->snt_synack = tcp_time_stamp;
168a8f58
JC
1624 tcp_rsk(req)->listener = NULL;
1625 /* Add the request_sock to the SYN table */
1626 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1627 if (fastopen_cookie_present(&foc) && foc.len != 0)
1628 NET_INC_STATS_BH(sock_net(sk),
1629 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1a2c6181 1630 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1da177e4
LT
1631 goto drop_and_free;
1632
1da177e4
LT
1633 return 0;
1634
7cd04fa7
DL
1635drop_and_release:
1636 dst_release(dst);
1da177e4 1637drop_and_free:
60236fdd 1638 reqsk_free(req);
1da177e4 1639drop:
848bf15f 1640 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1641 return 0;
1642}
4bc2f18b 1643EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1644
1645
1646/*
1647 * The three way handshake has completed - we got a valid synack -
1648 * now create the new socket.
1649 */
1650struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1651 struct request_sock *req,
1da177e4
LT
1652 struct dst_entry *dst)
1653{
2e6599cb 1654 struct inet_request_sock *ireq;
1da177e4
LT
1655 struct inet_sock *newinet;
1656 struct tcp_sock *newtp;
1657 struct sock *newsk;
cfb6eeb4
YH
1658#ifdef CONFIG_TCP_MD5SIG
1659 struct tcp_md5sig_key *key;
1660#endif
f6d8bd05 1661 struct ip_options_rcu *inet_opt;
1da177e4
LT
1662
1663 if (sk_acceptq_is_full(sk))
1664 goto exit_overflow;
1665
1da177e4
LT
1666 newsk = tcp_create_openreq_child(sk, req, skb);
1667 if (!newsk)
093d2823 1668 goto exit_nonewsk;
1da177e4 1669
bcd76111 1670 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1671 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1672
1673 newtp = tcp_sk(newsk);
1674 newinet = inet_sk(newsk);
2e6599cb 1675 ireq = inet_rsk(req);
c720c7e8
ED
1676 newinet->inet_daddr = ireq->rmt_addr;
1677 newinet->inet_rcv_saddr = ireq->loc_addr;
1678 newinet->inet_saddr = ireq->loc_addr;
f6d8bd05
ED
1679 inet_opt = ireq->opt;
1680 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1681 ireq->opt = NULL;
463c84b9 1682 newinet->mc_index = inet_iif(skb);
eddc9ec5 1683 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1684 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1685 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1686 if (inet_opt)
1687 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1688 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1689
dfd25fff
ED
1690 if (!dst) {
1691 dst = inet_csk_route_child_sock(sk, newsk, req);
1692 if (!dst)
1693 goto put_and_exit;
1694 } else {
1695 /* syncookie case : see end of cookie_v4_check() */
1696 }
0e734419
DM
1697 sk_setup_caps(newsk, dst);
1698
5d424d5a 1699 tcp_mtup_init(newsk);
1da177e4 1700 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1701 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1702 if (tcp_sk(sk)->rx_opt.user_mss &&
1703 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1704 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1705
1da177e4 1706 tcp_initialize_rcv_mss(newsk);
623df484 1707 tcp_synack_rtt_meas(newsk, req);
e6c022a4 1708 newtp->total_retrans = req->num_retrans;
1da177e4 1709
cfb6eeb4
YH
1710#ifdef CONFIG_TCP_MD5SIG
1711 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1712 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1713 AF_INET);
c720c7e8 1714 if (key != NULL) {
cfb6eeb4
YH
1715 /*
1716 * We're using one, so create a matching key
1717 * on the newsk structure. If we fail to get
1718 * memory, then we end up not copying the key
1719 * across. Shucks.
1720 */
a915da9b
ED
1721 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1722 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1723 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1724 }
1725#endif
1726
0e734419
DM
1727 if (__inet_inherit_port(sk, newsk) < 0)
1728 goto put_and_exit;
9327f705 1729 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1730
1731 return newsk;
1732
1733exit_overflow:
de0744af 1734 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1735exit_nonewsk:
1736 dst_release(dst);
1da177e4 1737exit:
de0744af 1738 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1739 return NULL;
0e734419 1740put_and_exit:
e337e24d
CP
1741 inet_csk_prepare_forced_close(newsk);
1742 tcp_done(newsk);
0e734419 1743 goto exit;
1da177e4 1744}
4bc2f18b 1745EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1746
1747static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1748{
aa8223c7 1749 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1750 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1751 struct sock *nsk;
60236fdd 1752 struct request_sock **prev;
1da177e4 1753 /* Find possible connection requests. */
463c84b9
ACM
1754 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1755 iph->saddr, iph->daddr);
1da177e4 1756 if (req)
8336886f 1757 return tcp_check_req(sk, skb, req, prev, false);
1da177e4 1758
3b1e0a65 1759 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1760 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1761
1762 if (nsk) {
1763 if (nsk->sk_state != TCP_TIME_WAIT) {
1764 bh_lock_sock(nsk);
1765 return nsk;
1766 }
9469c7b4 1767 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1768 return NULL;
1769 }
1770
1771#ifdef CONFIG_SYN_COOKIES
af9b4738 1772 if (!th->syn)
1da177e4
LT
1773 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1774#endif
1775 return sk;
1776}
1777
b51655b9 1778static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1779{
eddc9ec5
ACM
1780 const struct iphdr *iph = ip_hdr(skb);
1781
84fa7933 1782 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1783 if (!tcp_v4_check(skb->len, iph->saddr,
1784 iph->daddr, skb->csum)) {
fb286bb2 1785 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1786 return 0;
fb286bb2 1787 }
1da177e4 1788 }
fb286bb2 1789
eddc9ec5 1790 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1791 skb->len, IPPROTO_TCP, 0);
1792
1da177e4 1793 if (skb->len <= 76) {
fb286bb2 1794 return __skb_checksum_complete(skb);
1da177e4
LT
1795 }
1796 return 0;
1797}
1798
1799
1800/* The socket must have it's spinlock held when we get
1801 * here.
1802 *
1803 * We have a potential double-lock case here, so even when
1804 * doing backlog processing we use the BH locking scheme.
1805 * This is because we cannot sleep with the original spinlock
1806 * held.
1807 */
1808int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1809{
cfb6eeb4
YH
1810 struct sock *rsk;
1811#ifdef CONFIG_TCP_MD5SIG
1812 /*
1813 * We really want to reject the packet as early as possible
1814 * if:
1815 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1816 * o There is an MD5 option and we're not expecting one
1817 */
7174259e 1818 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1819 goto discard;
1820#endif
1821
1da177e4 1822 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1823 struct dst_entry *dst = sk->sk_rx_dst;
1824
bdeab991 1825 sock_rps_save_rxhash(sk, skb);
404e0a8b 1826 if (dst) {
505fbcf0
ED
1827 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1828 dst->ops->check(dst, 0) == NULL) {
92101b3b
DM
1829 dst_release(dst);
1830 sk->sk_rx_dst = NULL;
1831 }
1832 }
aa8223c7 1833 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1834 rsk = sk;
1da177e4 1835 goto reset;
cfb6eeb4 1836 }
1da177e4
LT
1837 return 0;
1838 }
1839
ab6a5bb6 1840 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1841 goto csum_err;
1842
1843 if (sk->sk_state == TCP_LISTEN) {
1844 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1845 if (!nsk)
1846 goto discard;
1847
1848 if (nsk != sk) {
bdeab991 1849 sock_rps_save_rxhash(nsk, skb);
cfb6eeb4
YH
1850 if (tcp_child_process(sk, nsk, skb)) {
1851 rsk = nsk;
1da177e4 1852 goto reset;
cfb6eeb4 1853 }
1da177e4
LT
1854 return 0;
1855 }
ca55158c 1856 } else
bdeab991 1857 sock_rps_save_rxhash(sk, skb);
ca55158c 1858
aa8223c7 1859 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1860 rsk = sk;
1da177e4 1861 goto reset;
cfb6eeb4 1862 }
1da177e4
LT
1863 return 0;
1864
1865reset:
cfb6eeb4 1866 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1867discard:
1868 kfree_skb(skb);
1869 /* Be careful here. If this function gets more complicated and
1870 * gcc suffers from register pressure on the x86, sk (in %ebx)
1871 * might be destroyed here. This current version compiles correctly,
1872 * but you have been warned.
1873 */
1874 return 0;
1875
1876csum_err:
6a5dc9e5 1877 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1878 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1879 goto discard;
1880}
4bc2f18b 1881EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1882
160eb5a6 1883void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1884{
41063e9d
DM
1885 const struct iphdr *iph;
1886 const struct tcphdr *th;
1887 struct sock *sk;
41063e9d 1888
41063e9d 1889 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1890 return;
41063e9d 1891
45f00f99 1892 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1893 return;
41063e9d
DM
1894
1895 iph = ip_hdr(skb);
45f00f99 1896 th = tcp_hdr(skb);
41063e9d
DM
1897
1898 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1899 return;
41063e9d 1900
45f00f99 1901 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1902 iph->saddr, th->source,
7011d085 1903 iph->daddr, ntohs(th->dest),
9cb429d6 1904 skb->skb_iif);
41063e9d
DM
1905 if (sk) {
1906 skb->sk = sk;
1907 skb->destructor = sock_edemux;
1908 if (sk->sk_state != TCP_TIME_WAIT) {
1b946e38 1909 struct dst_entry *dst = ACCESS_ONCE(sk->sk_rx_dst);
505fbcf0 1910
41063e9d
DM
1911 if (dst)
1912 dst = dst_check(dst, 0);
92101b3b 1913 if (dst &&
505fbcf0 1914 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1915 skb_dst_set_noref(skb, dst);
41063e9d
DM
1916 }
1917 }
41063e9d
DM
1918}
1919
b2fb4f54
ED
1920/* Packet is added to VJ-style prequeue for processing in process
1921 * context, if a reader task is waiting. Apparently, this exciting
1922 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1923 * failed somewhere. Latency? Burstiness? Well, at least now we will
1924 * see, why it failed. 8)8) --ANK
1925 *
1926 */
1927bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1928{
1929 struct tcp_sock *tp = tcp_sk(sk);
1930
1931 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1932 return false;
1933
1934 if (skb->len <= tcp_hdrlen(skb) &&
1935 skb_queue_len(&tp->ucopy.prequeue) == 0)
1936 return false;
1937
58717686 1938 skb_dst_force(skb);
b2fb4f54
ED
1939 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1940 tp->ucopy.memory += skb->truesize;
1941 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1942 struct sk_buff *skb1;
1943
1944 BUG_ON(sock_owned_by_user(sk));
1945
1946 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1947 sk_backlog_rcv(sk, skb1);
1948 NET_INC_STATS_BH(sock_net(sk),
1949 LINUX_MIB_TCPPREQUEUEDROPPED);
1950 }
1951
1952 tp->ucopy.memory = 0;
1953 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1954 wake_up_interruptible_sync_poll(sk_sleep(sk),
1955 POLLIN | POLLRDNORM | POLLRDBAND);
1956 if (!inet_csk_ack_scheduled(sk))
1957 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1958 (3 * tcp_rto_min(sk)) / 4,
6fa3eb70 1959 sysctl_tcp_rto_max);
b2fb4f54
ED
1960 }
1961 return true;
1962}
1963EXPORT_SYMBOL(tcp_prequeue);
1964
56325d9f
ED
1965int tcp_filter(struct sock *sk, struct sk_buff *skb)
1966{
1967 struct tcphdr *th = (struct tcphdr *)skb->data;
1968 unsigned int eaten = skb->len;
1969 int err;
1970
1971 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1972 if (!err) {
1973 eaten -= skb->len;
1974 TCP_SKB_CB(skb)->end_seq -= eaten;
1975 }
1976 return err;
1977}
1978EXPORT_SYMBOL(tcp_filter);
1979
1da177e4
LT
1980/*
1981 * From tcp_input.c
1982 */
1983
1984int tcp_v4_rcv(struct sk_buff *skb)
1985{
eddc9ec5 1986 const struct iphdr *iph;
cf533ea5 1987 const struct tcphdr *th;
1da177e4
LT
1988 struct sock *sk;
1989 int ret;
a86b1e30 1990 struct net *net = dev_net(skb->dev);
1da177e4
LT
1991
1992 if (skb->pkt_type != PACKET_HOST)
1993 goto discard_it;
1994
1995 /* Count it even if it's bad */
63231bdd 1996 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1997
1998 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1999 goto discard_it;
2000
aa8223c7 2001 th = tcp_hdr(skb);
1da177e4
LT
2002
2003 if (th->doff < sizeof(struct tcphdr) / 4)
2004 goto bad_packet;
2005 if (!pskb_may_pull(skb, th->doff * 4))
2006 goto discard_it;
2007
2008 /* An explanation is required here, I think.
2009 * Packet length and doff are validated by header prediction,
caa20d9a 2010 * provided case of th->doff==0 is eliminated.
1da177e4 2011 * So, we defer the checks. */
60476372 2012 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
6a5dc9e5 2013 goto csum_error;
1da177e4 2014
aa8223c7 2015 th = tcp_hdr(skb);
eddc9ec5 2016 iph = ip_hdr(skb);
1da177e4
LT
2017 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2018 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2019 skb->len - th->doff * 4);
2020 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2021 TCP_SKB_CB(skb)->when = 0;
b82d1bb4 2022 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
2023 TCP_SKB_CB(skb)->sacked = 0;
2024
9a1f27c4 2025 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
2026 if (!sk)
2027 goto no_tcp_socket;
2028
bb134d5d
ED
2029process:
2030 if (sk->sk_state == TCP_TIME_WAIT)
2031 goto do_time_wait;
2032
6cce09f8
ED
2033 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2034 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 2035 goto discard_and_relse;
6cce09f8 2036 }
d218d111 2037
1da177e4
LT
2038 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2039 goto discard_and_relse;
b59c2701 2040 nf_reset(skb);
1da177e4 2041
56325d9f 2042 if (tcp_filter(sk, skb))
1da177e4 2043 goto discard_and_relse;
56325d9f
ED
2044 th = (const struct tcphdr *)skb->data;
2045 iph = ip_hdr(skb);
1da177e4
LT
2046
2047 skb->dev = NULL;
2048
c6366184 2049 bh_lock_sock_nested(sk);
1da177e4
LT
2050 ret = 0;
2051 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
2052#ifdef CONFIG_NET_DMA
2053 struct tcp_sock *tp = tcp_sk(sk);
2054 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
a2bd1140 2055 tp->ucopy.dma_chan = net_dma_find_channel();
1a2449a8 2056 if (tp->ucopy.dma_chan)
1da177e4 2057 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
2058 else
2059#endif
2060 {
2061 if (!tcp_prequeue(sk, skb))
ae8d7f88 2062 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 2063 }
da882c1f
ED
2064 } else if (unlikely(sk_add_backlog(sk, skb,
2065 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 2066 bh_unlock_sock(sk);
6cce09f8 2067 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
2068 goto discard_and_relse;
2069 }
1da177e4
LT
2070 bh_unlock_sock(sk);
2071
2072 sock_put(sk);
2073
2074 return ret;
2075
2076no_tcp_socket:
2077 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2078 goto discard_it;
2079
2080 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
6a5dc9e5
ED
2081csum_error:
2082 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 2083bad_packet:
63231bdd 2084 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 2085 } else {
cfb6eeb4 2086 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
2087 }
2088
2089discard_it:
2090 /* Discard frame. */
2091 kfree_skb(skb);
e905a9ed 2092 return 0;
1da177e4
LT
2093
2094discard_and_relse:
2095 sock_put(sk);
2096 goto discard_it;
2097
2098do_time_wait:
2099 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 2100 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2101 goto discard_it;
2102 }
2103
6a5dc9e5 2104 if (skb->len < (th->doff << 2)) {
9469c7b4 2105 inet_twsk_put(inet_twsk(sk));
6a5dc9e5
ED
2106 goto bad_packet;
2107 }
2108 if (tcp_checksum_complete(skb)) {
2109 inet_twsk_put(inet_twsk(sk));
2110 goto csum_error;
1da177e4 2111 }
9469c7b4 2112 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 2113 case TCP_TW_SYN: {
c346dca1 2114 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 2115 &tcp_hashinfo,
da5e3630 2116 iph->saddr, th->source,
eddc9ec5 2117 iph->daddr, th->dest,
463c84b9 2118 inet_iif(skb));
1da177e4 2119 if (sk2) {
9469c7b4
YH
2120 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2121 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
2122 sk = sk2;
2123 goto process;
2124 }
2125 /* Fall through to ACK */
2126 }
2127 case TCP_TW_ACK:
2128 tcp_v4_timewait_ack(sk, skb);
2129 break;
2130 case TCP_TW_RST:
2131 goto no_tcp_socket;
2132 case TCP_TW_SUCCESS:;
2133 }
2134 goto discard_it;
2135}
2136
ccb7c410
DM
2137static struct timewait_sock_ops tcp_timewait_sock_ops = {
2138 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2139 .twsk_unique = tcp_twsk_unique,
2140 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2141};
1da177e4 2142
63d02d15 2143void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2144{
2145 struct dst_entry *dst = skb_dst(skb);
2146
2147 dst_hold(dst);
2148 sk->sk_rx_dst = dst;
2149 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2150}
63d02d15 2151EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2152
3b401a81 2153const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2154 .queue_xmit = ip_queue_xmit,
2155 .send_check = tcp_v4_send_check,
2156 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2157 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2158 .conn_request = tcp_v4_conn_request,
2159 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2160 .net_header_len = sizeof(struct iphdr),
2161 .setsockopt = ip_setsockopt,
2162 .getsockopt = ip_getsockopt,
2163 .addr2sockaddr = inet_csk_addr2sockaddr,
2164 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 2165 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 2166#ifdef CONFIG_COMPAT
543d9cfe
ACM
2167 .compat_setsockopt = compat_ip_setsockopt,
2168 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2169#endif
5f80f4d8 2170 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2171};
4bc2f18b 2172EXPORT_SYMBOL(ipv4_specific);
1da177e4 2173
cfb6eeb4 2174#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2175static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2176 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2177 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2178 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2179};
b6332e6c 2180#endif
cfb6eeb4 2181
1da177e4
LT
2182/* NOTE: A lot of things set to zero explicitly by call to
2183 * sk_alloc() so need not be done here.
2184 */
2185static int tcp_v4_init_sock(struct sock *sk)
2186{
6687e988 2187 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2188
900f65d3 2189 tcp_init_sock(sk);
6fa3eb70 2190 icsk->icsk_MMSRB = 0;
1da177e4 2191
8292a17a 2192 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2193
cfb6eeb4 2194#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2195 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2196#endif
1da177e4 2197
1da177e4
LT
2198 return 0;
2199}
2200
7d06b2e0 2201void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2202{
2203 struct tcp_sock *tp = tcp_sk(sk);
2204
2205 tcp_clear_xmit_timers(sk);
2206
6687e988 2207 tcp_cleanup_congestion_control(sk);
317a76f9 2208
1da177e4 2209 /* Cleanup up the write buffer. */
fe067e8a 2210 tcp_write_queue_purge(sk);
1da177e4
LT
2211
2212 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 2213 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 2214
cfb6eeb4
YH
2215#ifdef CONFIG_TCP_MD5SIG
2216 /* Clean up the MD5 key list, if any */
2217 if (tp->md5sig_info) {
a915da9b 2218 tcp_clear_md5_list(sk);
a8afca03 2219 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
2220 tp->md5sig_info = NULL;
2221 }
2222#endif
2223
1a2449a8
CL
2224#ifdef CONFIG_NET_DMA
2225 /* Cleans up our sk_async_wait_queue */
e905a9ed 2226 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
2227#endif
2228
1da177e4
LT
2229 /* Clean prequeue, it must be empty really */
2230 __skb_queue_purge(&tp->ucopy.prequeue);
2231
2232 /* Clean up a referenced TCP bind bucket. */
463c84b9 2233 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2234 inet_put_port(sk);
1da177e4 2235
168a8f58 2236 BUG_ON(tp->fastopen_rsk != NULL);
435cf559 2237
cf60af03
YC
2238 /* If socket is aborted during connect operation */
2239 tcp_free_fastopen_req(tp);
2240
180d8cd9 2241 sk_sockets_allocated_dec(sk);
d1a4c0b3 2242 sock_release_memcg(sk);
1da177e4 2243}
1da177e4
LT
2244EXPORT_SYMBOL(tcp_v4_destroy_sock);
2245
6fa3eb70
S
2246void tcp_v4_handle_retrans_time_by_uid(struct uid_err uid_e)
2247{
2248 unsigned int bucket;
2249 uid_t skuid = (uid_t)(uid_e.appuid);
2250 struct inet_connection_sock *icsk = NULL;//inet_csk(sk);
2251
2252
2253 for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
2254 struct hlist_nulls_node *node;
2255 struct sock *sk;
2256 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
2257
2258 spin_lock_bh(lock);
2259 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
2260
2261 if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
2262 continue;
2263 if (sock_flag(sk, SOCK_DEAD))
2264 continue;
2265
2266 if(sk->sk_socket){
2267 if(SOCK_INODE(sk->sk_socket)->i_uid != skuid)
2268 continue;
2269 else
2270 printk("[mmspb] tcp_v4_handle_retrans_time_by_uid socket uid(%d) match!",
2271 SOCK_INODE(sk->sk_socket)->i_uid);
2272 } else{
2273 continue;
2274 }
2275
2276 sock_hold(sk);
2277 spin_unlock_bh(lock);
2278
2279 local_bh_disable();
2280 bh_lock_sock(sk);
2281
2282 // update sk time out value
2283 icsk = inet_csk(sk);
2284 printk("[mmspb] tcp_v4_handle_retrans_time_by_uid update timer\n");
2285
2286 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + 2);
2287 icsk->icsk_rto = sysctl_tcp_rto_min * 30;
2288 icsk->icsk_MMSRB = 1;
2289
2290 bh_unlock_sock(sk);
2291 local_bh_enable();
2292 spin_lock_bh(lock);
2293 sock_put(sk);
2294
2295 }
2296 spin_unlock_bh(lock);
2297 }
2298
2299}
2300
2301
2302/*
2303 * tcp_v4_nuke_addr_by_uid - destroy all sockets of spcial uid
2304 */
2305void tcp_v4_reset_connections_by_uid(struct uid_err uid_e)
2306{
2307 unsigned int bucket;
2308 uid_t skuid = (uid_t)(uid_e.appuid);
2309
2310 for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
2311 struct hlist_nulls_node *node;
2312 struct sock *sk;
2313 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
2314
2315restart:
2316 spin_lock_bh(lock);
2317 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
2318
2319 if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
2320 continue;
2321 if (sock_flag(sk, SOCK_DEAD))
2322 continue;
2323
2324 if(sk->sk_socket){
2325 if(SOCK_INODE(sk->sk_socket)->i_uid != skuid)
2326 continue;
2327 else
2328 printk(KERN_INFO "SIOCKILLSOCK socket uid(%d) match!",
2329 SOCK_INODE(sk->sk_socket)->i_uid);
2330 } else{
2331 continue;
2332 }
2333
2334 sock_hold(sk);
2335 spin_unlock_bh(lock);
2336
2337 local_bh_disable();
2338 bh_lock_sock(sk);
2339 sk->sk_err = uid_e.errNum;
2340 printk(KERN_INFO "SIOCKILLSOCK set sk err == %d!! \n", sk->sk_err);
2341 sk->sk_error_report(sk);
2342
2343 tcp_done(sk);
2344 bh_unlock_sock(sk);
2345 local_bh_enable();
2346 sock_put(sk);
2347
2348 goto restart;
2349 }
2350 spin_unlock_bh(lock);
2351 }
2352}
2353
2354
1da177e4
LT
2355#ifdef CONFIG_PROC_FS
2356/* Proc filesystem TCP sock list dumping. */
2357
3ab5aee7 2358static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 2359{
3ab5aee7 2360 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 2361 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
2362}
2363
8feaf0c0 2364static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 2365{
3ab5aee7
ED
2366 return !is_a_nulls(tw->tw_node.next) ?
2367 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
2368}
2369
a8b690f9
TH
2370/*
2371 * Get next listener socket follow cur. If cur is NULL, get first socket
2372 * starting from bucket given in st->bucket; when st->bucket is zero the
2373 * very first socket in the hash table is returned.
2374 */
1da177e4
LT
2375static void *listening_get_next(struct seq_file *seq, void *cur)
2376{
463c84b9 2377 struct inet_connection_sock *icsk;
c25eb3bf 2378 struct hlist_nulls_node *node;
1da177e4 2379 struct sock *sk = cur;
5caea4ea 2380 struct inet_listen_hashbucket *ilb;
5799de0b 2381 struct tcp_iter_state *st = seq->private;
a4146b1b 2382 struct net *net = seq_file_net(seq);
1da177e4
LT
2383
2384 if (!sk) {
a8b690f9 2385 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 2386 spin_lock_bh(&ilb->lock);
c25eb3bf 2387 sk = sk_nulls_head(&ilb->head);
a8b690f9 2388 st->offset = 0;
1da177e4
LT
2389 goto get_sk;
2390 }
5caea4ea 2391 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2392 ++st->num;
a8b690f9 2393 ++st->offset;
1da177e4
LT
2394
2395 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 2396 struct request_sock *req = cur;
1da177e4 2397
72a3effa 2398 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
2399 req = req->dl_next;
2400 while (1) {
2401 while (req) {
bdccc4ca 2402 if (req->rsk_ops->family == st->family) {
1da177e4
LT
2403 cur = req;
2404 goto out;
2405 }
2406 req = req->dl_next;
2407 }
72a3effa 2408 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
2409 break;
2410get_req:
463c84b9 2411 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 2412 }
1bde5ac4 2413 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2414 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2415 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2416 } else {
e905a9ed 2417 icsk = inet_csk(sk);
463c84b9
ACM
2418 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2419 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2420 goto start_req;
463c84b9 2421 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2422 sk = sk_nulls_next(sk);
1da177e4
LT
2423 }
2424get_sk:
c25eb3bf 2425 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2426 if (!net_eq(sock_net(sk), net))
2427 continue;
2428 if (sk->sk_family == st->family) {
1da177e4
LT
2429 cur = sk;
2430 goto out;
2431 }
e905a9ed 2432 icsk = inet_csk(sk);
463c84b9
ACM
2433 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2434 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2435start_req:
2436 st->uid = sock_i_uid(sk);
2437 st->syn_wait_sk = sk;
2438 st->state = TCP_SEQ_STATE_OPENREQ;
2439 st->sbucket = 0;
2440 goto get_req;
2441 }
463c84b9 2442 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2443 }
5caea4ea 2444 spin_unlock_bh(&ilb->lock);
a8b690f9 2445 st->offset = 0;
0f7ff927 2446 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2447 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2448 spin_lock_bh(&ilb->lock);
c25eb3bf 2449 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2450 goto get_sk;
2451 }
2452 cur = NULL;
2453out:
2454 return cur;
2455}
2456
2457static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2458{
a8b690f9
TH
2459 struct tcp_iter_state *st = seq->private;
2460 void *rc;
2461
2462 st->bucket = 0;
2463 st->offset = 0;
2464 rc = listening_get_next(seq, NULL);
1da177e4
LT
2465
2466 while (rc && *pos) {
2467 rc = listening_get_next(seq, rc);
2468 --*pos;
2469 }
2470 return rc;
2471}
2472
a2a385d6 2473static inline bool empty_bucket(struct tcp_iter_state *st)
6eac5604 2474{
3ab5aee7
ED
2475 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2476 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2477}
2478
a8b690f9
TH
2479/*
2480 * Get first established socket starting from bucket given in st->bucket.
2481 * If st->bucket is zero, the very first socket in the hash is returned.
2482 */
1da177e4
LT
2483static void *established_get_first(struct seq_file *seq)
2484{
5799de0b 2485 struct tcp_iter_state *st = seq->private;
a4146b1b 2486 struct net *net = seq_file_net(seq);
1da177e4
LT
2487 void *rc = NULL;
2488
a8b690f9
TH
2489 st->offset = 0;
2490 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2491 struct sock *sk;
3ab5aee7 2492 struct hlist_nulls_node *node;
8feaf0c0 2493 struct inet_timewait_sock *tw;
9db66bdc 2494 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2495
6eac5604
AK
2496 /* Lockless fast path for the common case of empty buckets */
2497 if (empty_bucket(st))
2498 continue;
2499
9db66bdc 2500 spin_lock_bh(lock);
3ab5aee7 2501 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2502 if (sk->sk_family != st->family ||
878628fb 2503 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2504 continue;
2505 }
2506 rc = sk;
2507 goto out;
2508 }
2509 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2510 inet_twsk_for_each(tw, node,
dbca9b27 2511 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2512 if (tw->tw_family != st->family ||
878628fb 2513 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2514 continue;
2515 }
2516 rc = tw;
2517 goto out;
2518 }
9db66bdc 2519 spin_unlock_bh(lock);
1da177e4
LT
2520 st->state = TCP_SEQ_STATE_ESTABLISHED;
2521 }
2522out:
2523 return rc;
2524}
2525
2526static void *established_get_next(struct seq_file *seq, void *cur)
2527{
2528 struct sock *sk = cur;
8feaf0c0 2529 struct inet_timewait_sock *tw;
3ab5aee7 2530 struct hlist_nulls_node *node;
5799de0b 2531 struct tcp_iter_state *st = seq->private;
a4146b1b 2532 struct net *net = seq_file_net(seq);
1da177e4
LT
2533
2534 ++st->num;
a8b690f9 2535 ++st->offset;
1da177e4
LT
2536
2537 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2538 tw = cur;
2539 tw = tw_next(tw);
2540get_tw:
878628fb 2541 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2542 tw = tw_next(tw);
2543 }
2544 if (tw) {
2545 cur = tw;
2546 goto out;
2547 }
9db66bdc 2548 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2549 st->state = TCP_SEQ_STATE_ESTABLISHED;
2550
6eac5604 2551 /* Look for next non empty bucket */
a8b690f9 2552 st->offset = 0;
f373b53b 2553 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2554 empty_bucket(st))
2555 ;
f373b53b 2556 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2557 return NULL;
2558
9db66bdc 2559 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2560 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2561 } else
3ab5aee7 2562 sk = sk_nulls_next(sk);
1da177e4 2563
3ab5aee7 2564 sk_nulls_for_each_from(sk, node) {
878628fb 2565 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2566 goto found;
2567 }
2568
2569 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2570 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2571 goto get_tw;
2572found:
2573 cur = sk;
2574out:
2575 return cur;
2576}
2577
2578static void *established_get_idx(struct seq_file *seq, loff_t pos)
2579{
a8b690f9
TH
2580 struct tcp_iter_state *st = seq->private;
2581 void *rc;
2582
2583 st->bucket = 0;
2584 rc = established_get_first(seq);
1da177e4
LT
2585
2586 while (rc && pos) {
2587 rc = established_get_next(seq, rc);
2588 --pos;
7174259e 2589 }
1da177e4
LT
2590 return rc;
2591}
2592
2593static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2594{
2595 void *rc;
5799de0b 2596 struct tcp_iter_state *st = seq->private;
1da177e4 2597
1da177e4
LT
2598 st->state = TCP_SEQ_STATE_LISTENING;
2599 rc = listening_get_idx(seq, &pos);
2600
2601 if (!rc) {
1da177e4
LT
2602 st->state = TCP_SEQ_STATE_ESTABLISHED;
2603 rc = established_get_idx(seq, pos);
2604 }
2605
2606 return rc;
2607}
2608
a8b690f9
TH
2609static void *tcp_seek_last_pos(struct seq_file *seq)
2610{
2611 struct tcp_iter_state *st = seq->private;
2612 int offset = st->offset;
2613 int orig_num = st->num;
2614 void *rc = NULL;
2615
2616 switch (st->state) {
2617 case TCP_SEQ_STATE_OPENREQ:
2618 case TCP_SEQ_STATE_LISTENING:
2619 if (st->bucket >= INET_LHTABLE_SIZE)
2620 break;
2621 st->state = TCP_SEQ_STATE_LISTENING;
2622 rc = listening_get_next(seq, NULL);
2623 while (offset-- && rc)
2624 rc = listening_get_next(seq, rc);
2625 if (rc)
2626 break;
2627 st->bucket = 0;
2628 /* Fallthrough */
2629 case TCP_SEQ_STATE_ESTABLISHED:
2630 case TCP_SEQ_STATE_TIME_WAIT:
2631 st->state = TCP_SEQ_STATE_ESTABLISHED;
2632 if (st->bucket > tcp_hashinfo.ehash_mask)
2633 break;
2634 rc = established_get_first(seq);
2635 while (offset-- && rc)
2636 rc = established_get_next(seq, rc);
2637 }
2638
2639 st->num = orig_num;
2640
2641 return rc;
2642}
2643
1da177e4
LT
2644static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2645{
5799de0b 2646 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2647 void *rc;
2648
2649 if (*pos && *pos == st->last_pos) {
2650 rc = tcp_seek_last_pos(seq);
2651 if (rc)
2652 goto out;
2653 }
2654
1da177e4
LT
2655 st->state = TCP_SEQ_STATE_LISTENING;
2656 st->num = 0;
a8b690f9
TH
2657 st->bucket = 0;
2658 st->offset = 0;
2659 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2660
2661out:
2662 st->last_pos = *pos;
2663 return rc;
1da177e4
LT
2664}
2665
2666static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2667{
a8b690f9 2668 struct tcp_iter_state *st = seq->private;
1da177e4 2669 void *rc = NULL;
1da177e4
LT
2670
2671 if (v == SEQ_START_TOKEN) {
2672 rc = tcp_get_idx(seq, 0);
2673 goto out;
2674 }
1da177e4
LT
2675
2676 switch (st->state) {
2677 case TCP_SEQ_STATE_OPENREQ:
2678 case TCP_SEQ_STATE_LISTENING:
2679 rc = listening_get_next(seq, v);
2680 if (!rc) {
1da177e4 2681 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2682 st->bucket = 0;
2683 st->offset = 0;
1da177e4
LT
2684 rc = established_get_first(seq);
2685 }
2686 break;
2687 case TCP_SEQ_STATE_ESTABLISHED:
2688 case TCP_SEQ_STATE_TIME_WAIT:
2689 rc = established_get_next(seq, v);
2690 break;
2691 }
2692out:
2693 ++*pos;
a8b690f9 2694 st->last_pos = *pos;
1da177e4
LT
2695 return rc;
2696}
2697
2698static void tcp_seq_stop(struct seq_file *seq, void *v)
2699{
5799de0b 2700 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2701
2702 switch (st->state) {
2703 case TCP_SEQ_STATE_OPENREQ:
2704 if (v) {
463c84b9
ACM
2705 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2706 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2707 }
2708 case TCP_SEQ_STATE_LISTENING:
2709 if (v != SEQ_START_TOKEN)
5caea4ea 2710 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2711 break;
2712 case TCP_SEQ_STATE_TIME_WAIT:
2713 case TCP_SEQ_STATE_ESTABLISHED:
2714 if (v)
9db66bdc 2715 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2716 break;
2717 }
2718}
2719
73cb88ec 2720int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2721{
d9dda78b 2722 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2723 struct tcp_iter_state *s;
52d6f3f1 2724 int err;
1da177e4 2725
52d6f3f1
DL
2726 err = seq_open_net(inode, file, &afinfo->seq_ops,
2727 sizeof(struct tcp_iter_state));
2728 if (err < 0)
2729 return err;
f40c8174 2730
52d6f3f1 2731 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2732 s->family = afinfo->family;
a8b690f9 2733 s->last_pos = 0;
f40c8174
DL
2734 return 0;
2735}
73cb88ec 2736EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2737
6f8b13bc 2738int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2739{
2740 int rc = 0;
2741 struct proc_dir_entry *p;
2742
9427c4b3
DL
2743 afinfo->seq_ops.start = tcp_seq_start;
2744 afinfo->seq_ops.next = tcp_seq_next;
2745 afinfo->seq_ops.stop = tcp_seq_stop;
2746
84841c3c 2747 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2748 afinfo->seq_fops, afinfo);
84841c3c 2749 if (!p)
1da177e4
LT
2750 rc = -ENOMEM;
2751 return rc;
2752}
4bc2f18b 2753EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2754
6f8b13bc 2755void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2756{
ece31ffd 2757 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2758}
4bc2f18b 2759EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2760
cf533ea5 2761static void get_openreq4(const struct sock *sk, const struct request_sock *req,
a7cb5a49 2762 struct seq_file *f, int i, kuid_t uid, int *len)
1da177e4 2763{
2e6599cb 2764 const struct inet_request_sock *ireq = inet_rsk(req);
a399a805 2765 long delta = req->expires - jiffies;
1da177e4 2766
5e659e4c 2767 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
71338aa7 2768 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
1da177e4 2769 i,
2e6599cb 2770 ireq->loc_addr,
c720c7e8 2771 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2772 ireq->rmt_addr,
2773 ntohs(ireq->rmt_port),
1da177e4
LT
2774 TCP_SYN_RECV,
2775 0, 0, /* could print option size, but that is af dependent. */
2776 1, /* timers active (only the expire timer) */
a399a805 2777 jiffies_delta_to_clock_t(delta),
e6c022a4 2778 req->num_timeout,
a7cb5a49 2779 from_kuid_munged(seq_user_ns(f), uid),
1da177e4
LT
2780 0, /* non standard timer */
2781 0, /* open_requests have no inode */
2782 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2783 req,
2784 len);
1da177e4
LT
2785}
2786
5e659e4c 2787static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2788{
2789 int timer_active;
2790 unsigned long timer_expires;
cf533ea5 2791 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2792 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2793 const struct inet_sock *inet = inet_sk(sk);
168a8f58 2794 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2795 __be32 dest = inet->inet_daddr;
2796 __be32 src = inet->inet_rcv_saddr;
2797 __u16 destp = ntohs(inet->inet_dport);
2798 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2799 int rx_queue;
1da177e4 2800
6ba8a3b1
ND
2801 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2802 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2803 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2804 timer_active = 1;
463c84b9
ACM
2805 timer_expires = icsk->icsk_timeout;
2806 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2807 timer_active = 4;
463c84b9 2808 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2809 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2810 timer_active = 2;
cf4c6bf8 2811 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2812 } else {
2813 timer_active = 0;
2814 timer_expires = jiffies;
2815 }
2816
49d09007
ED
2817 if (sk->sk_state == TCP_LISTEN)
2818 rx_queue = sk->sk_ack_backlog;
2819 else
2820 /*
2821 * because we dont lock socket, we might find a transient negative value
2822 */
2823 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2824
5e659e4c 2825 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
71338aa7 2826 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
cf4c6bf8 2827 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2828 tp->write_seq - tp->snd_una,
49d09007 2829 rx_queue,
1da177e4 2830 timer_active,
a399a805 2831 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2832 icsk->icsk_retransmits,
a7cb5a49 2833 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2834 icsk->icsk_probes_out,
cf4c6bf8
IJ
2835 sock_i_ino(sk),
2836 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2837 jiffies_to_clock_t(icsk->icsk_rto),
2838 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2839 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2840 tp->snd_cwnd,
168a8f58
JC
2841 sk->sk_state == TCP_LISTEN ?
2842 (fastopenq ? fastopenq->max_qlen : 0) :
2843 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
5e659e4c 2844 len);
1da177e4
LT
2845}
2846
cf533ea5 2847static void get_timewait4_sock(const struct inet_timewait_sock *tw,
5e659e4c 2848 struct seq_file *f, int i, int *len)
1da177e4 2849{
23f33c2d 2850 __be32 dest, src;
1da177e4 2851 __u16 destp, srcp;
a399a805 2852 long delta = tw->tw_ttd - jiffies;
1da177e4
LT
2853
2854 dest = tw->tw_daddr;
2855 src = tw->tw_rcv_saddr;
2856 destp = ntohs(tw->tw_dport);
2857 srcp = ntohs(tw->tw_sport);
2858
5e659e4c 2859 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
71338aa7 2860 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
1da177e4 2861 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2862 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
5e659e4c 2863 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2864}
2865
2866#define TMPSZ 150
2867
2868static int tcp4_seq_show(struct seq_file *seq, void *v)
2869{
5799de0b 2870 struct tcp_iter_state *st;
5e659e4c 2871 int len;
1da177e4
LT
2872
2873 if (v == SEQ_START_TOKEN) {
2874 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2875 " sl local_address rem_address st tx_queue "
2876 "rx_queue tr tm->when retrnsmt uid timeout "
2877 "inode");
2878 goto out;
2879 }
2880 st = seq->private;
2881
2882 switch (st->state) {
2883 case TCP_SEQ_STATE_LISTENING:
2884 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2885 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2886 break;
2887 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2888 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2889 break;
2890 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2891 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2892 break;
2893 }
5e659e4c 2894 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2895out:
2896 return 0;
2897}
2898
73cb88ec
AV
2899static const struct file_operations tcp_afinfo_seq_fops = {
2900 .owner = THIS_MODULE,
2901 .open = tcp_seq_open,
2902 .read = seq_read,
2903 .llseek = seq_lseek,
2904 .release = seq_release_net
2905};
2906
1da177e4 2907static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2908 .name = "tcp",
2909 .family = AF_INET,
73cb88ec 2910 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2911 .seq_ops = {
2912 .show = tcp4_seq_show,
2913 },
1da177e4
LT
2914};
2915
2c8c1e72 2916static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2917{
2918 return tcp_proc_register(net, &tcp4_seq_afinfo);
2919}
2920
2c8c1e72 2921static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2922{
2923 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2924}
2925
2926static struct pernet_operations tcp4_net_ops = {
2927 .init = tcp4_proc_init_net,
2928 .exit = tcp4_proc_exit_net,
2929};
2930
1da177e4
LT
2931int __init tcp4_proc_init(void)
2932{
757764f6 2933 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2934}
2935
2936void tcp4_proc_exit(void)
2937{
757764f6 2938 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2939}
2940#endif /* CONFIG_PROC_FS */
2941
bf296b12
HX
2942struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2943{
b71d1d42 2944 const struct iphdr *iph = skb_gro_network_header(skb);
861b6501
ED
2945 __wsum wsum;
2946 __sum16 sum;
bf296b12
HX
2947
2948 switch (skb->ip_summed) {
2949 case CHECKSUM_COMPLETE:
86911732 2950 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
bf296b12
HX
2951 skb->csum)) {
2952 skb->ip_summed = CHECKSUM_UNNECESSARY;
2953 break;
2954 }
861b6501 2955flush:
bf296b12
HX
2956 NAPI_GRO_CB(skb)->flush = 1;
2957 return NULL;
861b6501
ED
2958
2959 case CHECKSUM_NONE:
2960 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2961 skb_gro_len(skb), IPPROTO_TCP, 0);
2962 sum = csum_fold(skb_checksum(skb,
2963 skb_gro_offset(skb),
2964 skb_gro_len(skb),
2965 wsum));
2966 if (sum)
2967 goto flush;
2968
2969 skb->ip_summed = CHECKSUM_UNNECESSARY;
2970 break;
bf296b12
HX
2971 }
2972
2973 return tcp_gro_receive(head, skb);
2974}
bf296b12
HX
2975
2976int tcp4_gro_complete(struct sk_buff *skb)
2977{
b71d1d42 2978 const struct iphdr *iph = ip_hdr(skb);
bf296b12
HX
2979 struct tcphdr *th = tcp_hdr(skb);
2980
2981 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2982 iph->saddr, iph->daddr, 0);
2983 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2984
2985 return tcp_gro_complete(skb);
2986}
bf296b12 2987
1da177e4
LT
2988struct proto tcp_prot = {
2989 .name = "TCP",
2990 .owner = THIS_MODULE,
2991 .close = tcp_close,
2992 .connect = tcp_v4_connect,
2993 .disconnect = tcp_disconnect,
463c84b9 2994 .accept = inet_csk_accept,
1da177e4
LT
2995 .ioctl = tcp_ioctl,
2996 .init = tcp_v4_init_sock,
2997 .destroy = tcp_v4_destroy_sock,
2998 .shutdown = tcp_shutdown,
2999 .setsockopt = tcp_setsockopt,
3000 .getsockopt = tcp_getsockopt,
1da177e4 3001 .recvmsg = tcp_recvmsg,
7ba42910
CG
3002 .sendmsg = tcp_sendmsg,
3003 .sendpage = tcp_sendpage,
1da177e4 3004 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 3005 .release_cb = tcp_release_cb,
ab1e0a13
ACM
3006 .hash = inet_hash,
3007 .unhash = inet_unhash,
3008 .get_port = inet_csk_get_port,
1da177e4
LT
3009 .enter_memory_pressure = tcp_enter_memory_pressure,
3010 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 3011 .orphan_count = &tcp_orphan_count,
1da177e4
LT
3012 .memory_allocated = &tcp_memory_allocated,
3013 .memory_pressure = &tcp_memory_pressure,
1da177e4
LT
3014 .sysctl_wmem = sysctl_tcp_wmem,
3015 .sysctl_rmem = sysctl_tcp_rmem,
3016 .max_header = MAX_TCP_HEADER,
3017 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 3018 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 3019 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 3020 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 3021 .h.hashinfo = &tcp_hashinfo,
7ba42910 3022 .no_autobind = true,
543d9cfe
ACM
3023#ifdef CONFIG_COMPAT
3024 .compat_setsockopt = compat_tcp_setsockopt,
3025 .compat_getsockopt = compat_tcp_getsockopt,
3026#endif
c255a458 3027#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
3028 .init_cgroup = tcp_init_cgroup,
3029 .destroy_cgroup = tcp_destroy_cgroup,
3030 .proto_cgroup = tcp_proto_cgroup,
3031#endif
1da177e4 3032};
4bc2f18b 3033EXPORT_SYMBOL(tcp_prot);
1da177e4 3034
6bed3166
ED
3035static void __net_exit tcp_sk_exit(struct net *net)
3036{
3037 int cpu;
3038
3039 for_each_possible_cpu(cpu)
3040 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3041 free_percpu(net->ipv4.tcp_sk);
3042}
3043
046ee902
DL
3044static int __net_init tcp_sk_init(struct net *net)
3045{
6bed3166
ED
3046 int res, cpu;
3047
3048 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3049 if (!net->ipv4.tcp_sk)
3050 return -ENOMEM;
3051
3052 for_each_possible_cpu(cpu) {
3053 struct sock *sk;
3054
3055 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3056 IPPROTO_TCP, net);
3057 if (res)
3058 goto fail;
3059 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3060 }
5d134f1c 3061 net->ipv4.sysctl_tcp_ecn = 2;
be9f4a44 3062 return 0;
046ee902 3063
6bed3166
ED
3064fail:
3065 tcp_sk_exit(net);
3066
3067 return res;
b099ce26
EB
3068}
3069
3070static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3071{
3072 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
3073}
3074
3075static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
3076 .init = tcp_sk_init,
3077 .exit = tcp_sk_exit,
3078 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
3079};
3080
9b0f976f 3081void __init tcp_v4_init(void)
1da177e4 3082{
5caea4ea 3083 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 3084 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 3085 panic("Failed to create the TCP control socket.\n");
1da177e4 3086}