source: G950F DSE4
[GitHub/exynos8895/android_kernel_samsung_universal8895.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
1cac41cb
MB
70#ifdef CONFIG_MPTCP
71 #include <net/mptcp.h>
72 #include <net/mptcp_v4.h>
73#endif
20380731 74#include <net/transp_v6.h>
1da177e4
LT
75#include <net/ipv6.h>
76#include <net/inet_common.h>
6d6ee43e 77#include <net/timewait_sock.h>
1da177e4 78#include <net/xfrm.h>
6e5714ea 79#include <net/secure_seq.h>
d1a4c0b3 80#include <net/tcp_memcontrol.h>
076bb0c8 81#include <net/busy_poll.h>
1da177e4
LT
82
83#include <linux/inet.h>
84#include <linux/ipv6.h>
85#include <linux/stddef.h>
86#include <linux/proc_fs.h>
87#include <linux/seq_file.h>
88
cfb6eeb4
YH
89#include <linux/crypto.h>
90#include <linux/scatterlist.h>
91
ab32ea5d
BH
92int sysctl_tcp_tw_reuse __read_mostly;
93int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 94EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 95
cfb6eeb4 96#ifdef CONFIG_TCP_MD5SIG
a915da9b 97static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 98 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
99#endif
100
5caea4ea 101struct inet_hashinfo tcp_hashinfo;
4bc2f18b 102EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 103
936b8bdb 104static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
1da177e4 105{
eddc9ec5
ACM
106 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107 ip_hdr(skb)->saddr,
aa8223c7
ACM
108 tcp_hdr(skb)->dest,
109 tcp_hdr(skb)->source);
1da177e4
LT
110}
111
6d6ee43e
ACM
112int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113{
114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 struct tcp_sock *tp = tcp_sk(sk);
116
117 /* With PAWS, it is safe from the viewpoint
118 of data integrity. Even without PAWS it is safe provided sequence
119 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
120
121 Actually, the idea is close to VJ's one, only timestamp cache is
122 held not per host, but per port pair and TW bucket is used as state
123 holder.
124
125 If TW bucket has been already destroyed we fall back to VJ's scheme
126 and use initial timestamp retrieved from peer table.
127 */
128 if (tcptw->tw_ts_recent_stamp &&
51456b29 129 (!twp || (sysctl_tcp_tw_reuse &&
9d729f72 130 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
131 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
132 if (tp->write_seq == 0)
133 tp->write_seq = 1;
134 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
135 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
136 sock_hold(sktw);
137 return 1;
138 }
139
140 return 0;
141}
6d6ee43e
ACM
142EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
1da177e4
LT
144/* This will initiate an outgoing connection. */
145int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146{
2d7192d6 147 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
148 struct inet_sock *inet = inet_sk(sk);
149 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 150 __be16 orig_sport, orig_dport;
bada8adc 151 __be32 daddr, nexthop;
da905bd1 152 struct flowi4 *fl4;
2d7192d6 153 struct rtable *rt;
1da177e4 154 int err;
f6d8bd05 155 struct ip_options_rcu *inet_opt;
1da177e4
LT
156
157 if (addr_len < sizeof(struct sockaddr_in))
158 return -EINVAL;
159
160 if (usin->sin_family != AF_INET)
161 return -EAFNOSUPPORT;
162
163 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05
ED
164 inet_opt = rcu_dereference_protected(inet->inet_opt,
165 sock_owned_by_user(sk));
166 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
167 if (!daddr)
168 return -EINVAL;
f6d8bd05 169 nexthop = inet_opt->opt.faddr;
1da177e4
LT
170 }
171
dca8b089
DM
172 orig_sport = inet->inet_sport;
173 orig_dport = usin->sin_port;
da905bd1
DM
174 fl4 = &inet->cork.fl.u.ip4;
175 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
176 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
177 IPPROTO_TCP,
0e0d44ab 178 orig_sport, orig_dport, sk);
b23dd4fe
DM
179 if (IS_ERR(rt)) {
180 err = PTR_ERR(rt);
181 if (err == -ENETUNREACH)
f1d8cba6 182 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 183 return err;
584bdf8c 184 }
1da177e4
LT
185
186 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
187 ip_rt_put(rt);
188 return -ENETUNREACH;
189 }
190
f6d8bd05 191 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 192 daddr = fl4->daddr;
1da177e4 193
c720c7e8 194 if (!inet->inet_saddr)
da905bd1 195 inet->inet_saddr = fl4->saddr;
d1e559d0 196 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 197
c720c7e8 198 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
199 /* Reset inherited state */
200 tp->rx_opt.ts_recent = 0;
201 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
202 if (likely(!tp->repair))
203 tp->write_seq = 0;
1da177e4
LT
204 }
205
295ff7ed 206 if (tcp_death_row.sysctl_tw_recycle &&
81166dd6
DM
207 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
208 tcp_fetch_timewait_stamp(sk, &rt->dst);
1da177e4 209
c720c7e8 210 inet->inet_dport = usin->sin_port;
d1e559d0 211 sk_daddr_set(sk, daddr);
1da177e4 212
d83d8461 213 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
214 if (inet_opt)
215 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 216
bee7ca9e 217 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
218
219 /* Socket identity is still unknown (sport may be zero).
220 * However we set state to SYN-SENT and not releasing socket
221 * lock select source port, enter ourselves into the hash tables and
222 * complete initialization after this.
223 */
224 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 225 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
226 if (err)
227 goto failure;
228
877d1f62 229 sk_set_txhash(sk);
9e7ceb06 230
da905bd1 231 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
232 inet->inet_sport, inet->inet_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 rt = NULL;
1da177e4 236 goto failure;
b23dd4fe 237 }
1da177e4 238 /* OK, now commit destination to socket. */
bcd76111 239 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 240 sk_setup_caps(sk, &rt->dst);
1da177e4 241
ee995283 242 if (!tp->write_seq && likely(!tp->repair))
c720c7e8
ED
243 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
244 inet->inet_daddr,
245 inet->inet_sport,
1da177e4
LT
246 usin->sin_port);
247
c720c7e8 248 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 249
2b916477 250 err = tcp_connect(sk);
ee995283 251
1da177e4
LT
252 rt = NULL;
253 if (err)
254 goto failure;
255
256 return 0;
257
258failure:
7174259e
ACM
259 /*
260 * This unhashes the socket and releases the local port,
261 * if necessary.
262 */
1da177e4
LT
263 tcp_set_state(sk, TCP_CLOSE);
264 ip_rt_put(rt);
265 sk->sk_route_caps = 0;
c720c7e8 266 inet->inet_dport = 0;
1da177e4
LT
267 return err;
268}
4bc2f18b 269EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 270
1da177e4 271/*
563d34d0
ED
272 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
273 * It can be called through tcp_release_cb() if socket was owned by user
274 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 275 */
4fab9071 276void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 277{
1da177e4 278 struct inet_sock *inet = inet_sk(sk);
2681a785
ED
279 struct dst_entry *dst;
280 u32 mtu;
1da177e4 281
2681a785
ED
282 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
283 return;
284 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
285 dst = inet_csk_update_pmtu(sk, mtu);
286 if (!dst)
1da177e4
LT
287 return;
288
1da177e4
LT
289 /* Something is about to be wrong... Remember soft error
290 * for the case, if this connection will not able to recover.
291 */
292 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
293 sk->sk_err_soft = EMSGSIZE;
294
295 mtu = dst_mtu(dst);
296
297 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 298 ip_sk_accept_pmtu(sk) &&
d83d8461 299 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
300 tcp_sync_mss(sk, mtu);
301
302 /* Resend the TCP packet because it's
303 * clear that the old packet has been
304 * dropped. This is the new "fast" path mtu
305 * discovery.
306 */
307 tcp_simple_retransmit(sk);
308 } /* else let the usual retransmit timer handle it */
309}
4fab9071 310EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 311
55be7a9c
DM
312static void do_redirect(struct sk_buff *skb, struct sock *sk)
313{
314 struct dst_entry *dst = __sk_dst_check(sk, 0);
315
1ed5c48f 316 if (dst)
6700c270 317 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
318}
319
26e37360
ED
320
321/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
2679161c 322void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
323{
324 struct request_sock *req = inet_reqsk(sk);
325 struct net *net = sock_net(sk);
326
327 /* ICMPs are not backlogged, hence we cannot get
328 * an established socket here.
329 */
26e37360
ED
330 if (seq != tcp_rsk(req)->snt_isn) {
331 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
2679161c 332 } else if (abort) {
26e37360
ED
333 /*
334 * Still in SYN_RECV, just remove it silently.
335 * There is no good way to pass the error to the newly
336 * created socket, and POSIX does not want network
337 * errors returned from accept().
338 */
c6973669 339 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
ef84d8ce 340 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
26e37360 341 }
ef84d8ce 342 reqsk_put(req);
26e37360
ED
343}
344EXPORT_SYMBOL(tcp_req_err);
345
1da177e4
LT
346/*
347 * This routine is called by the ICMP module when it gets some
348 * sort of error condition. If err < 0 then the socket should
349 * be closed and the error returned to the user. If err > 0
350 * it's just the icmp type << 8 | icmp code. After adjustment
351 * header points to the first 8 bytes of the tcp header. We need
352 * to find the appropriate port.
353 *
354 * The locking strategy used here is very "optimistic". When
355 * someone else accesses the socket the ICMP is just dropped
356 * and for some paths there is no check at all.
357 * A more general error queue to queue errors for later handling
358 * is probably better.
359 *
360 */
361
4d1a2d9e 362void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 363{
b71d1d42 364 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 365 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 366 struct inet_connection_sock *icsk;
1da177e4
LT
367 struct tcp_sock *tp;
368 struct inet_sock *inet;
4d1a2d9e
DL
369 const int type = icmp_hdr(icmp_skb)->type;
370 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 371 struct sock *sk;
1cac41cb
MB
372#ifdef CONFIG_MPTCP
373 struct sock *meta_sk;
374#endif
f1ecd5d9 375 struct sk_buff *skb;
0a672f74
YC
376 struct request_sock *fastopen;
377 __u32 seq, snd_una;
f1ecd5d9 378 __u32 remaining;
1da177e4 379 int err;
4d1a2d9e 380 struct net *net = dev_net(icmp_skb->dev);
1da177e4 381
26e37360
ED
382 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
383 th->dest, iph->saddr, ntohs(th->source),
384 inet_iif(icmp_skb));
1da177e4 385 if (!sk) {
dcfc23ca 386 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
387 return;
388 }
389 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 390 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
391 return;
392 }
26e37360
ED
393 seq = ntohl(th->seq);
394 if (sk->sk_state == TCP_NEW_SYN_RECV)
2679161c
ED
395 return tcp_req_err(sk, seq,
396 type == ICMP_PARAMETERPROB ||
397 type == ICMP_TIME_EXCEEDED ||
398 (type == ICMP_DEST_UNREACH &&
399 (code == ICMP_NET_UNREACH ||
400 code == ICMP_HOST_UNREACH)));
1da177e4 401
1cac41cb
MB
402#ifdef CONFIG_MPTCP
403 tp = tcp_sk(sk);
404 if (mptcp(tp))
405 meta_sk = mptcp_meta_sk(sk);
406 else
407 meta_sk = sk;
408
409 bh_lock_sock(meta_sk);
410#else
1da177e4 411 bh_lock_sock(sk);
1cac41cb 412#endif
1da177e4
LT
413 /* If too many ICMPs get dropped on busy
414 * servers this needs to be solved differently.
563d34d0
ED
415 * We do take care of PMTU discovery (RFC1191) special case :
416 * we can receive locally generated ICMP messages while socket is held.
1da177e4 417 */
1cac41cb
MB
418#ifdef CONFIG_MPTCP
419 if (sock_owned_by_user(meta_sk)) {
420#else
b74aa930 421 if (sock_owned_by_user(sk)) {
1cac41cb 422#endif
b74aa930
ED
423 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
424 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
425 }
1da177e4
LT
426 if (sk->sk_state == TCP_CLOSE)
427 goto out;
428
97e3ecd1 429 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
430 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
431 goto out;
432 }
433
f1ecd5d9 434 icsk = inet_csk(sk);
1cac41cb 435#ifndef CONFIG_MPTCP
1da177e4 436 tp = tcp_sk(sk);
1cac41cb 437#endif
0a672f74
YC
438 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
439 fastopen = tp->fastopen_rsk;
440 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 441 if (sk->sk_state != TCP_LISTEN &&
0a672f74 442 !between(seq, snd_una, tp->snd_nxt)) {
de0744af 443 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
444 goto out;
445 }
446
447 switch (type) {
55be7a9c 448 case ICMP_REDIRECT:
4ab956b5
JM
449 if (!sock_owned_by_user(sk))
450 do_redirect(icmp_skb, sk);
55be7a9c 451 goto out;
1da177e4
LT
452 case ICMP_SOURCE_QUENCH:
453 /* Just silently ignore these. */
454 goto out;
455 case ICMP_PARAMETERPROB:
456 err = EPROTO;
457 break;
458 case ICMP_DEST_UNREACH:
459 if (code > NR_ICMP_UNREACH)
460 goto out;
461
462 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
463 /* We are not interested in TCP_LISTEN and open_requests
464 * (SYN-ACKs send out by Linux are always <576bytes so
465 * they should go through unfragmented).
466 */
467 if (sk->sk_state == TCP_LISTEN)
468 goto out;
469
563d34d0 470 tp->mtu_info = info;
1cac41cb
MB
471#ifdef CONFIG_MPTCP
472 if (!sock_owned_by_user(meta_sk)) {
473#else
144d56e9 474 if (!sock_owned_by_user(sk)) {
1cac41cb 475#endif
563d34d0 476 tcp_v4_mtu_reduced(sk);
144d56e9
ED
477 } else {
478 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
479 sock_hold(sk);
1cac41cb
MB
480#ifdef CONFIG_MPTCP
481 if (mptcp(tp))
482 mptcp_tsq_flags(sk);
483#endif
144d56e9 484 }
1da177e4
LT
485 goto out;
486 }
487
488 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
489 /* check if icmp_skb allows revert of backoff
490 * (see draft-zimmermann-tcp-lcd) */
491 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
492 break;
493 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 494 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
495 break;
496
1cac41cb
MB
497#ifdef CONFIG_MPTCP
498 if (sock_owned_by_user(meta_sk))
499#else
8f49c270 500 if (sock_owned_by_user(sk))
1cac41cb 501#endif
8f49c270
DM
502 break;
503
f1ecd5d9 504 icsk->icsk_backoff--;
fcdd1cf4
ED
505 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
506 TCP_TIMEOUT_INIT;
507 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
508
509 skb = tcp_write_queue_head(sk);
510 BUG_ON(!skb);
511
7faee5c0
ED
512 remaining = icsk->icsk_rto -
513 min(icsk->icsk_rto,
514 tcp_time_stamp - tcp_skb_timestamp(skb));
f1ecd5d9
DL
515
516 if (remaining) {
517 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
518 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
519 } else {
520 /* RTO revert clocked out retransmission.
521 * Will retransmit now */
1cac41cb
MB
522#ifdef CONFIG_MPTCP
523 tcp_sk(sk)->ops->retransmit_timer(sk);
524#else
525 tcp_retransmit_timer(sk);
526#endif
527
f1ecd5d9
DL
528 }
529
1da177e4
LT
530 break;
531 case ICMP_TIME_EXCEEDED:
532 err = EHOSTUNREACH;
533 break;
534 default:
535 goto out;
536 }
537
538 switch (sk->sk_state) {
1da177e4 539 case TCP_SYN_SENT:
0a672f74
YC
540 case TCP_SYN_RECV:
541 /* Only in fast or simultaneous open. If a fast open socket is
542 * is already accepted it is treated as a connected one below.
543 */
51456b29 544 if (fastopen && !fastopen->sk)
0a672f74
YC
545 break;
546
1cac41cb
MB
547#ifdef CONFIG_MPTCP
548 if (!sock_owned_by_user(meta_sk)) {
549#else
1da177e4 550 if (!sock_owned_by_user(sk)) {
1cac41cb 551#endif
1da177e4
LT
552 sk->sk_err = err;
553
554 sk->sk_error_report(sk);
555
556 tcp_done(sk);
557 } else {
558 sk->sk_err_soft = err;
559 }
560 goto out;
561 }
562
563 /* If we've already connected we will keep trying
564 * until we time out, or the user gives up.
565 *
566 * rfc1122 4.2.3.9 allows to consider as hard errors
567 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
568 * but it is obsoleted by pmtu discovery).
569 *
570 * Note, that in modern internet, where routing is unreliable
571 * and in each dark corner broken firewalls sit, sending random
572 * errors ordered by their masters even this two messages finally lose
573 * their original sense (even Linux sends invalid PORT_UNREACHs)
574 *
575 * Now we are in compliance with RFCs.
576 * --ANK (980905)
577 */
578
579 inet = inet_sk(sk);
1cac41cb
MB
580#ifdef CONFIG_MPTCP
581 if (!sock_owned_by_user(meta_sk) && inet->recverr) {
582#else
1da177e4 583 if (!sock_owned_by_user(sk) && inet->recverr) {
1cac41cb 584#endif
1da177e4
LT
585 sk->sk_err = err;
586 sk->sk_error_report(sk);
587 } else { /* Only an error on timeout */
588 sk->sk_err_soft = err;
589 }
590
591out:
1cac41cb
MB
592#ifdef CONFIG_MPTCP
593 bh_unlock_sock(meta_sk);
594#else
1da177e4 595 bh_unlock_sock(sk);
1cac41cb 596#endif
1da177e4
LT
597 sock_put(sk);
598}
599
28850dc7 600void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 601{
aa8223c7 602 struct tcphdr *th = tcp_hdr(skb);
1da177e4 603
84fa7933 604 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 605 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 606 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 607 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 608 } else {
419f9f89 609 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 610 csum_partial(th,
1da177e4
LT
611 th->doff << 2,
612 skb->csum));
613 }
614}
615
419f9f89 616/* This routine computes an IPv4 TCP checksum. */
bb296246 617void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 618{
cf533ea5 619 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
620
621 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
622}
4bc2f18b 623EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 624
1da177e4
LT
625/*
626 * This routine will send an RST to the other tcp.
627 *
628 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
629 * for reset.
630 * Answer: if a packet caused RST, it is not for a socket
631 * existing in our system, if it is matched to a socket,
632 * it is just duplicate segment or bug in other side's TCP.
633 * So that we build reply only basing on parameters
634 * arrived with segment.
635 * Exception: precedence violation. We do not implement it in any case.
636 */
637
1cac41cb
MB
638#ifndef CONFIG_MPTCP
639static
640#endif
641void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 642{
cf533ea5 643 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
644 struct {
645 struct tcphdr th;
646#ifdef CONFIG_TCP_MD5SIG
714e85be 647 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
648#endif
649 } rep;
1da177e4 650 struct ip_reply_arg arg;
cfb6eeb4
YH
651#ifdef CONFIG_TCP_MD5SIG
652 struct tcp_md5sig_key *key;
658ddaaf
SL
653 const __u8 *hash_location = NULL;
654 unsigned char newhash[16];
655 int genhash;
656 struct sock *sk1 = NULL;
cfb6eeb4 657#endif
a86b1e30 658 struct net *net;
1da177e4
LT
659
660 /* Never send a reset in response to a reset. */
661 if (th->rst)
662 return;
663
c3658e8d
ED
664 /* If sk not NULL, it means we did a successful lookup and incoming
665 * route had to be correct. prequeue might have dropped our dst.
666 */
667 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
668 return;
669
670 /* Swap the send and the receive. */
cfb6eeb4
YH
671 memset(&rep, 0, sizeof(rep));
672 rep.th.dest = th->source;
673 rep.th.source = th->dest;
674 rep.th.doff = sizeof(struct tcphdr) / 4;
675 rep.th.rst = 1;
1da177e4
LT
676
677 if (th->ack) {
cfb6eeb4 678 rep.th.seq = th->ack_seq;
1da177e4 679 } else {
cfb6eeb4
YH
680 rep.th.ack = 1;
681 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
682 skb->len - (th->doff << 2));
1da177e4
LT
683 }
684
7174259e 685 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
686 arg.iov[0].iov_base = (unsigned char *)&rep;
687 arg.iov[0].iov_len = sizeof(rep.th);
688
0f85feae 689 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 690#ifdef CONFIG_TCP_MD5SIG
658ddaaf
SL
691 hash_location = tcp_parse_md5sig_option(th);
692 if (!sk && hash_location) {
693 /*
694 * active side is lost. Try to find listening socket through
695 * source port, and then find md5 key through listening socket.
696 * we are not loose security here:
697 * Incoming packet is checked with md5 hash with finding key,
698 * no RST generated if md5 hash doesn't match.
699 */
0f85feae 700 sk1 = __inet_lookup_listener(net,
da5e3630
TH
701 &tcp_hashinfo, ip_hdr(skb)->saddr,
702 th->source, ip_hdr(skb)->daddr,
658ddaaf
SL
703 ntohs(th->source), inet_iif(skb));
704 /* don't send rst if it can't find key */
705 if (!sk1)
706 return;
707 rcu_read_lock();
708 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
709 &ip_hdr(skb)->saddr, AF_INET);
710 if (!key)
711 goto release_sk1;
712
39f8e58e 713 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf
SL
714 if (genhash || memcmp(hash_location, newhash, 16) != 0)
715 goto release_sk1;
716 } else {
717 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
718 &ip_hdr(skb)->saddr,
719 AF_INET) : NULL;
720 }
721
cfb6eeb4
YH
722 if (key) {
723 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
724 (TCPOPT_NOP << 16) |
725 (TCPOPT_MD5SIG << 8) |
726 TCPOLEN_MD5SIG);
727 /* Update length and the length the header thinks exists */
728 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
729 rep.th.doff = arg.iov[0].iov_len / 4;
730
49a72dfb 731 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
732 key, ip_hdr(skb)->saddr,
733 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
734 }
735#endif
eddc9ec5
ACM
736 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
737 ip_hdr(skb)->saddr, /* XXX */
52cd5750 738 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 739 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 740 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
e2446eaa 741 /* When socket is gone, all binding information is lost.
4c675258
AK
742 * routing might fail in this case. No choice here, if we choose to force
743 * input interface, we will misroute in case of asymmetric route.
e2446eaa 744 */
4c675258
AK
745 if (sk)
746 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 747
66b13d99 748 arg.tos = ip_hdr(skb)->tos;
344afd62 749 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
bdbbb852
ED
750 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
751 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
752 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
753 &arg, arg.iov[0].iov_len);
1da177e4 754
63231bdd
PE
755 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
756 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
658ddaaf
SL
757
758#ifdef CONFIG_TCP_MD5SIG
759release_sk1:
760 if (sk1) {
761 rcu_read_unlock();
762 sock_put(sk1);
763 }
764#endif
1da177e4
LT
765}
766
767/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
768 outside socket context is ugly, certainly. What can I do?
769 */
1cac41cb
MB
770#ifdef CONFIG_MPTCP
771static void tcp_v4_send_ack(const struct sock *sk,
772 struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
773 u32 win, u32 tsval, u32 tsecr, int oif,
774 struct tcp_md5sig_key *key,
775 int reply_flags, u8 tos, int mptcp)
776#else
344afd62
LC
777static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
778 u32 seq, u32 ack,
ee684b6f 779 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 780 struct tcp_md5sig_key *key,
66b13d99 781 int reply_flags, u8 tos)
1cac41cb 782#endif
1da177e4 783{
cf533ea5 784 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
785 struct {
786 struct tcphdr th;
714e85be 787 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 788#ifdef CONFIG_TCP_MD5SIG
714e85be 789 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
1cac41cb
MB
790#endif
791#ifdef CONFIG_MPTCP
792 + ((MPTCP_SUB_LEN_DSS >> 2) +
793 (MPTCP_SUB_LEN_ACK >> 2))
cfb6eeb4
YH
794#endif
795 ];
1da177e4
LT
796 } rep;
797 struct ip_reply_arg arg;
344afd62 798 struct net *net = sock_net(sk);
1da177e4
LT
799
800 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 801 memset(&arg, 0, sizeof(arg));
1da177e4
LT
802
803 arg.iov[0].iov_base = (unsigned char *)&rep;
804 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 805 if (tsecr) {
cfb6eeb4
YH
806 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
807 (TCPOPT_TIMESTAMP << 8) |
808 TCPOLEN_TIMESTAMP);
ee684b6f
AV
809 rep.opt[1] = htonl(tsval);
810 rep.opt[2] = htonl(tsecr);
cb48cfe8 811 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
812 }
813
814 /* Swap the send and the receive. */
815 rep.th.dest = th->source;
816 rep.th.source = th->dest;
817 rep.th.doff = arg.iov[0].iov_len / 4;
818 rep.th.seq = htonl(seq);
819 rep.th.ack_seq = htonl(ack);
820 rep.th.ack = 1;
821 rep.th.window = htons(win);
822
cfb6eeb4 823#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 824 if (key) {
ee684b6f 825 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
826
827 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
828 (TCPOPT_NOP << 16) |
829 (TCPOPT_MD5SIG << 8) |
830 TCPOLEN_MD5SIG);
831 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
832 rep.th.doff = arg.iov[0].iov_len/4;
833
49a72dfb 834 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
835 key, ip_hdr(skb)->saddr,
836 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
837 }
838#endif
1cac41cb
MB
839#ifdef CONFIG_MPTCP
840 if (mptcp) {
841 int offset = (tsecr) ? 3 : 0;
842 /* Construction of 32-bit data_ack */
843 rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
844 ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
845 (0x20 << 8) |
846 (0x01));
847 rep.opt[offset] = htonl(data_ack);
848
849 arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
850 rep.th.doff = arg.iov[0].iov_len / 4;
851 }
852#endif /* CONFIG_MPTCP */
88ef4a5a 853 arg.flags = reply_flags;
eddc9ec5
ACM
854 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
855 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
856 arg.iov[0].iov_len, IPPROTO_TCP, 0);
857 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
858 if (oif)
859 arg.bound_dev_if = oif;
66b13d99 860 arg.tos = tos;
344afd62 861 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
bdbbb852
ED
862 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
863 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
864 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
865 &arg, arg.iov[0].iov_len);
1da177e4 866
63231bdd 867 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
868}
869
870static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
871{
8feaf0c0 872 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 873 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1cac41cb
MB
874#ifdef CONFIG_MPTCP
875 u32 data_ack = 0;
876 int mptcp = 0;
877if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
878 data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
879 mptcp = 1;
880 }
881#endif
344afd62 882 tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1cac41cb
MB
883#ifdef CONFIG_MPTCP
884 data_ack,
885#endif
7174259e 886 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
ee684b6f 887 tcp_time_stamp + tcptw->tw_ts_offset,
9501f972
YH
888 tcptw->tw_ts_recent,
889 tw->tw_bound_dev_if,
88ef4a5a 890 tcp_twsk_md5_key(tcptw),
66b13d99
ED
891 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
892 tw->tw_tos
1cac41cb
MB
893#ifdef CONFIG_MPTCP
894 , mptcp
895#endif
9501f972 896 );
1da177e4 897
8feaf0c0 898 inet_twsk_put(tw);
1da177e4
LT
899}
900
1cac41cb
MB
901#ifndef CONFIG_MPTCP
902static
903#endif
904void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 905 struct request_sock *req)
1da177e4 906{
1cac41cb 907 /* sk->sk_state == TCP_LISTEN or meta-sk -> for regular TCP_SYN_RECV
168a8f58
JC
908 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
909 */
1cac41cb
MB
910#ifdef CONFIG_MPTCP
911 u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
912 tcp_rsk(req)->snt_isn + 1 :
913 tcp_sk(sk)->snd_nxt;
914#else
915 u32 seq = (sk->sk_state == TCP_LISTEN) ?
916 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt;
917#endif
918
919 tcp_v4_send_ack(sk, skb, seq,
920 tcp_rsk(req)->rcv_nxt,
921#ifdef CONFIG_MPTCP
922 0,
923#endif
924 req->rsk_rcv_wnd,
ee684b6f 925 tcp_time_stamp,
9501f972
YH
926 req->ts_recent,
927 0,
6925223a 928 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
a915da9b 929 AF_INET),
66b13d99 930 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1cac41cb
MB
931 ip_hdr(skb)->tos
932#ifdef CONFIG_MPTCP
933 , 0
934#endif
935 );
1da177e4
LT
936}
937
1da177e4 938/*
9bf1d83e 939 * Send a SYN-ACK after having received a SYN.
60236fdd 940 * This still operates on a request_sock only, not on a big
1da177e4
LT
941 * socket.
942 */
1cac41cb
MB
943#ifndef CONFIG_MPTCP
944static
945#endif
946int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 947 struct flowi *fl,
72659ecc 948 struct request_sock *req,
ca6fb065
ED
949 struct tcp_fastopen_cookie *foc,
950 bool attach_req)
1da177e4 951{
2e6599cb 952 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 953 struct flowi4 fl4;
1da177e4 954 int err = -1;
d41db5af 955 struct sk_buff *skb;
1da177e4
LT
956
957 /* First, grab a route. */
ba3f7f04 958 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 959 return -1;
1da177e4 960
ca6fb065 961 skb = tcp_make_synack(sk, dst, req, foc, attach_req);
1da177e4
LT
962
963 if (skb) {
634fb979 964 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 965
634fb979
ED
966 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
967 ireq->ir_rmt_addr,
11fa3353 968 ireq_opt_deref(ireq));
b9df3cb8 969 err = net_xmit_eval(err);
1da177e4
LT
970 }
971
1da177e4
LT
972 return err;
973}
974
975/*
60236fdd 976 * IPv4 request_sock destructor.
1da177e4 977 */
1cac41cb
MB
978#ifndef CONFIG_MPTCP
979static
980#endif
981void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 982{
13eddc67 983 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1da177e4
LT
984}
985
1da177e4 986
cfb6eeb4
YH
987#ifdef CONFIG_TCP_MD5SIG
988/*
989 * RFC2385 MD5 checksumming requires a mapping of
990 * IP address->MD5 Key.
991 * We need to maintain these in the sk structure.
992 */
993
994/* Find the Key structure for an address. */
b83e3deb 995struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
996 const union tcp_md5_addr *addr,
997 int family)
cfb6eeb4 998{
fd3a154a 999 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1000 struct tcp_md5sig_key *key;
a915da9b 1001 unsigned int size = sizeof(struct in_addr);
fd3a154a 1002 const struct tcp_md5sig_info *md5sig;
cfb6eeb4 1003
a8afca03
ED
1004 /* caller either holds rcu_read_lock() or socket lock */
1005 md5sig = rcu_dereference_check(tp->md5sig_info,
b4fb05ea 1006 sock_owned_by_user(sk) ||
b83e3deb 1007 lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
a8afca03 1008 if (!md5sig)
cfb6eeb4 1009 return NULL;
a915da9b
ED
1010#if IS_ENABLED(CONFIG_IPV6)
1011 if (family == AF_INET6)
1012 size = sizeof(struct in6_addr);
1013#endif
b67bfe0d 1014 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
1015 if (key->family != family)
1016 continue;
1017 if (!memcmp(&key->addr, addr, size))
1018 return key;
cfb6eeb4
YH
1019 }
1020 return NULL;
1021}
a915da9b 1022EXPORT_SYMBOL(tcp_md5_do_lookup);
cfb6eeb4 1023
b83e3deb 1024struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 1025 const struct sock *addr_sk)
cfb6eeb4 1026{
b52e6921 1027 const union tcp_md5_addr *addr;
a915da9b 1028
b52e6921 1029 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 1030 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 1031}
cfb6eeb4
YH
1032EXPORT_SYMBOL(tcp_v4_md5_lookup);
1033
cfb6eeb4 1034/* This can be called on a newly created socket, from other files */
a915da9b
ED
1035int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1036 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
cfb6eeb4
YH
1037{
1038 /* Add Key to the list */
b0a713e9 1039 struct tcp_md5sig_key *key;
cfb6eeb4 1040 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1041 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1042
c0353c7b 1043 key = tcp_md5_do_lookup(sk, addr, family);
cfb6eeb4
YH
1044 if (key) {
1045 /* Pre-existing entry - just update that one. */
a915da9b 1046 memcpy(key->key, newkey, newkeylen);
b0a713e9 1047 key->keylen = newkeylen;
a915da9b
ED
1048 return 0;
1049 }
260fcbeb 1050
a8afca03 1051 md5sig = rcu_dereference_protected(tp->md5sig_info,
1b8e6a01
ED
1052 sock_owned_by_user(sk) ||
1053 lockdep_is_held(&sk->sk_lock.slock));
a915da9b
ED
1054 if (!md5sig) {
1055 md5sig = kmalloc(sizeof(*md5sig), gfp);
1056 if (!md5sig)
cfb6eeb4 1057 return -ENOMEM;
cfb6eeb4 1058
a915da9b
ED
1059 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1060 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1061 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1062 }
cfb6eeb4 1063
5f3d9cb2 1064 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1065 if (!key)
1066 return -ENOMEM;
71cea17e 1067 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1068 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1069 return -ENOMEM;
cfb6eeb4 1070 }
a915da9b
ED
1071
1072 memcpy(key->key, newkey, newkeylen);
1073 key->keylen = newkeylen;
1074 key->family = family;
1075 memcpy(&key->addr, addr,
1076 (family == AF_INET6) ? sizeof(struct in6_addr) :
1077 sizeof(struct in_addr));
1078 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1079 return 0;
1080}
a915da9b 1081EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1082
a915da9b 1083int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
cfb6eeb4 1084{
a915da9b
ED
1085 struct tcp_md5sig_key *key;
1086
c0353c7b 1087 key = tcp_md5_do_lookup(sk, addr, family);
a915da9b
ED
1088 if (!key)
1089 return -ENOENT;
1090 hlist_del_rcu(&key->node);
5f3d9cb2 1091 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1092 kfree_rcu(key, rcu);
a915da9b 1093 return 0;
cfb6eeb4 1094}
a915da9b 1095EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1096
e0683e70 1097static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1098{
1099 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1100 struct tcp_md5sig_key *key;
b67bfe0d 1101 struct hlist_node *n;
a8afca03 1102 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1103
a8afca03
ED
1104 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1105
b67bfe0d 1106 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1107 hlist_del_rcu(&key->node);
5f3d9cb2 1108 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1109 kfree_rcu(key, rcu);
cfb6eeb4
YH
1110 }
1111}
1112
7174259e
ACM
1113static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1114 int optlen)
cfb6eeb4
YH
1115{
1116 struct tcp_md5sig cmd;
1117 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
cfb6eeb4
YH
1118
1119 if (optlen < sizeof(cmd))
1120 return -EINVAL;
1121
7174259e 1122 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1123 return -EFAULT;
1124
1125 if (sin->sin_family != AF_INET)
1126 return -EINVAL;
1127
64a124ed 1128 if (!cmd.tcpm_keylen)
a915da9b
ED
1129 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1130 AF_INET);
cfb6eeb4
YH
1131
1132 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1133 return -EINVAL;
1134
a915da9b
ED
1135 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1136 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1137 GFP_KERNEL);
cfb6eeb4
YH
1138}
1139
49a72dfb
AL
1140static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1141 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1142{
cfb6eeb4 1143 struct tcp4_pseudohdr *bp;
49a72dfb 1144 struct scatterlist sg;
cfb6eeb4
YH
1145
1146 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1147
1148 /*
49a72dfb 1149 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1150 * destination IP address, zero-padded protocol number, and
1151 * segment length)
1152 */
1153 bp->saddr = saddr;
1154 bp->daddr = daddr;
1155 bp->pad = 0;
076fb722 1156 bp->protocol = IPPROTO_TCP;
49a72dfb 1157 bp->len = cpu_to_be16(nbytes);
c7da57a1 1158
49a72dfb
AL
1159 sg_init_one(&sg, bp, sizeof(*bp));
1160 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1161}
1162
a915da9b 1163static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1164 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1165{
1166 struct tcp_md5sig_pool *hp;
1167 struct hash_desc *desc;
1168
1169 hp = tcp_get_md5sig_pool();
1170 if (!hp)
1171 goto clear_hash_noput;
1172 desc = &hp->md5_desc;
1173
1174 if (crypto_hash_init(desc))
1175 goto clear_hash;
1176 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1177 goto clear_hash;
1178 if (tcp_md5_hash_header(hp, th))
1179 goto clear_hash;
1180 if (tcp_md5_hash_key(hp, key))
1181 goto clear_hash;
1182 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1183 goto clear_hash;
1184
cfb6eeb4 1185 tcp_put_md5sig_pool();
cfb6eeb4 1186 return 0;
49a72dfb 1187
cfb6eeb4
YH
1188clear_hash:
1189 tcp_put_md5sig_pool();
1190clear_hash_noput:
1191 memset(md5_hash, 0, 16);
49a72dfb 1192 return 1;
cfb6eeb4
YH
1193}
1194
39f8e58e
ED
1195int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1196 const struct sock *sk,
318cf7aa 1197 const struct sk_buff *skb)
cfb6eeb4 1198{
49a72dfb
AL
1199 struct tcp_md5sig_pool *hp;
1200 struct hash_desc *desc;
318cf7aa 1201 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1202 __be32 saddr, daddr;
1203
39f8e58e
ED
1204 if (sk) { /* valid for establish/request sockets */
1205 saddr = sk->sk_rcv_saddr;
1206 daddr = sk->sk_daddr;
cfb6eeb4 1207 } else {
49a72dfb
AL
1208 const struct iphdr *iph = ip_hdr(skb);
1209 saddr = iph->saddr;
1210 daddr = iph->daddr;
cfb6eeb4 1211 }
49a72dfb
AL
1212
1213 hp = tcp_get_md5sig_pool();
1214 if (!hp)
1215 goto clear_hash_noput;
1216 desc = &hp->md5_desc;
1217
1218 if (crypto_hash_init(desc))
1219 goto clear_hash;
1220
1221 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1222 goto clear_hash;
1223 if (tcp_md5_hash_header(hp, th))
1224 goto clear_hash;
1225 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1226 goto clear_hash;
1227 if (tcp_md5_hash_key(hp, key))
1228 goto clear_hash;
1229 if (crypto_hash_final(desc, md5_hash))
1230 goto clear_hash;
1231
1232 tcp_put_md5sig_pool();
1233 return 0;
1234
1235clear_hash:
1236 tcp_put_md5sig_pool();
1237clear_hash_noput:
1238 memset(md5_hash, 0, 16);
1239 return 1;
cfb6eeb4 1240}
49a72dfb 1241EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1242
ba8e275a
ED
1243#endif
1244
ff74e23f 1245/* Called with rcu_read_lock() */
ba8e275a 1246static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1247 const struct sk_buff *skb)
cfb6eeb4 1248{
ba8e275a 1249#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1250 /*
1251 * This gets called for each TCP segment that arrives
1252 * so we want to be efficient.
1253 * We have 3 drop cases:
1254 * o No MD5 hash and one expected.
1255 * o MD5 hash and we're not expecting one.
1256 * o MD5 hash and its wrong.
1257 */
cf533ea5 1258 const __u8 *hash_location = NULL;
cfb6eeb4 1259 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1260 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1261 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1262 int genhash;
cfb6eeb4
YH
1263 unsigned char newhash[16];
1264
a915da9b
ED
1265 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1266 AF_INET);
7d5d5525 1267 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1268
cfb6eeb4
YH
1269 /* We've parsed the options - do we have a hash? */
1270 if (!hash_expected && !hash_location)
a2a385d6 1271 return false;
cfb6eeb4
YH
1272
1273 if (hash_expected && !hash_location) {
785957d3 1274 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1275 return true;
cfb6eeb4
YH
1276 }
1277
1278 if (!hash_expected && hash_location) {
785957d3 1279 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1280 return true;
cfb6eeb4
YH
1281 }
1282
1283 /* Okay, so this is hash_expected and hash_location -
1284 * so we need to calculate the checksum.
1285 */
49a72dfb
AL
1286 genhash = tcp_v4_md5_hash_skb(newhash,
1287 hash_expected,
39f8e58e 1288 NULL, skb);
cfb6eeb4
YH
1289
1290 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
e87cc472
JP
1291 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1292 &iph->saddr, ntohs(th->source),
1293 &iph->daddr, ntohs(th->dest),
1294 genhash ? " tcp_v4_calc_md5_hash failed"
1295 : "");
a2a385d6 1296 return true;
cfb6eeb4 1297 }
a2a385d6 1298 return false;
cfb6eeb4 1299#endif
ba8e275a
ED
1300 return false;
1301}
cfb6eeb4 1302
1cac41cb
MB
1303#ifdef CONFIG_MPTCP
1304static int tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener,
1305 struct sk_buff *skb, bool want_cookie)
1306#else
b40cf18e
ED
1307static void tcp_v4_init_req(struct request_sock *req,
1308 const struct sock *sk_listener,
16bea70a 1309 struct sk_buff *skb)
1cac41cb 1310#endif
16bea70a
OP
1311{
1312 struct inet_request_sock *ireq = inet_rsk(req);
1313
08d2cc3b
ED
1314 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1315 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1316 ireq->no_srccheck = inet_sk(sk_listener)->transparent;
13eddc67 1317 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
1cac41cb
MB
1318
1319#ifdef CONFIG_MPTCP
1320 return 0;
1321#endif
16bea70a
OP
1322}
1323
f964629e
ED
1324static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1325 struct flowi *fl,
d94e0417
OP
1326 const struct request_sock *req,
1327 bool *strict)
1328{
1329 struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1330
1331 if (strict) {
1332 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1333 *strict = true;
1334 else
1335 *strict = false;
1336 }
1337
1338 return dst;
1339}
1340
72a3effa 1341struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1342 .family = PF_INET,
2e6599cb 1343 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1344 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1345 .send_ack = tcp_v4_reqsk_send_ack,
1346 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1347 .send_reset = tcp_v4_send_reset,
688d1945 1348 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1349};
1350
1cac41cb
MB
1351#ifndef CONFIG_MPTCP
1352static
1353#endif
1354const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1355 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1356#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1357 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1358 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1359#endif
16bea70a 1360 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1361#ifdef CONFIG_SYN_COOKIES
1362 .cookie_init_seq = cookie_v4_init_sequence,
1363#endif
d94e0417 1364 .route_req = tcp_v4_route_req,
936b8bdb 1365 .init_seq = tcp_v4_init_sequence,
d6274bd8 1366 .send_synack = tcp_v4_send_synack,
16bea70a 1367};
cfb6eeb4 1368
1da177e4
LT
1369int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1370{
1da177e4 1371 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1372 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1373 goto drop;
1374
1fb6f159
OP
1375 return tcp_conn_request(&tcp_request_sock_ops,
1376 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1377
1da177e4 1378drop:
848bf15f 1379 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1380 return 0;
1381}
4bc2f18b 1382EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1383
1384
1385/*
1386 * The three way handshake has completed - we got a valid synack -
1387 * now create the new socket.
1388 */
0c27171e 1389struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1390 struct request_sock *req,
5e0724d0
ED
1391 struct dst_entry *dst,
1392 struct request_sock *req_unhash,
1393 bool *own_req)
1da177e4 1394{
2e6599cb 1395 struct inet_request_sock *ireq;
1da177e4
LT
1396 struct inet_sock *newinet;
1397 struct tcp_sock *newtp;
1398 struct sock *newsk;
cfb6eeb4
YH
1399#ifdef CONFIG_TCP_MD5SIG
1400 struct tcp_md5sig_key *key;
1401#endif
f6d8bd05 1402 struct ip_options_rcu *inet_opt;
1da177e4
LT
1403
1404 if (sk_acceptq_is_full(sk))
1405 goto exit_overflow;
1406
1da177e4
LT
1407 newsk = tcp_create_openreq_child(sk, req, skb);
1408 if (!newsk)
093d2823 1409 goto exit_nonewsk;
1da177e4 1410
bcd76111 1411 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1412 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1413
1414 newtp = tcp_sk(newsk);
1415 newinet = inet_sk(newsk);
2e6599cb 1416 ireq = inet_rsk(req);
d1e559d0
ED
1417 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1418 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
13eddc67
ED
1419 newinet->inet_saddr = ireq->ir_loc_addr;
1420 inet_opt = rcu_dereference(ireq->ireq_opt);
1421 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
463c84b9 1422 newinet->mc_index = inet_iif(skb);
eddc9ec5 1423 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1424 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1425 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1426 if (inet_opt)
1427 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1428 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1429
dfd25fff
ED
1430 if (!dst) {
1431 dst = inet_csk_route_child_sock(sk, newsk, req);
1432 if (!dst)
1433 goto put_and_exit;
1434 } else {
1435 /* syncookie case : see end of cookie_v4_check() */
1436 }
0e734419
DM
1437 sk_setup_caps(newsk, dst);
1438
81164413
DB
1439 tcp_ca_openreq_child(newsk, dst);
1440
1da177e4 1441 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1442 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1443 if (tcp_sk(sk)->rx_opt.user_mss &&
1444 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1445 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1446
1da177e4
LT
1447 tcp_initialize_rcv_mss(newsk);
1448
cfb6eeb4
YH
1449#ifdef CONFIG_TCP_MD5SIG
1450 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1451 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1452 AF_INET);
00db4124 1453 if (key) {
cfb6eeb4
YH
1454 /*
1455 * We're using one, so create a matching key
1456 * on the newsk structure. If we fail to get
1457 * memory, then we end up not copying the key
1458 * across. Shucks.
1459 */
a915da9b
ED
1460 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1461 AF_INET, key->key, key->keylen, GFP_ATOMIC);
a465419b 1462 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1463 }
1464#endif
1465
0e734419
DM
1466 if (__inet_inherit_port(sk, newsk) < 0)
1467 goto put_and_exit;
5e0724d0 1468 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
13eddc67 1469 if (likely(*own_req)) {
49a496c9 1470 tcp_move_syn(newtp, req);
13eddc67
ED
1471 ireq->ireq_opt = NULL;
1472 } else {
1473 newinet->inet_opt = NULL;
1474 }
1da177e4
LT
1475 return newsk;
1476
1477exit_overflow:
de0744af 1478 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1479exit_nonewsk:
1480 dst_release(dst);
1da177e4 1481exit:
de0744af 1482 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4 1483 return NULL;
0e734419 1484put_and_exit:
13eddc67 1485 newinet->inet_opt = NULL;
e337e24d
CP
1486 inet_csk_prepare_forced_close(newsk);
1487 tcp_done(newsk);
0e734419 1488 goto exit;
1da177e4 1489}
4bc2f18b 1490EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1491
1cac41cb
MB
1492#ifndef CONFIG_MPTCP
1493static
1494#endif
1495struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1496{
079096f1 1497#ifdef CONFIG_SYN_COOKIES
52452c54 1498 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1499
af9b4738 1500 if (!th->syn)
461b74c3 1501 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1502#endif
1503 return sk;
1504}
1505
1da177e4 1506/* The socket must have it's spinlock held when we get
e994b2f0 1507 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1508 *
1509 * We have a potential double-lock case here, so even when
1510 * doing backlog processing we use the BH locking scheme.
1511 * This is because we cannot sleep with the original spinlock
1512 * held.
1513 */
1514int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1515{
cfb6eeb4 1516 struct sock *rsk;
1cac41cb
MB
1517#ifdef CONFIG_MPTCP
1518 if (is_meta_sk(sk))
1519 return mptcp_v4_do_rcv(sk, skb);
1520#endif
cfb6eeb4 1521
1da177e4 1522 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1523 struct dst_entry *dst = sk->sk_rx_dst;
1524
bdeab991 1525 sock_rps_save_rxhash(sk, skb);
3d97379a 1526 sk_mark_napi_id(sk, skb);
404e0a8b 1527 if (dst) {
505fbcf0 1528 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1529 !dst->ops->check(dst, 0)) {
92101b3b
DM
1530 dst_release(dst);
1531 sk->sk_rx_dst = NULL;
1532 }
1533 }
c995ae22 1534 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1da177e4
LT
1535 return 0;
1536 }
1537
12e25e10 1538 if (tcp_checksum_complete(skb))
1da177e4
LT
1539 goto csum_err;
1540
1541 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1542 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1543
1da177e4
LT
1544 if (!nsk)
1545 goto discard;
1da177e4 1546 if (nsk != sk) {
bdeab991 1547 sock_rps_save_rxhash(nsk, skb);
38cb5245 1548 sk_mark_napi_id(nsk, skb);
cfb6eeb4
YH
1549 if (tcp_child_process(sk, nsk, skb)) {
1550 rsk = nsk;
1da177e4 1551 goto reset;
cfb6eeb4 1552 }
1da177e4
LT
1553 return 0;
1554 }
ca55158c 1555 } else
bdeab991 1556 sock_rps_save_rxhash(sk, skb);
ca55158c 1557
72ab4a86 1558 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1559 rsk = sk;
1da177e4 1560 goto reset;
cfb6eeb4 1561 }
1da177e4
LT
1562 return 0;
1563
1564reset:
cfb6eeb4 1565 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1566discard:
1567 kfree_skb(skb);
1568 /* Be careful here. If this function gets more complicated and
1569 * gcc suffers from register pressure on the x86, sk (in %ebx)
1570 * might be destroyed here. This current version compiles correctly,
1571 * but you have been warned.
1572 */
1573 return 0;
1574
1575csum_err:
6a5dc9e5 1576 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
63231bdd 1577 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1578 goto discard;
1579}
4bc2f18b 1580EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1581
160eb5a6 1582void tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1583{
41063e9d
DM
1584 const struct iphdr *iph;
1585 const struct tcphdr *th;
1586 struct sock *sk;
41063e9d 1587
41063e9d 1588 if (skb->pkt_type != PACKET_HOST)
160eb5a6 1589 return;
41063e9d 1590
45f00f99 1591 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
160eb5a6 1592 return;
41063e9d
DM
1593
1594 iph = ip_hdr(skb);
45f00f99 1595 th = tcp_hdr(skb);
41063e9d
DM
1596
1597 if (th->doff < sizeof(struct tcphdr) / 4)
160eb5a6 1598 return;
41063e9d 1599
45f00f99 1600 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1601 iph->saddr, th->source,
7011d085 1602 iph->daddr, ntohs(th->dest),
9cb429d6 1603 skb->skb_iif);
41063e9d
DM
1604 if (sk) {
1605 skb->sk = sk;
1606 skb->destructor = sock_edemux;
f7e4eb03 1607 if (sk_fullsock(sk)) {
d0c294c5 1608 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1609
41063e9d
DM
1610 if (dst)
1611 dst = dst_check(dst, 0);
92101b3b 1612 if (dst &&
505fbcf0 1613 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1614 skb_dst_set_noref(skb, dst);
41063e9d
DM
1615 }
1616 }
41063e9d
DM
1617}
1618
b2fb4f54
ED
1619/* Packet is added to VJ-style prequeue for processing in process
1620 * context, if a reader task is waiting. Apparently, this exciting
1621 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1622 * failed somewhere. Latency? Burstiness? Well, at least now we will
1623 * see, why it failed. 8)8) --ANK
1624 *
1625 */
1626bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1627{
1628 struct tcp_sock *tp = tcp_sk(sk);
1629
1630 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1631 return false;
1632
1633 if (skb->len <= tcp_hdrlen(skb) &&
1634 skb_queue_len(&tp->ucopy.prequeue) == 0)
1635 return false;
1636
ca777eff
ED
1637 /* Before escaping RCU protected region, we need to take care of skb
1638 * dst. Prequeue is only enabled for established sockets.
1639 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1640 * Instead of doing full sk_rx_dst validity here, let's perform
1641 * an optimistic check.
1642 */
1643 if (likely(sk->sk_rx_dst))
1644 skb_dst_drop(skb);
1645 else
5037e9ef 1646 skb_dst_force_safe(skb);
ca777eff 1647
b2fb4f54
ED
1648 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1649 tp->ucopy.memory += skb->truesize;
1650 if (tp->ucopy.memory > sk->sk_rcvbuf) {
1651 struct sk_buff *skb1;
1652
1653 BUG_ON(sock_owned_by_user(sk));
1654
1655 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1656 sk_backlog_rcv(sk, skb1);
1657 NET_INC_STATS_BH(sock_net(sk),
1658 LINUX_MIB_TCPPREQUEUEDROPPED);
1659 }
1660
1661 tp->ucopy.memory = 0;
1662 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1663 wake_up_interruptible_sync_poll(sk_sleep(sk),
1664 POLLIN | POLLRDNORM | POLLRDBAND);
1cac41cb
MB
1665 if (!inet_csk_ack_scheduled(sk)
1666#ifdef CONFIG_MPTCP
1667 && !mptcp(tp)
1668#endif
1669 )
b2fb4f54
ED
1670 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1671 (3 * tcp_rto_min(sk)) / 4,
1672 TCP_RTO_MAX);
1673 }
1674 return true;
1675}
1676EXPORT_SYMBOL(tcp_prequeue);
1677
225a24ae
ED
1678int tcp_filter(struct sock *sk, struct sk_buff *skb)
1679{
1680 struct tcphdr *th = (struct tcphdr *)skb->data;
1681 unsigned int eaten = skb->len;
1682 int err;
1683
1684 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1685 if (!err) {
1686 eaten -= skb->len;
1687 TCP_SKB_CB(skb)->end_seq -= eaten;
1688 }
1689 return err;
1690}
1691EXPORT_SYMBOL(tcp_filter);
1692
1da177e4
LT
1693/*
1694 * From tcp_input.c
1695 */
1696
1cac41cb 1697#define RC_RETRY_CNT 3
1da177e4
LT
1698int tcp_v4_rcv(struct sk_buff *skb)
1699{
eddc9ec5 1700 const struct iphdr *iph;
cf533ea5 1701 const struct tcphdr *th;
1da177e4 1702 struct sock *sk;
1cac41cb
MB
1703#ifdef CONFIG_MPTCP
1704 struct sock *meta_sk = NULL;
1705#endif
1da177e4 1706 int ret;
a86b1e30 1707 struct net *net = dev_net(skb->dev);
1cac41cb 1708 unsigned int retry_cnt = RC_RETRY_CNT;
1da177e4
LT
1709
1710 if (skb->pkt_type != PACKET_HOST)
1711 goto discard_it;
1712
1713 /* Count it even if it's bad */
63231bdd 1714 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1715
1716 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1717 goto discard_it;
1718
aa8223c7 1719 th = tcp_hdr(skb);
1da177e4
LT
1720
1721 if (th->doff < sizeof(struct tcphdr) / 4)
1722 goto bad_packet;
1723 if (!pskb_may_pull(skb, th->doff * 4))
1724 goto discard_it;
1725
1726 /* An explanation is required here, I think.
1727 * Packet length and doff are validated by header prediction,
caa20d9a 1728 * provided case of th->doff==0 is eliminated.
1da177e4 1729 * So, we defer the checks. */
ed70fcfc
TH
1730
1731 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1732 goto csum_error;
1da177e4 1733
aa8223c7 1734 th = tcp_hdr(skb);
eddc9ec5 1735 iph = ip_hdr(skb);
971f10ec
ED
1736 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1737 * barrier() makes sure compiler wont play fool^Waliasing games.
1738 */
1739 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1740 sizeof(struct inet_skb_parm));
1741 barrier();
1742
1da177e4
LT
1743 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1744 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1745 skb->len - th->doff * 4);
1746 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1cac41cb
MB
1747#ifdef CONFIG_MPTCP
1748 TCP_SKB_CB(skb)->mptcp_flags = 0;
1749 TCP_SKB_CB(skb)->dss_off = 0;
1750#endif
e11ecddf 1751 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1752 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1753 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4
LT
1754 TCP_SKB_CB(skb)->sacked = 0;
1755
4bdc3d66 1756lookup:
9a1f27c4 1757 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1cac41cb 1758#ifndef CONFIG_MPTCP
1da177e4
LT
1759 if (!sk)
1760 goto no_tcp_socket;
1cac41cb 1761#endif
1da177e4 1762
bb134d5d 1763process:
1cac41cb
MB
1764#ifdef CONFIG_MPTCP
1765 if (sk && sk->sk_state == TCP_TIME_WAIT)
1766#else
bb134d5d 1767 if (sk->sk_state == TCP_TIME_WAIT)
1cac41cb 1768#endif
bb134d5d
ED
1769 goto do_time_wait;
1770
1cac41cb
MB
1771#ifdef CONFIG_MPTCP
1772 if (!sk)
1773 goto no_tcp_socket;
1774#endif
1775 /*
1776 * FIXME: SEC patch for P171206-06874
1777 * If ACK packets for three-way handshake are received at the same time by multi core,
1778 * each core will try to access request socket and create new socket to establish TCP connection.
1779 * But, there is no synchronization scheme to avoid race condition for request socket,
1780 * 2nd attempt that create new socket will be fail, it caused 2nd ACK packet discard.
1781 *
1782 * For that reason,
1783 * If 2nd ACK packet contained meaningful data, it caused unintended packet drop.
1784 * so, 2nd core should wait at this point until new socket was created by 1st core.
1785 * */
1786 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1787 struct request_sock *req = inet_reqsk(sk);
1788 if (atomic_read(&req->rsk_refcnt) > (2+1) && retry_cnt > 0) {
1789 reqsk_put(req);
1790 if (retry_cnt == RC_RETRY_CNT)
1791 NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSK);
1792 retry_cnt--;
1793 udelay(500);
1794
1795 goto lookup;
1796 }
1797
1798 if (!retry_cnt)
1799 NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSKDROP);
1800 }
1801
079096f1
ED
1802 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1803 struct request_sock *req = inet_reqsk(sk);
9653359e 1804 struct sock *nsk;
079096f1
ED
1805
1806 sk = req->rsk_listener;
a4b84d5e
ED
1807 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1808 reqsk_put(req);
1809 goto discard_it;
1810 }
1cac41cb
MB
1811 if (unlikely(sk->sk_state != TCP_LISTEN
1812#ifdef CONFIG_MPTCP
1813 && !is_meta_sk(sk)
1814#endif
1815 )) {
f03f2e15 1816 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1817 goto lookup;
1818 }
9653359e 1819 sock_hold(sk);
1cac41cb
MB
1820
1821#ifdef CONFIG_MPTCP
1822 if (is_meta_sk(sk)) {
1823 bh_lock_sock(sk);
1824
1825 if (sock_owned_by_user(sk)) {
1826 skb->sk = sk;
1827 if (unlikely(sk_add_backlog(sk, skb,
1828 sk->sk_rcvbuf + sk->sk_sndbuf))) {
1829 bh_unlock_sock(sk);
1830 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1831
1832 reqsk_put(req);
1833 goto discard_and_relse;
1834 }
1835
1836 reqsk_put(req);
1837 bh_unlock_sock(sk);
1838 sock_put(sk);
1839
1840 return 0;
1841 }
1842 }
1843#endif
9653359e 1844 nsk = tcp_check_req(sk, skb, req, false);
079096f1
ED
1845 if (!nsk) {
1846 reqsk_put(req);
1cac41cb
MB
1847#ifdef CONFIG_MPTCP
1848 if (is_meta_sk(sk))
1849 bh_unlock_sock(sk);
1850#endif
9653359e 1851 goto discard_and_relse;
079096f1
ED
1852 }
1853 if (nsk == sk) {
079096f1 1854 reqsk_put(req);
1cac41cb
MB
1855#ifdef CONFIG_MPTCP
1856 if (is_meta_sk(sk))
1857 bh_unlock_sock(sk);
1858#endif
079096f1
ED
1859 } else if (tcp_child_process(sk, nsk, skb)) {
1860 tcp_v4_send_reset(nsk, skb);
9653359e 1861 goto discard_and_relse;
079096f1 1862 } else {
9653359e 1863 sock_put(sk);
079096f1
ED
1864 return 0;
1865 }
1866 }
6cce09f8
ED
1867 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1868 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1869 goto discard_and_relse;
6cce09f8 1870 }
d218d111 1871
1da177e4
LT
1872 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1873 goto discard_and_relse;
9ea88a15 1874
9ea88a15
DP
1875 if (tcp_v4_inbound_md5_hash(sk, skb))
1876 goto discard_and_relse;
9ea88a15 1877
b59c2701 1878 nf_reset(skb);
1da177e4 1879
225a24ae 1880 if (tcp_filter(sk, skb))
1da177e4 1881 goto discard_and_relse;
225a24ae
ED
1882 th = (const struct tcphdr *)skb->data;
1883 iph = ip_hdr(skb);
1da177e4
LT
1884
1885 skb->dev = NULL;
1886
e994b2f0
ED
1887 if (sk->sk_state == TCP_LISTEN) {
1888 ret = tcp_v4_do_rcv(sk, skb);
1889 goto put_and_return;
1890 }
1891
1892 sk_incoming_cpu_update(sk);
1893
1cac41cb
MB
1894#ifdef CONFIG_MPTCP
1895 if (mptcp(tcp_sk(sk))) {
1896 meta_sk = mptcp_meta_sk(sk);
1897
1898 bh_lock_sock_nested(meta_sk);
1899 if (sock_owned_by_user(meta_sk))
1900 skb->sk = sk;
1901 } else {
1902 meta_sk = sk;
1903#endif
1904 bh_lock_sock_nested(sk);
1905#ifdef CONFIG_MPTCP
1906 }
1907#endif
2efd055c 1908 tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1da177e4 1909 ret = 0;
1cac41cb
MB
1910#ifdef CONFIG_MPTCP
1911 if (!sock_owned_by_user(meta_sk)) {
1912 if (!tcp_prequeue(meta_sk, skb))
1913 ret = tcp_v4_do_rcv(sk, skb);
1914 } else if (unlikely(sk_add_backlog(meta_sk, skb,
1915 meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
1916 bh_unlock_sock(meta_sk);
1917#else
1da177e4 1918 if (!sock_owned_by_user(sk)) {
7bced397 1919 if (!tcp_prequeue(sk, skb))
1da177e4 1920 ret = tcp_v4_do_rcv(sk, skb);
da882c1f
ED
1921 } else if (unlikely(sk_add_backlog(sk, skb,
1922 sk->sk_rcvbuf + sk->sk_sndbuf))) {
6b03a53a 1923 bh_unlock_sock(sk);
1cac41cb 1924#endif
6cce09f8 1925 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1926 goto discard_and_relse;
1927 }
1cac41cb
MB
1928#ifdef CONFIG_MPTCP
1929 bh_unlock_sock(meta_sk);
1930#else
1da177e4 1931 bh_unlock_sock(sk);
1cac41cb 1932#endif
1da177e4 1933
e994b2f0 1934put_and_return:
1da177e4
LT
1935 sock_put(sk);
1936
1937 return ret;
1938
1939no_tcp_socket:
1940 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1941 goto discard_it;
1942
1cac41cb
MB
1943#ifdef CONFIG_MPTCP
1944 if (!sk && th->syn && !th->ack) {
1945 int ret = mptcp_lookup_join(skb, NULL);
1946
1947 if (ret < 0) {
1948 tcp_v4_send_reset(NULL, skb);
1949 goto discard_it;
1950 } else if (ret > 0) {
1951 return 0;
1952 }
1953 }
1954#endif
1955
12e25e10 1956 if (tcp_checksum_complete(skb)) {
6a5dc9e5
ED
1957csum_error:
1958 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1da177e4 1959bad_packet:
63231bdd 1960 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1961 } else {
cfb6eeb4 1962 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1963 }
1964
1965discard_it:
1966 /* Discard frame. */
1967 kfree_skb(skb);
e905a9ed 1968 return 0;
1da177e4
LT
1969
1970discard_and_relse:
1971 sock_put(sk);
1972 goto discard_it;
1973
1974do_time_wait:
1975 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1976 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1977 goto discard_it;
1978 }
1979
6a5dc9e5
ED
1980 if (tcp_checksum_complete(skb)) {
1981 inet_twsk_put(inet_twsk(sk));
1982 goto csum_error;
1da177e4 1983 }
9469c7b4 1984 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1985 case TCP_TW_SYN: {
c346dca1 1986 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1987 &tcp_hashinfo,
da5e3630 1988 iph->saddr, th->source,
eddc9ec5 1989 iph->daddr, th->dest,
463c84b9 1990 inet_iif(skb));
1da177e4 1991 if (sk2) {
dbe7faa4 1992 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4
LT
1993 sk = sk2;
1994 goto process;
1995 }
1cac41cb
MB
1996#ifdef CONFIG_MPTCP
1997 if (th->syn && !th->ack) {
1998 int ret = mptcp_lookup_join(skb, inet_twsk(sk));
1999
2000 if (ret < 0) {
2001 tcp_v4_send_reset(NULL, skb);
2002 goto discard_it;
2003 } else if (ret > 0) {
2004 return 0;
2005 }
2006 }
2007#endif
1da177e4
LT
2008 /* Fall through to ACK */
2009 }
2010 case TCP_TW_ACK:
2011 tcp_v4_timewait_ack(sk, skb);
2012 break;
2013 case TCP_TW_RST:
2014 goto no_tcp_socket;
2015 case TCP_TW_SUCCESS:;
2016 }
2017 goto discard_it;
2018}
2019
ccb7c410
DM
2020static struct timewait_sock_ops tcp_timewait_sock_ops = {
2021 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2022 .twsk_unique = tcp_twsk_unique,
2023 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 2024};
1da177e4 2025
63d02d15 2026void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
2027{
2028 struct dst_entry *dst = skb_dst(skb);
2029
5037e9ef 2030 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
2031 sk->sk_rx_dst = dst;
2032 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2033 }
5d299f3d 2034}
63d02d15 2035EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 2036
3b401a81 2037const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
2038 .queue_xmit = ip_queue_xmit,
2039 .send_check = tcp_v4_send_check,
2040 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 2041 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
2042 .conn_request = tcp_v4_conn_request,
2043 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
2044 .net_header_len = sizeof(struct iphdr),
2045 .setsockopt = ip_setsockopt,
2046 .getsockopt = ip_getsockopt,
2047 .addr2sockaddr = inet_csk_addr2sockaddr,
2048 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 2049 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 2050#ifdef CONFIG_COMPAT
543d9cfe
ACM
2051 .compat_setsockopt = compat_ip_setsockopt,
2052 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 2053#endif
4fab9071 2054 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 2055};
4bc2f18b 2056EXPORT_SYMBOL(ipv4_specific);
1da177e4 2057
cfb6eeb4 2058#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 2059static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 2060 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 2061 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 2062 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 2063};
b6332e6c 2064#endif
cfb6eeb4 2065
1da177e4
LT
2066/* NOTE: A lot of things set to zero explicitly by call to
2067 * sk_alloc() so need not be done here.
2068 */
2069static int tcp_v4_init_sock(struct sock *sk)
2070{
6687e988 2071 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 2072
900f65d3 2073 tcp_init_sock(sk);
1da177e4 2074
1cac41cb
MB
2075#ifdef CONFIG_MPTCP
2076 if (sock_flag(sk, SOCK_MPTCP))
2077 icsk->icsk_af_ops = &mptcp_v4_specific;
2078 else
2079#endif
8292a17a 2080 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 2081
cfb6eeb4 2082#ifdef CONFIG_TCP_MD5SIG
ac807fa8 2083 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 2084#endif
1da177e4 2085
1da177e4
LT
2086 return 0;
2087}
2088
7d06b2e0 2089void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
2090{
2091 struct tcp_sock *tp = tcp_sk(sk);
2092
2093 tcp_clear_xmit_timers(sk);
2094
6687e988 2095 tcp_cleanup_congestion_control(sk);
1cac41cb
MB
2096#ifdef CONFIG_MPTCP
2097 if (mptcp(tp))
2098 mptcp_destroy_sock(sk);
2099 if (tp->inside_tk_table)
2100 mptcp_hash_remove(tp);
2101#endif
317a76f9 2102
1da177e4 2103 /* Cleanup up the write buffer. */
fe067e8a 2104 tcp_write_queue_purge(sk);
1da177e4
LT
2105
2106 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 2107 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 2108
cfb6eeb4
YH
2109#ifdef CONFIG_TCP_MD5SIG
2110 /* Clean up the MD5 key list, if any */
2111 if (tp->md5sig_info) {
a915da9b 2112 tcp_clear_md5_list(sk);
a8afca03 2113 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
2114 tp->md5sig_info = NULL;
2115 }
2116#endif
1a2449a8 2117
1da177e4
LT
2118 /* Clean prequeue, it must be empty really */
2119 __skb_queue_purge(&tp->ucopy.prequeue);
2120
2121 /* Clean up a referenced TCP bind bucket. */
463c84b9 2122 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 2123 inet_put_port(sk);
1da177e4 2124
00db4124 2125 BUG_ON(tp->fastopen_rsk);
435cf559 2126
cf60af03
YC
2127 /* If socket is aborted during connect operation */
2128 tcp_free_fastopen_req(tp);
cd8ae852 2129 tcp_saved_syn_free(tp);
cf60af03 2130
180d8cd9 2131 sk_sockets_allocated_dec(sk);
d1a4c0b3 2132 sock_release_memcg(sk);
1da177e4 2133}
1da177e4
LT
2134EXPORT_SYMBOL(tcp_v4_destroy_sock);
2135
2136#ifdef CONFIG_PROC_FS
2137/* Proc filesystem TCP sock list dumping. */
2138
a8b690f9
TH
2139/*
2140 * Get next listener socket follow cur. If cur is NULL, get first socket
2141 * starting from bucket given in st->bucket; when st->bucket is zero the
2142 * very first socket in the hash table is returned.
2143 */
1da177e4
LT
2144static void *listening_get_next(struct seq_file *seq, void *cur)
2145{
463c84b9 2146 struct inet_connection_sock *icsk;
c25eb3bf 2147 struct hlist_nulls_node *node;
1da177e4 2148 struct sock *sk = cur;
5caea4ea 2149 struct inet_listen_hashbucket *ilb;
5799de0b 2150 struct tcp_iter_state *st = seq->private;
a4146b1b 2151 struct net *net = seq_file_net(seq);
1da177e4
LT
2152
2153 if (!sk) {
a8b690f9 2154 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 2155 spin_lock_bh(&ilb->lock);
c25eb3bf 2156 sk = sk_nulls_head(&ilb->head);
a8b690f9 2157 st->offset = 0;
1da177e4
LT
2158 goto get_sk;
2159 }
5caea4ea 2160 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 2161 ++st->num;
a8b690f9 2162 ++st->offset;
1da177e4 2163
079096f1 2164 sk = sk_nulls_next(sk);
1da177e4 2165get_sk:
c25eb3bf 2166 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2167 if (!net_eq(sock_net(sk), net))
2168 continue;
2169 if (sk->sk_family == st->family) {
1da177e4
LT
2170 cur = sk;
2171 goto out;
2172 }
e905a9ed 2173 icsk = inet_csk(sk);
1da177e4 2174 }
5caea4ea 2175 spin_unlock_bh(&ilb->lock);
a8b690f9 2176 st->offset = 0;
0f7ff927 2177 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2178 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2179 spin_lock_bh(&ilb->lock);
c25eb3bf 2180 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2181 goto get_sk;
2182 }
2183 cur = NULL;
2184out:
2185 return cur;
2186}
2187
2188static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2189{
a8b690f9
TH
2190 struct tcp_iter_state *st = seq->private;
2191 void *rc;
2192
2193 st->bucket = 0;
2194 st->offset = 0;
2195 rc = listening_get_next(seq, NULL);
1da177e4
LT
2196
2197 while (rc && *pos) {
2198 rc = listening_get_next(seq, rc);
2199 --*pos;
2200 }
2201 return rc;
2202}
2203
05dbc7b5 2204static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 2205{
05dbc7b5 2206 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
2207}
2208
a8b690f9
TH
2209/*
2210 * Get first established socket starting from bucket given in st->bucket.
2211 * If st->bucket is zero, the very first socket in the hash is returned.
2212 */
1da177e4
LT
2213static void *established_get_first(struct seq_file *seq)
2214{
5799de0b 2215 struct tcp_iter_state *st = seq->private;
a4146b1b 2216 struct net *net = seq_file_net(seq);
1da177e4
LT
2217 void *rc = NULL;
2218
a8b690f9
TH
2219 st->offset = 0;
2220 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2221 struct sock *sk;
3ab5aee7 2222 struct hlist_nulls_node *node;
9db66bdc 2223 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2224
6eac5604
AK
2225 /* Lockless fast path for the common case of empty buckets */
2226 if (empty_bucket(st))
2227 continue;
2228
9db66bdc 2229 spin_lock_bh(lock);
3ab5aee7 2230 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2231 if (sk->sk_family != st->family ||
878628fb 2232 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2233 continue;
2234 }
2235 rc = sk;
2236 goto out;
2237 }
9db66bdc 2238 spin_unlock_bh(lock);
1da177e4
LT
2239 }
2240out:
2241 return rc;
2242}
2243
2244static void *established_get_next(struct seq_file *seq, void *cur)
2245{
2246 struct sock *sk = cur;
3ab5aee7 2247 struct hlist_nulls_node *node;
5799de0b 2248 struct tcp_iter_state *st = seq->private;
a4146b1b 2249 struct net *net = seq_file_net(seq);
1da177e4
LT
2250
2251 ++st->num;
a8b690f9 2252 ++st->offset;
1da177e4 2253
05dbc7b5 2254 sk = sk_nulls_next(sk);
1da177e4 2255
3ab5aee7 2256 sk_nulls_for_each_from(sk, node) {
878628fb 2257 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2258 return sk;
1da177e4
LT
2259 }
2260
05dbc7b5
ED
2261 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2262 ++st->bucket;
2263 return established_get_first(seq);
1da177e4
LT
2264}
2265
2266static void *established_get_idx(struct seq_file *seq, loff_t pos)
2267{
a8b690f9
TH
2268 struct tcp_iter_state *st = seq->private;
2269 void *rc;
2270
2271 st->bucket = 0;
2272 rc = established_get_first(seq);
1da177e4
LT
2273
2274 while (rc && pos) {
2275 rc = established_get_next(seq, rc);
2276 --pos;
7174259e 2277 }
1da177e4
LT
2278 return rc;
2279}
2280
2281static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2282{
2283 void *rc;
5799de0b 2284 struct tcp_iter_state *st = seq->private;
1da177e4 2285
1da177e4
LT
2286 st->state = TCP_SEQ_STATE_LISTENING;
2287 rc = listening_get_idx(seq, &pos);
2288
2289 if (!rc) {
1da177e4
LT
2290 st->state = TCP_SEQ_STATE_ESTABLISHED;
2291 rc = established_get_idx(seq, pos);
2292 }
2293
2294 return rc;
2295}
2296
a8b690f9
TH
2297static void *tcp_seek_last_pos(struct seq_file *seq)
2298{
2299 struct tcp_iter_state *st = seq->private;
2300 int offset = st->offset;
2301 int orig_num = st->num;
2302 void *rc = NULL;
2303
2304 switch (st->state) {
a8b690f9
TH
2305 case TCP_SEQ_STATE_LISTENING:
2306 if (st->bucket >= INET_LHTABLE_SIZE)
2307 break;
2308 st->state = TCP_SEQ_STATE_LISTENING;
2309 rc = listening_get_next(seq, NULL);
2310 while (offset-- && rc)
2311 rc = listening_get_next(seq, rc);
2312 if (rc)
2313 break;
2314 st->bucket = 0;
05dbc7b5 2315 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2316 /* Fallthrough */
2317 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2318 if (st->bucket > tcp_hashinfo.ehash_mask)
2319 break;
2320 rc = established_get_first(seq);
2321 while (offset-- && rc)
2322 rc = established_get_next(seq, rc);
2323 }
2324
2325 st->num = orig_num;
2326
2327 return rc;
2328}
2329
1da177e4
LT
2330static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2331{
5799de0b 2332 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2333 void *rc;
2334
2335 if (*pos && *pos == st->last_pos) {
2336 rc = tcp_seek_last_pos(seq);
2337 if (rc)
2338 goto out;
2339 }
2340
1da177e4
LT
2341 st->state = TCP_SEQ_STATE_LISTENING;
2342 st->num = 0;
a8b690f9
TH
2343 st->bucket = 0;
2344 st->offset = 0;
2345 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2346
2347out:
2348 st->last_pos = *pos;
2349 return rc;
1da177e4
LT
2350}
2351
2352static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2353{
a8b690f9 2354 struct tcp_iter_state *st = seq->private;
1da177e4 2355 void *rc = NULL;
1da177e4
LT
2356
2357 if (v == SEQ_START_TOKEN) {
2358 rc = tcp_get_idx(seq, 0);
2359 goto out;
2360 }
1da177e4
LT
2361
2362 switch (st->state) {
1da177e4
LT
2363 case TCP_SEQ_STATE_LISTENING:
2364 rc = listening_get_next(seq, v);
2365 if (!rc) {
1da177e4 2366 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2367 st->bucket = 0;
2368 st->offset = 0;
1da177e4
LT
2369 rc = established_get_first(seq);
2370 }
2371 break;
2372 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2373 rc = established_get_next(seq, v);
2374 break;
2375 }
2376out:
2377 ++*pos;
a8b690f9 2378 st->last_pos = *pos;
1da177e4
LT
2379 return rc;
2380}
2381
2382static void tcp_seq_stop(struct seq_file *seq, void *v)
2383{
5799de0b 2384 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2385
2386 switch (st->state) {
1da177e4
LT
2387 case TCP_SEQ_STATE_LISTENING:
2388 if (v != SEQ_START_TOKEN)
5caea4ea 2389 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2390 break;
1da177e4
LT
2391 case TCP_SEQ_STATE_ESTABLISHED:
2392 if (v)
9db66bdc 2393 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2394 break;
2395 }
2396}
2397
73cb88ec 2398int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2399{
d9dda78b 2400 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2401 struct tcp_iter_state *s;
52d6f3f1 2402 int err;
1da177e4 2403
52d6f3f1
DL
2404 err = seq_open_net(inode, file, &afinfo->seq_ops,
2405 sizeof(struct tcp_iter_state));
2406 if (err < 0)
2407 return err;
f40c8174 2408
52d6f3f1 2409 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2410 s->family = afinfo->family;
688d1945 2411 s->last_pos = 0;
f40c8174
DL
2412 return 0;
2413}
73cb88ec 2414EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2415
6f8b13bc 2416int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2417{
2418 int rc = 0;
2419 struct proc_dir_entry *p;
2420
9427c4b3
DL
2421 afinfo->seq_ops.start = tcp_seq_start;
2422 afinfo->seq_ops.next = tcp_seq_next;
2423 afinfo->seq_ops.stop = tcp_seq_stop;
2424
84841c3c 2425 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2426 afinfo->seq_fops, afinfo);
84841c3c 2427 if (!p)
1da177e4
LT
2428 rc = -ENOMEM;
2429 return rc;
2430}
4bc2f18b 2431EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2432
6f8b13bc 2433void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2434{
ece31ffd 2435 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2436}
4bc2f18b 2437EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2438
d4f06873 2439static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2440 struct seq_file *f, int i)
1da177e4 2441{
2e6599cb 2442 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2443 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2444
5e659e4c 2445 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2446 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2447 i,
634fb979 2448 ireq->ir_loc_addr,
d4f06873 2449 ireq->ir_num,
634fb979
ED
2450 ireq->ir_rmt_addr,
2451 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2452 TCP_SYN_RECV,
2453 0, 0, /* could print option size, but that is af dependent. */
2454 1, /* timers active (only the expire timer) */
a399a805 2455 jiffies_delta_to_clock_t(delta),
e6c022a4 2456 req->num_timeout,
aa3a0c8c
ED
2457 from_kuid_munged(seq_user_ns(f),
2458 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2459 0, /* non standard timer */
2460 0, /* open_requests have no inode */
d4f06873 2461 0,
652586df 2462 req);
1da177e4
LT
2463}
2464
652586df 2465static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2466{
2467 int timer_active;
2468 unsigned long timer_expires;
cf533ea5 2469 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2470 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2471 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2472 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2473 __be32 dest = inet->inet_daddr;
2474 __be32 src = inet->inet_rcv_saddr;
2475 __u16 destp = ntohs(inet->inet_dport);
2476 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2477 int rx_queue;
00fd38d9 2478 int state;
1da177e4 2479
6ba8a3b1
ND
2480 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2481 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2482 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2483 timer_active = 1;
463c84b9
ACM
2484 timer_expires = icsk->icsk_timeout;
2485 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2486 timer_active = 4;
463c84b9 2487 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2488 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2489 timer_active = 2;
cf4c6bf8 2490 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2491 } else {
2492 timer_active = 0;
2493 timer_expires = jiffies;
2494 }
2495
00fd38d9
ED
2496 state = sk_state_load(sk);
2497 if (state == TCP_LISTEN)
49d09007
ED
2498 rx_queue = sk->sk_ack_backlog;
2499 else
00fd38d9
ED
2500 /* Because we don't lock the socket,
2501 * we might find a transient negative value.
49d09007
ED
2502 */
2503 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2504
5e659e4c 2505 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2506 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2507 i, src, srcp, dest, destp, state,
47da8ee6 2508 tp->write_seq - tp->snd_una,
49d09007 2509 rx_queue,
1da177e4 2510 timer_active,
a399a805 2511 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2512 icsk->icsk_retransmits,
a7cb5a49 2513 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2514 icsk->icsk_probes_out,
cf4c6bf8
IJ
2515 sock_i_ino(sk),
2516 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2517 jiffies_to_clock_t(icsk->icsk_rto),
2518 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2519 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2520 tp->snd_cwnd,
00fd38d9
ED
2521 state == TCP_LISTEN ?
2522 fastopenq->max_qlen :
652586df 2523 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2524}
2525
cf533ea5 2526static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2527 struct seq_file *f, int i)
1da177e4 2528{
789f558c 2529 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2530 __be32 dest, src;
1da177e4 2531 __u16 destp, srcp;
1da177e4
LT
2532
2533 dest = tw->tw_daddr;
2534 src = tw->tw_rcv_saddr;
2535 destp = ntohs(tw->tw_dport);
2536 srcp = ntohs(tw->tw_sport);
2537
5e659e4c 2538 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2539 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2540 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2541 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
652586df 2542 atomic_read(&tw->tw_refcnt), tw);
1da177e4
LT
2543}
2544
2545#define TMPSZ 150
2546
2547static int tcp4_seq_show(struct seq_file *seq, void *v)
2548{
5799de0b 2549 struct tcp_iter_state *st;
05dbc7b5 2550 struct sock *sk = v;
1da177e4 2551
652586df 2552 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2553 if (v == SEQ_START_TOKEN) {
652586df 2554 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2555 "rx_queue tr tm->when retrnsmt uid timeout "
2556 "inode");
2557 goto out;
2558 }
2559 st = seq->private;
2560
079096f1
ED
2561 if (sk->sk_state == TCP_TIME_WAIT)
2562 get_timewait4_sock(v, seq, st->num);
2563 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2564 get_openreq4(v, seq, st->num);
079096f1
ED
2565 else
2566 get_tcp4_sock(v, seq, st->num);
1da177e4 2567out:
652586df 2568 seq_pad(seq, '\n');
1da177e4
LT
2569 return 0;
2570}
2571
73cb88ec
AV
2572static const struct file_operations tcp_afinfo_seq_fops = {
2573 .owner = THIS_MODULE,
2574 .open = tcp_seq_open,
2575 .read = seq_read,
2576 .llseek = seq_lseek,
2577 .release = seq_release_net
2578};
2579
1da177e4 2580static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2581 .name = "tcp",
2582 .family = AF_INET,
73cb88ec 2583 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2584 .seq_ops = {
2585 .show = tcp4_seq_show,
2586 },
1da177e4
LT
2587};
2588
2c8c1e72 2589static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2590{
2591 return tcp_proc_register(net, &tcp4_seq_afinfo);
2592}
2593
2c8c1e72 2594static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2595{
2596 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2597}
2598
2599static struct pernet_operations tcp4_net_ops = {
2600 .init = tcp4_proc_init_net,
2601 .exit = tcp4_proc_exit_net,
2602};
2603
1da177e4
LT
2604int __init tcp4_proc_init(void)
2605{
757764f6 2606 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2607}
2608
2609void tcp4_proc_exit(void)
2610{
757764f6 2611 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2612}
2613#endif /* CONFIG_PROC_FS */
2614
1cac41cb
MB
2615#ifdef CONFIG_MPTCP
2616static void tcp_v4_clear_sk(struct sock *sk, int size)
2617{
2618 struct tcp_sock *tp = tcp_sk(sk);
2619
2620 /* we do not want to clear tk_table field, because of RCU lookups */
2621 sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table.next));
2622
2623 memset(&tp->tk_table.pprev, 0, size - offsetof(struct tcp_sock, tk_table.pprev));
2624}
2625
2626void tcp_copy_sk(struct sock *nsk, const struct sock *osk)
2627{
2628 struct tcp_sock *ntp = tcp_sk(nsk);
2629 struct tcp_sock *otp = tcp_sk(osk);
2630
2631 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2632
2633 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2634 offsetof(struct tcp_sock, tk_table.next) - offsetof(struct sock, sk_dontcopy_end));
2635
2636 memcpy(&ntp->tk_table.pprev, &otp->tk_table.pprev,
2637 osk->sk_prot->obj_size - offsetof(struct tcp_sock, tk_table.pprev));
2638
2639 ntp->tk_table.pprev = NULL;
2640}
2641#endif
2642
1da177e4
LT
2643struct proto tcp_prot = {
2644 .name = "TCP",
2645 .owner = THIS_MODULE,
2646 .close = tcp_close,
2647 .connect = tcp_v4_connect,
2648 .disconnect = tcp_disconnect,
463c84b9 2649 .accept = inet_csk_accept,
1da177e4
LT
2650 .ioctl = tcp_ioctl,
2651 .init = tcp_v4_init_sock,
2652 .destroy = tcp_v4_destroy_sock,
2653 .shutdown = tcp_shutdown,
2654 .setsockopt = tcp_setsockopt,
2655 .getsockopt = tcp_getsockopt,
1da177e4 2656 .recvmsg = tcp_recvmsg,
7ba42910
CG
2657 .sendmsg = tcp_sendmsg,
2658 .sendpage = tcp_sendpage,
1da177e4 2659 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2660 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2661 .hash = inet_hash,
2662 .unhash = inet_unhash,
2663 .get_port = inet_csk_get_port,
1da177e4 2664 .enter_memory_pressure = tcp_enter_memory_pressure,
c9bee3b7 2665 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2666 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2667 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2668 .memory_allocated = &tcp_memory_allocated,
2669 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2670 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2671 .sysctl_wmem = sysctl_tcp_wmem,
2672 .sysctl_rmem = sysctl_tcp_rmem,
2673 .max_header = MAX_TCP_HEADER,
2674 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2675 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2676 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2677 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2678 .h.hashinfo = &tcp_hashinfo,
7ba42910 2679 .no_autobind = true,
543d9cfe
ACM
2680#ifdef CONFIG_COMPAT
2681 .compat_setsockopt = compat_tcp_setsockopt,
2682 .compat_getsockopt = compat_tcp_getsockopt,
2683#endif
c255a458 2684#ifdef CONFIG_MEMCG_KMEM
d1a4c0b3
GC
2685 .init_cgroup = tcp_init_cgroup,
2686 .destroy_cgroup = tcp_destroy_cgroup,
2687 .proto_cgroup = tcp_proto_cgroup,
2688#endif
79170d8d 2689 .diag_destroy = tcp_abort,
1cac41cb
MB
2690#ifdef CONFIG_MPTCP
2691 .clear_sk = tcp_v4_clear_sk,
2692 .copy_sk = tcp_copy_sk,
2693#endif
1da177e4 2694};
4bc2f18b 2695EXPORT_SYMBOL(tcp_prot);
1da177e4 2696
bdbbb852
ED
2697static void __net_exit tcp_sk_exit(struct net *net)
2698{
2699 int cpu;
2700
2701 for_each_possible_cpu(cpu)
2702 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2703 free_percpu(net->ipv4.tcp_sk);
2704}
2705
046ee902
DL
2706static int __net_init tcp_sk_init(struct net *net)
2707{
bdbbb852
ED
2708 int res, cpu;
2709
2710 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2711 if (!net->ipv4.tcp_sk)
2712 return -ENOMEM;
2713
2714 for_each_possible_cpu(cpu) {
2715 struct sock *sk;
2716
2717 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2718 IPPROTO_TCP, net);
2719 if (res)
2720 goto fail;
2721 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2722 }
49213555 2723
5d134f1c 2724 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2725 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2726
b0f9ca53 2727 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2728 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2729 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2730
49213555 2731 return 0;
bdbbb852
ED
2732fail:
2733 tcp_sk_exit(net);
2734
2735 return res;
b099ce26
EB
2736}
2737
2738static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2739{
2740 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2741}
2742
2743static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2744 .init = tcp_sk_init,
2745 .exit = tcp_sk_exit,
2746 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2747};
2748
9b0f976f 2749void __init tcp_v4_init(void)
1da177e4 2750{
5caea4ea 2751 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2752 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2753 panic("Failed to create the TCP control socket.\n");
1da177e4 2754}