2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * IPv4 specific functions
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
16 * See tcp.c for author information
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
53 #define pr_fmt(fmt) "TCP: " fmt
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
66 #include <net/net_namespace.h>
68 #include <net/inet_hashtables.h>
70 #include <net/transp_v6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
88 int sysctl_tcp_tw_reuse __read_mostly
;
89 int sysctl_tcp_low_latency __read_mostly
;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency
);
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
95 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
);
98 struct inet_hashinfo tcp_hashinfo
;
99 EXPORT_SYMBOL(tcp_hashinfo
);
101 static inline __u32
tcp_v4_init_sequence(const struct sk_buff
*skb
)
103 return secure_tcp_sequence_number(ip_hdr(skb
)->daddr
,
106 tcp_hdr(skb
)->source
);
109 int tcp_twsk_unique(struct sock
*sk
, struct sock
*sktw
, void *twp
)
111 const struct tcp_timewait_sock
*tcptw
= tcp_twsk(sktw
);
112 struct tcp_sock
*tp
= tcp_sk(sk
);
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
125 if (tcptw
->tw_ts_recent_stamp
&&
126 (twp
== NULL
|| (sysctl_tcp_tw_reuse
&&
127 get_seconds() - tcptw
->tw_ts_recent_stamp
> 1))) {
128 tp
->write_seq
= tcptw
->tw_snd_nxt
+ 65535 + 2;
129 if (tp
->write_seq
== 0)
131 tp
->rx_opt
.ts_recent
= tcptw
->tw_ts_recent
;
132 tp
->rx_opt
.ts_recent_stamp
= tcptw
->tw_ts_recent_stamp
;
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique
);
141 /* This will initiate an outgoing connection. */
142 int tcp_v4_connect(struct sock
*sk
, struct sockaddr
*uaddr
, int addr_len
)
144 struct sockaddr_in
*usin
= (struct sockaddr_in
*)uaddr
;
145 struct inet_sock
*inet
= inet_sk(sk
);
146 struct tcp_sock
*tp
= tcp_sk(sk
);
147 __be16 orig_sport
, orig_dport
;
148 __be32 daddr
, nexthop
;
152 struct ip_options_rcu
*inet_opt
;
154 if (addr_len
< sizeof(struct sockaddr_in
))
157 if (usin
->sin_family
!= AF_INET
)
158 return -EAFNOSUPPORT
;
160 nexthop
= daddr
= usin
->sin_addr
.s_addr
;
161 inet_opt
= rcu_dereference_protected(inet
->inet_opt
,
162 sock_owned_by_user(sk
));
163 if (inet_opt
&& inet_opt
->opt
.srr
) {
166 nexthop
= inet_opt
->opt
.faddr
;
169 orig_sport
= inet
->inet_sport
;
170 orig_dport
= usin
->sin_port
;
171 fl4
= &inet
->cork
.fl
.u
.ip4
;
172 rt
= ip_route_connect(fl4
, nexthop
, inet
->inet_saddr
,
173 RT_CONN_FLAGS(sk
), sk
->sk_bound_dev_if
,
175 orig_sport
, orig_dport
, sk
, true);
178 if (err
== -ENETUNREACH
)
179 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
183 if (rt
->rt_flags
& (RTCF_MULTICAST
| RTCF_BROADCAST
)) {
188 if (!inet_opt
|| !inet_opt
->opt
.srr
)
191 if (!inet
->inet_saddr
)
192 inet
->inet_saddr
= fl4
->saddr
;
193 inet
->inet_rcv_saddr
= inet
->inet_saddr
;
195 if (tp
->rx_opt
.ts_recent_stamp
&& inet
->inet_daddr
!= daddr
) {
196 /* Reset inherited state */
197 tp
->rx_opt
.ts_recent
= 0;
198 tp
->rx_opt
.ts_recent_stamp
= 0;
199 if (likely(!tp
->repair
))
203 if (tcp_death_row
.sysctl_tw_recycle
&&
204 !tp
->rx_opt
.ts_recent_stamp
&& fl4
->daddr
== daddr
)
205 tcp_fetch_timewait_stamp(sk
, &rt
->dst
);
207 inet
->inet_dport
= usin
->sin_port
;
208 inet
->inet_daddr
= daddr
;
210 inet_csk(sk
)->icsk_ext_hdr_len
= 0;
212 inet_csk(sk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
214 tp
->rx_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
216 /* Socket identity is still unknown (sport may be zero).
217 * However we set state to SYN-SENT and not releasing socket
218 * lock select source port, enter ourselves into the hash tables and
219 * complete initialization after this.
221 tcp_set_state(sk
, TCP_SYN_SENT
);
222 err
= inet_hash_connect(&tcp_death_row
, sk
);
226 rt
= ip_route_newports(fl4
, rt
, orig_sport
, orig_dport
,
227 inet
->inet_sport
, inet
->inet_dport
, sk
);
233 /* OK, now commit destination to socket. */
234 sk
->sk_gso_type
= SKB_GSO_TCPV4
;
235 sk_setup_caps(sk
, &rt
->dst
);
236 printk(KERN_INFO
"[socket_conn]IPV4 socket[%lu] sport:%u \n", SOCK_INODE(sk
->sk_socket
)->i_ino
, ntohs(inet
->inet_sport
));
237 if (!tp
->write_seq
&& likely(!tp
->repair
))
238 tp
->write_seq
= secure_tcp_sequence_number(inet
->inet_saddr
,
243 inet
->inet_id
= tp
->write_seq
^ jiffies
;
245 err
= tcp_connect(sk
);
255 * This unhashes the socket and releases the local port,
258 tcp_set_state(sk
, TCP_CLOSE
);
260 sk
->sk_route_caps
= 0;
261 inet
->inet_dport
= 0;
264 EXPORT_SYMBOL(tcp_v4_connect
);
267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
268 * It can be called through tcp_release_cb() if socket was owned by user
269 * at the time tcp_v4_err() was called to handle ICMP message.
271 void tcp_v4_mtu_reduced(struct sock
*sk
)
273 struct dst_entry
*dst
;
274 struct inet_sock
*inet
= inet_sk(sk
);
275 u32 mtu
= tcp_sk(sk
)->mtu_info
;
277 dst
= inet_csk_update_pmtu(sk
, mtu
);
281 /* Something is about to be wrong... Remember soft error
282 * for the case, if this connection will not able to recover.
284 if (mtu
< dst_mtu(dst
) && ip_dont_fragment(sk
, dst
))
285 sk
->sk_err_soft
= EMSGSIZE
;
289 if (inet
->pmtudisc
!= IP_PMTUDISC_DONT
&&
290 inet_csk(sk
)->icsk_pmtu_cookie
> mtu
) {
291 tcp_sync_mss(sk
, mtu
);
293 /* Resend the TCP packet because it's
294 * clear that the old packet has been
295 * dropped. This is the new "fast" path mtu
298 tcp_simple_retransmit(sk
);
299 } /* else let the usual retransmit timer handle it */
301 EXPORT_SYMBOL(tcp_v4_mtu_reduced
);
303 static void do_redirect(struct sk_buff
*skb
, struct sock
*sk
)
305 struct dst_entry
*dst
= __sk_dst_check(sk
, 0);
308 dst
->ops
->redirect(dst
, sk
, skb
);
312 * This routine is called by the ICMP module when it gets some
313 * sort of error condition. If err < 0 then the socket should
314 * be closed and the error returned to the user. If err > 0
315 * it's just the icmp type << 8 | icmp code. After adjustment
316 * header points to the first 8 bytes of the tcp header. We need
317 * to find the appropriate port.
319 * The locking strategy used here is very "optimistic". When
320 * someone else accesses the socket the ICMP is just dropped
321 * and for some paths there is no check at all.
322 * A more general error queue to queue errors for later handling
323 * is probably better.
327 void tcp_v4_err(struct sk_buff
*icmp_skb
, u32 info
)
329 const struct iphdr
*iph
= (const struct iphdr
*)icmp_skb
->data
;
330 struct tcphdr
*th
= (struct tcphdr
*)(icmp_skb
->data
+ (iph
->ihl
<< 2));
331 struct inet_connection_sock
*icsk
;
333 struct inet_sock
*inet
;
334 const int type
= icmp_hdr(icmp_skb
)->type
;
335 const int code
= icmp_hdr(icmp_skb
)->code
;
338 struct request_sock
*req
;
342 struct net
*net
= dev_net(icmp_skb
->dev
);
344 if (icmp_skb
->len
< (iph
->ihl
<< 2) + 8) {
345 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
349 sk
= inet_lookup(net
, &tcp_hashinfo
, iph
->daddr
, th
->dest
,
350 iph
->saddr
, th
->source
, inet_iif(icmp_skb
));
352 ICMP_INC_STATS_BH(net
, ICMP_MIB_INERRORS
);
355 if (sk
->sk_state
== TCP_TIME_WAIT
) {
356 inet_twsk_put(inet_twsk(sk
));
361 /* If too many ICMPs get dropped on busy
362 * servers this needs to be solved differently.
363 * We do take care of PMTU discovery (RFC1191) special case :
364 * we can receive locally generated ICMP messages while socket is held.
366 if (sock_owned_by_user(sk
)) {
367 if (!(type
== ICMP_DEST_UNREACH
&& code
== ICMP_FRAG_NEEDED
))
368 NET_INC_STATS_BH(net
, LINUX_MIB_LOCKDROPPEDICMPS
);
370 if (sk
->sk_state
== TCP_CLOSE
)
373 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
374 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
380 req
= tp
->fastopen_rsk
;
381 seq
= ntohl(th
->seq
);
382 if (sk
->sk_state
!= TCP_LISTEN
&&
383 !between(seq
, tp
->snd_una
, tp
->snd_nxt
) &&
384 (req
== NULL
|| seq
!= tcp_rsk(req
)->snt_isn
)) {
385 /* For a Fast Open socket, allow seq to be snt_isn. */
386 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
392 do_redirect(icmp_skb
, sk
);
394 case ICMP_SOURCE_QUENCH
:
395 /* Just silently ignore these. */
397 case ICMP_PARAMETERPROB
:
400 case ICMP_DEST_UNREACH
:
401 if (code
> NR_ICMP_UNREACH
)
404 if (code
== ICMP_FRAG_NEEDED
) { /* PMTU discovery (RFC1191) */
405 /* We are not interested in TCP_LISTEN and open_requests
406 * (SYN-ACKs send out by Linux are always <576bytes so
407 * they should go through unfragmented).
409 if (sk
->sk_state
== TCP_LISTEN
)
413 if (!sock_owned_by_user(sk
)) {
414 tcp_v4_mtu_reduced(sk
);
416 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED
, &tp
->tsq_flags
))
422 err
= icmp_err_convert
[code
].errno
;
423 /* check if icmp_skb allows revert of backoff
424 * (see draft-zimmermann-tcp-lcd) */
425 if (code
!= ICMP_NET_UNREACH
&& code
!= ICMP_HOST_UNREACH
)
427 if (seq
!= tp
->snd_una
|| !icsk
->icsk_retransmits
||
431 /* XXX (TFO) - revisit the following logic for TFO */
433 if (sock_owned_by_user(sk
))
436 icsk
->icsk_backoff
--;
437 inet_csk(sk
)->icsk_rto
= (tp
->srtt
? __tcp_set_rto(tp
) :
438 TCP_TIMEOUT_INIT
) << icsk
->icsk_backoff
;
441 skb
= tcp_write_queue_head(sk
);
444 remaining
= icsk
->icsk_rto
- min(icsk
->icsk_rto
,
445 tcp_time_stamp
- TCP_SKB_CB(skb
)->when
);
448 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_RETRANS
,
449 remaining
, sysctl_tcp_rto_max
);
451 /* RTO revert clocked out retransmission.
452 * Will retransmit now */
453 tcp_retransmit_timer(sk
);
457 case ICMP_TIME_EXCEEDED
:
464 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
465 * than following the TCP_SYN_RECV case and closing the socket,
466 * we ignore the ICMP error and keep trying like a fully established
467 * socket. Is this the right thing to do?
469 if (req
&& req
->sk
== NULL
)
472 switch (sk
->sk_state
) {
473 struct request_sock
*req
, **prev
;
475 if (sock_owned_by_user(sk
))
478 req
= inet_csk_search_req(sk
, &prev
, th
->dest
,
479 iph
->daddr
, iph
->saddr
);
483 /* ICMPs are not backlogged, hence we cannot get
484 an established socket here.
488 if (seq
!= tcp_rsk(req
)->snt_isn
) {
489 NET_INC_STATS_BH(net
, LINUX_MIB_OUTOFWINDOWICMPS
);
494 * Still in SYN_RECV, just remove it silently.
495 * There is no good way to pass the error to the newly
496 * created socket, and POSIX does not want network
497 * errors returned from accept().
499 inet_csk_reqsk_queue_drop(sk
, req
, prev
);
500 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
504 case TCP_SYN_RECV
: /* Cannot happen.
505 It can f.e. if SYNs crossed,
508 if (!sock_owned_by_user(sk
)) {
511 sk
->sk_error_report(sk
);
515 sk
->sk_err_soft
= err
;
520 /* If we've already connected we will keep trying
521 * until we time out, or the user gives up.
523 * rfc1122 4.2.3.9 allows to consider as hard errors
524 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
525 * but it is obsoleted by pmtu discovery).
527 * Note, that in modern internet, where routing is unreliable
528 * and in each dark corner broken firewalls sit, sending random
529 * errors ordered by their masters even this two messages finally lose
530 * their original sense (even Linux sends invalid PORT_UNREACHs)
532 * Now we are in compliance with RFCs.
537 if (!sock_owned_by_user(sk
) && inet
->recverr
) {
539 sk
->sk_error_report(sk
);
540 } else { /* Only an error on timeout */
541 sk
->sk_err_soft
= err
;
549 static void __tcp_v4_send_check(struct sk_buff
*skb
,
550 __be32 saddr
, __be32 daddr
)
552 struct tcphdr
*th
= tcp_hdr(skb
);
554 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
555 th
->check
= ~tcp_v4_check(skb
->len
, saddr
, daddr
, 0);
556 skb
->csum_start
= skb_transport_header(skb
) - skb
->head
;
557 skb
->csum_offset
= offsetof(struct tcphdr
, check
);
559 th
->check
= tcp_v4_check(skb
->len
, saddr
, daddr
,
566 /* This routine computes an IPv4 TCP checksum. */
567 void tcp_v4_send_check(struct sock
*sk
, struct sk_buff
*skb
)
569 const struct inet_sock
*inet
= inet_sk(sk
);
571 __tcp_v4_send_check(skb
, inet
->inet_saddr
, inet
->inet_daddr
);
573 EXPORT_SYMBOL(tcp_v4_send_check
);
575 int tcp_v4_gso_send_check(struct sk_buff
*skb
)
577 const struct iphdr
*iph
;
580 if (!pskb_may_pull(skb
, sizeof(*th
)))
587 skb
->ip_summed
= CHECKSUM_PARTIAL
;
588 __tcp_v4_send_check(skb
, iph
->saddr
, iph
->daddr
);
593 * This routine will send an RST to the other tcp.
595 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
597 * Answer: if a packet caused RST, it is not for a socket
598 * existing in our system, if it is matched to a socket,
599 * it is just duplicate segment or bug in other side's TCP.
600 * So that we build reply only basing on parameters
601 * arrived with segment.
602 * Exception: precedence violation. We do not implement it in any case.
605 static void tcp_v4_send_reset(struct sock
*sk
, struct sk_buff
*skb
)
607 const struct tcphdr
*th
= tcp_hdr(skb
);
610 #ifdef CONFIG_TCP_MD5SIG
611 __be32 opt
[(TCPOLEN_MD5SIG_ALIGNED
>> 2)];
614 struct ip_reply_arg arg
;
615 #ifdef CONFIG_TCP_MD5SIG
616 struct tcp_md5sig_key
*key
;
617 const __u8
*hash_location
= NULL
;
618 unsigned char newhash
[16];
620 struct sock
*sk1
= NULL
;
624 /* Never send a reset in response to a reset. */
628 if (skb_rtable(skb
)->rt_type
!= RTN_LOCAL
)
631 /* Swap the send and the receive. */
632 memset(&rep
, 0, sizeof(rep
));
633 rep
.th
.dest
= th
->source
;
634 rep
.th
.source
= th
->dest
;
635 rep
.th
.doff
= sizeof(struct tcphdr
) / 4;
639 rep
.th
.seq
= th
->ack_seq
;
642 rep
.th
.ack_seq
= htonl(ntohl(th
->seq
) + th
->syn
+ th
->fin
+
643 skb
->len
- (th
->doff
<< 2));
646 memset(&arg
, 0, sizeof(arg
));
647 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
648 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
650 #ifdef CONFIG_TCP_MD5SIG
651 hash_location
= tcp_parse_md5sig_option(th
);
652 if (!sk
&& hash_location
) {
654 * active side is lost. Try to find listening socket through
655 * source port, and then find md5 key through listening socket.
656 * we are not loose security here:
657 * Incoming packet is checked with md5 hash with finding key,
658 * no RST generated if md5 hash doesn't match.
660 sk1
= __inet_lookup_listener(dev_net(skb_dst(skb
)->dev
),
661 &tcp_hashinfo
, ip_hdr(skb
)->saddr
,
662 th
->source
, ip_hdr(skb
)->daddr
,
663 ntohs(th
->source
), inet_iif(skb
));
664 /* don't send rst if it can't find key */
668 key
= tcp_md5_do_lookup(sk1
, (union tcp_md5_addr
*)
669 &ip_hdr(skb
)->saddr
, AF_INET
);
673 genhash
= tcp_v4_md5_hash_skb(newhash
, key
, NULL
, NULL
, skb
);
674 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0)
677 key
= sk
? tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)
683 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) |
685 (TCPOPT_MD5SIG
<< 8) |
687 /* Update length and the length the header thinks exists */
688 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
689 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
691 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[1],
692 key
, ip_hdr(skb
)->saddr
,
693 ip_hdr(skb
)->daddr
, &rep
.th
);
696 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
697 ip_hdr(skb
)->saddr
, /* XXX */
698 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
699 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
700 arg
.flags
= (sk
&& inet_sk(sk
)->transparent
) ? IP_REPLY_ARG_NOSRCCHECK
: 0;
701 /* When socket is gone, all binding information is lost.
702 * routing might fail in this case. No choice here, if we choose to force
703 * input interface, we will misroute in case of asymmetric route.
706 arg
.bound_dev_if
= sk
->sk_bound_dev_if
;
708 net
= dev_net(skb_dst(skb
)->dev
);
709 arg
.tos
= ip_hdr(skb
)->tos
;
710 ip_send_unicast_reply(*this_cpu_ptr(net
->ipv4
.tcp_sk
),
711 skb
, ip_hdr(skb
)->saddr
,
712 ip_hdr(skb
)->daddr
, &arg
, arg
.iov
[0].iov_len
);
714 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
715 TCP_INC_STATS_BH(net
, TCP_MIB_OUTRSTS
);
717 #ifdef CONFIG_TCP_MD5SIG
726 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
727 outside socket context is ugly, certainly. What can I do?
730 static void tcp_v4_send_ack(struct sk_buff
*skb
, u32 seq
, u32 ack
,
731 u32 win
, u32 tsval
, u32 tsecr
, int oif
,
732 struct tcp_md5sig_key
*key
,
733 int reply_flags
, u8 tos
)
735 const struct tcphdr
*th
= tcp_hdr(skb
);
738 __be32 opt
[(TCPOLEN_TSTAMP_ALIGNED
>> 2)
739 #ifdef CONFIG_TCP_MD5SIG
740 + (TCPOLEN_MD5SIG_ALIGNED
>> 2)
744 struct ip_reply_arg arg
;
745 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
747 memset(&rep
.th
, 0, sizeof(struct tcphdr
));
748 memset(&arg
, 0, sizeof(arg
));
750 arg
.iov
[0].iov_base
= (unsigned char *)&rep
;
751 arg
.iov
[0].iov_len
= sizeof(rep
.th
);
753 rep
.opt
[0] = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
754 (TCPOPT_TIMESTAMP
<< 8) |
756 rep
.opt
[1] = htonl(tsval
);
757 rep
.opt
[2] = htonl(tsecr
);
758 arg
.iov
[0].iov_len
+= TCPOLEN_TSTAMP_ALIGNED
;
761 /* Swap the send and the receive. */
762 rep
.th
.dest
= th
->source
;
763 rep
.th
.source
= th
->dest
;
764 rep
.th
.doff
= arg
.iov
[0].iov_len
/ 4;
765 rep
.th
.seq
= htonl(seq
);
766 rep
.th
.ack_seq
= htonl(ack
);
768 rep
.th
.window
= htons(win
);
770 #ifdef CONFIG_TCP_MD5SIG
772 int offset
= (tsecr
) ? 3 : 0;
774 rep
.opt
[offset
++] = htonl((TCPOPT_NOP
<< 24) |
776 (TCPOPT_MD5SIG
<< 8) |
778 arg
.iov
[0].iov_len
+= TCPOLEN_MD5SIG_ALIGNED
;
779 rep
.th
.doff
= arg
.iov
[0].iov_len
/4;
781 tcp_v4_md5_hash_hdr((__u8
*) &rep
.opt
[offset
],
782 key
, ip_hdr(skb
)->saddr
,
783 ip_hdr(skb
)->daddr
, &rep
.th
);
786 arg
.flags
= reply_flags
;
787 arg
.csum
= csum_tcpudp_nofold(ip_hdr(skb
)->daddr
,
788 ip_hdr(skb
)->saddr
, /* XXX */
789 arg
.iov
[0].iov_len
, IPPROTO_TCP
, 0);
790 arg
.csumoffset
= offsetof(struct tcphdr
, check
) / 2;
792 arg
.bound_dev_if
= oif
;
794 ip_send_unicast_reply(*this_cpu_ptr(net
->ipv4
.tcp_sk
),
795 skb
, ip_hdr(skb
)->saddr
,
796 ip_hdr(skb
)->daddr
, &arg
, arg
.iov
[0].iov_len
);
798 TCP_INC_STATS_BH(net
, TCP_MIB_OUTSEGS
);
801 static void tcp_v4_timewait_ack(struct sock
*sk
, struct sk_buff
*skb
)
803 struct inet_timewait_sock
*tw
= inet_twsk(sk
);
804 struct tcp_timewait_sock
*tcptw
= tcp_twsk(sk
);
806 tcp_v4_send_ack(skb
, tcptw
->tw_snd_nxt
, tcptw
->tw_rcv_nxt
,
807 tcptw
->tw_rcv_wnd
>> tw
->tw_rcv_wscale
,
808 tcp_time_stamp
+ tcptw
->tw_ts_offset
,
811 tcp_twsk_md5_key(tcptw
),
812 tw
->tw_transparent
? IP_REPLY_ARG_NOSRCCHECK
: 0,
819 static void tcp_v4_reqsk_send_ack(struct sock
*sk
, struct sk_buff
*skb
,
820 struct request_sock
*req
)
822 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
823 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
825 tcp_v4_send_ack(skb
, (sk
->sk_state
== TCP_LISTEN
) ?
826 tcp_rsk(req
)->snt_isn
+ 1 : tcp_sk(sk
)->snd_nxt
,
827 tcp_rsk(req
)->rcv_nxt
, req
->rcv_wnd
,
831 tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&ip_hdr(skb
)->daddr
,
833 inet_rsk(req
)->no_srccheck
? IP_REPLY_ARG_NOSRCCHECK
: 0,
838 * Send a SYN-ACK after having received a SYN.
839 * This still operates on a request_sock only, not on a big
842 static int tcp_v4_send_synack(struct sock
*sk
, struct dst_entry
*dst
,
843 struct request_sock
*req
,
847 const struct inet_request_sock
*ireq
= inet_rsk(req
);
850 struct sk_buff
* skb
;
852 /* First, grab a route. */
853 if (!dst
&& (dst
= inet_csk_route_req(sk
, &fl4
, req
)) == NULL
)
856 skb
= tcp_make_synack(sk
, dst
, req
, NULL
);
859 __tcp_v4_send_check(skb
, ireq
->loc_addr
, ireq
->rmt_addr
);
861 skb_set_queue_mapping(skb
, queue_mapping
);
862 err
= ip_build_and_send_pkt(skb
, sk
, ireq
->loc_addr
,
865 err
= net_xmit_eval(err
);
866 if (!tcp_rsk(req
)->snt_synack
&& !err
)
867 tcp_rsk(req
)->snt_synack
= tcp_time_stamp
;
873 static int tcp_v4_rtx_synack(struct sock
*sk
, struct request_sock
*req
)
875 int res
= tcp_v4_send_synack(sk
, NULL
, req
, 0, false);
878 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_RETRANSSEGS
);
883 * IPv4 request_sock destructor.
885 static void tcp_v4_reqsk_destructor(struct request_sock
*req
)
887 kfree(inet_rsk(req
)->opt
);
891 * Return true if a syncookie should be sent
893 bool tcp_syn_flood_action(struct sock
*sk
,
894 const struct sk_buff
*skb
,
897 const char *msg
= "Dropping request";
898 bool want_cookie
= false;
899 struct listen_sock
*lopt
;
903 #ifdef CONFIG_SYN_COOKIES
904 if (sysctl_tcp_syncookies
) {
905 msg
= "Sending cookies";
907 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPREQQFULLDOCOOKIES
);
910 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPREQQFULLDROP
);
912 lopt
= inet_csk(sk
)->icsk_accept_queue
.listen_opt
;
913 if (!lopt
->synflood_warned
) {
914 lopt
->synflood_warned
= 1;
915 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
916 proto
, ntohs(tcp_hdr(skb
)->dest
), msg
);
920 EXPORT_SYMBOL(tcp_syn_flood_action
);
923 * Save and compile IPv4 options into the request_sock if needed.
925 static struct ip_options_rcu
*tcp_v4_save_options(struct sk_buff
*skb
)
927 const struct ip_options
*opt
= &(IPCB(skb
)->opt
);
928 struct ip_options_rcu
*dopt
= NULL
;
930 if (opt
&& opt
->optlen
) {
931 int opt_size
= sizeof(*dopt
) + opt
->optlen
;
933 dopt
= kmalloc(opt_size
, GFP_ATOMIC
);
935 if (ip_options_echo(&dopt
->opt
, skb
)) {
944 #ifdef CONFIG_TCP_MD5SIG
946 * RFC2385 MD5 checksumming requires a mapping of
947 * IP address->MD5 Key.
948 * We need to maintain these in the sk structure.
951 /* Find the Key structure for an address. */
952 struct tcp_md5sig_key
*tcp_md5_do_lookup(struct sock
*sk
,
953 const union tcp_md5_addr
*addr
,
956 struct tcp_sock
*tp
= tcp_sk(sk
);
957 struct tcp_md5sig_key
*key
;
958 unsigned int size
= sizeof(struct in_addr
);
959 struct tcp_md5sig_info
*md5sig
;
961 /* caller either holds rcu_read_lock() or socket lock */
962 md5sig
= rcu_dereference_check(tp
->md5sig_info
,
963 sock_owned_by_user(sk
) ||
964 lockdep_is_held(&sk
->sk_lock
.slock
));
967 #if IS_ENABLED(CONFIG_IPV6)
968 if (family
== AF_INET6
)
969 size
= sizeof(struct in6_addr
);
971 hlist_for_each_entry_rcu(key
, &md5sig
->head
, node
) {
972 if (key
->family
!= family
)
974 if (!memcmp(&key
->addr
, addr
, size
))
979 EXPORT_SYMBOL(tcp_md5_do_lookup
);
981 struct tcp_md5sig_key
*tcp_v4_md5_lookup(struct sock
*sk
,
982 struct sock
*addr_sk
)
984 union tcp_md5_addr
*addr
;
986 addr
= (union tcp_md5_addr
*)&inet_sk(addr_sk
)->inet_daddr
;
987 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
989 EXPORT_SYMBOL(tcp_v4_md5_lookup
);
991 static struct tcp_md5sig_key
*tcp_v4_reqsk_md5_lookup(struct sock
*sk
,
992 struct request_sock
*req
)
994 union tcp_md5_addr
*addr
;
996 addr
= (union tcp_md5_addr
*)&inet_rsk(req
)->rmt_addr
;
997 return tcp_md5_do_lookup(sk
, addr
, AF_INET
);
1000 /* This can be called on a newly created socket, from other files */
1001 int tcp_md5_do_add(struct sock
*sk
, const union tcp_md5_addr
*addr
,
1002 int family
, const u8
*newkey
, u8 newkeylen
, gfp_t gfp
)
1004 /* Add Key to the list */
1005 struct tcp_md5sig_key
*key
;
1006 struct tcp_sock
*tp
= tcp_sk(sk
);
1007 struct tcp_md5sig_info
*md5sig
;
1009 key
= tcp_md5_do_lookup(sk
, addr
, family
);
1011 /* Pre-existing entry - just update that one. */
1012 memcpy(key
->key
, newkey
, newkeylen
);
1013 key
->keylen
= newkeylen
;
1017 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
1018 sock_owned_by_user(sk
) ||
1019 lockdep_is_held(&sk
->sk_lock
.slock
));
1021 md5sig
= kmalloc(sizeof(*md5sig
), gfp
);
1025 sk_nocaps_add(sk
, NETIF_F_GSO_MASK
);
1026 INIT_HLIST_HEAD(&md5sig
->head
);
1027 rcu_assign_pointer(tp
->md5sig_info
, md5sig
);
1030 key
= sock_kmalloc(sk
, sizeof(*key
), gfp
);
1033 if (hlist_empty(&md5sig
->head
) && !tcp_alloc_md5sig_pool(sk
)) {
1034 sock_kfree_s(sk
, key
, sizeof(*key
));
1038 memcpy(key
->key
, newkey
, newkeylen
);
1039 key
->keylen
= newkeylen
;
1040 key
->family
= family
;
1041 memcpy(&key
->addr
, addr
,
1042 (family
== AF_INET6
) ? sizeof(struct in6_addr
) :
1043 sizeof(struct in_addr
));
1044 hlist_add_head_rcu(&key
->node
, &md5sig
->head
);
1047 EXPORT_SYMBOL(tcp_md5_do_add
);
1049 int tcp_md5_do_del(struct sock
*sk
, const union tcp_md5_addr
*addr
, int family
)
1051 struct tcp_sock
*tp
= tcp_sk(sk
);
1052 struct tcp_md5sig_key
*key
;
1053 struct tcp_md5sig_info
*md5sig
;
1055 key
= tcp_md5_do_lookup(sk
, addr
, family
);
1058 hlist_del_rcu(&key
->node
);
1059 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1060 kfree_rcu(key
, rcu
);
1061 md5sig
= rcu_dereference_protected(tp
->md5sig_info
,
1062 sock_owned_by_user(sk
));
1063 if (hlist_empty(&md5sig
->head
))
1064 tcp_free_md5sig_pool();
1067 EXPORT_SYMBOL(tcp_md5_do_del
);
1069 static void tcp_clear_md5_list(struct sock
*sk
)
1071 struct tcp_sock
*tp
= tcp_sk(sk
);
1072 struct tcp_md5sig_key
*key
;
1073 struct hlist_node
*n
;
1074 struct tcp_md5sig_info
*md5sig
;
1076 md5sig
= rcu_dereference_protected(tp
->md5sig_info
, 1);
1078 if (!hlist_empty(&md5sig
->head
))
1079 tcp_free_md5sig_pool();
1080 hlist_for_each_entry_safe(key
, n
, &md5sig
->head
, node
) {
1081 hlist_del_rcu(&key
->node
);
1082 atomic_sub(sizeof(*key
), &sk
->sk_omem_alloc
);
1083 kfree_rcu(key
, rcu
);
1087 static int tcp_v4_parse_md5_keys(struct sock
*sk
, char __user
*optval
,
1090 struct tcp_md5sig cmd
;
1091 struct sockaddr_in
*sin
= (struct sockaddr_in
*)&cmd
.tcpm_addr
;
1093 if (optlen
< sizeof(cmd
))
1096 if (copy_from_user(&cmd
, optval
, sizeof(cmd
)))
1099 if (sin
->sin_family
!= AF_INET
)
1102 if (!cmd
.tcpm_key
|| !cmd
.tcpm_keylen
)
1103 return tcp_md5_do_del(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1106 if (cmd
.tcpm_keylen
> TCP_MD5SIG_MAXKEYLEN
)
1109 return tcp_md5_do_add(sk
, (union tcp_md5_addr
*)&sin
->sin_addr
.s_addr
,
1110 AF_INET
, cmd
.tcpm_key
, cmd
.tcpm_keylen
,
1114 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool
*hp
,
1115 __be32 daddr
, __be32 saddr
, int nbytes
)
1117 struct tcp4_pseudohdr
*bp
;
1118 struct scatterlist sg
;
1120 bp
= &hp
->md5_blk
.ip4
;
1123 * 1. the TCP pseudo-header (in the order: source IP address,
1124 * destination IP address, zero-padded protocol number, and
1130 bp
->protocol
= IPPROTO_TCP
;
1131 bp
->len
= cpu_to_be16(nbytes
);
1133 sg_init_one(&sg
, bp
, sizeof(*bp
));
1134 return crypto_hash_update(&hp
->md5_desc
, &sg
, sizeof(*bp
));
1137 static int tcp_v4_md5_hash_hdr(char *md5_hash
, const struct tcp_md5sig_key
*key
,
1138 __be32 daddr
, __be32 saddr
, const struct tcphdr
*th
)
1140 struct tcp_md5sig_pool
*hp
;
1141 struct hash_desc
*desc
;
1143 hp
= tcp_get_md5sig_pool();
1145 goto clear_hash_noput
;
1146 desc
= &hp
->md5_desc
;
1148 if (crypto_hash_init(desc
))
1150 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, th
->doff
<< 2))
1152 if (tcp_md5_hash_header(hp
, th
))
1154 if (tcp_md5_hash_key(hp
, key
))
1156 if (crypto_hash_final(desc
, md5_hash
))
1159 tcp_put_md5sig_pool();
1163 tcp_put_md5sig_pool();
1165 memset(md5_hash
, 0, 16);
1169 int tcp_v4_md5_hash_skb(char *md5_hash
, struct tcp_md5sig_key
*key
,
1170 const struct sock
*sk
, const struct request_sock
*req
,
1171 const struct sk_buff
*skb
)
1173 struct tcp_md5sig_pool
*hp
;
1174 struct hash_desc
*desc
;
1175 const struct tcphdr
*th
= tcp_hdr(skb
);
1176 __be32 saddr
, daddr
;
1179 saddr
= inet_sk(sk
)->inet_saddr
;
1180 daddr
= inet_sk(sk
)->inet_daddr
;
1182 saddr
= inet_rsk(req
)->loc_addr
;
1183 daddr
= inet_rsk(req
)->rmt_addr
;
1185 const struct iphdr
*iph
= ip_hdr(skb
);
1190 hp
= tcp_get_md5sig_pool();
1192 goto clear_hash_noput
;
1193 desc
= &hp
->md5_desc
;
1195 if (crypto_hash_init(desc
))
1198 if (tcp_v4_md5_hash_pseudoheader(hp
, daddr
, saddr
, skb
->len
))
1200 if (tcp_md5_hash_header(hp
, th
))
1202 if (tcp_md5_hash_skb_data(hp
, skb
, th
->doff
<< 2))
1204 if (tcp_md5_hash_key(hp
, key
))
1206 if (crypto_hash_final(desc
, md5_hash
))
1209 tcp_put_md5sig_pool();
1213 tcp_put_md5sig_pool();
1215 memset(md5_hash
, 0, 16);
1218 EXPORT_SYMBOL(tcp_v4_md5_hash_skb
);
1220 static bool tcp_v4_inbound_md5_hash(struct sock
*sk
, const struct sk_buff
*skb
)
1223 * This gets called for each TCP segment that arrives
1224 * so we want to be efficient.
1225 * We have 3 drop cases:
1226 * o No MD5 hash and one expected.
1227 * o MD5 hash and we're not expecting one.
1228 * o MD5 hash and its wrong.
1230 const __u8
*hash_location
= NULL
;
1231 struct tcp_md5sig_key
*hash_expected
;
1232 const struct iphdr
*iph
= ip_hdr(skb
);
1233 const struct tcphdr
*th
= tcp_hdr(skb
);
1235 unsigned char newhash
[16];
1237 hash_expected
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&iph
->saddr
,
1239 hash_location
= tcp_parse_md5sig_option(th
);
1241 /* We've parsed the options - do we have a hash? */
1242 if (!hash_expected
&& !hash_location
)
1245 if (hash_expected
&& !hash_location
) {
1246 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5NOTFOUND
);
1250 if (!hash_expected
&& hash_location
) {
1251 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPMD5UNEXPECTED
);
1255 /* Okay, so this is hash_expected and hash_location -
1256 * so we need to calculate the checksum.
1258 genhash
= tcp_v4_md5_hash_skb(newhash
,
1262 if (genhash
|| memcmp(hash_location
, newhash
, 16) != 0) {
1263 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1264 &iph
->saddr
, ntohs(th
->source
),
1265 &iph
->daddr
, ntohs(th
->dest
),
1266 genhash
? " tcp_v4_calc_md5_hash failed"
1275 struct request_sock_ops tcp_request_sock_ops __read_mostly
= {
1277 .obj_size
= sizeof(struct tcp_request_sock
),
1278 .rtx_syn_ack
= tcp_v4_rtx_synack
,
1279 .send_ack
= tcp_v4_reqsk_send_ack
,
1280 .destructor
= tcp_v4_reqsk_destructor
,
1281 .send_reset
= tcp_v4_send_reset
,
1282 .syn_ack_timeout
= tcp_syn_ack_timeout
,
1285 #ifdef CONFIG_TCP_MD5SIG
1286 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops
= {
1287 .md5_lookup
= tcp_v4_reqsk_md5_lookup
,
1288 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
1292 static bool tcp_fastopen_check(struct sock
*sk
, struct sk_buff
*skb
,
1293 struct request_sock
*req
,
1294 struct tcp_fastopen_cookie
*foc
,
1295 struct tcp_fastopen_cookie
*valid_foc
)
1297 bool skip_cookie
= false;
1298 struct fastopen_queue
*fastopenq
;
1300 if (likely(!fastopen_cookie_present(foc
))) {
1301 /* See include/net/tcp.h for the meaning of these knobs */
1302 if ((sysctl_tcp_fastopen
& TFO_SERVER_ALWAYS
) ||
1303 ((sysctl_tcp_fastopen
& TFO_SERVER_COOKIE_NOT_REQD
) &&
1304 (TCP_SKB_CB(skb
)->end_seq
!= TCP_SKB_CB(skb
)->seq
+ 1)))
1305 skip_cookie
= true; /* no cookie to validate */
1309 fastopenq
= inet_csk(sk
)->icsk_accept_queue
.fastopenq
;
1310 /* A FO option is present; bump the counter. */
1311 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_TCPFASTOPENPASSIVE
);
1313 /* Make sure the listener has enabled fastopen, and we don't
1314 * exceed the max # of pending TFO requests allowed before trying
1315 * to validating the cookie in order to avoid burning CPU cycles
1318 * XXX (TFO) - The implication of checking the max_qlen before
1319 * processing a cookie request is that clients can't differentiate
1320 * between qlen overflow causing Fast Open to be disabled
1321 * temporarily vs a server not supporting Fast Open at all.
1323 if ((sysctl_tcp_fastopen
& TFO_SERVER_ENABLE
) == 0 ||
1324 fastopenq
== NULL
|| fastopenq
->max_qlen
== 0)
1327 if (fastopenq
->qlen
>= fastopenq
->max_qlen
) {
1328 struct request_sock
*req1
;
1329 spin_lock(&fastopenq
->lock
);
1330 req1
= fastopenq
->rskq_rst_head
;
1331 if ((req1
== NULL
) || time_after(req1
->expires
, jiffies
)) {
1332 spin_unlock(&fastopenq
->lock
);
1333 NET_INC_STATS_BH(sock_net(sk
),
1334 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW
);
1335 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1339 fastopenq
->rskq_rst_head
= req1
->dl_next
;
1341 spin_unlock(&fastopenq
->lock
);
1345 tcp_rsk(req
)->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1348 if (foc
->len
== TCP_FASTOPEN_COOKIE_SIZE
) {
1349 if ((sysctl_tcp_fastopen
& TFO_SERVER_COOKIE_NOT_CHKED
) == 0) {
1350 tcp_fastopen_cookie_gen(ip_hdr(skb
)->saddr
, valid_foc
);
1351 if ((valid_foc
->len
!= TCP_FASTOPEN_COOKIE_SIZE
) ||
1352 memcmp(&foc
->val
[0], &valid_foc
->val
[0],
1353 TCP_FASTOPEN_COOKIE_SIZE
) != 0)
1355 valid_foc
->len
= -1;
1357 /* Acknowledge the data received from the peer. */
1358 tcp_rsk(req
)->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1360 } else if (foc
->len
== 0) { /* Client requesting a cookie */
1361 tcp_fastopen_cookie_gen(ip_hdr(skb
)->saddr
, valid_foc
);
1362 NET_INC_STATS_BH(sock_net(sk
),
1363 LINUX_MIB_TCPFASTOPENCOOKIEREQD
);
1365 /* Client sent a cookie with wrong size. Treat it
1366 * the same as invalid and return a valid one.
1368 tcp_fastopen_cookie_gen(ip_hdr(skb
)->saddr
, valid_foc
);
1373 static int tcp_v4_conn_req_fastopen(struct sock
*sk
,
1374 struct sk_buff
*skb
,
1375 struct sk_buff
*skb_synack
,
1376 struct request_sock
*req
)
1378 struct tcp_sock
*tp
= tcp_sk(sk
);
1379 struct request_sock_queue
*queue
= &inet_csk(sk
)->icsk_accept_queue
;
1380 const struct inet_request_sock
*ireq
= inet_rsk(req
);
1384 req
->num_retrans
= 0;
1385 req
->num_timeout
= 0;
1388 child
= inet_csk(sk
)->icsk_af_ops
->syn_recv_sock(sk
, skb
, req
, NULL
);
1389 if (child
== NULL
) {
1390 NET_INC_STATS_BH(sock_net(sk
),
1391 LINUX_MIB_TCPFASTOPENPASSIVEFAIL
);
1392 kfree_skb(skb_synack
);
1395 err
= ip_build_and_send_pkt(skb_synack
, sk
, ireq
->loc_addr
,
1396 ireq
->rmt_addr
, ireq
->opt
);
1397 err
= net_xmit_eval(err
);
1399 tcp_rsk(req
)->snt_synack
= tcp_time_stamp
;
1400 /* XXX (TFO) - is it ok to ignore error and continue? */
1402 spin_lock(&queue
->fastopenq
->lock
);
1403 queue
->fastopenq
->qlen
++;
1404 spin_unlock(&queue
->fastopenq
->lock
);
1406 /* Initialize the child socket. Have to fix some values to take
1407 * into account the child is a Fast Open socket and is created
1408 * only out of the bits carried in the SYN packet.
1412 tp
->fastopen_rsk
= req
;
1413 /* Do a hold on the listner sk so that if the listener is being
1414 * closed, the child that has been accepted can live on and still
1415 * access listen_lock.
1418 tcp_rsk(req
)->listener
= sk
;
1420 /* RFC1323: The window in SYN & SYN/ACK segments is never
1421 * scaled. So correct it appropriately.
1423 tp
->snd_wnd
= ntohs(tcp_hdr(skb
)->window
);
1425 /* Activate the retrans timer so that SYNACK can be retransmitted.
1426 * The request socket is not added to the SYN table of the parent
1427 * because it's been added to the accept queue directly.
1429 inet_csk_reset_xmit_timer(child
, ICSK_TIME_RETRANS
,
1430 TCP_TIMEOUT_INIT
, sysctl_tcp_rto_max
);
1432 /* Add the child socket directly into the accept queue */
1433 inet_csk_reqsk_queue_add(sk
, req
, child
);
1435 /* Now finish processing the fastopen child socket. */
1436 inet_csk(child
)->icsk_af_ops
->rebuild_header(child
);
1437 tcp_init_congestion_control(child
);
1438 tcp_mtup_init(child
);
1439 tcp_init_buffer_space(child
);
1440 tcp_init_metrics(child
);
1442 /* Queue the data carried in the SYN packet. We need to first
1443 * bump skb's refcnt because the caller will attempt to free it.
1445 * XXX (TFO) - we honor a zero-payload TFO request for now.
1446 * (Any reason not to?)
1448 if (TCP_SKB_CB(skb
)->end_seq
== TCP_SKB_CB(skb
)->seq
+ 1) {
1449 /* Don't queue the skb if there is no payload in SYN.
1450 * XXX (TFO) - How about SYN+FIN?
1452 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1456 __skb_pull(skb
, tcp_hdr(skb
)->doff
* 4);
1457 skb_set_owner_r(skb
, child
);
1458 __skb_queue_tail(&child
->sk_receive_queue
, skb
);
1459 tp
->rcv_nxt
= TCP_SKB_CB(skb
)->end_seq
;
1460 tp
->syn_data_acked
= 1;
1462 sk
->sk_data_ready(sk
, 0);
1463 bh_unlock_sock(child
);
1465 WARN_ON(req
->sk
== NULL
);
1469 int tcp_v4_conn_request(struct sock
*sk
, struct sk_buff
*skb
)
1471 struct tcp_options_received tmp_opt
;
1472 struct request_sock
*req
;
1473 struct inet_request_sock
*ireq
;
1474 struct tcp_sock
*tp
= tcp_sk(sk
);
1475 struct dst_entry
*dst
= NULL
;
1476 __be32 saddr
= ip_hdr(skb
)->saddr
;
1477 __be32 daddr
= ip_hdr(skb
)->daddr
;
1478 __u32 isn
= TCP_SKB_CB(skb
)->when
;
1479 bool want_cookie
= false;
1481 struct tcp_fastopen_cookie foc
= { .len
= -1 };
1482 struct tcp_fastopen_cookie valid_foc
= { .len
= -1 };
1483 struct sk_buff
*skb_synack
;
1486 /* Never answer to SYNs send to broadcast or multicast */
1487 if (skb_rtable(skb
)->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
))
1490 /* TW buckets are converted to open requests without
1491 * limitations, they conserve resources and peer is
1492 * evidently real one.
1494 if (inet_csk_reqsk_queue_is_full(sk
) && !isn
) {
1495 want_cookie
= tcp_syn_flood_action(sk
, skb
, "TCP");
1500 /* Accept backlog is full. If we have already queued enough
1501 * of warm entries in syn queue, drop request. It is better than
1502 * clogging syn queue with openreqs with exponentially increasing
1505 if (sk_acceptq_is_full(sk
) && inet_csk_reqsk_queue_young(sk
) > 1) {
1506 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1510 req
= inet_reqsk_alloc(&tcp_request_sock_ops
);
1514 #ifdef CONFIG_TCP_MD5SIG
1515 tcp_rsk(req
)->af_specific
= &tcp_request_sock_ipv4_ops
;
1518 tcp_clear_options(&tmp_opt
);
1519 tmp_opt
.mss_clamp
= TCP_MSS_DEFAULT
;
1520 tmp_opt
.user_mss
= tp
->rx_opt
.user_mss
;
1521 tcp_parse_options(skb
, &tmp_opt
, 0, want_cookie
? NULL
: &foc
);
1523 if (want_cookie
&& !tmp_opt
.saw_tstamp
)
1524 tcp_clear_options(&tmp_opt
);
1526 tmp_opt
.tstamp_ok
= tmp_opt
.saw_tstamp
;
1527 tcp_openreq_init(req
, &tmp_opt
, skb
);
1529 ireq
= inet_rsk(req
);
1530 ireq
->loc_addr
= daddr
;
1531 ireq
->rmt_addr
= saddr
;
1532 ireq
->no_srccheck
= inet_sk(sk
)->transparent
;
1533 ireq
->opt
= tcp_v4_save_options(skb
);
1534 ireq
->ir_mark
= inet_request_mark(sk
, skb
);
1536 if (security_inet_conn_request(sk
, skb
, req
))
1539 if (!want_cookie
|| tmp_opt
.tstamp_ok
)
1540 TCP_ECN_create_request(req
, skb
, sock_net(sk
));
1543 isn
= cookie_v4_init_sequence(sk
, skb
, &req
->mss
);
1544 req
->cookie_ts
= tmp_opt
.tstamp_ok
;
1546 /* VJ's idea. We save last timestamp seen
1547 * from the destination in peer table, when entering
1548 * state TIME-WAIT, and check against it before
1549 * accepting new connection request.
1551 * If "isn" is not zero, this request hit alive
1552 * timewait bucket, so that all the necessary checks
1553 * are made in the function processing timewait state.
1555 if (tmp_opt
.saw_tstamp
&&
1556 tcp_death_row
.sysctl_tw_recycle
&&
1557 (dst
= inet_csk_route_req(sk
, &fl4
, req
)) != NULL
&&
1558 fl4
.daddr
== saddr
) {
1559 if (!tcp_peer_is_proven(req
, dst
, true)) {
1560 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_PAWSPASSIVEREJECTED
);
1561 goto drop_and_release
;
1564 /* Kill the following clause, if you dislike this way. */
1565 else if (!sysctl_tcp_syncookies
&&
1566 (sysctl_max_syn_backlog
- inet_csk_reqsk_queue_len(sk
) <
1567 (sysctl_max_syn_backlog
>> 2)) &&
1568 !tcp_peer_is_proven(req
, dst
, false)) {
1569 /* Without syncookies last quarter of
1570 * backlog is filled with destinations,
1571 * proven to be alive.
1572 * It means that we continue to communicate
1573 * to destinations, already remembered
1574 * to the moment of synflood.
1576 LIMIT_NETDEBUG(KERN_DEBUG
pr_fmt("drop open request from %pI4/%u\n"),
1577 &saddr
, ntohs(tcp_hdr(skb
)->source
));
1578 goto drop_and_release
;
1581 isn
= tcp_v4_init_sequence(skb
);
1583 tcp_rsk(req
)->snt_isn
= isn
;
1586 dst
= inet_csk_route_req(sk
, &fl4
, req
);
1590 do_fastopen
= tcp_fastopen_check(sk
, skb
, req
, &foc
, &valid_foc
);
1592 /* We don't call tcp_v4_send_synack() directly because we need
1593 * to make sure a child socket can be created successfully before
1594 * sending back synack!
1596 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1597 * (or better yet, call tcp_send_synack() in the child context
1598 * directly, but will have to fix bunch of other code first)
1599 * after syn_recv_sock() except one will need to first fix the
1600 * latter to remove its dependency on the current implementation
1601 * of tcp_v4_send_synack()->tcp_select_initial_window().
1603 skb_synack
= tcp_make_synack(sk
, dst
, req
,
1604 fastopen_cookie_present(&valid_foc
) ? &valid_foc
: NULL
);
1607 __tcp_v4_send_check(skb_synack
, ireq
->loc_addr
, ireq
->rmt_addr
);
1608 skb_set_queue_mapping(skb_synack
, skb_get_queue_mapping(skb
));
1612 if (likely(!do_fastopen
)) {
1614 err
= ip_build_and_send_pkt(skb_synack
, sk
, ireq
->loc_addr
,
1615 ireq
->rmt_addr
, ireq
->opt
);
1616 err
= net_xmit_eval(err
);
1617 if (err
|| want_cookie
)
1620 tcp_rsk(req
)->snt_synack
= tcp_time_stamp
;
1621 tcp_rsk(req
)->listener
= NULL
;
1622 /* Add the request_sock to the SYN table */
1623 inet_csk_reqsk_queue_hash_add(sk
, req
, TCP_TIMEOUT_INIT
);
1624 if (fastopen_cookie_present(&foc
) && foc
.len
!= 0)
1625 NET_INC_STATS_BH(sock_net(sk
),
1626 LINUX_MIB_TCPFASTOPENPASSIVEFAIL
);
1627 } else if (tcp_v4_conn_req_fastopen(sk
, skb
, skb_synack
, req
))
1637 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
1640 EXPORT_SYMBOL(tcp_v4_conn_request
);
1644 * The three way handshake has completed - we got a valid synack -
1645 * now create the new socket.
1647 struct sock
*tcp_v4_syn_recv_sock(struct sock
*sk
, struct sk_buff
*skb
,
1648 struct request_sock
*req
,
1649 struct dst_entry
*dst
)
1651 struct inet_request_sock
*ireq
;
1652 struct inet_sock
*newinet
;
1653 struct tcp_sock
*newtp
;
1655 #ifdef CONFIG_TCP_MD5SIG
1656 struct tcp_md5sig_key
*key
;
1658 struct ip_options_rcu
*inet_opt
;
1660 if (sk_acceptq_is_full(sk
))
1663 newsk
= tcp_create_openreq_child(sk
, req
, skb
);
1667 newsk
->sk_gso_type
= SKB_GSO_TCPV4
;
1668 inet_sk_rx_dst_set(newsk
, skb
);
1670 newtp
= tcp_sk(newsk
);
1671 newinet
= inet_sk(newsk
);
1672 ireq
= inet_rsk(req
);
1673 newinet
->inet_daddr
= ireq
->rmt_addr
;
1674 newinet
->inet_rcv_saddr
= ireq
->loc_addr
;
1675 newinet
->inet_saddr
= ireq
->loc_addr
;
1676 inet_opt
= ireq
->opt
;
1677 rcu_assign_pointer(newinet
->inet_opt
, inet_opt
);
1679 newinet
->mc_index
= inet_iif(skb
);
1680 newinet
->mc_ttl
= ip_hdr(skb
)->ttl
;
1681 newinet
->rcv_tos
= ip_hdr(skb
)->tos
;
1682 inet_csk(newsk
)->icsk_ext_hdr_len
= 0;
1684 inet_csk(newsk
)->icsk_ext_hdr_len
= inet_opt
->opt
.optlen
;
1685 newinet
->inet_id
= newtp
->write_seq
^ jiffies
;
1688 dst
= inet_csk_route_child_sock(sk
, newsk
, req
);
1692 /* syncookie case : see end of cookie_v4_check() */
1694 sk_setup_caps(newsk
, dst
);
1696 tcp_mtup_init(newsk
);
1697 tcp_sync_mss(newsk
, dst_mtu(dst
));
1698 newtp
->advmss
= dst_metric_advmss(dst
);
1699 if (tcp_sk(sk
)->rx_opt
.user_mss
&&
1700 tcp_sk(sk
)->rx_opt
.user_mss
< newtp
->advmss
)
1701 newtp
->advmss
= tcp_sk(sk
)->rx_opt
.user_mss
;
1703 tcp_initialize_rcv_mss(newsk
);
1704 tcp_synack_rtt_meas(newsk
, req
);
1705 newtp
->total_retrans
= req
->num_retrans
;
1707 #ifdef CONFIG_TCP_MD5SIG
1708 /* Copy over the MD5 key from the original socket */
1709 key
= tcp_md5_do_lookup(sk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1713 * We're using one, so create a matching key
1714 * on the newsk structure. If we fail to get
1715 * memory, then we end up not copying the key
1718 tcp_md5_do_add(newsk
, (union tcp_md5_addr
*)&newinet
->inet_daddr
,
1719 AF_INET
, key
->key
, key
->keylen
, GFP_ATOMIC
);
1720 sk_nocaps_add(newsk
, NETIF_F_GSO_MASK
);
1724 if (__inet_inherit_port(sk
, newsk
) < 0)
1726 __inet_hash_nolisten(newsk
, NULL
);
1731 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENOVERFLOWS
);
1735 NET_INC_STATS_BH(sock_net(sk
), LINUX_MIB_LISTENDROPS
);
1738 inet_csk_prepare_forced_close(newsk
);
1742 EXPORT_SYMBOL(tcp_v4_syn_recv_sock
);
1744 static struct sock
*tcp_v4_hnd_req(struct sock
*sk
, struct sk_buff
*skb
)
1746 struct tcphdr
*th
= tcp_hdr(skb
);
1747 const struct iphdr
*iph
= ip_hdr(skb
);
1749 struct request_sock
**prev
;
1750 /* Find possible connection requests. */
1751 struct request_sock
*req
= inet_csk_search_req(sk
, &prev
, th
->source
,
1752 iph
->saddr
, iph
->daddr
);
1754 return tcp_check_req(sk
, skb
, req
, prev
, false);
1756 nsk
= inet_lookup_established(sock_net(sk
), &tcp_hashinfo
, iph
->saddr
,
1757 th
->source
, iph
->daddr
, th
->dest
, inet_iif(skb
));
1760 if (nsk
->sk_state
!= TCP_TIME_WAIT
) {
1764 inet_twsk_put(inet_twsk(nsk
));
1768 #ifdef CONFIG_SYN_COOKIES
1770 sk
= cookie_v4_check(sk
, skb
, &(IPCB(skb
)->opt
));
1775 static __sum16
tcp_v4_checksum_init(struct sk_buff
*skb
)
1777 const struct iphdr
*iph
= ip_hdr(skb
);
1779 if (skb
->ip_summed
== CHECKSUM_COMPLETE
) {
1780 if (!tcp_v4_check(skb
->len
, iph
->saddr
,
1781 iph
->daddr
, skb
->csum
)) {
1782 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
1787 skb
->csum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
1788 skb
->len
, IPPROTO_TCP
, 0);
1790 if (skb
->len
<= 76) {
1791 return __skb_checksum_complete(skb
);
1797 /* The socket must have it's spinlock held when we get
1800 * We have a potential double-lock case here, so even when
1801 * doing backlog processing we use the BH locking scheme.
1802 * This is because we cannot sleep with the original spinlock
1805 int tcp_v4_do_rcv(struct sock
*sk
, struct sk_buff
*skb
)
1808 #ifdef CONFIG_TCP_MD5SIG
1810 * We really want to reject the packet as early as possible
1812 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1813 * o There is an MD5 option and we're not expecting one
1815 if (tcp_v4_inbound_md5_hash(sk
, skb
))
1819 if (sk
->sk_state
== TCP_ESTABLISHED
) { /* Fast path */
1820 struct dst_entry
*dst
= sk
->sk_rx_dst
;
1822 sock_rps_save_rxhash(sk
, skb
);
1824 if (inet_sk(sk
)->rx_dst_ifindex
!= skb
->skb_iif
||
1825 dst
->ops
->check(dst
, 0) == NULL
) {
1827 sk
->sk_rx_dst
= NULL
;
1830 if (tcp_rcv_established(sk
, skb
, tcp_hdr(skb
), skb
->len
)) {
1837 if (skb
->len
< tcp_hdrlen(skb
) || tcp_checksum_complete(skb
))
1840 if (sk
->sk_state
== TCP_LISTEN
) {
1841 struct sock
*nsk
= tcp_v4_hnd_req(sk
, skb
);
1846 sock_rps_save_rxhash(nsk
, skb
);
1847 if (tcp_child_process(sk
, nsk
, skb
)) {
1854 sock_rps_save_rxhash(sk
, skb
);
1856 if (tcp_rcv_state_process(sk
, skb
, tcp_hdr(skb
), skb
->len
)) {
1863 tcp_v4_send_reset(rsk
, skb
);
1866 /* Be careful here. If this function gets more complicated and
1867 * gcc suffers from register pressure on the x86, sk (in %ebx)
1868 * might be destroyed here. This current version compiles correctly,
1869 * but you have been warned.
1874 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_CSUMERRORS
);
1875 TCP_INC_STATS_BH(sock_net(sk
), TCP_MIB_INERRS
);
1878 EXPORT_SYMBOL(tcp_v4_do_rcv
);
1880 void tcp_v4_early_demux(struct sk_buff
*skb
)
1882 const struct iphdr
*iph
;
1883 const struct tcphdr
*th
;
1886 if (skb
->pkt_type
!= PACKET_HOST
)
1889 if (!pskb_may_pull(skb
, skb_transport_offset(skb
) + sizeof(struct tcphdr
)))
1895 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1898 sk
= __inet_lookup_established(dev_net(skb
->dev
), &tcp_hashinfo
,
1899 iph
->saddr
, th
->source
,
1900 iph
->daddr
, ntohs(th
->dest
),
1904 skb
->destructor
= sock_edemux
;
1905 if (sk
->sk_state
!= TCP_TIME_WAIT
) {
1906 struct dst_entry
*dst
= ACCESS_ONCE(sk
->sk_rx_dst
);
1909 dst
= dst_check(dst
, 0);
1911 inet_sk(sk
)->rx_dst_ifindex
== skb
->skb_iif
)
1912 skb_dst_set_noref(skb
, dst
);
1917 /* Packet is added to VJ-style prequeue for processing in process
1918 * context, if a reader task is waiting. Apparently, this exciting
1919 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1920 * failed somewhere. Latency? Burstiness? Well, at least now we will
1921 * see, why it failed. 8)8) --ANK
1924 bool tcp_prequeue(struct sock
*sk
, struct sk_buff
*skb
)
1926 struct tcp_sock
*tp
= tcp_sk(sk
);
1928 if (sysctl_tcp_low_latency
|| !tp
->ucopy
.task
)
1931 if (skb
->len
<= tcp_hdrlen(skb
) &&
1932 skb_queue_len(&tp
->ucopy
.prequeue
) == 0)
1936 __skb_queue_tail(&tp
->ucopy
.prequeue
, skb
);
1937 tp
->ucopy
.memory
+= skb
->truesize
;
1938 if (tp
->ucopy
.memory
> sk
->sk_rcvbuf
) {
1939 struct sk_buff
*skb1
;
1941 BUG_ON(sock_owned_by_user(sk
));
1943 while ((skb1
= __skb_dequeue(&tp
->ucopy
.prequeue
)) != NULL
) {
1944 sk_backlog_rcv(sk
, skb1
);
1945 NET_INC_STATS_BH(sock_net(sk
),
1946 LINUX_MIB_TCPPREQUEUEDROPPED
);
1949 tp
->ucopy
.memory
= 0;
1950 } else if (skb_queue_len(&tp
->ucopy
.prequeue
) == 1) {
1951 wake_up_interruptible_sync_poll(sk_sleep(sk
),
1952 POLLIN
| POLLRDNORM
| POLLRDBAND
);
1953 if (!inet_csk_ack_scheduled(sk
))
1954 inet_csk_reset_xmit_timer(sk
, ICSK_TIME_DACK
,
1955 (3 * tcp_rto_min(sk
)) / 4,
1956 sysctl_tcp_rto_max
);
1960 EXPORT_SYMBOL(tcp_prequeue
);
1966 int tcp_v4_rcv(struct sk_buff
*skb
)
1968 const struct iphdr
*iph
;
1969 const struct tcphdr
*th
;
1972 struct net
*net
= dev_net(skb
->dev
);
1974 if (skb
->pkt_type
!= PACKET_HOST
)
1977 /* Count it even if it's bad */
1978 TCP_INC_STATS_BH(net
, TCP_MIB_INSEGS
);
1980 if (!pskb_may_pull(skb
, sizeof(struct tcphdr
)))
1985 if (th
->doff
< sizeof(struct tcphdr
) / 4)
1987 if (!pskb_may_pull(skb
, th
->doff
* 4))
1990 /* An explanation is required here, I think.
1991 * Packet length and doff are validated by header prediction,
1992 * provided case of th->doff==0 is eliminated.
1993 * So, we defer the checks. */
1994 if (!skb_csum_unnecessary(skb
) && tcp_v4_checksum_init(skb
))
1999 TCP_SKB_CB(skb
)->seq
= ntohl(th
->seq
);
2000 TCP_SKB_CB(skb
)->end_seq
= (TCP_SKB_CB(skb
)->seq
+ th
->syn
+ th
->fin
+
2001 skb
->len
- th
->doff
* 4);
2002 TCP_SKB_CB(skb
)->ack_seq
= ntohl(th
->ack_seq
);
2003 TCP_SKB_CB(skb
)->when
= 0;
2004 TCP_SKB_CB(skb
)->ip_dsfield
= ipv4_get_dsfield(iph
);
2005 TCP_SKB_CB(skb
)->sacked
= 0;
2007 sk
= __inet_lookup_skb(&tcp_hashinfo
, skb
, th
->source
, th
->dest
);
2012 if (sk
->sk_state
== TCP_TIME_WAIT
)
2015 if (unlikely(iph
->ttl
< inet_sk(sk
)->min_ttl
)) {
2016 NET_INC_STATS_BH(net
, LINUX_MIB_TCPMINTTLDROP
);
2017 goto discard_and_relse
;
2020 if (!xfrm4_policy_check(sk
, XFRM_POLICY_IN
, skb
))
2021 goto discard_and_relse
;
2024 if (sk_filter(sk
, skb
))
2025 goto discard_and_relse
;
2029 bh_lock_sock_nested(sk
);
2031 if (!sock_owned_by_user(sk
)) {
2032 #ifdef CONFIG_NET_DMA
2033 struct tcp_sock
*tp
= tcp_sk(sk
);
2034 if (!tp
->ucopy
.dma_chan
&& tp
->ucopy
.pinned_list
)
2035 tp
->ucopy
.dma_chan
= net_dma_find_channel();
2036 if (tp
->ucopy
.dma_chan
)
2037 ret
= tcp_v4_do_rcv(sk
, skb
);
2041 if (!tcp_prequeue(sk
, skb
))
2042 ret
= tcp_v4_do_rcv(sk
, skb
);
2044 } else if (unlikely(sk_add_backlog(sk
, skb
,
2045 sk
->sk_rcvbuf
+ sk
->sk_sndbuf
))) {
2047 NET_INC_STATS_BH(net
, LINUX_MIB_TCPBACKLOGDROP
);
2048 goto discard_and_relse
;
2057 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
))
2060 if (skb
->len
< (th
->doff
<< 2) || tcp_checksum_complete(skb
)) {
2062 TCP_INC_STATS_BH(net
, TCP_MIB_CSUMERRORS
);
2064 TCP_INC_STATS_BH(net
, TCP_MIB_INERRS
);
2066 tcp_v4_send_reset(NULL
, skb
);
2070 /* Discard frame. */
2079 if (!xfrm4_policy_check(NULL
, XFRM_POLICY_IN
, skb
)) {
2080 inet_twsk_put(inet_twsk(sk
));
2084 if (skb
->len
< (th
->doff
<< 2)) {
2085 inet_twsk_put(inet_twsk(sk
));
2088 if (tcp_checksum_complete(skb
)) {
2089 inet_twsk_put(inet_twsk(sk
));
2092 switch (tcp_timewait_state_process(inet_twsk(sk
), skb
, th
)) {
2094 struct sock
*sk2
= inet_lookup_listener(dev_net(skb
->dev
),
2096 iph
->saddr
, th
->source
,
2097 iph
->daddr
, th
->dest
,
2100 inet_twsk_deschedule(inet_twsk(sk
), &tcp_death_row
);
2101 inet_twsk_put(inet_twsk(sk
));
2105 /* Fall through to ACK */
2108 tcp_v4_timewait_ack(sk
, skb
);
2112 case TCP_TW_SUCCESS
:;
2117 static struct timewait_sock_ops tcp_timewait_sock_ops
= {
2118 .twsk_obj_size
= sizeof(struct tcp_timewait_sock
),
2119 .twsk_unique
= tcp_twsk_unique
,
2120 .twsk_destructor
= tcp_twsk_destructor
,
2123 void inet_sk_rx_dst_set(struct sock
*sk
, const struct sk_buff
*skb
)
2125 struct dst_entry
*dst
= skb_dst(skb
);
2128 sk
->sk_rx_dst
= dst
;
2129 inet_sk(sk
)->rx_dst_ifindex
= skb
->skb_iif
;
2131 EXPORT_SYMBOL(inet_sk_rx_dst_set
);
2133 const struct inet_connection_sock_af_ops ipv4_specific
= {
2134 .queue_xmit
= ip_queue_xmit
,
2135 .send_check
= tcp_v4_send_check
,
2136 .rebuild_header
= inet_sk_rebuild_header
,
2137 .sk_rx_dst_set
= inet_sk_rx_dst_set
,
2138 .conn_request
= tcp_v4_conn_request
,
2139 .syn_recv_sock
= tcp_v4_syn_recv_sock
,
2140 .net_header_len
= sizeof(struct iphdr
),
2141 .setsockopt
= ip_setsockopt
,
2142 .getsockopt
= ip_getsockopt
,
2143 .addr2sockaddr
= inet_csk_addr2sockaddr
,
2144 .sockaddr_len
= sizeof(struct sockaddr_in
),
2145 .bind_conflict
= inet_csk_bind_conflict
,
2146 #ifdef CONFIG_COMPAT
2147 .compat_setsockopt
= compat_ip_setsockopt
,
2148 .compat_getsockopt
= compat_ip_getsockopt
,
2150 .mtu_reduced
= tcp_v4_mtu_reduced
,
2152 EXPORT_SYMBOL(ipv4_specific
);
2154 #ifdef CONFIG_TCP_MD5SIG
2155 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific
= {
2156 .md5_lookup
= tcp_v4_md5_lookup
,
2157 .calc_md5_hash
= tcp_v4_md5_hash_skb
,
2158 .md5_parse
= tcp_v4_parse_md5_keys
,
2162 /* NOTE: A lot of things set to zero explicitly by call to
2163 * sk_alloc() so need not be done here.
2165 static int tcp_v4_init_sock(struct sock
*sk
)
2167 struct inet_connection_sock
*icsk
= inet_csk(sk
);
2170 icsk
->icsk_MMSRB
= 0;
2172 icsk
->icsk_af_ops
= &ipv4_specific
;
2174 #ifdef CONFIG_TCP_MD5SIG
2175 tcp_sk(sk
)->af_specific
= &tcp_sock_ipv4_specific
;
2181 void tcp_v4_destroy_sock(struct sock
*sk
)
2183 struct tcp_sock
*tp
= tcp_sk(sk
);
2185 tcp_clear_xmit_timers(sk
);
2187 tcp_cleanup_congestion_control(sk
);
2189 /* Cleanup up the write buffer. */
2190 tcp_write_queue_purge(sk
);
2192 /* Cleans up our, hopefully empty, out_of_order_queue. */
2193 __skb_queue_purge(&tp
->out_of_order_queue
);
2195 #ifdef CONFIG_TCP_MD5SIG
2196 /* Clean up the MD5 key list, if any */
2197 if (tp
->md5sig_info
) {
2198 tcp_clear_md5_list(sk
);
2199 kfree_rcu(tp
->md5sig_info
, rcu
);
2200 tp
->md5sig_info
= NULL
;
2204 #ifdef CONFIG_NET_DMA
2205 /* Cleans up our sk_async_wait_queue */
2206 __skb_queue_purge(&sk
->sk_async_wait_queue
);
2209 /* Clean prequeue, it must be empty really */
2210 __skb_queue_purge(&tp
->ucopy
.prequeue
);
2212 /* Clean up a referenced TCP bind bucket. */
2213 if (inet_csk(sk
)->icsk_bind_hash
)
2216 BUG_ON(tp
->fastopen_rsk
!= NULL
);
2218 /* If socket is aborted during connect operation */
2219 tcp_free_fastopen_req(tp
);
2221 sk_sockets_allocated_dec(sk
);
2222 sock_release_memcg(sk
);
2224 EXPORT_SYMBOL(tcp_v4_destroy_sock
);
2226 void tcp_v4_handle_retrans_time_by_uid(struct uid_err uid_e
)
2228 unsigned int bucket
;
2229 uid_t skuid
= (uid_t
)(uid_e
.appuid
);
2230 struct inet_connection_sock
*icsk
= NULL
;//inet_csk(sk);
2233 for (bucket
= 0; bucket
< tcp_hashinfo
.ehash_mask
; bucket
++) {
2234 struct hlist_nulls_node
*node
;
2236 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, bucket
);
2239 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[bucket
].chain
) {
2241 if (sysctl_ip_dynaddr
&& sk
->sk_state
== TCP_SYN_SENT
)
2243 if (sock_flag(sk
, SOCK_DEAD
))
2247 if(SOCK_INODE(sk
->sk_socket
)->i_uid
!= skuid
)
2250 printk("[mmspb] tcp_v4_handle_retrans_time_by_uid socket uid(%d) match!",
2251 SOCK_INODE(sk
->sk_socket
)->i_uid
);
2257 spin_unlock_bh(lock
);
2262 // update sk time out value
2263 icsk
= inet_csk(sk
);
2264 printk("[mmspb] tcp_v4_handle_retrans_time_by_uid update timer\n");
2266 sk_reset_timer(sk
, &icsk
->icsk_retransmit_timer
, jiffies
+ 2);
2267 icsk
->icsk_rto
= sysctl_tcp_rto_min
* 30;
2268 icsk
->icsk_MMSRB
= 1;
2276 spin_unlock_bh(lock
);
2283 * tcp_v4_nuke_addr_by_uid - destroy all sockets of spcial uid
2285 void tcp_v4_reset_connections_by_uid(struct uid_err uid_e
)
2287 unsigned int bucket
;
2288 uid_t skuid
= (uid_t
)(uid_e
.appuid
);
2290 for (bucket
= 0; bucket
< tcp_hashinfo
.ehash_mask
; bucket
++) {
2291 struct hlist_nulls_node
*node
;
2293 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, bucket
);
2297 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[bucket
].chain
) {
2299 if (sysctl_ip_dynaddr
&& sk
->sk_state
== TCP_SYN_SENT
)
2301 if (sock_flag(sk
, SOCK_DEAD
))
2305 if(SOCK_INODE(sk
->sk_socket
)->i_uid
!= skuid
)
2308 printk(KERN_INFO
"SIOCKILLSOCK socket uid(%d) match!",
2309 SOCK_INODE(sk
->sk_socket
)->i_uid
);
2315 spin_unlock_bh(lock
);
2319 sk
->sk_err
= uid_e
.errNum
;
2320 printk(KERN_INFO
"SIOCKILLSOCK set sk err == %d!! \n", sk
->sk_err
);
2321 sk
->sk_error_report(sk
);
2330 spin_unlock_bh(lock
);
2335 #ifdef CONFIG_PROC_FS
2336 /* Proc filesystem TCP sock list dumping. */
2338 static inline struct inet_timewait_sock
*tw_head(struct hlist_nulls_head
*head
)
2340 return hlist_nulls_empty(head
) ? NULL
:
2341 list_entry(head
->first
, struct inet_timewait_sock
, tw_node
);
2344 static inline struct inet_timewait_sock
*tw_next(struct inet_timewait_sock
*tw
)
2346 return !is_a_nulls(tw
->tw_node
.next
) ?
2347 hlist_nulls_entry(tw
->tw_node
.next
, typeof(*tw
), tw_node
) : NULL
;
2351 * Get next listener socket follow cur. If cur is NULL, get first socket
2352 * starting from bucket given in st->bucket; when st->bucket is zero the
2353 * very first socket in the hash table is returned.
2355 static void *listening_get_next(struct seq_file
*seq
, void *cur
)
2357 struct inet_connection_sock
*icsk
;
2358 struct hlist_nulls_node
*node
;
2359 struct sock
*sk
= cur
;
2360 struct inet_listen_hashbucket
*ilb
;
2361 struct tcp_iter_state
*st
= seq
->private;
2362 struct net
*net
= seq_file_net(seq
);
2365 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2366 spin_lock_bh(&ilb
->lock
);
2367 sk
= sk_nulls_head(&ilb
->head
);
2371 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2375 if (st
->state
== TCP_SEQ_STATE_OPENREQ
) {
2376 struct request_sock
*req
= cur
;
2378 icsk
= inet_csk(st
->syn_wait_sk
);
2382 if (req
->rsk_ops
->family
== st
->family
) {
2388 if (++st
->sbucket
>= icsk
->icsk_accept_queue
.listen_opt
->nr_table_entries
)
2391 req
= icsk
->icsk_accept_queue
.listen_opt
->syn_table
[st
->sbucket
];
2393 sk
= sk_nulls_next(st
->syn_wait_sk
);
2394 st
->state
= TCP_SEQ_STATE_LISTENING
;
2395 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2397 icsk
= inet_csk(sk
);
2398 read_lock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2399 if (reqsk_queue_len(&icsk
->icsk_accept_queue
))
2401 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2402 sk
= sk_nulls_next(sk
);
2405 sk_nulls_for_each_from(sk
, node
) {
2406 if (!net_eq(sock_net(sk
), net
))
2408 if (sk
->sk_family
== st
->family
) {
2412 icsk
= inet_csk(sk
);
2413 read_lock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2414 if (reqsk_queue_len(&icsk
->icsk_accept_queue
)) {
2416 st
->uid
= sock_i_uid(sk
);
2417 st
->syn_wait_sk
= sk
;
2418 st
->state
= TCP_SEQ_STATE_OPENREQ
;
2422 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2424 spin_unlock_bh(&ilb
->lock
);
2426 if (++st
->bucket
< INET_LHTABLE_SIZE
) {
2427 ilb
= &tcp_hashinfo
.listening_hash
[st
->bucket
];
2428 spin_lock_bh(&ilb
->lock
);
2429 sk
= sk_nulls_head(&ilb
->head
);
2437 static void *listening_get_idx(struct seq_file
*seq
, loff_t
*pos
)
2439 struct tcp_iter_state
*st
= seq
->private;
2444 rc
= listening_get_next(seq
, NULL
);
2446 while (rc
&& *pos
) {
2447 rc
= listening_get_next(seq
, rc
);
2453 static inline bool empty_bucket(struct tcp_iter_state
*st
)
2455 return hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].chain
) &&
2456 hlist_nulls_empty(&tcp_hashinfo
.ehash
[st
->bucket
].twchain
);
2460 * Get first established socket starting from bucket given in st->bucket.
2461 * If st->bucket is zero, the very first socket in the hash is returned.
2463 static void *established_get_first(struct seq_file
*seq
)
2465 struct tcp_iter_state
*st
= seq
->private;
2466 struct net
*net
= seq_file_net(seq
);
2470 for (; st
->bucket
<= tcp_hashinfo
.ehash_mask
; ++st
->bucket
) {
2472 struct hlist_nulls_node
*node
;
2473 struct inet_timewait_sock
*tw
;
2474 spinlock_t
*lock
= inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
);
2476 /* Lockless fast path for the common case of empty buckets */
2477 if (empty_bucket(st
))
2481 sk_nulls_for_each(sk
, node
, &tcp_hashinfo
.ehash
[st
->bucket
].chain
) {
2482 if (sk
->sk_family
!= st
->family
||
2483 !net_eq(sock_net(sk
), net
)) {
2489 st
->state
= TCP_SEQ_STATE_TIME_WAIT
;
2490 inet_twsk_for_each(tw
, node
,
2491 &tcp_hashinfo
.ehash
[st
->bucket
].twchain
) {
2492 if (tw
->tw_family
!= st
->family
||
2493 !net_eq(twsk_net(tw
), net
)) {
2499 spin_unlock_bh(lock
);
2500 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2506 static void *established_get_next(struct seq_file
*seq
, void *cur
)
2508 struct sock
*sk
= cur
;
2509 struct inet_timewait_sock
*tw
;
2510 struct hlist_nulls_node
*node
;
2511 struct tcp_iter_state
*st
= seq
->private;
2512 struct net
*net
= seq_file_net(seq
);
2517 if (st
->state
== TCP_SEQ_STATE_TIME_WAIT
) {
2521 while (tw
&& (tw
->tw_family
!= st
->family
|| !net_eq(twsk_net(tw
), net
))) {
2528 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2529 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2531 /* Look for next non empty bucket */
2533 while (++st
->bucket
<= tcp_hashinfo
.ehash_mask
&&
2536 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2539 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2540 sk
= sk_nulls_head(&tcp_hashinfo
.ehash
[st
->bucket
].chain
);
2542 sk
= sk_nulls_next(sk
);
2544 sk_nulls_for_each_from(sk
, node
) {
2545 if (sk
->sk_family
== st
->family
&& net_eq(sock_net(sk
), net
))
2549 st
->state
= TCP_SEQ_STATE_TIME_WAIT
;
2550 tw
= tw_head(&tcp_hashinfo
.ehash
[st
->bucket
].twchain
);
2558 static void *established_get_idx(struct seq_file
*seq
, loff_t pos
)
2560 struct tcp_iter_state
*st
= seq
->private;
2564 rc
= established_get_first(seq
);
2567 rc
= established_get_next(seq
, rc
);
2573 static void *tcp_get_idx(struct seq_file
*seq
, loff_t pos
)
2576 struct tcp_iter_state
*st
= seq
->private;
2578 st
->state
= TCP_SEQ_STATE_LISTENING
;
2579 rc
= listening_get_idx(seq
, &pos
);
2582 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2583 rc
= established_get_idx(seq
, pos
);
2589 static void *tcp_seek_last_pos(struct seq_file
*seq
)
2591 struct tcp_iter_state
*st
= seq
->private;
2592 int offset
= st
->offset
;
2593 int orig_num
= st
->num
;
2596 switch (st
->state
) {
2597 case TCP_SEQ_STATE_OPENREQ
:
2598 case TCP_SEQ_STATE_LISTENING
:
2599 if (st
->bucket
>= INET_LHTABLE_SIZE
)
2601 st
->state
= TCP_SEQ_STATE_LISTENING
;
2602 rc
= listening_get_next(seq
, NULL
);
2603 while (offset
-- && rc
)
2604 rc
= listening_get_next(seq
, rc
);
2609 case TCP_SEQ_STATE_ESTABLISHED
:
2610 case TCP_SEQ_STATE_TIME_WAIT
:
2611 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2612 if (st
->bucket
> tcp_hashinfo
.ehash_mask
)
2614 rc
= established_get_first(seq
);
2615 while (offset
-- && rc
)
2616 rc
= established_get_next(seq
, rc
);
2624 static void *tcp_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2626 struct tcp_iter_state
*st
= seq
->private;
2629 if (*pos
&& *pos
== st
->last_pos
) {
2630 rc
= tcp_seek_last_pos(seq
);
2635 st
->state
= TCP_SEQ_STATE_LISTENING
;
2639 rc
= *pos
? tcp_get_idx(seq
, *pos
- 1) : SEQ_START_TOKEN
;
2642 st
->last_pos
= *pos
;
2646 static void *tcp_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2648 struct tcp_iter_state
*st
= seq
->private;
2651 if (v
== SEQ_START_TOKEN
) {
2652 rc
= tcp_get_idx(seq
, 0);
2656 switch (st
->state
) {
2657 case TCP_SEQ_STATE_OPENREQ
:
2658 case TCP_SEQ_STATE_LISTENING
:
2659 rc
= listening_get_next(seq
, v
);
2661 st
->state
= TCP_SEQ_STATE_ESTABLISHED
;
2664 rc
= established_get_first(seq
);
2667 case TCP_SEQ_STATE_ESTABLISHED
:
2668 case TCP_SEQ_STATE_TIME_WAIT
:
2669 rc
= established_get_next(seq
, v
);
2674 st
->last_pos
= *pos
;
2678 static void tcp_seq_stop(struct seq_file
*seq
, void *v
)
2680 struct tcp_iter_state
*st
= seq
->private;
2682 switch (st
->state
) {
2683 case TCP_SEQ_STATE_OPENREQ
:
2685 struct inet_connection_sock
*icsk
= inet_csk(st
->syn_wait_sk
);
2686 read_unlock_bh(&icsk
->icsk_accept_queue
.syn_wait_lock
);
2688 case TCP_SEQ_STATE_LISTENING
:
2689 if (v
!= SEQ_START_TOKEN
)
2690 spin_unlock_bh(&tcp_hashinfo
.listening_hash
[st
->bucket
].lock
);
2692 case TCP_SEQ_STATE_TIME_WAIT
:
2693 case TCP_SEQ_STATE_ESTABLISHED
:
2695 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo
, st
->bucket
));
2700 int tcp_seq_open(struct inode
*inode
, struct file
*file
)
2702 struct tcp_seq_afinfo
*afinfo
= PDE_DATA(inode
);
2703 struct tcp_iter_state
*s
;
2706 err
= seq_open_net(inode
, file
, &afinfo
->seq_ops
,
2707 sizeof(struct tcp_iter_state
));
2711 s
= ((struct seq_file
*)file
->private_data
)->private;
2712 s
->family
= afinfo
->family
;
2716 EXPORT_SYMBOL(tcp_seq_open
);
2718 int tcp_proc_register(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2721 struct proc_dir_entry
*p
;
2723 afinfo
->seq_ops
.start
= tcp_seq_start
;
2724 afinfo
->seq_ops
.next
= tcp_seq_next
;
2725 afinfo
->seq_ops
.stop
= tcp_seq_stop
;
2727 p
= proc_create_data(afinfo
->name
, S_IRUGO
, net
->proc_net
,
2728 afinfo
->seq_fops
, afinfo
);
2733 EXPORT_SYMBOL(tcp_proc_register
);
2735 void tcp_proc_unregister(struct net
*net
, struct tcp_seq_afinfo
*afinfo
)
2737 remove_proc_entry(afinfo
->name
, net
->proc_net
);
2739 EXPORT_SYMBOL(tcp_proc_unregister
);
2741 static void get_openreq4(const struct sock
*sk
, const struct request_sock
*req
,
2742 struct seq_file
*f
, int i
, kuid_t uid
, int *len
)
2744 const struct inet_request_sock
*ireq
= inet_rsk(req
);
2745 long delta
= req
->expires
- jiffies
;
2747 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2748 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2751 ntohs(inet_sk(sk
)->inet_sport
),
2753 ntohs(ireq
->rmt_port
),
2755 0, 0, /* could print option size, but that is af dependent. */
2756 1, /* timers active (only the expire timer) */
2757 jiffies_delta_to_clock_t(delta
),
2759 from_kuid_munged(seq_user_ns(f
), uid
),
2760 0, /* non standard timer */
2761 0, /* open_requests have no inode */
2762 atomic_read(&sk
->sk_refcnt
),
2767 static void get_tcp4_sock(struct sock
*sk
, struct seq_file
*f
, int i
, int *len
)
2770 unsigned long timer_expires
;
2771 const struct tcp_sock
*tp
= tcp_sk(sk
);
2772 const struct inet_connection_sock
*icsk
= inet_csk(sk
);
2773 const struct inet_sock
*inet
= inet_sk(sk
);
2774 struct fastopen_queue
*fastopenq
= icsk
->icsk_accept_queue
.fastopenq
;
2775 __be32 dest
= inet
->inet_daddr
;
2776 __be32 src
= inet
->inet_rcv_saddr
;
2777 __u16 destp
= ntohs(inet
->inet_dport
);
2778 __u16 srcp
= ntohs(inet
->inet_sport
);
2781 if (icsk
->icsk_pending
== ICSK_TIME_RETRANS
||
2782 icsk
->icsk_pending
== ICSK_TIME_EARLY_RETRANS
||
2783 icsk
->icsk_pending
== ICSK_TIME_LOSS_PROBE
) {
2785 timer_expires
= icsk
->icsk_timeout
;
2786 } else if (icsk
->icsk_pending
== ICSK_TIME_PROBE0
) {
2788 timer_expires
= icsk
->icsk_timeout
;
2789 } else if (timer_pending(&sk
->sk_timer
)) {
2791 timer_expires
= sk
->sk_timer
.expires
;
2794 timer_expires
= jiffies
;
2797 if (sk
->sk_state
== TCP_LISTEN
)
2798 rx_queue
= sk
->sk_ack_backlog
;
2801 * because we dont lock socket, we might find a transient negative value
2803 rx_queue
= max_t(int, tp
->rcv_nxt
- tp
->copied_seq
, 0);
2805 seq_printf(f
, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2806 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2807 i
, src
, srcp
, dest
, destp
, sk
->sk_state
,
2808 tp
->write_seq
- tp
->snd_una
,
2811 jiffies_delta_to_clock_t(timer_expires
- jiffies
),
2812 icsk
->icsk_retransmits
,
2813 from_kuid_munged(seq_user_ns(f
), sock_i_uid(sk
)),
2814 icsk
->icsk_probes_out
,
2816 atomic_read(&sk
->sk_refcnt
), sk
,
2817 jiffies_to_clock_t(icsk
->icsk_rto
),
2818 jiffies_to_clock_t(icsk
->icsk_ack
.ato
),
2819 (icsk
->icsk_ack
.quick
<< 1) | icsk
->icsk_ack
.pingpong
,
2821 sk
->sk_state
== TCP_LISTEN
?
2822 (fastopenq
? fastopenq
->max_qlen
: 0) :
2823 (tcp_in_initial_slowstart(tp
) ? -1 : tp
->snd_ssthresh
),
2827 static void get_timewait4_sock(const struct inet_timewait_sock
*tw
,
2828 struct seq_file
*f
, int i
, int *len
)
2832 long delta
= tw
->tw_ttd
- jiffies
;
2834 dest
= tw
->tw_daddr
;
2835 src
= tw
->tw_rcv_saddr
;
2836 destp
= ntohs(tw
->tw_dport
);
2837 srcp
= ntohs(tw
->tw_sport
);
2839 seq_printf(f
, "%4d: %08X:%04X %08X:%04X"
2840 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2841 i
, src
, srcp
, dest
, destp
, tw
->tw_substate
, 0, 0,
2842 3, jiffies_delta_to_clock_t(delta
), 0, 0, 0, 0,
2843 atomic_read(&tw
->tw_refcnt
), tw
, len
);
2848 static int tcp4_seq_show(struct seq_file
*seq
, void *v
)
2850 struct tcp_iter_state
*st
;
2853 if (v
== SEQ_START_TOKEN
) {
2854 seq_printf(seq
, "%-*s\n", TMPSZ
- 1,
2855 " sl local_address rem_address st tx_queue "
2856 "rx_queue tr tm->when retrnsmt uid timeout "
2862 switch (st
->state
) {
2863 case TCP_SEQ_STATE_LISTENING
:
2864 case TCP_SEQ_STATE_ESTABLISHED
:
2865 get_tcp4_sock(v
, seq
, st
->num
, &len
);
2867 case TCP_SEQ_STATE_OPENREQ
:
2868 get_openreq4(st
->syn_wait_sk
, v
, seq
, st
->num
, st
->uid
, &len
);
2870 case TCP_SEQ_STATE_TIME_WAIT
:
2871 get_timewait4_sock(v
, seq
, st
->num
, &len
);
2874 seq_printf(seq
, "%*s\n", TMPSZ
- 1 - len
, "");
2879 static const struct file_operations tcp_afinfo_seq_fops
= {
2880 .owner
= THIS_MODULE
,
2881 .open
= tcp_seq_open
,
2883 .llseek
= seq_lseek
,
2884 .release
= seq_release_net
2887 static struct tcp_seq_afinfo tcp4_seq_afinfo
= {
2890 .seq_fops
= &tcp_afinfo_seq_fops
,
2892 .show
= tcp4_seq_show
,
2896 static int __net_init
tcp4_proc_init_net(struct net
*net
)
2898 return tcp_proc_register(net
, &tcp4_seq_afinfo
);
2901 static void __net_exit
tcp4_proc_exit_net(struct net
*net
)
2903 tcp_proc_unregister(net
, &tcp4_seq_afinfo
);
2906 static struct pernet_operations tcp4_net_ops
= {
2907 .init
= tcp4_proc_init_net
,
2908 .exit
= tcp4_proc_exit_net
,
2911 int __init
tcp4_proc_init(void)
2913 return register_pernet_subsys(&tcp4_net_ops
);
2916 void tcp4_proc_exit(void)
2918 unregister_pernet_subsys(&tcp4_net_ops
);
2920 #endif /* CONFIG_PROC_FS */
2922 struct sk_buff
**tcp4_gro_receive(struct sk_buff
**head
, struct sk_buff
*skb
)
2924 const struct iphdr
*iph
= skb_gro_network_header(skb
);
2928 switch (skb
->ip_summed
) {
2929 case CHECKSUM_COMPLETE
:
2930 if (!tcp_v4_check(skb_gro_len(skb
), iph
->saddr
, iph
->daddr
,
2932 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2936 NAPI_GRO_CB(skb
)->flush
= 1;
2940 wsum
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
2941 skb_gro_len(skb
), IPPROTO_TCP
, 0);
2942 sum
= csum_fold(skb_checksum(skb
,
2943 skb_gro_offset(skb
),
2949 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2953 return tcp_gro_receive(head
, skb
);
2956 int tcp4_gro_complete(struct sk_buff
*skb
)
2958 const struct iphdr
*iph
= ip_hdr(skb
);
2959 struct tcphdr
*th
= tcp_hdr(skb
);
2961 th
->check
= ~tcp_v4_check(skb
->len
- skb_transport_offset(skb
),
2962 iph
->saddr
, iph
->daddr
, 0);
2963 skb_shinfo(skb
)->gso_type
= SKB_GSO_TCPV4
;
2965 return tcp_gro_complete(skb
);
2968 struct proto tcp_prot
= {
2970 .owner
= THIS_MODULE
,
2972 .connect
= tcp_v4_connect
,
2973 .disconnect
= tcp_disconnect
,
2974 .accept
= inet_csk_accept
,
2976 .init
= tcp_v4_init_sock
,
2977 .destroy
= tcp_v4_destroy_sock
,
2978 .shutdown
= tcp_shutdown
,
2979 .setsockopt
= tcp_setsockopt
,
2980 .getsockopt
= tcp_getsockopt
,
2981 .recvmsg
= tcp_recvmsg
,
2982 .sendmsg
= tcp_sendmsg
,
2983 .sendpage
= tcp_sendpage
,
2984 .backlog_rcv
= tcp_v4_do_rcv
,
2985 .release_cb
= tcp_release_cb
,
2987 .unhash
= inet_unhash
,
2988 .get_port
= inet_csk_get_port
,
2989 .enter_memory_pressure
= tcp_enter_memory_pressure
,
2990 .sockets_allocated
= &tcp_sockets_allocated
,
2991 .orphan_count
= &tcp_orphan_count
,
2992 .memory_allocated
= &tcp_memory_allocated
,
2993 .memory_pressure
= &tcp_memory_pressure
,
2994 .sysctl_wmem
= sysctl_tcp_wmem
,
2995 .sysctl_rmem
= sysctl_tcp_rmem
,
2996 .max_header
= MAX_TCP_HEADER
,
2997 .obj_size
= sizeof(struct tcp_sock
),
2998 .slab_flags
= SLAB_DESTROY_BY_RCU
,
2999 .twsk_prot
= &tcp_timewait_sock_ops
,
3000 .rsk_prot
= &tcp_request_sock_ops
,
3001 .h
.hashinfo
= &tcp_hashinfo
,
3002 .no_autobind
= true,
3003 #ifdef CONFIG_COMPAT
3004 .compat_setsockopt
= compat_tcp_setsockopt
,
3005 .compat_getsockopt
= compat_tcp_getsockopt
,
3007 #ifdef CONFIG_MEMCG_KMEM
3008 .init_cgroup
= tcp_init_cgroup
,
3009 .destroy_cgroup
= tcp_destroy_cgroup
,
3010 .proto_cgroup
= tcp_proto_cgroup
,
3013 EXPORT_SYMBOL(tcp_prot
);
3015 static void __net_exit
tcp_sk_exit(struct net
*net
)
3019 for_each_possible_cpu(cpu
)
3020 inet_ctl_sock_destroy(*per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
));
3021 free_percpu(net
->ipv4
.tcp_sk
);
3024 static int __net_init
tcp_sk_init(struct net
*net
)
3028 net
->ipv4
.tcp_sk
= alloc_percpu(struct sock
*);
3029 if (!net
->ipv4
.tcp_sk
)
3032 for_each_possible_cpu(cpu
) {
3035 res
= inet_ctl_sock_create(&sk
, PF_INET
, SOCK_RAW
,
3039 *per_cpu_ptr(net
->ipv4
.tcp_sk
, cpu
) = sk
;
3041 net
->ipv4
.sysctl_tcp_ecn
= 2;
3050 static void __net_exit
tcp_sk_exit_batch(struct list_head
*net_exit_list
)
3052 inet_twsk_purge(&tcp_hashinfo
, &tcp_death_row
, AF_INET
);
3055 static struct pernet_operations __net_initdata tcp_sk_ops
= {
3056 .init
= tcp_sk_init
,
3057 .exit
= tcp_sk_exit
,
3058 .exit_batch
= tcp_sk_exit_batch
,
3061 void __init
tcp_v4_init(void)
3063 inet_hashinfo_init(&tcp_hashinfo
);
3064 if (register_pernet_subsys(&tcp_sk_ops
))
3065 panic("Failed to create the TCP control socket.\n");