net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 /* This will initiate an outgoing connection. */
 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143 {
 144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct tcp_sock *tp = tcp_sk(sk);
 147         __be16 orig_sport, orig_dport;
 148         __be32 daddr, nexthop;
 149         struct flowi4 *fl4;
 150         struct rtable *rt;
 151         int err;
 152         struct ip_options_rcu *inet_opt;
 153
 154         if (addr_len < sizeof(struct sockaddr_in))
 155                 return -EINVAL;
 156
 157         if (usin->sin_family != AF_INET)
 158                 return -EAFNOSUPPORT;
 159
 160         nexthop = daddr = usin->sin_addr.s_addr;
 161         inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                              sock_owned_by_user(sk));
 163         if (inet_opt && inet_opt->opt.srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet_opt->opt.faddr;
 167         }
 168
 169         orig_sport = inet->inet_sport;
 170         orig_dport = usin->sin_port;
 171         fl4 = &inet->cork.fl.u.ip4;
 172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                               IPPROTO_TCP,
 175                               orig_sport, orig_dport, sk, true);
 176         if (IS_ERR(rt)) {
 177                 err = PTR_ERR(rt);
 178                 if (err == -ENETUNREACH)
 179                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                 return err;
 181         }
 182
 183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                 ip_rt_put(rt);
 185                 return -ENETUNREACH;
 186         }
 187
 188         if (!inet_opt || !inet_opt->opt.srr)
 189                 daddr = fl4->daddr;
 190
 191         if (!inet->inet_saddr)
 192                 inet->inet_saddr = fl4->saddr;
 193         inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                 /* Reset inherited state */
 197                 tp->rx_opt.ts_recent       = 0;
 198                 tp->rx_opt.ts_recent_stamp = 0;
 199                 if (likely(!tp->repair))
 200                         tp->write_seq      = 0;
 201         }
 202
 203         if (tcp_death_row.sysctl_tw_recycle &&
 204             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 205                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 206
 207         inet->inet_dport = usin->sin_port;
 208         inet->inet_daddr = daddr;
 209
 210         inet_csk(sk)->icsk_ext_hdr_len = 0;
 211         if (inet_opt)
 212                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 213
 214         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 215
 216         /* Socket identity is still unknown (sport may be zero).
 217          * However we set state to SYN-SENT and not releasing socket
 218          * lock select source port, enter ourselves into the hash tables and
 219          * complete initialization after this.
 220          */
 221         tcp_set_state(sk, TCP_SYN_SENT);
 222         err = inet_hash_connect(&tcp_death_row, sk);
 223         if (err)
 224                 goto failure;
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 static void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 278          * send out by Linux are always <576bytes so they should go through
 279          * unfragmented).
 280          */
 281         if (sk->sk_state == TCP_LISTEN)
 282                 return;
 283
 284         dst = inet_csk_update_pmtu(sk, mtu);
 285         if (!dst)
 286                 return;
 287
 288         /* Something is about to be wrong... Remember soft error
 289          * for the case, if this connection will not able to recover.
 290          */
 291         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 292                 sk->sk_err_soft = EMSGSIZE;
 293
 294         mtu = dst_mtu(dst);
 295
 296         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 297             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 298                 tcp_sync_mss(sk, mtu);
 299
 300                 /* Resend the TCP packet because it's
 301                  * clear that the old packet has been
 302                  * dropped. This is the new "fast" path mtu
 303                  * discovery.
 304                  */
 305                 tcp_simple_retransmit(sk);
 306         } /* else let the usual retransmit timer handle it */
 307 }
 308
 309 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 310 {
 311         struct dst_entry *dst = __sk_dst_check(sk, 0);
 312
 313         if (dst)
 314                 dst->ops->redirect(dst, sk, skb);
 315 }
 316
 317 /*
 318  * This routine is called by the ICMP module when it gets some
 319  * sort of error condition.  If err < 0 then the socket should
 320  * be closed and the error returned to the user.  If err > 0
 321  * it's just the icmp type << 8 | icmp code.  After adjustment
 322  * header points to the first 8 bytes of the tcp header.  We need
 323  * to find the appropriate port.
 324  *
 325  * The locking strategy used here is very "optimistic". When
 326  * someone else accesses the socket the ICMP is just dropped
 327  * and for some paths there is no check at all.
 328  * A more general error queue to queue errors for later handling
 329  * is probably better.
 330  *
 331  */
 332
 333 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 334 {
 335         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 336         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 337         struct inet_connection_sock *icsk;
 338         struct tcp_sock *tp;
 339         struct inet_sock *inet;
 340         const int type = icmp_hdr(icmp_skb)->type;
 341         const int code = icmp_hdr(icmp_skb)->code;
 342         struct sock *sk;
 343         struct sk_buff *skb;
 344         struct request_sock *req;
 345         __u32 seq;
 346         __u32 remaining;
 347         int err;
 348         struct net *net = dev_net(icmp_skb->dev);
 349
 350         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 351                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 352                 return;
 353         }
 354
 355         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 356                         iph->saddr, th->source, inet_iif(icmp_skb));
 357         if (!sk) {
 358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 359                 return;
 360         }
 361         if (sk->sk_state == TCP_TIME_WAIT) {
 362                 inet_twsk_put(inet_twsk(sk));
 363                 return;
 364         }
 365
 366         bh_lock_sock(sk);
 367         /* If too many ICMPs get dropped on busy
 368          * servers this needs to be solved differently.
 369          * We do take care of PMTU discovery (RFC1191) special case :
 370          * we can receive locally generated ICMP messages while socket is held.
 371          */
 372         if (sock_owned_by_user(sk)) {
 373                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 374                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 375         }
 376         if (sk->sk_state == TCP_CLOSE)
 377                 goto out;
 378
 379         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 380                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 381                 goto out;
 382         }
 383
 384         icsk = inet_csk(sk);
 385         tp = tcp_sk(sk);
 386         req = tp->fastopen_rsk;
 387         seq = ntohl(th->seq);
 388         if (sk->sk_state != TCP_LISTEN &&
 389             !between(seq, tp->snd_una, tp->snd_nxt) &&
 390             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 391                 /* For a Fast Open socket, allow seq to be snt_isn. */
 392                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393                 goto out;
 394         }
 395
 396         switch (type) {
 397         case ICMP_REDIRECT:
 398                 do_redirect(icmp_skb, sk);
 399                 goto out;
 400         case ICMP_SOURCE_QUENCH:
 401                 /* Just silently ignore these. */
 402                 goto out;
 403         case ICMP_PARAMETERPROB:
 404                 err = EPROTO;
 405                 break;
 406         case ICMP_DEST_UNREACH:
 407                 if (code > NR_ICMP_UNREACH)
 408                         goto out;
 409
 410                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 411                         tp->mtu_info = info;
 412                         if (!sock_owned_by_user(sk)) {
 413                                 tcp_v4_mtu_reduced(sk);
 414                         } else {
 415                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 416                                         sock_hold(sk);
 417                         }
 418                         goto out;
 419                 }
 420
 421                 err = icmp_err_convert[code].errno;
 422                 /* check if icmp_skb allows revert of backoff
 423                  * (see draft-zimmermann-tcp-lcd) */
 424                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 425                         break;
 426                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 427                     !icsk->icsk_backoff)
 428                         break;
 429
 430                 /* XXX (TFO) - revisit the following logic for TFO */
 431
 432                 if (sock_owned_by_user(sk))
 433                         break;
 434
 435                 icsk->icsk_backoff--;
 436                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 437                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 438                 tcp_bound_rto(sk);
 439
 440                 skb = tcp_write_queue_head(sk);
 441                 BUG_ON(!skb);
 442
 443                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 444                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 445
 446                 if (remaining) {
 447                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 448                                                   remaining, TCP_RTO_MAX);
 449                 } else {
 450                         /* RTO revert clocked out retransmission.
 451                          * Will retransmit now */
 452                         tcp_retransmit_timer(sk);
 453                 }
 454
 455                 break;
 456         case ICMP_TIME_EXCEEDED:
 457                 err = EHOSTUNREACH;
 458                 break;
 459         default:
 460                 goto out;
 461         }
 462
 463         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 464          * than following the TCP_SYN_RECV case and closing the socket,
 465          * we ignore the ICMP error and keep trying like a fully established
 466          * socket. Is this the right thing to do?
 467          */
 468         if (req && req->sk == NULL)
 469                 goto out;
 470
 471         switch (sk->sk_state) {
 472                 struct request_sock *req, **prev;
 473         case TCP_LISTEN:
 474                 if (sock_owned_by_user(sk))
 475                         goto out;
 476
 477                 req = inet_csk_search_req(sk, &prev, th->dest,
 478                                           iph->daddr, iph->saddr);
 479                 if (!req)
 480                         goto out;
 481
 482                 /* ICMPs are not backlogged, hence we cannot get
 483                    an established socket here.
 484                  */
 485                 WARN_ON(req->sk);
 486
 487                 if (seq != tcp_rsk(req)->snt_isn) {
 488                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 489                         goto out;
 490                 }
 491
 492                 /*
 493                  * Still in SYN_RECV, just remove it silently.
 494                  * There is no good way to pass the error to the newly
 495                  * created socket, and POSIX does not want network
 496                  * errors returned from accept().
 497                  */
 498                 inet_csk_reqsk_queue_drop(sk, req, prev);
 499                 goto out;
 500
 501         case TCP_SYN_SENT:
 502         case TCP_SYN_RECV:  /* Cannot happen.
 503                                It can f.e. if SYNs crossed,
 504                                or Fast Open.
 505                              */
 506                 if (!sock_owned_by_user(sk)) {
 507                         sk->sk_err = err;
 508
 509                         sk->sk_error_report(sk);
 510
 511                         tcp_done(sk);
 512                 } else {
 513                         sk->sk_err_soft = err;
 514                 }
 515                 goto out;
 516         }
 517
 518         /* If we've already connected we will keep trying
 519          * until we time out, or the user gives up.
 520          *
 521          * rfc1122 4.2.3.9 allows to consider as hard errors
 522          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 523          * but it is obsoleted by pmtu discovery).
 524          *
 525          * Note, that in modern internet, where routing is unreliable
 526          * and in each dark corner broken firewalls sit, sending random
 527          * errors ordered by their masters even this two messages finally lose
 528          * their original sense (even Linux sends invalid PORT_UNREACHs)
 529          *
 530          * Now we are in compliance with RFCs.
 531          *                                                      --ANK (980905)
 532          */
 533
 534         inet = inet_sk(sk);
 535         if (!sock_owned_by_user(sk) && inet->recverr) {
 536                 sk->sk_err = err;
 537                 sk->sk_error_report(sk);
 538         } else  { /* Only an error on timeout */
 539                 sk->sk_err_soft = err;
 540         }
 541
 542 out:
 543         bh_unlock_sock(sk);
 544         sock_put(sk);
 545 }
 546
 547 static void __tcp_v4_send_check(struct sk_buff *skb,
 548                                 __be32 saddr, __be32 daddr)
 549 {
 550         struct tcphdr *th = tcp_hdr(skb);
 551
 552         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 553                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 554                 skb->csum_start = skb_transport_header(skb) - skb->head;
 555                 skb->csum_offset = offsetof(struct tcphdr, check);
 556         } else {
 557                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 558                                          csum_partial(th,
 559                                                       th->doff << 2,
 560                                                       skb->csum));
 561         }
 562 }
 563
 564 /* This routine computes an IPv4 TCP checksum. */
 565 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 566 {
 567         const struct inet_sock *inet = inet_sk(sk);
 568
 569         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 570 }
 571 EXPORT_SYMBOL(tcp_v4_send_check);
 572
 573 int tcp_v4_gso_send_check(struct sk_buff *skb)
 574 {
 575         const struct iphdr *iph;
 576         struct tcphdr *th;
 577
 578         if (!pskb_may_pull(skb, sizeof(*th)))
 579                 return -EINVAL;
 580
 581         iph = ip_hdr(skb);
 582         th = tcp_hdr(skb);
 583
 584         th->check = 0;
 585         skb->ip_summed = CHECKSUM_PARTIAL;
 586         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 587         return 0;
 588 }
 589
 590 /*
 591  *      This routine will send an RST to the other tcp.
 592  *
 593  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 594  *                    for reset.
 595  *      Answer: if a packet caused RST, it is not for a socket
 596  *              existing in our system, if it is matched to a socket,
 597  *              it is just duplicate segment or bug in other side's TCP.
 598  *              So that we build reply only basing on parameters
 599  *              arrived with segment.
 600  *      Exception: precedence violation. We do not implement it in any case.
 601  */
 602
 603 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 604 {
 605         const struct tcphdr *th = tcp_hdr(skb);
 606         struct {
 607                 struct tcphdr th;
 608 #ifdef CONFIG_TCP_MD5SIG
 609                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 610 #endif
 611         } rep;
 612         struct ip_reply_arg arg;
 613 #ifdef CONFIG_TCP_MD5SIG
 614         struct tcp_md5sig_key *key;
 615         const __u8 *hash_location = NULL;
 616         unsigned char newhash[16];
 617         int genhash;
 618         struct sock *sk1 = NULL;
 619 #endif
 620         struct net *net;
 621
 622         /* Never send a reset in response to a reset. */
 623         if (th->rst)
 624                 return;
 625
 626         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 627                 return;
 628
 629         /* Swap the send and the receive. */
 630         memset(&rep, 0, sizeof(rep));
 631         rep.th.dest   = th->source;
 632         rep.th.source = th->dest;
 633         rep.th.doff   = sizeof(struct tcphdr) / 4;
 634         rep.th.rst    = 1;
 635
 636         if (th->ack) {
 637                 rep.th.seq = th->ack_seq;
 638         } else {
 639                 rep.th.ack = 1;
 640                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 641                                        skb->len - (th->doff << 2));
 642         }
 643
 644         memset(&arg, 0, sizeof(arg));
 645         arg.iov[0].iov_base = (unsigned char *)&rep;
 646         arg.iov[0].iov_len  = sizeof(rep.th);
 647
 648 #ifdef CONFIG_TCP_MD5SIG
 649         hash_location = tcp_parse_md5sig_option(th);
 650         if (!sk && hash_location) {
 651                 /*
 652                  * active side is lost. Try to find listening socket through
 653                  * source port, and then find md5 key through listening socket.
 654                  * we are not loose security here:
 655                  * Incoming packet is checked with md5 hash with finding key,
 656                  * no RST generated if md5 hash doesn't match.
 657                  */
 658                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 659                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 660                                              ntohs(th->source), inet_iif(skb));
 661                 /* don't send rst if it can't find key */
 662                 if (!sk1)
 663                         return;
 664                 rcu_read_lock();
 665                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 666                                         &ip_hdr(skb)->saddr, AF_INET);
 667                 if (!key)
 668                         goto release_sk1;
 669
 670                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 671                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 672                         goto release_sk1;
 673         } else {
 674                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 675                                              &ip_hdr(skb)->saddr,
 676                                              AF_INET) : NULL;
 677         }
 678
 679         if (key) {
 680                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 681                                    (TCPOPT_NOP << 16) |
 682                                    (TCPOPT_MD5SIG << 8) |
 683                                    TCPOLEN_MD5SIG);
 684                 /* Update length and the length the header thinks exists */
 685                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 686                 rep.th.doff = arg.iov[0].iov_len / 4;
 687
 688                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 689                                      key, ip_hdr(skb)->saddr,
 690                                      ip_hdr(skb)->daddr, &rep.th);
 691         }
 692 #endif
 693         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 694                                       ip_hdr(skb)->saddr, /* XXX */
 695                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 696         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 697         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 698         /* When socket is gone, all binding information is lost.
 699          * routing might fail in this case. No choice here, if we choose to force
 700          * input interface, we will misroute in case of asymmetric route.
 701          */
 702         if (sk)
 703                 arg.bound_dev_if = sk->sk_bound_dev_if;
 704
 705         net = dev_net(skb_dst(skb)->dev);
 706         arg.tos = ip_hdr(skb)->tos;
 707         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 708                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 709
 710         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 711         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 712
 713 #ifdef CONFIG_TCP_MD5SIG
 714 release_sk1:
 715         if (sk1) {
 716                 rcu_read_unlock();
 717                 sock_put(sk1);
 718         }
 719 #endif
 720 }
 721
 722 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 723    outside socket context is ugly, certainly. What can I do?
 724  */
 725
 726 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 727                             u32 win, u32 ts, int oif,
 728                             struct tcp_md5sig_key *key,
 729                             int reply_flags, u8 tos)
 730 {
 731         const struct tcphdr *th = tcp_hdr(skb);
 732         struct {
 733                 struct tcphdr th;
 734                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 735 #ifdef CONFIG_TCP_MD5SIG
 736                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 737 #endif
 738                         ];
 739         } rep;
 740         struct ip_reply_arg arg;
 741         struct net *net = dev_net(skb_dst(skb)->dev);
 742
 743         memset(&rep.th, 0, sizeof(struct tcphdr));
 744         memset(&arg, 0, sizeof(arg));
 745
 746         arg.iov[0].iov_base = (unsigned char *)&rep;
 747         arg.iov[0].iov_len  = sizeof(rep.th);
 748         if (ts) {
 749                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 750                                    (TCPOPT_TIMESTAMP << 8) |
 751                                    TCPOLEN_TIMESTAMP);
 752                 rep.opt[1] = htonl(tcp_time_stamp);
 753                 rep.opt[2] = htonl(ts);
 754                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 755         }
 756
 757         /* Swap the send and the receive. */
 758         rep.th.dest    = th->source;
 759         rep.th.source  = th->dest;
 760         rep.th.doff    = arg.iov[0].iov_len / 4;
 761         rep.th.seq     = htonl(seq);
 762         rep.th.ack_seq = htonl(ack);
 763         rep.th.ack     = 1;
 764         rep.th.window  = htons(win);
 765
 766 #ifdef CONFIG_TCP_MD5SIG
 767         if (key) {
 768                 int offset = (ts) ? 3 : 0;
 769
 770                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 771                                           (TCPOPT_NOP << 16) |
 772                                           (TCPOPT_MD5SIG << 8) |
 773                                           TCPOLEN_MD5SIG);
 774                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 775                 rep.th.doff = arg.iov[0].iov_len/4;
 776
 777                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 778                                     key, ip_hdr(skb)->saddr,
 779                                     ip_hdr(skb)->daddr, &rep.th);
 780         }
 781 #endif
 782         arg.flags = reply_flags;
 783         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 784                                       ip_hdr(skb)->saddr, /* XXX */
 785                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 786         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 787         if (oif)
 788                 arg.bound_dev_if = oif;
 789         arg.tos = tos;
 790         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 791                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 792
 793         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 794 }
 795
 796 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 797 {
 798         struct inet_timewait_sock *tw = inet_twsk(sk);
 799         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 800
 801         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 802                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 803                         tcptw->tw_ts_recent,
 804                         tw->tw_bound_dev_if,
 805                         tcp_twsk_md5_key(tcptw),
 806                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 807                         tw->tw_tos
 808                         );
 809
 810         inet_twsk_put(tw);
 811 }
 812
 813 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 814                                   struct request_sock *req)
 815 {
 816         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 817          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 818          */
 819         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 820                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 821                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 822                         req->ts_recent,
 823                         0,
 824                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 825                                           AF_INET),
 826                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 827                         ip_hdr(skb)->tos);
 828 }
 829
 830 /*
 831  *      Send a SYN-ACK after having received a SYN.
 832  *      This still operates on a request_sock only, not on a big
 833  *      socket.
 834  */
 835 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 836                               struct request_sock *req,
 837                               struct request_values *rvp,
 838                               u16 queue_mapping,
 839                               bool nocache)
 840 {
 841         const struct inet_request_sock *ireq = inet_rsk(req);
 842         struct flowi4 fl4;
 843         int err = -1;
 844         struct sk_buff * skb;
 845
 846         /* First, grab a route. */
 847         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 848                 return -1;
 849
 850         skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 851
 852         if (skb) {
 853                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 854
 855                 skb_set_queue_mapping(skb, queue_mapping);
 856                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 857                                             ireq->rmt_addr,
 858                                             ireq->opt);
 859                 err = net_xmit_eval(err);
 860                 if (!tcp_rsk(req)->snt_synack && !err)
 861                         tcp_rsk(req)->snt_synack = tcp_time_stamp;
 862         }
 863
 864         return err;
 865 }
 866
 867 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 868                              struct request_values *rvp)
 869 {
 870         int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 871
 872         if (!res)
 873                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 874         return res;
 875 }
 876
 877 /*
 878  *      IPv4 request_sock destructor.
 879  */
 880 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 881 {
 882         kfree(inet_rsk(req)->opt);
 883 }
 884
 885 /*
 886  * Return true if a syncookie should be sent
 887  */
 888 bool tcp_syn_flood_action(struct sock *sk,
 889                          const struct sk_buff *skb,
 890                          const char *proto)
 891 {
 892         const char *msg = "Dropping request";
 893         bool want_cookie = false;
 894         struct listen_sock *lopt;
 895
 896
 897
 898 #ifdef CONFIG_SYN_COOKIES
 899         if (sysctl_tcp_syncookies) {
 900                 msg = "Sending cookies";
 901                 want_cookie = true;
 902                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 903         } else
 904 #endif
 905                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 906
 907         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 908         if (!lopt->synflood_warned) {
 909                 lopt->synflood_warned = 1;
 910                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 911                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 912         }
 913         return want_cookie;
 914 }
 915 EXPORT_SYMBOL(tcp_syn_flood_action);
 916
 917 /*
 918  * Save and compile IPv4 options into the request_sock if needed.
 919  */
 920 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 921 {
 922         const struct ip_options *opt = &(IPCB(skb)->opt);
 923         struct ip_options_rcu *dopt = NULL;
 924
 925         if (opt && opt->optlen) {
 926                 int opt_size = sizeof(*dopt) + opt->optlen;
 927
 928                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 929                 if (dopt) {
 930                         if (ip_options_echo(&dopt->opt, skb)) {
 931                                 kfree(dopt);
 932                                 dopt = NULL;
 933                         }
 934                 }
 935         }
 936         return dopt;
 937 }
 938
 939 #ifdef CONFIG_TCP_MD5SIG
 940 /*
 941  * RFC2385 MD5 checksumming requires a mapping of
 942  * IP address->MD5 Key.
 943  * We need to maintain these in the sk structure.
 944  */
 945
 946 /* Find the Key structure for an address.  */
 947 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 948                                          const union tcp_md5_addr *addr,
 949                                          int family)
 950 {
 951         struct tcp_sock *tp = tcp_sk(sk);
 952         struct tcp_md5sig_key *key;
 953         struct hlist_node *pos;
 954         unsigned int size = sizeof(struct in_addr);
 955         struct tcp_md5sig_info *md5sig;
 956
 957         /* caller either holds rcu_read_lock() or socket lock */
 958         md5sig = rcu_dereference_check(tp->md5sig_info,
 959                                        sock_owned_by_user(sk) ||
 960                                        lockdep_is_held(&sk->sk_lock.slock));
 961         if (!md5sig)
 962                 return NULL;
 963 #if IS_ENABLED(CONFIG_IPV6)
 964         if (family == AF_INET6)
 965                 size = sizeof(struct in6_addr);
 966 #endif
 967         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 968                 if (key->family != family)
 969                         continue;
 970                 if (!memcmp(&key->addr, addr, size))
 971                         return key;
 972         }
 973         return NULL;
 974 }
 975 EXPORT_SYMBOL(tcp_md5_do_lookup);
 976
 977 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 978                                          struct sock *addr_sk)
 979 {
 980         union tcp_md5_addr *addr;
 981
 982         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 983         return tcp_md5_do_lookup(sk, addr, AF_INET);
 984 }
 985 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 986
 987 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 988                                                       struct request_sock *req)
 989 {
 990         union tcp_md5_addr *addr;
 991
 992         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 993         return tcp_md5_do_lookup(sk, addr, AF_INET);
 994 }
 995
 996 /* This can be called on a newly created socket, from other files */
 997 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 998                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 999 {
1000         /* Add Key to the list */
1001         struct tcp_md5sig_key *key;
1002         struct tcp_sock *tp = tcp_sk(sk);
1003         struct tcp_md5sig_info *md5sig;
1004
1005         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1006         if (key) {
1007                 /* Pre-existing entry - just update that one. */
1008                 memcpy(key->key, newkey, newkeylen);
1009                 key->keylen = newkeylen;
1010                 return 0;
1011         }
1012
1013         md5sig = rcu_dereference_protected(tp->md5sig_info,
1014                                            sock_owned_by_user(sk));
1015         if (!md5sig) {
1016                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1017                 if (!md5sig)
1018                         return -ENOMEM;
1019
1020                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1021                 INIT_HLIST_HEAD(&md5sig->head);
1022                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1023         }
1024
1025         key = sock_kmalloc(sk, sizeof(*key), gfp);
1026         if (!key)
1027                 return -ENOMEM;
1028         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1029                 sock_kfree_s(sk, key, sizeof(*key));
1030                 return -ENOMEM;
1031         }
1032
1033         memcpy(key->key, newkey, newkeylen);
1034         key->keylen = newkeylen;
1035         key->family = family;
1036         memcpy(&key->addr, addr,
1037                (family == AF_INET6) ? sizeof(struct in6_addr) :
1038                                       sizeof(struct in_addr));
1039         hlist_add_head_rcu(&key->node, &md5sig->head);
1040         return 0;
1041 }
1042 EXPORT_SYMBOL(tcp_md5_do_add);
1043
1044 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1045 {
1046         struct tcp_sock *tp = tcp_sk(sk);
1047         struct tcp_md5sig_key *key;
1048         struct tcp_md5sig_info *md5sig;
1049
1050         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1051         if (!key)
1052                 return -ENOENT;
1053         hlist_del_rcu(&key->node);
1054         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1055         kfree_rcu(key, rcu);
1056         md5sig = rcu_dereference_protected(tp->md5sig_info,
1057                                            sock_owned_by_user(sk));
1058         if (hlist_empty(&md5sig->head))
1059                 tcp_free_md5sig_pool();
1060         return 0;
1061 }
1062 EXPORT_SYMBOL(tcp_md5_do_del);
1063
1064 static void tcp_clear_md5_list(struct sock *sk)
1065 {
1066         struct tcp_sock *tp = tcp_sk(sk);
1067         struct tcp_md5sig_key *key;
1068         struct hlist_node *pos, *n;
1069         struct tcp_md5sig_info *md5sig;
1070
1071         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1072
1073         if (!hlist_empty(&md5sig->head))
1074                 tcp_free_md5sig_pool();
1075         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1076                 hlist_del_rcu(&key->node);
1077                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1078                 kfree_rcu(key, rcu);
1079         }
1080 }
1081
1082 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1083                                  int optlen)
1084 {
1085         struct tcp_md5sig cmd;
1086         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1087
1088         if (optlen < sizeof(cmd))
1089                 return -EINVAL;
1090
1091         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1092                 return -EFAULT;
1093
1094         if (sin->sin_family != AF_INET)
1095                 return -EINVAL;
1096
1097         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1098                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1099                                       AF_INET);
1100
1101         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1102                 return -EINVAL;
1103
1104         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1105                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1106                               GFP_KERNEL);
1107 }
1108
1109 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1110                                         __be32 daddr, __be32 saddr, int nbytes)
1111 {
1112         struct tcp4_pseudohdr *bp;
1113         struct scatterlist sg;
1114
1115         bp = &hp->md5_blk.ip4;
1116
1117         /*
1118          * 1. the TCP pseudo-header (in the order: source IP address,
1119          * destination IP address, zero-padded protocol number, and
1120          * segment length)
1121          */
1122         bp->saddr = saddr;
1123         bp->daddr = daddr;
1124         bp->pad = 0;
1125         bp->protocol = IPPROTO_TCP;
1126         bp->len = cpu_to_be16(nbytes);
1127
1128         sg_init_one(&sg, bp, sizeof(*bp));
1129         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1130 }
1131
1132 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1133                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1134 {
1135         struct tcp_md5sig_pool *hp;
1136         struct hash_desc *desc;
1137
1138         hp = tcp_get_md5sig_pool();
1139         if (!hp)
1140                 goto clear_hash_noput;
1141         desc = &hp->md5_desc;
1142
1143         if (crypto_hash_init(desc))
1144                 goto clear_hash;
1145         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1146                 goto clear_hash;
1147         if (tcp_md5_hash_header(hp, th))
1148                 goto clear_hash;
1149         if (tcp_md5_hash_key(hp, key))
1150                 goto clear_hash;
1151         if (crypto_hash_final(desc, md5_hash))
1152                 goto clear_hash;
1153
1154         tcp_put_md5sig_pool();
1155         return 0;
1156
1157 clear_hash:
1158         tcp_put_md5sig_pool();
1159 clear_hash_noput:
1160         memset(md5_hash, 0, 16);
1161         return 1;
1162 }
1163
1164 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1165                         const struct sock *sk, const struct request_sock *req,
1166                         const struct sk_buff *skb)
1167 {
1168         struct tcp_md5sig_pool *hp;
1169         struct hash_desc *desc;
1170         const struct tcphdr *th = tcp_hdr(skb);
1171         __be32 saddr, daddr;
1172
1173         if (sk) {
1174                 saddr = inet_sk(sk)->inet_saddr;
1175                 daddr = inet_sk(sk)->inet_daddr;
1176         } else if (req) {
1177                 saddr = inet_rsk(req)->loc_addr;
1178                 daddr = inet_rsk(req)->rmt_addr;
1179         } else {
1180                 const struct iphdr *iph = ip_hdr(skb);
1181                 saddr = iph->saddr;
1182                 daddr = iph->daddr;
1183         }
1184
1185         hp = tcp_get_md5sig_pool();
1186         if (!hp)
1187                 goto clear_hash_noput;
1188         desc = &hp->md5_desc;
1189
1190         if (crypto_hash_init(desc))
1191                 goto clear_hash;
1192
1193         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1194                 goto clear_hash;
1195         if (tcp_md5_hash_header(hp, th))
1196                 goto clear_hash;
1197         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1198                 goto clear_hash;
1199         if (tcp_md5_hash_key(hp, key))
1200                 goto clear_hash;
1201         if (crypto_hash_final(desc, md5_hash))
1202                 goto clear_hash;
1203
1204         tcp_put_md5sig_pool();
1205         return 0;
1206
1207 clear_hash:
1208         tcp_put_md5sig_pool();
1209 clear_hash_noput:
1210         memset(md5_hash, 0, 16);
1211         return 1;
1212 }
1213 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1214
1215 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1216 {
1217         /*
1218          * This gets called for each TCP segment that arrives
1219          * so we want to be efficient.
1220          * We have 3 drop cases:
1221          * o No MD5 hash and one expected.
1222          * o MD5 hash and we're not expecting one.
1223          * o MD5 hash and its wrong.
1224          */
1225         const __u8 *hash_location = NULL;
1226         struct tcp_md5sig_key *hash_expected;
1227         const struct iphdr *iph = ip_hdr(skb);
1228         const struct tcphdr *th = tcp_hdr(skb);
1229         int genhash;
1230         unsigned char newhash[16];
1231
1232         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1233                                           AF_INET);
1234         hash_location = tcp_parse_md5sig_option(th);
1235
1236         /* We've parsed the options - do we have a hash? */
1237         if (!hash_expected && !hash_location)
1238                 return false;
1239
1240         if (hash_expected && !hash_location) {
1241                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1242                 return true;
1243         }
1244
1245         if (!hash_expected && hash_location) {
1246                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1247                 return true;
1248         }
1249
1250         /* Okay, so this is hash_expected and hash_location -
1251          * so we need to calculate the checksum.
1252          */
1253         genhash = tcp_v4_md5_hash_skb(newhash,
1254                                       hash_expected,
1255                                       NULL, NULL, skb);
1256
1257         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1258                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1259                                      &iph->saddr, ntohs(th->source),
1260                                      &iph->daddr, ntohs(th->dest),
1261                                      genhash ? " tcp_v4_calc_md5_hash failed"
1262                                      : "");
1263                 return true;
1264         }
1265         return false;
1266 }
1267
1268 #endif
1269
1270 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1271         .family         =       PF_INET,
1272         .obj_size       =       sizeof(struct tcp_request_sock),
1273         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1274         .send_ack       =       tcp_v4_reqsk_send_ack,
1275         .destructor     =       tcp_v4_reqsk_destructor,
1276         .send_reset     =       tcp_v4_send_reset,
1277         .syn_ack_timeout =      tcp_syn_ack_timeout,
1278 };
1279
1280 #ifdef CONFIG_TCP_MD5SIG
1281 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1282         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1283         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1284 };
1285 #endif
1286
1287 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1288                                struct request_sock *req,
1289                                struct tcp_fastopen_cookie *foc,
1290                                struct tcp_fastopen_cookie *valid_foc)
1291 {
1292         bool skip_cookie = false;
1293         struct fastopen_queue *fastopenq;
1294
1295         if (likely(!fastopen_cookie_present(foc))) {
1296                 /* See include/net/tcp.h for the meaning of these knobs */
1297                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1298                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1299                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1300                         skip_cookie = true; /* no cookie to validate */
1301                 else
1302                         return false;
1303         }
1304         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1305         /* A FO option is present; bump the counter. */
1306         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1307
1308         /* Make sure the listener has enabled fastopen, and we don't
1309          * exceed the max # of pending TFO requests allowed before trying
1310          * to validating the cookie in order to avoid burning CPU cycles
1311          * unnecessarily.
1312          *
1313          * XXX (TFO) - The implication of checking the max_qlen before
1314          * processing a cookie request is that clients can't differentiate
1315          * between qlen overflow causing Fast Open to be disabled
1316          * temporarily vs a server not supporting Fast Open at all.
1317          */
1318         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1319             fastopenq == NULL || fastopenq->max_qlen == 0)
1320                 return false;
1321
1322         if (fastopenq->qlen >= fastopenq->max_qlen) {
1323                 struct request_sock *req1;
1324                 spin_lock(&fastopenq->lock);
1325                 req1 = fastopenq->rskq_rst_head;
1326                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1327                         spin_unlock(&fastopenq->lock);
1328                         NET_INC_STATS_BH(sock_net(sk),
1329                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1330                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1331                         foc->len = -1;
1332                         return false;
1333                 }
1334                 fastopenq->rskq_rst_head = req1->dl_next;
1335                 fastopenq->qlen--;
1336                 spin_unlock(&fastopenq->lock);
1337                 reqsk_free(req1);
1338         }
1339         if (skip_cookie) {
1340                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1341                 return true;
1342         }
1343         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1344                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1345                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1346                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1347                             memcmp(&foc->val[0], &valid_foc->val[0],
1348                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1349                                 return false;
1350                         valid_foc->len = -1;
1351                 }
1352                 /* Acknowledge the data received from the peer. */
1353                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1354                 return true;
1355         } else if (foc->len == 0) { /* Client requesting a cookie */
1356                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1357                 NET_INC_STATS_BH(sock_net(sk),
1358                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1359         } else {
1360                 /* Client sent a cookie with wrong size. Treat it
1361                  * the same as invalid and return a valid one.
1362                  */
1363                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364         }
1365         return false;
1366 }
1367
1368 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1369                                     struct sk_buff *skb,
1370                                     struct sk_buff *skb_synack,
1371                                     struct request_sock *req,
1372                                     struct request_values *rvp)
1373 {
1374         struct tcp_sock *tp = tcp_sk(sk);
1375         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1376         const struct inet_request_sock *ireq = inet_rsk(req);
1377         struct sock *child;
1378         int err;
1379
1380         req->num_retrans = 0;
1381         req->num_timeout = 0;
1382         req->sk = NULL;
1383
1384         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1385         if (child == NULL) {
1386                 NET_INC_STATS_BH(sock_net(sk),
1387                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1388                 kfree_skb(skb_synack);
1389                 return -1;
1390         }
1391         err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1392                                     ireq->rmt_addr, ireq->opt);
1393         err = net_xmit_eval(err);
1394         if (!err)
1395                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1396         /* XXX (TFO) - is it ok to ignore error and continue? */
1397
1398         spin_lock(&queue->fastopenq->lock);
1399         queue->fastopenq->qlen++;
1400         spin_unlock(&queue->fastopenq->lock);
1401
1402         /* Initialize the child socket. Have to fix some values to take
1403          * into account the child is a Fast Open socket and is created
1404          * only out of the bits carried in the SYN packet.
1405          */
1406         tp = tcp_sk(child);
1407
1408         tp->fastopen_rsk = req;
1409         /* Do a hold on the listner sk so that if the listener is being
1410          * closed, the child that has been accepted can live on and still
1411          * access listen_lock.
1412          */
1413         sock_hold(sk);
1414         tcp_rsk(req)->listener = sk;
1415
1416         /* RFC1323: The window in SYN & SYN/ACK segments is never
1417          * scaled. So correct it appropriately.
1418          */
1419         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1420
1421         /* Activate the retrans timer so that SYNACK can be retransmitted.
1422          * The request socket is not added to the SYN table of the parent
1423          * because it's been added to the accept queue directly.
1424          */
1425         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1426             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1427
1428         /* Add the child socket directly into the accept queue */
1429         inet_csk_reqsk_queue_add(sk, req, child);
1430
1431         /* Now finish processing the fastopen child socket. */
1432         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1433         tcp_init_congestion_control(child);
1434         tcp_mtup_init(child);
1435         tcp_init_buffer_space(child);
1436         tcp_init_metrics(child);
1437
1438         /* Queue the data carried in the SYN packet. We need to first
1439          * bump skb's refcnt because the caller will attempt to free it.
1440          *
1441          * XXX (TFO) - we honor a zero-payload TFO request for now.
1442          * (Any reason not to?)
1443          */
1444         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1445                 /* Don't queue the skb if there is no payload in SYN.
1446                  * XXX (TFO) - How about SYN+FIN?
1447                  */
1448                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1449         } else {
1450                 skb = skb_get(skb);
1451                 skb_dst_drop(skb);
1452                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1453                 skb_set_owner_r(skb, child);
1454                 __skb_queue_tail(&child->sk_receive_queue, skb);
1455                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456                 tp->syn_data_acked = 1;
1457         }
1458         sk->sk_data_ready(sk, 0);
1459         bh_unlock_sock(child);
1460         sock_put(child);
1461         WARN_ON(req->sk == NULL);
1462         return 0;
1463 }
1464
1465 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1466 {
1467         struct tcp_extend_values tmp_ext;
1468         struct tcp_options_received tmp_opt;
1469         const u8 *hash_location;
1470         struct request_sock *req;
1471         struct inet_request_sock *ireq;
1472         struct tcp_sock *tp = tcp_sk(sk);
1473         struct dst_entry *dst = NULL;
1474         __be32 saddr = ip_hdr(skb)->saddr;
1475         __be32 daddr = ip_hdr(skb)->daddr;
1476         __u32 isn = TCP_SKB_CB(skb)->when;
1477         bool want_cookie = false;
1478         struct flowi4 fl4;
1479         struct tcp_fastopen_cookie foc = { .len = -1 };
1480         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1481         struct sk_buff *skb_synack;
1482         int do_fastopen;
1483
1484         /* Never answer to SYNs send to broadcast or multicast */
1485         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1486                 goto drop;
1487
1488         /* TW buckets are converted to open requests without
1489          * limitations, they conserve resources and peer is
1490          * evidently real one.
1491          */
1492         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1493                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1494                 if (!want_cookie)
1495                         goto drop;
1496         }
1497
1498         /* Accept backlog is full. If we have already queued enough
1499          * of warm entries in syn queue, drop request. It is better than
1500          * clogging syn queue with openreqs with exponentially increasing
1501          * timeout.
1502          */
1503         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1504                 goto drop;
1505
1506         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1507         if (!req)
1508                 goto drop;
1509
1510 #ifdef CONFIG_TCP_MD5SIG
1511         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1512 #endif
1513
1514         tcp_clear_options(&tmp_opt);
1515         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1516         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1517         tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1518             want_cookie ? NULL : &foc);
1519
1520         if (tmp_opt.cookie_plus > 0 &&
1521             tmp_opt.saw_tstamp &&
1522             !tp->rx_opt.cookie_out_never &&
1523             (sysctl_tcp_cookie_size > 0 ||
1524              (tp->cookie_values != NULL &&
1525               tp->cookie_values->cookie_desired > 0))) {
1526                 u8 *c;
1527                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1528                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1529
1530                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1531                         goto drop_and_release;
1532
1533                 /* Secret recipe starts with IP addresses */
1534                 *mess++ ^= (__force u32)daddr;
1535                 *mess++ ^= (__force u32)saddr;
1536
1537                 /* plus variable length Initiator Cookie */
1538                 c = (u8 *)mess;
1539                 while (l-- > 0)
1540                         *c++ ^= *hash_location++;
1541
1542                 want_cookie = false;    /* not our kind of cookie */
1543                 tmp_ext.cookie_out_never = 0; /* false */
1544                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1545         } else if (!tp->rx_opt.cookie_in_always) {
1546                 /* redundant indications, but ensure initialization. */
1547                 tmp_ext.cookie_out_never = 1; /* true */
1548                 tmp_ext.cookie_plus = 0;
1549         } else {
1550                 goto drop_and_release;
1551         }
1552         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1553
1554         if (want_cookie && !tmp_opt.saw_tstamp)
1555                 tcp_clear_options(&tmp_opt);
1556
1557         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1558         tcp_openreq_init(req, &tmp_opt, skb);
1559
1560         ireq = inet_rsk(req);
1561         ireq->loc_addr = daddr;
1562         ireq->rmt_addr = saddr;
1563         ireq->no_srccheck = inet_sk(sk)->transparent;
1564         ireq->opt = tcp_v4_save_options(skb);
1565
1566         if (security_inet_conn_request(sk, skb, req))
1567                 goto drop_and_free;
1568
1569         if (!want_cookie || tmp_opt.tstamp_ok)
1570                 TCP_ECN_create_request(req, skb);
1571
1572         if (want_cookie) {
1573                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1574                 req->cookie_ts = tmp_opt.tstamp_ok;
1575         } else if (!isn) {
1576                 /* VJ's idea. We save last timestamp seen
1577                  * from the destination in peer table, when entering
1578                  * state TIME-WAIT, and check against it before
1579                  * accepting new connection request.
1580                  *
1581                  * If "isn" is not zero, this request hit alive
1582                  * timewait bucket, so that all the necessary checks
1583                  * are made in the function processing timewait state.
1584                  */
1585                 if (tmp_opt.saw_tstamp &&
1586                     tcp_death_row.sysctl_tw_recycle &&
1587                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1588                     fl4.daddr == saddr) {
1589                         if (!tcp_peer_is_proven(req, dst, true)) {
1590                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1591                                 goto drop_and_release;
1592                         }
1593                 }
1594                 /* Kill the following clause, if you dislike this way. */
1595                 else if (!sysctl_tcp_syncookies &&
1596                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1597                           (sysctl_max_syn_backlog >> 2)) &&
1598                          !tcp_peer_is_proven(req, dst, false)) {
1599                         /* Without syncookies last quarter of
1600                          * backlog is filled with destinations,
1601                          * proven to be alive.
1602                          * It means that we continue to communicate
1603                          * to destinations, already remembered
1604                          * to the moment of synflood.
1605                          */
1606                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1607                                        &saddr, ntohs(tcp_hdr(skb)->source));
1608                         goto drop_and_release;
1609                 }
1610
1611                 isn = tcp_v4_init_sequence(skb);
1612         }
1613         tcp_rsk(req)->snt_isn = isn;
1614
1615         if (dst == NULL) {
1616                 dst = inet_csk_route_req(sk, &fl4, req);
1617                 if (dst == NULL)
1618                         goto drop_and_free;
1619         }
1620         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1621
1622         /* We don't call tcp_v4_send_synack() directly because we need
1623          * to make sure a child socket can be created successfully before
1624          * sending back synack!
1625          *
1626          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1627          * (or better yet, call tcp_send_synack() in the child context
1628          * directly, but will have to fix bunch of other code first)
1629          * after syn_recv_sock() except one will need to first fix the
1630          * latter to remove its dependency on the current implementation
1631          * of tcp_v4_send_synack()->tcp_select_initial_window().
1632          */
1633         skb_synack = tcp_make_synack(sk, dst, req,
1634             (struct request_values *)&tmp_ext,
1635             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1636
1637         if (skb_synack) {
1638                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1639                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1640         } else
1641                 goto drop_and_free;
1642
1643         if (likely(!do_fastopen)) {
1644                 int err;
1645                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1646                      ireq->rmt_addr, ireq->opt);
1647                 err = net_xmit_eval(err);
1648                 if (err || want_cookie)
1649                         goto drop_and_free;
1650
1651                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1652                 tcp_rsk(req)->listener = NULL;
1653                 /* Add the request_sock to the SYN table */
1654                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1655                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1656                         NET_INC_STATS_BH(sock_net(sk),
1657                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1658         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1659             (struct request_values *)&tmp_ext))
1660                 goto drop_and_free;
1661
1662         return 0;
1663
1664 drop_and_release:
1665         dst_release(dst);
1666 drop_and_free:
1667         reqsk_free(req);
1668 drop:
1669         return 0;
1670 }
1671 EXPORT_SYMBOL(tcp_v4_conn_request);
1672
1673
1674 /*
1675  * The three way handshake has completed - we got a valid synack -
1676  * now create the new socket.
1677  */
1678 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1679                                   struct request_sock *req,
1680                                   struct dst_entry *dst)
1681 {
1682         struct inet_request_sock *ireq;
1683         struct inet_sock *newinet;
1684         struct tcp_sock *newtp;
1685         struct sock *newsk;
1686 #ifdef CONFIG_TCP_MD5SIG
1687         struct tcp_md5sig_key *key;
1688 #endif
1689         struct ip_options_rcu *inet_opt;
1690
1691         if (sk_acceptq_is_full(sk))
1692                 goto exit_overflow;
1693
1694         newsk = tcp_create_openreq_child(sk, req, skb);
1695         if (!newsk)
1696                 goto exit_nonewsk;
1697
1698         newsk->sk_gso_type = SKB_GSO_TCPV4;
1699         inet_sk_rx_dst_set(newsk, skb);
1700
1701         newtp                 = tcp_sk(newsk);
1702         newinet               = inet_sk(newsk);
1703         ireq                  = inet_rsk(req);
1704         newinet->inet_daddr   = ireq->rmt_addr;
1705         newinet->inet_rcv_saddr = ireq->loc_addr;
1706         newinet->inet_saddr           = ireq->loc_addr;
1707         inet_opt              = ireq->opt;
1708         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1709         ireq->opt             = NULL;
1710         newinet->mc_index     = inet_iif(skb);
1711         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1712         newinet->rcv_tos      = ip_hdr(skb)->tos;
1713         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1714         if (inet_opt)
1715                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1716         newinet->inet_id = newtp->write_seq ^ jiffies;
1717
1718         if (!dst) {
1719                 dst = inet_csk_route_child_sock(sk, newsk, req);
1720                 if (!dst)
1721                         goto put_and_exit;
1722         } else {
1723                 /* syncookie case : see end of cookie_v4_check() */
1724         }
1725         sk_setup_caps(newsk, dst);
1726
1727         tcp_mtup_init(newsk);
1728         tcp_sync_mss(newsk, dst_mtu(dst));
1729         newtp->advmss = dst_metric_advmss(dst);
1730         if (tcp_sk(sk)->rx_opt.user_mss &&
1731             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1732                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1733
1734         tcp_initialize_rcv_mss(newsk);
1735         tcp_synack_rtt_meas(newsk, req);
1736         newtp->total_retrans = req->num_retrans;
1737
1738 #ifdef CONFIG_TCP_MD5SIG
1739         /* Copy over the MD5 key from the original socket */
1740         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1741                                 AF_INET);
1742         if (key != NULL) {
1743                 /*
1744                  * We're using one, so create a matching key
1745                  * on the newsk structure. If we fail to get
1746                  * memory, then we end up not copying the key
1747                  * across. Shucks.
1748                  */
1749                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1750                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1751                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1752         }
1753 #endif
1754
1755         if (__inet_inherit_port(sk, newsk) < 0)
1756                 goto put_and_exit;
1757         __inet_hash_nolisten(newsk, NULL);
1758
1759         return newsk;
1760
1761 exit_overflow:
1762         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1763 exit_nonewsk:
1764         dst_release(dst);
1765 exit:
1766         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1767         return NULL;
1768 put_and_exit:
1769         inet_csk_prepare_forced_close(newsk);
1770         tcp_done(newsk);
1771         goto exit;
1772 }
1773 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1774
1775 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1776 {
1777         struct tcphdr *th = tcp_hdr(skb);
1778         const struct iphdr *iph = ip_hdr(skb);
1779         struct sock *nsk;
1780         struct request_sock **prev;
1781         /* Find possible connection requests. */
1782         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1783                                                        iph->saddr, iph->daddr);
1784         if (req)
1785                 return tcp_check_req(sk, skb, req, prev, false);
1786
1787         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1788                         th->source, iph->daddr, th->dest, inet_iif(skb));
1789
1790         if (nsk) {
1791                 if (nsk->sk_state != TCP_TIME_WAIT) {
1792                         bh_lock_sock(nsk);
1793                         return nsk;
1794                 }
1795                 inet_twsk_put(inet_twsk(nsk));
1796                 return NULL;
1797         }
1798
1799 #ifdef CONFIG_SYN_COOKIES
1800         if (!th->syn)
1801                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1802 #endif
1803         return sk;
1804 }
1805
1806 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1807 {
1808         const struct iphdr *iph = ip_hdr(skb);
1809
1810         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1811                 if (!tcp_v4_check(skb->len, iph->saddr,
1812                                   iph->daddr, skb->csum)) {
1813                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1814                         return 0;
1815                 }
1816         }
1817
1818         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1819                                        skb->len, IPPROTO_TCP, 0);
1820
1821         if (skb->len <= 76) {
1822                 return __skb_checksum_complete(skb);
1823         }
1824         return 0;
1825 }
1826
1827
1828 /* The socket must have it's spinlock held when we get
1829  * here.
1830  *
1831  * We have a potential double-lock case here, so even when
1832  * doing backlog processing we use the BH locking scheme.
1833  * This is because we cannot sleep with the original spinlock
1834  * held.
1835  */
1836 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1837 {
1838         struct sock *rsk;
1839 #ifdef CONFIG_TCP_MD5SIG
1840         /*
1841          * We really want to reject the packet as early as possible
1842          * if:
1843          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1844          *  o There is an MD5 option and we're not expecting one
1845          */
1846         if (tcp_v4_inbound_md5_hash(sk, skb))
1847                 goto discard;
1848 #endif
1849
1850         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1851                 struct dst_entry *dst = sk->sk_rx_dst;
1852
1853                 sock_rps_save_rxhash(sk, skb);
1854                 if (dst) {
1855                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1856                             dst->ops->check(dst, 0) == NULL) {
1857                                 dst_release(dst);
1858                                 sk->sk_rx_dst = NULL;
1859                         }
1860                 }
1861                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1862                         rsk = sk;
1863                         goto reset;
1864                 }
1865                 return 0;
1866         }
1867
1868         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1869                 goto csum_err;
1870
1871         if (sk->sk_state == TCP_LISTEN) {
1872                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1873                 if (!nsk)
1874                         goto discard;
1875
1876                 if (nsk != sk) {
1877                         sock_rps_save_rxhash(nsk, skb);
1878                         if (tcp_child_process(sk, nsk, skb)) {
1879                                 rsk = nsk;
1880                                 goto reset;
1881                         }
1882                         return 0;
1883                 }
1884         } else
1885                 sock_rps_save_rxhash(sk, skb);
1886
1887         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1888                 rsk = sk;
1889                 goto reset;
1890         }
1891         return 0;
1892
1893 reset:
1894         tcp_v4_send_reset(rsk, skb);
1895 discard:
1896         kfree_skb(skb);
1897         /* Be careful here. If this function gets more complicated and
1898          * gcc suffers from register pressure on the x86, sk (in %ebx)
1899          * might be destroyed here. This current version compiles correctly,
1900          * but you have been warned.
1901          */
1902         return 0;
1903
1904 csum_err:
1905         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1906         goto discard;
1907 }
1908 EXPORT_SYMBOL(tcp_v4_do_rcv);
1909
1910 void tcp_v4_early_demux(struct sk_buff *skb)
1911 {
1912         const struct iphdr *iph;
1913         const struct tcphdr *th;
1914         struct sock *sk;
1915
1916         if (skb->pkt_type != PACKET_HOST)
1917                 return;
1918
1919         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1920                 return;
1921
1922         iph = ip_hdr(skb);
1923         th = tcp_hdr(skb);
1924
1925         if (th->doff < sizeof(struct tcphdr) / 4)
1926                 return;
1927
1928         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1929                                        iph->saddr, th->source,
1930                                        iph->daddr, ntohs(th->dest),
1931                                        skb->skb_iif);
1932         if (sk) {
1933                 skb->sk = sk;
1934                 skb->destructor = sock_edemux;
1935                 if (sk->sk_state != TCP_TIME_WAIT) {
1936                         struct dst_entry *dst = sk->sk_rx_dst;
1937
1938                         if (dst)
1939                                 dst = dst_check(dst, 0);
1940                         if (dst &&
1941                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1942                                 skb_dst_set_noref(skb, dst);
1943                 }
1944         }
1945 }
1946
1947 /*
1948  *      From tcp_input.c
1949  */
1950
1951 int tcp_v4_rcv(struct sk_buff *skb)
1952 {
1953         const struct iphdr *iph;
1954         const struct tcphdr *th;
1955         struct sock *sk;
1956         int ret;
1957         struct net *net = dev_net(skb->dev);
1958
1959         if (skb->pkt_type != PACKET_HOST)
1960                 goto discard_it;
1961
1962         /* Count it even if it's bad */
1963         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1964
1965         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1966                 goto discard_it;
1967
1968         th = tcp_hdr(skb);
1969
1970         if (th->doff < sizeof(struct tcphdr) / 4)
1971                 goto bad_packet;
1972         if (!pskb_may_pull(skb, th->doff * 4))
1973                 goto discard_it;
1974
1975         /* An explanation is required here, I think.
1976          * Packet length and doff are validated by header prediction,
1977          * provided case of th->doff==0 is eliminated.
1978          * So, we defer the checks. */
1979         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1980                 goto bad_packet;
1981
1982         th = tcp_hdr(skb);
1983         iph = ip_hdr(skb);
1984         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1985         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1986                                     skb->len - th->doff * 4);
1987         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1988         TCP_SKB_CB(skb)->when    = 0;
1989         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1990         TCP_SKB_CB(skb)->sacked  = 0;
1991
1992         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1993         if (!sk)
1994                 goto no_tcp_socket;
1995
1996 process:
1997         if (sk->sk_state == TCP_TIME_WAIT)
1998                 goto do_time_wait;
1999
2000         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2001                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2002                 goto discard_and_relse;
2003         }
2004
2005         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2006                 goto discard_and_relse;
2007         nf_reset(skb);
2008
2009         if (sk_filter(sk, skb))
2010                 goto discard_and_relse;
2011
2012         skb->dev = NULL;
2013
2014         bh_lock_sock_nested(sk);
2015         ret = 0;
2016         if (!sock_owned_by_user(sk)) {
2017 #ifdef CONFIG_NET_DMA
2018                 struct tcp_sock *tp = tcp_sk(sk);
2019                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2020                         tp->ucopy.dma_chan = net_dma_find_channel();
2021                 if (tp->ucopy.dma_chan)
2022                         ret = tcp_v4_do_rcv(sk, skb);
2023                 else
2024 #endif
2025                 {
2026                         if (!tcp_prequeue(sk, skb))
2027                                 ret = tcp_v4_do_rcv(sk, skb);
2028                 }
2029         } else if (unlikely(sk_add_backlog(sk, skb,
2030                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2031                 bh_unlock_sock(sk);
2032                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2033                 goto discard_and_relse;
2034         }
2035         bh_unlock_sock(sk);
2036
2037         sock_put(sk);
2038
2039         return ret;
2040
2041 no_tcp_socket:
2042         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2043                 goto discard_it;
2044
2045         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2046 bad_packet:
2047                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2048         } else {
2049                 tcp_v4_send_reset(NULL, skb);
2050         }
2051
2052 discard_it:
2053         /* Discard frame. */
2054         kfree_skb(skb);
2055         return 0;
2056
2057 discard_and_relse:
2058         sock_put(sk);
2059         goto discard_it;
2060
2061 do_time_wait:
2062         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2063                 inet_twsk_put(inet_twsk(sk));
2064                 goto discard_it;
2065         }
2066
2067         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2068                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2069                 inet_twsk_put(inet_twsk(sk));
2070                 goto discard_it;
2071         }
2072         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2073         case TCP_TW_SYN: {
2074                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2075                                                         &tcp_hashinfo,
2076                                                         iph->daddr, th->dest,
2077                                                         inet_iif(skb));
2078                 if (sk2) {
2079                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2080                         inet_twsk_put(inet_twsk(sk));
2081                         sk = sk2;
2082                         goto process;
2083                 }
2084                 /* Fall through to ACK */
2085         }
2086         case TCP_TW_ACK:
2087                 tcp_v4_timewait_ack(sk, skb);
2088                 break;
2089         case TCP_TW_RST:
2090                 goto no_tcp_socket;
2091         case TCP_TW_SUCCESS:;
2092         }
2093         goto discard_it;
2094 }
2095
2096 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2097         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2098         .twsk_unique    = tcp_twsk_unique,
2099         .twsk_destructor= tcp_twsk_destructor,
2100 };
2101
2102 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2103 {
2104         struct dst_entry *dst = skb_dst(skb);
2105
2106         dst_hold(dst);
2107         sk->sk_rx_dst = dst;
2108         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2109 }
2110 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2111
2112 const struct inet_connection_sock_af_ops ipv4_specific = {
2113         .queue_xmit        = ip_queue_xmit,
2114         .send_check        = tcp_v4_send_check,
2115         .rebuild_header    = inet_sk_rebuild_header,
2116         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2117         .conn_request      = tcp_v4_conn_request,
2118         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2119         .net_header_len    = sizeof(struct iphdr),
2120         .setsockopt        = ip_setsockopt,
2121         .getsockopt        = ip_getsockopt,
2122         .addr2sockaddr     = inet_csk_addr2sockaddr,
2123         .sockaddr_len      = sizeof(struct sockaddr_in),
2124         .bind_conflict     = inet_csk_bind_conflict,
2125 #ifdef CONFIG_COMPAT
2126         .compat_setsockopt = compat_ip_setsockopt,
2127         .compat_getsockopt = compat_ip_getsockopt,
2128 #endif
2129 };
2130 EXPORT_SYMBOL(ipv4_specific);
2131
2132 #ifdef CONFIG_TCP_MD5SIG
2133 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2134         .md5_lookup             = tcp_v4_md5_lookup,
2135         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2136         .md5_parse              = tcp_v4_parse_md5_keys,
2137 };
2138 #endif
2139
2140 /* NOTE: A lot of things set to zero explicitly by call to
2141  *       sk_alloc() so need not be done here.
2142  */
2143 static int tcp_v4_init_sock(struct sock *sk)
2144 {
2145         struct inet_connection_sock *icsk = inet_csk(sk);
2146
2147         tcp_init_sock(sk);
2148
2149         icsk->icsk_af_ops = &ipv4_specific;
2150
2151 #ifdef CONFIG_TCP_MD5SIG
2152         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2153 #endif
2154
2155         return 0;
2156 }
2157
2158 void tcp_v4_destroy_sock(struct sock *sk)
2159 {
2160         struct tcp_sock *tp = tcp_sk(sk);
2161
2162         tcp_clear_xmit_timers(sk);
2163
2164         tcp_cleanup_congestion_control(sk);
2165
2166         /* Cleanup up the write buffer. */
2167         tcp_write_queue_purge(sk);
2168
2169         /* Cleans up our, hopefully empty, out_of_order_queue. */
2170         __skb_queue_purge(&tp->out_of_order_queue);
2171
2172 #ifdef CONFIG_TCP_MD5SIG
2173         /* Clean up the MD5 key list, if any */
2174         if (tp->md5sig_info) {
2175                 tcp_clear_md5_list(sk);
2176                 kfree_rcu(tp->md5sig_info, rcu);
2177                 tp->md5sig_info = NULL;
2178         }
2179 #endif
2180
2181 #ifdef CONFIG_NET_DMA
2182         /* Cleans up our sk_async_wait_queue */
2183         __skb_queue_purge(&sk->sk_async_wait_queue);
2184 #endif
2185
2186         /* Clean prequeue, it must be empty really */
2187         __skb_queue_purge(&tp->ucopy.prequeue);
2188
2189         /* Clean up a referenced TCP bind bucket. */
2190         if (inet_csk(sk)->icsk_bind_hash)
2191                 inet_put_port(sk);
2192
2193         /* TCP Cookie Transactions */
2194         if (tp->cookie_values != NULL) {
2195                 kref_put(&tp->cookie_values->kref,
2196                          tcp_cookie_values_release);
2197                 tp->cookie_values = NULL;
2198         }
2199         BUG_ON(tp->fastopen_rsk != NULL);
2200
2201         /* If socket is aborted during connect operation */
2202         tcp_free_fastopen_req(tp);
2203
2204         sk_sockets_allocated_dec(sk);
2205         sock_release_memcg(sk);
2206 }
2207 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2208
2209 #ifdef CONFIG_PROC_FS
2210 /* Proc filesystem TCP sock list dumping. */
2211
2212 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2213 {
2214         return hlist_nulls_empty(head) ? NULL :
2215                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2216 }
2217
2218 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2219 {
2220         return !is_a_nulls(tw->tw_node.next) ?
2221                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2222 }
2223
2224 /*
2225  * Get next listener socket follow cur.  If cur is NULL, get first socket
2226  * starting from bucket given in st->bucket; when st->bucket is zero the
2227  * very first socket in the hash table is returned.
2228  */
2229 static void *listening_get_next(struct seq_file *seq, void *cur)
2230 {
2231         struct inet_connection_sock *icsk;
2232         struct hlist_nulls_node *node;
2233         struct sock *sk = cur;
2234         struct inet_listen_hashbucket *ilb;
2235         struct tcp_iter_state *st = seq->private;
2236         struct net *net = seq_file_net(seq);
2237
2238         if (!sk) {
2239                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2240                 spin_lock_bh(&ilb->lock);
2241                 sk = sk_nulls_head(&ilb->head);
2242                 st->offset = 0;
2243                 goto get_sk;
2244         }
2245         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2246         ++st->num;
2247         ++st->offset;
2248
2249         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2250                 struct request_sock *req = cur;
2251
2252                 icsk = inet_csk(st->syn_wait_sk);
2253                 req = req->dl_next;
2254                 while (1) {
2255                         while (req) {
2256                                 if (req->rsk_ops->family == st->family) {
2257                                         cur = req;
2258                                         goto out;
2259                                 }
2260                                 req = req->dl_next;
2261                         }
2262                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2263                                 break;
2264 get_req:
2265                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2266                 }
2267                 sk        = sk_nulls_next(st->syn_wait_sk);
2268                 st->state = TCP_SEQ_STATE_LISTENING;
2269                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2270         } else {
2271                 icsk = inet_csk(sk);
2272                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2273                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2274                         goto start_req;
2275                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2276                 sk = sk_nulls_next(sk);
2277         }
2278 get_sk:
2279         sk_nulls_for_each_from(sk, node) {
2280                 if (!net_eq(sock_net(sk), net))
2281                         continue;
2282                 if (sk->sk_family == st->family) {
2283                         cur = sk;
2284                         goto out;
2285                 }
2286                 icsk = inet_csk(sk);
2287                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2288                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2289 start_req:
2290                         st->uid         = sock_i_uid(sk);
2291                         st->syn_wait_sk = sk;
2292                         st->state       = TCP_SEQ_STATE_OPENREQ;
2293                         st->sbucket     = 0;
2294                         goto get_req;
2295                 }
2296                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2297         }
2298         spin_unlock_bh(&ilb->lock);
2299         st->offset = 0;
2300         if (++st->bucket < INET_LHTABLE_SIZE) {
2301                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2302                 spin_lock_bh(&ilb->lock);
2303                 sk = sk_nulls_head(&ilb->head);
2304                 goto get_sk;
2305         }
2306         cur = NULL;
2307 out:
2308         return cur;
2309 }
2310
2311 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2312 {
2313         struct tcp_iter_state *st = seq->private;
2314         void *rc;
2315
2316         st->bucket = 0;
2317         st->offset = 0;
2318         rc = listening_get_next(seq, NULL);
2319
2320         while (rc && *pos) {
2321                 rc = listening_get_next(seq, rc);
2322                 --*pos;
2323         }
2324         return rc;
2325 }
2326
2327 static inline bool empty_bucket(struct tcp_iter_state *st)
2328 {
2329         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2330                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2331 }
2332
2333 /*
2334  * Get first established socket starting from bucket given in st->bucket.
2335  * If st->bucket is zero, the very first socket in the hash is returned.
2336  */
2337 static void *established_get_first(struct seq_file *seq)
2338 {
2339         struct tcp_iter_state *st = seq->private;
2340         struct net *net = seq_file_net(seq);
2341         void *rc = NULL;
2342
2343         st->offset = 0;
2344         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2345                 struct sock *sk;
2346                 struct hlist_nulls_node *node;
2347                 struct inet_timewait_sock *tw;
2348                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2349
2350                 /* Lockless fast path for the common case of empty buckets */
2351                 if (empty_bucket(st))
2352                         continue;
2353
2354                 spin_lock_bh(lock);
2355                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2356                         if (sk->sk_family != st->family ||
2357                             !net_eq(sock_net(sk), net)) {
2358                                 continue;
2359                         }
2360                         rc = sk;
2361                         goto out;
2362                 }
2363                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2364                 inet_twsk_for_each(tw, node,
2365                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2366                         if (tw->tw_family != st->family ||
2367                             !net_eq(twsk_net(tw), net)) {
2368                                 continue;
2369                         }
2370                         rc = tw;
2371                         goto out;
2372                 }
2373                 spin_unlock_bh(lock);
2374                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2375         }
2376 out:
2377         return rc;
2378 }
2379
2380 static void *established_get_next(struct seq_file *seq, void *cur)
2381 {
2382         struct sock *sk = cur;
2383         struct inet_timewait_sock *tw;
2384         struct hlist_nulls_node *node;
2385         struct tcp_iter_state *st = seq->private;
2386         struct net *net = seq_file_net(seq);
2387
2388         ++st->num;
2389         ++st->offset;
2390
2391         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2392                 tw = cur;
2393                 tw = tw_next(tw);
2394 get_tw:
2395                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2396                         tw = tw_next(tw);
2397                 }
2398                 if (tw) {
2399                         cur = tw;
2400                         goto out;
2401                 }
2402                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2404
2405                 /* Look for next non empty bucket */
2406                 st->offset = 0;
2407                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2408                                 empty_bucket(st))
2409                         ;
2410                 if (st->bucket > tcp_hashinfo.ehash_mask)
2411                         return NULL;
2412
2413                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2414                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2415         } else
2416                 sk = sk_nulls_next(sk);
2417
2418         sk_nulls_for_each_from(sk, node) {
2419                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2420                         goto found;
2421         }
2422
2423         st->state = TCP_SEQ_STATE_TIME_WAIT;
2424         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2425         goto get_tw;
2426 found:
2427         cur = sk;
2428 out:
2429         return cur;
2430 }
2431
2432 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2433 {
2434         struct tcp_iter_state *st = seq->private;
2435         void *rc;
2436
2437         st->bucket = 0;
2438         rc = established_get_first(seq);
2439
2440         while (rc && pos) {
2441                 rc = established_get_next(seq, rc);
2442                 --pos;
2443         }
2444         return rc;
2445 }
2446
2447 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2448 {
2449         void *rc;
2450         struct tcp_iter_state *st = seq->private;
2451
2452         st->state = TCP_SEQ_STATE_LISTENING;
2453         rc        = listening_get_idx(seq, &pos);
2454
2455         if (!rc) {
2456                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2457                 rc        = established_get_idx(seq, pos);
2458         }
2459
2460         return rc;
2461 }
2462
2463 static void *tcp_seek_last_pos(struct seq_file *seq)
2464 {
2465         struct tcp_iter_state *st = seq->private;
2466         int offset = st->offset;
2467         int orig_num = st->num;
2468         void *rc = NULL;
2469
2470         switch (st->state) {
2471         case TCP_SEQ_STATE_OPENREQ:
2472         case TCP_SEQ_STATE_LISTENING:
2473                 if (st->bucket >= INET_LHTABLE_SIZE)
2474                         break;
2475                 st->state = TCP_SEQ_STATE_LISTENING;
2476                 rc = listening_get_next(seq, NULL);
2477                 while (offset-- && rc)
2478                         rc = listening_get_next(seq, rc);
2479                 if (rc)
2480                         break;
2481                 st->bucket = 0;
2482                 /* Fallthrough */
2483         case TCP_SEQ_STATE_ESTABLISHED:
2484         case TCP_SEQ_STATE_TIME_WAIT:
2485                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2486                 if (st->bucket > tcp_hashinfo.ehash_mask)
2487                         break;
2488                 rc = established_get_first(seq);
2489                 while (offset-- && rc)
2490                         rc = established_get_next(seq, rc);
2491         }
2492
2493         st->num = orig_num;
2494
2495         return rc;
2496 }
2497
2498 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2499 {
2500         struct tcp_iter_state *st = seq->private;
2501         void *rc;
2502
2503         if (*pos && *pos == st->last_pos) {
2504                 rc = tcp_seek_last_pos(seq);
2505                 if (rc)
2506                         goto out;
2507         }
2508
2509         st->state = TCP_SEQ_STATE_LISTENING;
2510         st->num = 0;
2511         st->bucket = 0;
2512         st->offset = 0;
2513         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2514
2515 out:
2516         st->last_pos = *pos;
2517         return rc;
2518 }
2519
2520 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2521 {
2522         struct tcp_iter_state *st = seq->private;
2523         void *rc = NULL;
2524
2525         if (v == SEQ_START_TOKEN) {
2526                 rc = tcp_get_idx(seq, 0);
2527                 goto out;
2528         }
2529
2530         switch (st->state) {
2531         case TCP_SEQ_STATE_OPENREQ:
2532         case TCP_SEQ_STATE_LISTENING:
2533                 rc = listening_get_next(seq, v);
2534                 if (!rc) {
2535                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2536                         st->bucket = 0;
2537                         st->offset = 0;
2538                         rc        = established_get_first(seq);
2539                 }
2540                 break;
2541         case TCP_SEQ_STATE_ESTABLISHED:
2542         case TCP_SEQ_STATE_TIME_WAIT:
2543                 rc = established_get_next(seq, v);
2544                 break;
2545         }
2546 out:
2547         ++*pos;
2548         st->last_pos = *pos;
2549         return rc;
2550 }
2551
2552 static void tcp_seq_stop(struct seq_file *seq, void *v)
2553 {
2554         struct tcp_iter_state *st = seq->private;
2555
2556         switch (st->state) {
2557         case TCP_SEQ_STATE_OPENREQ:
2558                 if (v) {
2559                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2560                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2561                 }
2562         case TCP_SEQ_STATE_LISTENING:
2563                 if (v != SEQ_START_TOKEN)
2564                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2565                 break;
2566         case TCP_SEQ_STATE_TIME_WAIT:
2567         case TCP_SEQ_STATE_ESTABLISHED:
2568                 if (v)
2569                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2570                 break;
2571         }
2572 }
2573
2574 int tcp_seq_open(struct inode *inode, struct file *file)
2575 {
2576         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2577         struct tcp_iter_state *s;
2578         int err;
2579
2580         err = seq_open_net(inode, file, &afinfo->seq_ops,
2581                           sizeof(struct tcp_iter_state));
2582         if (err < 0)
2583                 return err;
2584
2585         s = ((struct seq_file *)file->private_data)->private;
2586         s->family               = afinfo->family;
2587         s->last_pos             = 0;
2588         return 0;
2589 }
2590 EXPORT_SYMBOL(tcp_seq_open);
2591
2592 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2593 {
2594         int rc = 0;
2595         struct proc_dir_entry *p;
2596
2597         afinfo->seq_ops.start           = tcp_seq_start;
2598         afinfo->seq_ops.next            = tcp_seq_next;
2599         afinfo->seq_ops.stop            = tcp_seq_stop;
2600
2601         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2602                              afinfo->seq_fops, afinfo);
2603         if (!p)
2604                 rc = -ENOMEM;
2605         return rc;
2606 }
2607 EXPORT_SYMBOL(tcp_proc_register);
2608
2609 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2610 {
2611         proc_net_remove(net, afinfo->name);
2612 }
2613 EXPORT_SYMBOL(tcp_proc_unregister);
2614
2615 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2616                          struct seq_file *f, int i, kuid_t uid, int *len)
2617 {
2618         const struct inet_request_sock *ireq = inet_rsk(req);
2619         long delta = req->expires - jiffies;
2620
2621         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2622                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2623                 i,
2624                 ireq->loc_addr,
2625                 ntohs(inet_sk(sk)->inet_sport),
2626                 ireq->rmt_addr,
2627                 ntohs(ireq->rmt_port),
2628                 TCP_SYN_RECV,
2629                 0, 0, /* could print option size, but that is af dependent. */
2630                 1,    /* timers active (only the expire timer) */
2631                 jiffies_delta_to_clock_t(delta),
2632                 req->num_timeout,
2633                 from_kuid_munged(seq_user_ns(f), uid),
2634                 0,  /* non standard timer */
2635                 0, /* open_requests have no inode */
2636                 atomic_read(&sk->sk_refcnt),
2637                 req,
2638                 len);
2639 }
2640
2641 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2642 {
2643         int timer_active;
2644         unsigned long timer_expires;
2645         const struct tcp_sock *tp = tcp_sk(sk);
2646         const struct inet_connection_sock *icsk = inet_csk(sk);
2647         const struct inet_sock *inet = inet_sk(sk);
2648         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2649         __be32 dest = inet->inet_daddr;
2650         __be32 src = inet->inet_rcv_saddr;
2651         __u16 destp = ntohs(inet->inet_dport);
2652         __u16 srcp = ntohs(inet->inet_sport);
2653         int rx_queue;
2654
2655         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2656                 timer_active    = 1;
2657                 timer_expires   = icsk->icsk_timeout;
2658         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2659                 timer_active    = 4;
2660                 timer_expires   = icsk->icsk_timeout;
2661         } else if (timer_pending(&sk->sk_timer)) {
2662                 timer_active    = 2;
2663                 timer_expires   = sk->sk_timer.expires;
2664         } else {
2665                 timer_active    = 0;
2666                 timer_expires = jiffies;
2667         }
2668
2669         if (sk->sk_state == TCP_LISTEN)
2670                 rx_queue = sk->sk_ack_backlog;
2671         else
2672                 /*
2673                  * because we dont lock socket, we might find a transient negative value
2674                  */
2675                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2676
2677         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2678                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2679                 i, src, srcp, dest, destp, sk->sk_state,
2680                 tp->write_seq - tp->snd_una,
2681                 rx_queue,
2682                 timer_active,
2683                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2684                 icsk->icsk_retransmits,
2685                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2686                 icsk->icsk_probes_out,
2687                 sock_i_ino(sk),
2688                 atomic_read(&sk->sk_refcnt), sk,
2689                 jiffies_to_clock_t(icsk->icsk_rto),
2690                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2691                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2692                 tp->snd_cwnd,
2693                 sk->sk_state == TCP_LISTEN ?
2694                     (fastopenq ? fastopenq->max_qlen : 0) :
2695                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2696                 len);
2697 }
2698
2699 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2700                                struct seq_file *f, int i, int *len)
2701 {
2702         __be32 dest, src;
2703         __u16 destp, srcp;
2704         long delta = tw->tw_ttd - jiffies;
2705
2706         dest  = tw->tw_daddr;
2707         src   = tw->tw_rcv_saddr;
2708         destp = ntohs(tw->tw_dport);
2709         srcp  = ntohs(tw->tw_sport);
2710
2711         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2712                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2713                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2714                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2715                 atomic_read(&tw->tw_refcnt), tw, len);
2716 }
2717
2718 #define TMPSZ 150
2719
2720 static int tcp4_seq_show(struct seq_file *seq, void *v)
2721 {
2722         struct tcp_iter_state *st;
2723         int len;
2724
2725         if (v == SEQ_START_TOKEN) {
2726                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2727                            "  sl  local_address rem_address   st tx_queue "
2728                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2729                            "inode");
2730                 goto out;
2731         }
2732         st = seq->private;
2733
2734         switch (st->state) {
2735         case TCP_SEQ_STATE_LISTENING:
2736         case TCP_SEQ_STATE_ESTABLISHED:
2737                 get_tcp4_sock(v, seq, st->num, &len);
2738                 break;
2739         case TCP_SEQ_STATE_OPENREQ:
2740                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2741                 break;
2742         case TCP_SEQ_STATE_TIME_WAIT:
2743                 get_timewait4_sock(v, seq, st->num, &len);
2744                 break;
2745         }
2746         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2747 out:
2748         return 0;
2749 }
2750
2751 static const struct file_operations tcp_afinfo_seq_fops = {
2752         .owner   = THIS_MODULE,
2753         .open    = tcp_seq_open,
2754         .read    = seq_read,
2755         .llseek  = seq_lseek,
2756         .release = seq_release_net
2757 };
2758
2759 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2760         .name           = "tcp",
2761         .family         = AF_INET,
2762         .seq_fops       = &tcp_afinfo_seq_fops,
2763         .seq_ops        = {
2764                 .show           = tcp4_seq_show,
2765         },
2766 };
2767
2768 static int __net_init tcp4_proc_init_net(struct net *net)
2769 {
2770         return tcp_proc_register(net, &tcp4_seq_afinfo);
2771 }
2772
2773 static void __net_exit tcp4_proc_exit_net(struct net *net)
2774 {
2775         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2776 }
2777
2778 static struct pernet_operations tcp4_net_ops = {
2779         .init = tcp4_proc_init_net,
2780         .exit = tcp4_proc_exit_net,
2781 };
2782
2783 int __init tcp4_proc_init(void)
2784 {
2785         return register_pernet_subsys(&tcp4_net_ops);
2786 }
2787
2788 void tcp4_proc_exit(void)
2789 {
2790         unregister_pernet_subsys(&tcp4_net_ops);
2791 }
2792 #endif /* CONFIG_PROC_FS */
2793
2794 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2795 {
2796         const struct iphdr *iph = skb_gro_network_header(skb);
2797         __wsum wsum;
2798         __sum16 sum;
2799
2800         switch (skb->ip_summed) {
2801         case CHECKSUM_COMPLETE:
2802                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2803                                   skb->csum)) {
2804                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2805                         break;
2806                 }
2807 flush:
2808                 NAPI_GRO_CB(skb)->flush = 1;
2809                 return NULL;
2810
2811         case CHECKSUM_NONE:
2812                 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2813                                           skb_gro_len(skb), IPPROTO_TCP, 0);
2814                 sum = csum_fold(skb_checksum(skb,
2815                                              skb_gro_offset(skb),
2816                                              skb_gro_len(skb),
2817                                              wsum));
2818                 if (sum)
2819                         goto flush;
2820
2821                 skb->ip_summed = CHECKSUM_UNNECESSARY;
2822                 break;
2823         }
2824
2825         return tcp_gro_receive(head, skb);
2826 }
2827
2828 int tcp4_gro_complete(struct sk_buff *skb)
2829 {
2830         const struct iphdr *iph = ip_hdr(skb);
2831         struct tcphdr *th = tcp_hdr(skb);
2832
2833         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2834                                   iph->saddr, iph->daddr, 0);
2835         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2836
2837         return tcp_gro_complete(skb);
2838 }
2839
2840 struct proto tcp_prot = {
2841         .name                   = "TCP",
2842         .owner                  = THIS_MODULE,
2843         .close                  = tcp_close,
2844         .connect                = tcp_v4_connect,
2845         .disconnect             = tcp_disconnect,
2846         .accept                 = inet_csk_accept,
2847         .ioctl                  = tcp_ioctl,
2848         .init                   = tcp_v4_init_sock,
2849         .destroy                = tcp_v4_destroy_sock,
2850         .shutdown               = tcp_shutdown,
2851         .setsockopt             = tcp_setsockopt,
2852         .getsockopt             = tcp_getsockopt,
2853         .recvmsg                = tcp_recvmsg,
2854         .sendmsg                = tcp_sendmsg,
2855         .sendpage               = tcp_sendpage,
2856         .backlog_rcv            = tcp_v4_do_rcv,
2857         .release_cb             = tcp_release_cb,
2858         .mtu_reduced            = tcp_v4_mtu_reduced,
2859         .hash                   = inet_hash,
2860         .unhash                 = inet_unhash,
2861         .get_port               = inet_csk_get_port,
2862         .enter_memory_pressure  = tcp_enter_memory_pressure,
2863         .sockets_allocated      = &tcp_sockets_allocated,
2864         .orphan_count           = &tcp_orphan_count,
2865         .memory_allocated       = &tcp_memory_allocated,
2866         .memory_pressure        = &tcp_memory_pressure,
2867         .sysctl_wmem            = sysctl_tcp_wmem,
2868         .sysctl_rmem            = sysctl_tcp_rmem,
2869         .max_header             = MAX_TCP_HEADER,
2870         .obj_size               = sizeof(struct tcp_sock),
2871         .slab_flags             = SLAB_DESTROY_BY_RCU,
2872         .twsk_prot              = &tcp_timewait_sock_ops,
2873         .rsk_prot               = &tcp_request_sock_ops,
2874         .h.hashinfo             = &tcp_hashinfo,
2875         .no_autobind            = true,
2876 #ifdef CONFIG_COMPAT
2877         .compat_setsockopt      = compat_tcp_setsockopt,
2878         .compat_getsockopt      = compat_tcp_getsockopt,
2879 #endif
2880 #ifdef CONFIG_MEMCG_KMEM
2881         .init_cgroup            = tcp_init_cgroup,
2882         .destroy_cgroup         = tcp_destroy_cgroup,
2883         .proto_cgroup           = tcp_proto_cgroup,
2884 #endif
2885 };
2886 EXPORT_SYMBOL(tcp_prot);
2887
2888 static int __net_init tcp_sk_init(struct net *net)
2889 {
2890         return 0;
2891 }
2892
2893 static void __net_exit tcp_sk_exit(struct net *net)
2894 {
2895 }
2896
2897 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2898 {
2899         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2900 }
2901
2902 static struct pernet_operations __net_initdata tcp_sk_ops = {
2903        .init       = tcp_sk_init,
2904        .exit       = tcp_sk_exit,
2905        .exit_batch = tcp_sk_exit_batch,
2906 };
2907
2908 void __init tcp_v4_init(void)
2909 {
2910         inet_hashinfo_init(&tcp_hashinfo);
2911         if (register_pernet_subsys(&tcp_sk_ops))
2912                 panic("Failed to create the TCP control socket.\n");
2913 }