net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 /* This will initiate an outgoing connection. */
 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 143 {
 144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 145         struct inet_sock *inet = inet_sk(sk);
 146         struct tcp_sock *tp = tcp_sk(sk);
 147         __be16 orig_sport, orig_dport;
 148         __be32 daddr, nexthop;
 149         struct flowi4 *fl4;
 150         struct rtable *rt;
 151         int err;
 152         struct ip_options_rcu *inet_opt;
 153
 154         if (addr_len < sizeof(struct sockaddr_in))
 155                 return -EINVAL;
 156
 157         if (usin->sin_family != AF_INET)
 158                 return -EAFNOSUPPORT;
 159
 160         nexthop = daddr = usin->sin_addr.s_addr;
 161         inet_opt = rcu_dereference_protected(inet->inet_opt,
 162                                              sock_owned_by_user(sk));
 163         if (inet_opt && inet_opt->opt.srr) {
 164                 if (!daddr)
 165                         return -EINVAL;
 166                 nexthop = inet_opt->opt.faddr;
 167         }
 168
 169         orig_sport = inet->inet_sport;
 170         orig_dport = usin->sin_port;
 171         fl4 = &inet->cork.fl.u.ip4;
 172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 174                               IPPROTO_TCP,
 175                               orig_sport, orig_dport, sk, true);
 176         if (IS_ERR(rt)) {
 177                 err = PTR_ERR(rt);
 178                 if (err == -ENETUNREACH)
 179                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 180                 return err;
 181         }
 182
 183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                 ip_rt_put(rt);
 185                 return -ENETUNREACH;
 186         }
 187
 188         if (!inet_opt || !inet_opt->opt.srr)
 189                 daddr = fl4->daddr;
 190
 191         if (!inet->inet_saddr)
 192                 inet->inet_saddr = fl4->saddr;
 193         inet->inet_rcv_saddr = inet->inet_saddr;
 194
 195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 196                 /* Reset inherited state */
 197                 tp->rx_opt.ts_recent       = 0;
 198                 tp->rx_opt.ts_recent_stamp = 0;
 199                 if (likely(!tp->repair))
 200                         tp->write_seq      = 0;
 201         }
 202
 203         if (tcp_death_row.sysctl_tw_recycle &&
 204             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 205                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 206
 207         inet->inet_dport = usin->sin_port;
 208         inet->inet_daddr = daddr;
 209
 210         inet_csk(sk)->icsk_ext_hdr_len = 0;
 211         if (inet_opt)
 212                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 213
 214         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 215
 216         /* Socket identity is still unknown (sport may be zero).
 217          * However we set state to SYN-SENT and not releasing socket
 218          * lock select source port, enter ourselves into the hash tables and
 219          * complete initialization after this.
 220          */
 221         tcp_set_state(sk, TCP_SYN_SENT);
 222         err = inet_hash_connect(&tcp_death_row, sk);
 223         if (err)
 224                 goto failure;
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236         printk(KERN_INFO "[socket_conn]IPV4 socket[%lu] sport:%u \n", SOCK_INODE(sk->sk_socket)->i_ino, ntohs(inet->inet_sport));
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 291                 tcp_sync_mss(sk, mtu);
 292
 293                 /* Resend the TCP packet because it's
 294                  * clear that the old packet has been
 295                  * dropped. This is the new "fast" path mtu
 296                  * discovery.
 297                  */
 298                 tcp_simple_retransmit(sk);
 299         } /* else let the usual retransmit timer handle it */
 300 }
 301 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 302
 303 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 304 {
 305         struct dst_entry *dst = __sk_dst_check(sk, 0);
 306
 307         if (dst)
 308                 dst->ops->redirect(dst, sk, skb);
 309 }
 310
 311 /*
 312  * This routine is called by the ICMP module when it gets some
 313  * sort of error condition.  If err < 0 then the socket should
 314  * be closed and the error returned to the user.  If err > 0
 315  * it's just the icmp type << 8 | icmp code.  After adjustment
 316  * header points to the first 8 bytes of the tcp header.  We need
 317  * to find the appropriate port.
 318  *
 319  * The locking strategy used here is very "optimistic". When
 320  * someone else accesses the socket the ICMP is just dropped
 321  * and for some paths there is no check at all.
 322  * A more general error queue to queue errors for later handling
 323  * is probably better.
 324  *
 325  */
 326
 327 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 328 {
 329         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 330         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 331         struct inet_connection_sock *icsk;
 332         struct tcp_sock *tp;
 333         struct inet_sock *inet;
 334         const int type = icmp_hdr(icmp_skb)->type;
 335         const int code = icmp_hdr(icmp_skb)->code;
 336         struct sock *sk;
 337         struct sk_buff *skb;
 338         struct request_sock *req;
 339         __u32 seq;
 340         __u32 remaining;
 341         int err;
 342         struct net *net = dev_net(icmp_skb->dev);
 343
 344         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 345                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 346                 return;
 347         }
 348
 349         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 350                         iph->saddr, th->source, inet_iif(icmp_skb));
 351         if (!sk) {
 352                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 353                 return;
 354         }
 355         if (sk->sk_state == TCP_TIME_WAIT) {
 356                 inet_twsk_put(inet_twsk(sk));
 357                 return;
 358         }
 359
 360         bh_lock_sock(sk);
 361         /* If too many ICMPs get dropped on busy
 362          * servers this needs to be solved differently.
 363          * We do take care of PMTU discovery (RFC1191) special case :
 364          * we can receive locally generated ICMP messages while socket is held.
 365          */
 366         if (sock_owned_by_user(sk)) {
 367                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 368                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 369         }
 370         if (sk->sk_state == TCP_CLOSE)
 371                 goto out;
 372
 373         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 374                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 375                 goto out;
 376         }
 377
 378         icsk = inet_csk(sk);
 379         tp = tcp_sk(sk);
 380         req = tp->fastopen_rsk;
 381         seq = ntohl(th->seq);
 382         if (sk->sk_state != TCP_LISTEN &&
 383             !between(seq, tp->snd_una, tp->snd_nxt) &&
 384             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 385                 /* For a Fast Open socket, allow seq to be snt_isn. */
 386                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 387                 goto out;
 388         }
 389
 390         switch (type) {
 391         case ICMP_REDIRECT:
 392                 do_redirect(icmp_skb, sk);
 393                 goto out;
 394         case ICMP_SOURCE_QUENCH:
 395                 /* Just silently ignore these. */
 396                 goto out;
 397         case ICMP_PARAMETERPROB:
 398                 err = EPROTO;
 399                 break;
 400         case ICMP_DEST_UNREACH:
 401                 if (code > NR_ICMP_UNREACH)
 402                         goto out;
 403
 404                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 405                         /* We are not interested in TCP_LISTEN and open_requests
 406                          * (SYN-ACKs send out by Linux are always <576bytes so
 407                          * they should go through unfragmented).
 408                          */
 409                         if (sk->sk_state == TCP_LISTEN)
 410                                 goto out;
 411
 412                         tp->mtu_info = info;
 413                         if (!sock_owned_by_user(sk)) {
 414                                 tcp_v4_mtu_reduced(sk);
 415                         } else {
 416                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 417                                         sock_hold(sk);
 418                         }
 419                         goto out;
 420                 }
 421
 422                 err = icmp_err_convert[code].errno;
 423                 /* check if icmp_skb allows revert of backoff
 424                  * (see draft-zimmermann-tcp-lcd) */
 425                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 426                         break;
 427                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 428                     !icsk->icsk_backoff)
 429                         break;
 430
 431                 /* XXX (TFO) - revisit the following logic for TFO */
 432
 433                 if (sock_owned_by_user(sk))
 434                         break;
 435
 436                 icsk->icsk_backoff--;
 437                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 438                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 439                 tcp_bound_rto(sk);
 440
 441                 skb = tcp_write_queue_head(sk);
 442                 BUG_ON(!skb);
 443
 444                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 445                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 446
 447                 if (remaining) {
 448                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 449                                                   remaining, sysctl_tcp_rto_max);
 450                 } else {
 451                         /* RTO revert clocked out retransmission.
 452                          * Will retransmit now */
 453                         tcp_retransmit_timer(sk);
 454                 }
 455
 456                 break;
 457         case ICMP_TIME_EXCEEDED:
 458                 err = EHOSTUNREACH;
 459                 break;
 460         default:
 461                 goto out;
 462         }
 463
 464         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 465          * than following the TCP_SYN_RECV case and closing the socket,
 466          * we ignore the ICMP error and keep trying like a fully established
 467          * socket. Is this the right thing to do?
 468          */
 469         if (req && req->sk == NULL)
 470                 goto out;
 471
 472         switch (sk->sk_state) {
 473                 struct request_sock *req, **prev;
 474         case TCP_LISTEN:
 475                 if (sock_owned_by_user(sk))
 476                         goto out;
 477
 478                 req = inet_csk_search_req(sk, &prev, th->dest,
 479                                           iph->daddr, iph->saddr);
 480                 if (!req)
 481                         goto out;
 482
 483                 /* ICMPs are not backlogged, hence we cannot get
 484                    an established socket here.
 485                  */
 486                 WARN_ON(req->sk);
 487
 488                 if (seq != tcp_rsk(req)->snt_isn) {
 489                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 490                         goto out;
 491                 }
 492
 493                 /*
 494                  * Still in SYN_RECV, just remove it silently.
 495                  * There is no good way to pass the error to the newly
 496                  * created socket, and POSIX does not want network
 497                  * errors returned from accept().
 498                  */
 499                 inet_csk_reqsk_queue_drop(sk, req, prev);
 500                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 501                 goto out;
 502
 503         case TCP_SYN_SENT:
 504         case TCP_SYN_RECV:  /* Cannot happen.
 505                                It can f.e. if SYNs crossed,
 506                                or Fast Open.
 507                              */
 508                 if (!sock_owned_by_user(sk)) {
 509                         sk->sk_err = err;
 510
 511                         sk->sk_error_report(sk);
 512
 513                         tcp_done(sk);
 514                 } else {
 515                         sk->sk_err_soft = err;
 516                 }
 517                 goto out;
 518         }
 519
 520         /* If we've already connected we will keep trying
 521          * until we time out, or the user gives up.
 522          *
 523          * rfc1122 4.2.3.9 allows to consider as hard errors
 524          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 525          * but it is obsoleted by pmtu discovery).
 526          *
 527          * Note, that in modern internet, where routing is unreliable
 528          * and in each dark corner broken firewalls sit, sending random
 529          * errors ordered by their masters even this two messages finally lose
 530          * their original sense (even Linux sends invalid PORT_UNREACHs)
 531          *
 532          * Now we are in compliance with RFCs.
 533          *                                                      --ANK (980905)
 534          */
 535
 536         inet = inet_sk(sk);
 537         if (!sock_owned_by_user(sk) && inet->recverr) {
 538                 sk->sk_err = err;
 539                 sk->sk_error_report(sk);
 540         } else  { /* Only an error on timeout */
 541                 sk->sk_err_soft = err;
 542         }
 543
 544 out:
 545         bh_unlock_sock(sk);
 546         sock_put(sk);
 547 }
 548
 549 static void __tcp_v4_send_check(struct sk_buff *skb,
 550                                 __be32 saddr, __be32 daddr)
 551 {
 552         struct tcphdr *th = tcp_hdr(skb);
 553
 554         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 555                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 556                 skb->csum_start = skb_transport_header(skb) - skb->head;
 557                 skb->csum_offset = offsetof(struct tcphdr, check);
 558         } else {
 559                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 560                                          csum_partial(th,
 561                                                       th->doff << 2,
 562                                                       skb->csum));
 563         }
 564 }
 565
 566 /* This routine computes an IPv4 TCP checksum. */
 567 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 568 {
 569         const struct inet_sock *inet = inet_sk(sk);
 570
 571         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 572 }
 573 EXPORT_SYMBOL(tcp_v4_send_check);
 574
 575 int tcp_v4_gso_send_check(struct sk_buff *skb)
 576 {
 577         const struct iphdr *iph;
 578         struct tcphdr *th;
 579
 580         if (!pskb_may_pull(skb, sizeof(*th)))
 581                 return -EINVAL;
 582
 583         iph = ip_hdr(skb);
 584         th = tcp_hdr(skb);
 585
 586         th->check = 0;
 587         skb->ip_summed = CHECKSUM_PARTIAL;
 588         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 589         return 0;
 590 }
 591
 592 /*
 593  *      This routine will send an RST to the other tcp.
 594  *
 595  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 596  *                    for reset.
 597  *      Answer: if a packet caused RST, it is not for a socket
 598  *              existing in our system, if it is matched to a socket,
 599  *              it is just duplicate segment or bug in other side's TCP.
 600  *              So that we build reply only basing on parameters
 601  *              arrived with segment.
 602  *      Exception: precedence violation. We do not implement it in any case.
 603  */
 604
 605 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 606 {
 607         const struct tcphdr *th = tcp_hdr(skb);
 608         struct {
 609                 struct tcphdr th;
 610 #ifdef CONFIG_TCP_MD5SIG
 611                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 612 #endif
 613         } rep;
 614         struct ip_reply_arg arg;
 615 #ifdef CONFIG_TCP_MD5SIG
 616         struct tcp_md5sig_key *key;
 617         const __u8 *hash_location = NULL;
 618         unsigned char newhash[16];
 619         int genhash;
 620         struct sock *sk1 = NULL;
 621 #endif
 622         struct net *net;
 623
 624         /* Never send a reset in response to a reset. */
 625         if (th->rst)
 626                 return;
 627
 628         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 629                 return;
 630
 631         /* Swap the send and the receive. */
 632         memset(&rep, 0, sizeof(rep));
 633         rep.th.dest   = th->source;
 634         rep.th.source = th->dest;
 635         rep.th.doff   = sizeof(struct tcphdr) / 4;
 636         rep.th.rst    = 1;
 637
 638         if (th->ack) {
 639                 rep.th.seq = th->ack_seq;
 640         } else {
 641                 rep.th.ack = 1;
 642                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 643                                        skb->len - (th->doff << 2));
 644         }
 645
 646         memset(&arg, 0, sizeof(arg));
 647         arg.iov[0].iov_base = (unsigned char *)&rep;
 648         arg.iov[0].iov_len  = sizeof(rep.th);
 649
 650 #ifdef CONFIG_TCP_MD5SIG
 651         hash_location = tcp_parse_md5sig_option(th);
 652         if (!sk && hash_location) {
 653                 /*
 654                  * active side is lost. Try to find listening socket through
 655                  * source port, and then find md5 key through listening socket.
 656                  * we are not loose security here:
 657                  * Incoming packet is checked with md5 hash with finding key,
 658                  * no RST generated if md5 hash doesn't match.
 659                  */
 660                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 661                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
 662                                              th->source, ip_hdr(skb)->daddr,
 663                                              ntohs(th->source), inet_iif(skb));
 664                 /* don't send rst if it can't find key */
 665                 if (!sk1)
 666                         return;
 667                 rcu_read_lock();
 668                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 669                                         &ip_hdr(skb)->saddr, AF_INET);
 670                 if (!key)
 671                         goto release_sk1;
 672
 673                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 674                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 675                         goto release_sk1;
 676         } else {
 677                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 678                                              &ip_hdr(skb)->saddr,
 679                                              AF_INET) : NULL;
 680         }
 681
 682         if (key) {
 683                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 684                                    (TCPOPT_NOP << 16) |
 685                                    (TCPOPT_MD5SIG << 8) |
 686                                    TCPOLEN_MD5SIG);
 687                 /* Update length and the length the header thinks exists */
 688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 689                 rep.th.doff = arg.iov[0].iov_len / 4;
 690
 691                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 692                                      key, ip_hdr(skb)->saddr,
 693                                      ip_hdr(skb)->daddr, &rep.th);
 694         }
 695 #endif
 696         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 697                                       ip_hdr(skb)->saddr, /* XXX */
 698                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 699         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 700         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 701         /* When socket is gone, all binding information is lost.
 702          * routing might fail in this case. No choice here, if we choose to force
 703          * input interface, we will misroute in case of asymmetric route.
 704          */
 705         if (sk)
 706                 arg.bound_dev_if = sk->sk_bound_dev_if;
 707
 708         net = dev_net(skb_dst(skb)->dev);
 709         arg.tos = ip_hdr(skb)->tos;
 710         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 711                               skb, ip_hdr(skb)->saddr,
 712                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 713
 714         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 715         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 716
 717 #ifdef CONFIG_TCP_MD5SIG
 718 release_sk1:
 719         if (sk1) {
 720                 rcu_read_unlock();
 721                 sock_put(sk1);
 722         }
 723 #endif
 724 }
 725
 726 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 727    outside socket context is ugly, certainly. What can I do?
 728  */
 729
 730 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 731                             u32 win, u32 tsval, u32 tsecr, int oif,
 732                             struct tcp_md5sig_key *key,
 733                             int reply_flags, u8 tos)
 734 {
 735         const struct tcphdr *th = tcp_hdr(skb);
 736         struct {
 737                 struct tcphdr th;
 738                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 739 #ifdef CONFIG_TCP_MD5SIG
 740                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 741 #endif
 742                         ];
 743         } rep;
 744         struct ip_reply_arg arg;
 745         struct net *net = dev_net(skb_dst(skb)->dev);
 746
 747         memset(&rep.th, 0, sizeof(struct tcphdr));
 748         memset(&arg, 0, sizeof(arg));
 749
 750         arg.iov[0].iov_base = (unsigned char *)&rep;
 751         arg.iov[0].iov_len  = sizeof(rep.th);
 752         if (tsecr) {
 753                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 754                                    (TCPOPT_TIMESTAMP << 8) |
 755                                    TCPOLEN_TIMESTAMP);
 756                 rep.opt[1] = htonl(tsval);
 757                 rep.opt[2] = htonl(tsecr);
 758                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 759         }
 760
 761         /* Swap the send and the receive. */
 762         rep.th.dest    = th->source;
 763         rep.th.source  = th->dest;
 764         rep.th.doff    = arg.iov[0].iov_len / 4;
 765         rep.th.seq     = htonl(seq);
 766         rep.th.ack_seq = htonl(ack);
 767         rep.th.ack     = 1;
 768         rep.th.window  = htons(win);
 769
 770 #ifdef CONFIG_TCP_MD5SIG
 771         if (key) {
 772                 int offset = (tsecr) ? 3 : 0;
 773
 774                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 775                                           (TCPOPT_NOP << 16) |
 776                                           (TCPOPT_MD5SIG << 8) |
 777                                           TCPOLEN_MD5SIG);
 778                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 779                 rep.th.doff = arg.iov[0].iov_len/4;
 780
 781                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 782                                     key, ip_hdr(skb)->saddr,
 783                                     ip_hdr(skb)->daddr, &rep.th);
 784         }
 785 #endif
 786         arg.flags = reply_flags;
 787         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 788                                       ip_hdr(skb)->saddr, /* XXX */
 789                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 790         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 791         if (oif)
 792                 arg.bound_dev_if = oif;
 793         arg.tos = tos;
 794         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 795                               skb, ip_hdr(skb)->saddr,
 796                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 797
 798         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 799 }
 800
 801 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 802 {
 803         struct inet_timewait_sock *tw = inet_twsk(sk);
 804         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 805
 806         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 807                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 808                         tcp_time_stamp + tcptw->tw_ts_offset,
 809                         tcptw->tw_ts_recent,
 810                         tw->tw_bound_dev_if,
 811                         tcp_twsk_md5_key(tcptw),
 812                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 813                         tw->tw_tos
 814                         );
 815
 816         inet_twsk_put(tw);
 817 }
 818
 819 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 820                                   struct request_sock *req)
 821 {
 822         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 823          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 824          */
 825         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 826                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 827                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 828                         tcp_time_stamp,
 829                         req->ts_recent,
 830                         0,
 831                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 832                                           AF_INET),
 833                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 834                         ip_hdr(skb)->tos);
 835 }
 836
 837 /*
 838  *      Send a SYN-ACK after having received a SYN.
 839  *      This still operates on a request_sock only, not on a big
 840  *      socket.
 841  */
 842 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 843                               struct request_sock *req,
 844                               u16 queue_mapping,
 845                               bool nocache)
 846 {
 847         const struct inet_request_sock *ireq = inet_rsk(req);
 848         struct flowi4 fl4;
 849         int err = -1;
 850         struct sk_buff * skb;
 851
 852         /* First, grab a route. */
 853         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 854                 return -1;
 855
 856         skb = tcp_make_synack(sk, dst, req, NULL);
 857
 858         if (skb) {
 859                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 860
 861                 skb_set_queue_mapping(skb, queue_mapping);
 862                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 863                                             ireq->rmt_addr,
 864                                             ireq->opt);
 865                 err = net_xmit_eval(err);
 866                 if (!tcp_rsk(req)->snt_synack && !err)
 867                         tcp_rsk(req)->snt_synack = tcp_time_stamp;
 868         }
 869
 870         return err;
 871 }
 872
 873 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 874 {
 875         int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
 876
 877         if (!res)
 878                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 879         return res;
 880 }
 881
 882 /*
 883  *      IPv4 request_sock destructor.
 884  */
 885 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 886 {
 887         kfree(inet_rsk(req)->opt);
 888 }
 889
 890 /*
 891  * Return true if a syncookie should be sent
 892  */
 893 bool tcp_syn_flood_action(struct sock *sk,
 894                          const struct sk_buff *skb,
 895                          const char *proto)
 896 {
 897         const char *msg = "Dropping request";
 898         bool want_cookie = false;
 899         struct listen_sock *lopt;
 900
 901
 902
 903 #ifdef CONFIG_SYN_COOKIES
 904         if (sysctl_tcp_syncookies) {
 905                 msg = "Sending cookies";
 906                 want_cookie = true;
 907                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 908         } else
 909 #endif
 910                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 911
 912         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 913         if (!lopt->synflood_warned) {
 914                 lopt->synflood_warned = 1;
 915                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 916                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 917         }
 918         return want_cookie;
 919 }
 920 EXPORT_SYMBOL(tcp_syn_flood_action);
 921
 922 /*
 923  * Save and compile IPv4 options into the request_sock if needed.
 924  */
 925 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 926 {
 927         const struct ip_options *opt = &(IPCB(skb)->opt);
 928         struct ip_options_rcu *dopt = NULL;
 929
 930         if (opt && opt->optlen) {
 931                 int opt_size = sizeof(*dopt) + opt->optlen;
 932
 933                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 934                 if (dopt) {
 935                         if (ip_options_echo(&dopt->opt, skb)) {
 936                                 kfree(dopt);
 937                                 dopt = NULL;
 938                         }
 939                 }
 940         }
 941         return dopt;
 942 }
 943
 944 #ifdef CONFIG_TCP_MD5SIG
 945 /*
 946  * RFC2385 MD5 checksumming requires a mapping of
 947  * IP address->MD5 Key.
 948  * We need to maintain these in the sk structure.
 949  */
 950
 951 /* Find the Key structure for an address.  */
 952 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 953                                          const union tcp_md5_addr *addr,
 954                                          int family)
 955 {
 956         struct tcp_sock *tp = tcp_sk(sk);
 957         struct tcp_md5sig_key *key;
 958         unsigned int size = sizeof(struct in_addr);
 959         struct tcp_md5sig_info *md5sig;
 960
 961         /* caller either holds rcu_read_lock() or socket lock */
 962         md5sig = rcu_dereference_check(tp->md5sig_info,
 963                                        sock_owned_by_user(sk) ||
 964                                        lockdep_is_held(&sk->sk_lock.slock));
 965         if (!md5sig)
 966                 return NULL;
 967 #if IS_ENABLED(CONFIG_IPV6)
 968         if (family == AF_INET6)
 969                 size = sizeof(struct in6_addr);
 970 #endif
 971         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 972                 if (key->family != family)
 973                         continue;
 974                 if (!memcmp(&key->addr, addr, size))
 975                         return key;
 976         }
 977         return NULL;
 978 }
 979 EXPORT_SYMBOL(tcp_md5_do_lookup);
 980
 981 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 982                                          struct sock *addr_sk)
 983 {
 984         union tcp_md5_addr *addr;
 985
 986         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 987         return tcp_md5_do_lookup(sk, addr, AF_INET);
 988 }
 989 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 990
 991 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 992                                                       struct request_sock *req)
 993 {
 994         union tcp_md5_addr *addr;
 995
 996         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
 997         return tcp_md5_do_lookup(sk, addr, AF_INET);
 998 }
 999
1000 /* This can be called on a newly created socket, from other files */
1001 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1002                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1003 {
1004         /* Add Key to the list */
1005         struct tcp_md5sig_key *key;
1006         struct tcp_sock *tp = tcp_sk(sk);
1007         struct tcp_md5sig_info *md5sig;
1008
1009         key = tcp_md5_do_lookup(sk, addr, family);
1010         if (key) {
1011                 /* Pre-existing entry - just update that one. */
1012                 memcpy(key->key, newkey, newkeylen);
1013                 key->keylen = newkeylen;
1014                 return 0;
1015         }
1016
1017         md5sig = rcu_dereference_protected(tp->md5sig_info,
1018                                            sock_owned_by_user(sk));
1019         if (!md5sig) {
1020                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1021                 if (!md5sig)
1022                         return -ENOMEM;
1023
1024                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1025                 INIT_HLIST_HEAD(&md5sig->head);
1026                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1027         }
1028
1029         key = sock_kmalloc(sk, sizeof(*key), gfp);
1030         if (!key)
1031                 return -ENOMEM;
1032         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1033                 sock_kfree_s(sk, key, sizeof(*key));
1034                 return -ENOMEM;
1035         }
1036
1037         memcpy(key->key, newkey, newkeylen);
1038         key->keylen = newkeylen;
1039         key->family = family;
1040         memcpy(&key->addr, addr,
1041                (family == AF_INET6) ? sizeof(struct in6_addr) :
1042                                       sizeof(struct in_addr));
1043         hlist_add_head_rcu(&key->node, &md5sig->head);
1044         return 0;
1045 }
1046 EXPORT_SYMBOL(tcp_md5_do_add);
1047
1048 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1049 {
1050         struct tcp_sock *tp = tcp_sk(sk);
1051         struct tcp_md5sig_key *key;
1052         struct tcp_md5sig_info *md5sig;
1053
1054         key = tcp_md5_do_lookup(sk, addr, family);
1055         if (!key)
1056                 return -ENOENT;
1057         hlist_del_rcu(&key->node);
1058         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1059         kfree_rcu(key, rcu);
1060         md5sig = rcu_dereference_protected(tp->md5sig_info,
1061                                            sock_owned_by_user(sk));
1062         if (hlist_empty(&md5sig->head))
1063                 tcp_free_md5sig_pool();
1064         return 0;
1065 }
1066 EXPORT_SYMBOL(tcp_md5_do_del);
1067
1068 static void tcp_clear_md5_list(struct sock *sk)
1069 {
1070         struct tcp_sock *tp = tcp_sk(sk);
1071         struct tcp_md5sig_key *key;
1072         struct hlist_node *n;
1073         struct tcp_md5sig_info *md5sig;
1074
1075         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1076
1077         if (!hlist_empty(&md5sig->head))
1078                 tcp_free_md5sig_pool();
1079         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1080                 hlist_del_rcu(&key->node);
1081                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1082                 kfree_rcu(key, rcu);
1083         }
1084 }
1085
1086 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1087                                  int optlen)
1088 {
1089         struct tcp_md5sig cmd;
1090         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1091
1092         if (optlen < sizeof(cmd))
1093                 return -EINVAL;
1094
1095         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1096                 return -EFAULT;
1097
1098         if (sin->sin_family != AF_INET)
1099                 return -EINVAL;
1100
1101         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1102                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1103                                       AF_INET);
1104
1105         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1106                 return -EINVAL;
1107
1108         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1109                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1110                               GFP_KERNEL);
1111 }
1112
1113 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1114                                         __be32 daddr, __be32 saddr, int nbytes)
1115 {
1116         struct tcp4_pseudohdr *bp;
1117         struct scatterlist sg;
1118
1119         bp = &hp->md5_blk.ip4;
1120
1121         /*
1122          * 1. the TCP pseudo-header (in the order: source IP address,
1123          * destination IP address, zero-padded protocol number, and
1124          * segment length)
1125          */
1126         bp->saddr = saddr;
1127         bp->daddr = daddr;
1128         bp->pad = 0;
1129         bp->protocol = IPPROTO_TCP;
1130         bp->len = cpu_to_be16(nbytes);
1131
1132         sg_init_one(&sg, bp, sizeof(*bp));
1133         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1134 }
1135
1136 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1137                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1138 {
1139         struct tcp_md5sig_pool *hp;
1140         struct hash_desc *desc;
1141
1142         hp = tcp_get_md5sig_pool();
1143         if (!hp)
1144                 goto clear_hash_noput;
1145         desc = &hp->md5_desc;
1146
1147         if (crypto_hash_init(desc))
1148                 goto clear_hash;
1149         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1150                 goto clear_hash;
1151         if (tcp_md5_hash_header(hp, th))
1152                 goto clear_hash;
1153         if (tcp_md5_hash_key(hp, key))
1154                 goto clear_hash;
1155         if (crypto_hash_final(desc, md5_hash))
1156                 goto clear_hash;
1157
1158         tcp_put_md5sig_pool();
1159         return 0;
1160
1161 clear_hash:
1162         tcp_put_md5sig_pool();
1163 clear_hash_noput:
1164         memset(md5_hash, 0, 16);
1165         return 1;
1166 }
1167
1168 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1169                         const struct sock *sk, const struct request_sock *req,
1170                         const struct sk_buff *skb)
1171 {
1172         struct tcp_md5sig_pool *hp;
1173         struct hash_desc *desc;
1174         const struct tcphdr *th = tcp_hdr(skb);
1175         __be32 saddr, daddr;
1176
1177         if (sk) {
1178                 saddr = inet_sk(sk)->inet_saddr;
1179                 daddr = inet_sk(sk)->inet_daddr;
1180         } else if (req) {
1181                 saddr = inet_rsk(req)->loc_addr;
1182                 daddr = inet_rsk(req)->rmt_addr;
1183         } else {
1184                 const struct iphdr *iph = ip_hdr(skb);
1185                 saddr = iph->saddr;
1186                 daddr = iph->daddr;
1187         }
1188
1189         hp = tcp_get_md5sig_pool();
1190         if (!hp)
1191                 goto clear_hash_noput;
1192         desc = &hp->md5_desc;
1193
1194         if (crypto_hash_init(desc))
1195                 goto clear_hash;
1196
1197         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1198                 goto clear_hash;
1199         if (tcp_md5_hash_header(hp, th))
1200                 goto clear_hash;
1201         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1202                 goto clear_hash;
1203         if (tcp_md5_hash_key(hp, key))
1204                 goto clear_hash;
1205         if (crypto_hash_final(desc, md5_hash))
1206                 goto clear_hash;
1207
1208         tcp_put_md5sig_pool();
1209         return 0;
1210
1211 clear_hash:
1212         tcp_put_md5sig_pool();
1213 clear_hash_noput:
1214         memset(md5_hash, 0, 16);
1215         return 1;
1216 }
1217 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1218
1219 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1220 {
1221         /*
1222          * This gets called for each TCP segment that arrives
1223          * so we want to be efficient.
1224          * We have 3 drop cases:
1225          * o No MD5 hash and one expected.
1226          * o MD5 hash and we're not expecting one.
1227          * o MD5 hash and its wrong.
1228          */
1229         const __u8 *hash_location = NULL;
1230         struct tcp_md5sig_key *hash_expected;
1231         const struct iphdr *iph = ip_hdr(skb);
1232         const struct tcphdr *th = tcp_hdr(skb);
1233         int genhash;
1234         unsigned char newhash[16];
1235
1236         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1237                                           AF_INET);
1238         hash_location = tcp_parse_md5sig_option(th);
1239
1240         /* We've parsed the options - do we have a hash? */
1241         if (!hash_expected && !hash_location)
1242                 return false;
1243
1244         if (hash_expected && !hash_location) {
1245                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1246                 return true;
1247         }
1248
1249         if (!hash_expected && hash_location) {
1250                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1251                 return true;
1252         }
1253
1254         /* Okay, so this is hash_expected and hash_location -
1255          * so we need to calculate the checksum.
1256          */
1257         genhash = tcp_v4_md5_hash_skb(newhash,
1258                                       hash_expected,
1259                                       NULL, NULL, skb);
1260
1261         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1262                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1263                                      &iph->saddr, ntohs(th->source),
1264                                      &iph->daddr, ntohs(th->dest),
1265                                      genhash ? " tcp_v4_calc_md5_hash failed"
1266                                      : "");
1267                 return true;
1268         }
1269         return false;
1270 }
1271
1272 #endif
1273
1274 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1275         .family         =       PF_INET,
1276         .obj_size       =       sizeof(struct tcp_request_sock),
1277         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1278         .send_ack       =       tcp_v4_reqsk_send_ack,
1279         .destructor     =       tcp_v4_reqsk_destructor,
1280         .send_reset     =       tcp_v4_send_reset,
1281         .syn_ack_timeout =      tcp_syn_ack_timeout,
1282 };
1283
1284 #ifdef CONFIG_TCP_MD5SIG
1285 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1286         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1287         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1288 };
1289 #endif
1290
1291 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1292                                struct request_sock *req,
1293                                struct tcp_fastopen_cookie *foc,
1294                                struct tcp_fastopen_cookie *valid_foc)
1295 {
1296         bool skip_cookie = false;
1297         struct fastopen_queue *fastopenq;
1298
1299         if (likely(!fastopen_cookie_present(foc))) {
1300                 /* See include/net/tcp.h for the meaning of these knobs */
1301                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1302                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1303                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1304                         skip_cookie = true; /* no cookie to validate */
1305                 else
1306                         return false;
1307         }
1308         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1309         /* A FO option is present; bump the counter. */
1310         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1311
1312         /* Make sure the listener has enabled fastopen, and we don't
1313          * exceed the max # of pending TFO requests allowed before trying
1314          * to validating the cookie in order to avoid burning CPU cycles
1315          * unnecessarily.
1316          *
1317          * XXX (TFO) - The implication of checking the max_qlen before
1318          * processing a cookie request is that clients can't differentiate
1319          * between qlen overflow causing Fast Open to be disabled
1320          * temporarily vs a server not supporting Fast Open at all.
1321          */
1322         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1323             fastopenq == NULL || fastopenq->max_qlen == 0)
1324                 return false;
1325
1326         if (fastopenq->qlen >= fastopenq->max_qlen) {
1327                 struct request_sock *req1;
1328                 spin_lock(&fastopenq->lock);
1329                 req1 = fastopenq->rskq_rst_head;
1330                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1331                         spin_unlock(&fastopenq->lock);
1332                         NET_INC_STATS_BH(sock_net(sk),
1333                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1334                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1335                         foc->len = -1;
1336                         return false;
1337                 }
1338                 fastopenq->rskq_rst_head = req1->dl_next;
1339                 fastopenq->qlen--;
1340                 spin_unlock(&fastopenq->lock);
1341                 reqsk_free(req1);
1342         }
1343         if (skip_cookie) {
1344                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1345                 return true;
1346         }
1347         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1348                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1349                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1350                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1351                             memcmp(&foc->val[0], &valid_foc->val[0],
1352                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1353                                 return false;
1354                         valid_foc->len = -1;
1355                 }
1356                 /* Acknowledge the data received from the peer. */
1357                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1358                 return true;
1359         } else if (foc->len == 0) { /* Client requesting a cookie */
1360                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1361                 NET_INC_STATS_BH(sock_net(sk),
1362                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1363         } else {
1364                 /* Client sent a cookie with wrong size. Treat it
1365                  * the same as invalid and return a valid one.
1366                  */
1367                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1368         }
1369         return false;
1370 }
1371
1372 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1373                                     struct sk_buff *skb,
1374                                     struct sk_buff *skb_synack,
1375                                     struct request_sock *req)
1376 {
1377         struct tcp_sock *tp = tcp_sk(sk);
1378         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1379         const struct inet_request_sock *ireq = inet_rsk(req);
1380         struct sock *child;
1381         int err;
1382
1383         req->num_retrans = 0;
1384         req->num_timeout = 0;
1385         req->sk = NULL;
1386
1387         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1388         if (child == NULL) {
1389                 NET_INC_STATS_BH(sock_net(sk),
1390                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1391                 kfree_skb(skb_synack);
1392                 return -1;
1393         }
1394         err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1395                                     ireq->rmt_addr, ireq->opt);
1396         err = net_xmit_eval(err);
1397         if (!err)
1398                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1399         /* XXX (TFO) - is it ok to ignore error and continue? */
1400
1401         spin_lock(&queue->fastopenq->lock);
1402         queue->fastopenq->qlen++;
1403         spin_unlock(&queue->fastopenq->lock);
1404
1405         /* Initialize the child socket. Have to fix some values to take
1406          * into account the child is a Fast Open socket and is created
1407          * only out of the bits carried in the SYN packet.
1408          */
1409         tp = tcp_sk(child);
1410
1411         tp->fastopen_rsk = req;
1412         /* Do a hold on the listner sk so that if the listener is being
1413          * closed, the child that has been accepted can live on and still
1414          * access listen_lock.
1415          */
1416         sock_hold(sk);
1417         tcp_rsk(req)->listener = sk;
1418
1419         /* RFC1323: The window in SYN & SYN/ACK segments is never
1420          * scaled. So correct it appropriately.
1421          */
1422         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1423
1424         /* Activate the retrans timer so that SYNACK can be retransmitted.
1425          * The request socket is not added to the SYN table of the parent
1426          * because it's been added to the accept queue directly.
1427          */
1428         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1429             TCP_TIMEOUT_INIT, sysctl_tcp_rto_max);
1430
1431         /* Add the child socket directly into the accept queue */
1432         inet_csk_reqsk_queue_add(sk, req, child);
1433
1434         /* Now finish processing the fastopen child socket. */
1435         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1436         tcp_init_congestion_control(child);
1437         tcp_mtup_init(child);
1438         tcp_init_buffer_space(child);
1439         tcp_init_metrics(child);
1440
1441         /* Queue the data carried in the SYN packet. We need to first
1442          * bump skb's refcnt because the caller will attempt to free it.
1443          *
1444          * XXX (TFO) - we honor a zero-payload TFO request for now.
1445          * (Any reason not to?)
1446          */
1447         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1448                 /* Don't queue the skb if there is no payload in SYN.
1449                  * XXX (TFO) - How about SYN+FIN?
1450                  */
1451                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1452         } else {
1453                 skb = skb_get(skb);
1454                 skb_dst_drop(skb);
1455                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1456                 skb_set_owner_r(skb, child);
1457                 __skb_queue_tail(&child->sk_receive_queue, skb);
1458                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1459                 tp->syn_data_acked = 1;
1460         }
1461         sk->sk_data_ready(sk, 0);
1462         bh_unlock_sock(child);
1463         sock_put(child);
1464         WARN_ON(req->sk == NULL);
1465         return 0;
1466 }
1467
1468 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1469 {
1470         struct tcp_options_received tmp_opt;
1471         struct request_sock *req;
1472         struct inet_request_sock *ireq;
1473         struct tcp_sock *tp = tcp_sk(sk);
1474         struct dst_entry *dst = NULL;
1475         __be32 saddr = ip_hdr(skb)->saddr;
1476         __be32 daddr = ip_hdr(skb)->daddr;
1477         __u32 isn = TCP_SKB_CB(skb)->when;
1478         bool want_cookie = false;
1479         struct flowi4 fl4;
1480         struct tcp_fastopen_cookie foc = { .len = -1 };
1481         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1482         struct sk_buff *skb_synack;
1483         int do_fastopen;
1484
1485         /* Never answer to SYNs send to broadcast or multicast */
1486         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1487                 goto drop;
1488
1489         /* TW buckets are converted to open requests without
1490          * limitations, they conserve resources and peer is
1491          * evidently real one.
1492          */
1493         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1494                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1495                 if (!want_cookie)
1496                         goto drop;
1497         }
1498
1499         /* Accept backlog is full. If we have already queued enough
1500          * of warm entries in syn queue, drop request. It is better than
1501          * clogging syn queue with openreqs with exponentially increasing
1502          * timeout.
1503          */
1504         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1505                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1506                 goto drop;
1507         }
1508
1509         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1510         if (!req)
1511                 goto drop;
1512
1513 #ifdef CONFIG_TCP_MD5SIG
1514         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1515 #endif
1516
1517         tcp_clear_options(&tmp_opt);
1518         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1519         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1520         tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1521
1522         if (want_cookie && !tmp_opt.saw_tstamp)
1523                 tcp_clear_options(&tmp_opt);
1524
1525         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1526         tcp_openreq_init(req, &tmp_opt, skb);
1527
1528         ireq = inet_rsk(req);
1529         ireq->loc_addr = daddr;
1530         ireq->rmt_addr = saddr;
1531         ireq->no_srccheck = inet_sk(sk)->transparent;
1532         ireq->opt = tcp_v4_save_options(skb);
1533         ireq->ir_mark = inet_request_mark(sk, skb);
1534
1535         if (security_inet_conn_request(sk, skb, req))
1536                 goto drop_and_free;
1537
1538         if (!want_cookie || tmp_opt.tstamp_ok)
1539                 TCP_ECN_create_request(req, skb, sock_net(sk));
1540
1541         if (want_cookie) {
1542                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1543                 req->cookie_ts = tmp_opt.tstamp_ok;
1544         } else if (!isn) {
1545                 /* VJ's idea. We save last timestamp seen
1546                  * from the destination in peer table, when entering
1547                  * state TIME-WAIT, and check against it before
1548                  * accepting new connection request.
1549                  *
1550                  * If "isn" is not zero, this request hit alive
1551                  * timewait bucket, so that all the necessary checks
1552                  * are made in the function processing timewait state.
1553                  */
1554                 if (tmp_opt.saw_tstamp &&
1555                     tcp_death_row.sysctl_tw_recycle &&
1556                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1557                     fl4.daddr == saddr) {
1558                         if (!tcp_peer_is_proven(req, dst, true)) {
1559                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1560                                 goto drop_and_release;
1561                         }
1562                 }
1563                 /* Kill the following clause, if you dislike this way. */
1564                 else if (!sysctl_tcp_syncookies &&
1565                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1566                           (sysctl_max_syn_backlog >> 2)) &&
1567                          !tcp_peer_is_proven(req, dst, false)) {
1568                         /* Without syncookies last quarter of
1569                          * backlog is filled with destinations,
1570                          * proven to be alive.
1571                          * It means that we continue to communicate
1572                          * to destinations, already remembered
1573                          * to the moment of synflood.
1574                          */
1575                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1576                                        &saddr, ntohs(tcp_hdr(skb)->source));
1577                         goto drop_and_release;
1578                 }
1579
1580                 isn = tcp_v4_init_sequence(skb);
1581         }
1582         tcp_rsk(req)->snt_isn = isn;
1583
1584         if (dst == NULL) {
1585                 dst = inet_csk_route_req(sk, &fl4, req);
1586                 if (dst == NULL)
1587                         goto drop_and_free;
1588         }
1589         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1590
1591         /* We don't call tcp_v4_send_synack() directly because we need
1592          * to make sure a child socket can be created successfully before
1593          * sending back synack!
1594          *
1595          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1596          * (or better yet, call tcp_send_synack() in the child context
1597          * directly, but will have to fix bunch of other code first)
1598          * after syn_recv_sock() except one will need to first fix the
1599          * latter to remove its dependency on the current implementation
1600          * of tcp_v4_send_synack()->tcp_select_initial_window().
1601          */
1602         skb_synack = tcp_make_synack(sk, dst, req,
1603             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1604
1605         if (skb_synack) {
1606                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1607                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1608         } else
1609                 goto drop_and_free;
1610
1611         if (likely(!do_fastopen)) {
1612                 int err;
1613                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1614                      ireq->rmt_addr, ireq->opt);
1615                 err = net_xmit_eval(err);
1616                 if (err || want_cookie)
1617                         goto drop_and_free;
1618
1619                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1620                 tcp_rsk(req)->listener = NULL;
1621                 /* Add the request_sock to the SYN table */
1622                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1623                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1624                         NET_INC_STATS_BH(sock_net(sk),
1625                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1626         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1627                 goto drop_and_free;
1628
1629         return 0;
1630
1631 drop_and_release:
1632         dst_release(dst);
1633 drop_and_free:
1634         reqsk_free(req);
1635 drop:
1636         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1637         return 0;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_conn_request);
1640
1641
1642 /*
1643  * The three way handshake has completed - we got a valid synack -
1644  * now create the new socket.
1645  */
1646 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1647                                   struct request_sock *req,
1648                                   struct dst_entry *dst)
1649 {
1650         struct inet_request_sock *ireq;
1651         struct inet_sock *newinet;
1652         struct tcp_sock *newtp;
1653         struct sock *newsk;
1654 #ifdef CONFIG_TCP_MD5SIG
1655         struct tcp_md5sig_key *key;
1656 #endif
1657         struct ip_options_rcu *inet_opt;
1658
1659         if (sk_acceptq_is_full(sk))
1660                 goto exit_overflow;
1661
1662         newsk = tcp_create_openreq_child(sk, req, skb);
1663         if (!newsk)
1664                 goto exit_nonewsk;
1665
1666         newsk->sk_gso_type = SKB_GSO_TCPV4;
1667         inet_sk_rx_dst_set(newsk, skb);
1668
1669         newtp                 = tcp_sk(newsk);
1670         newinet               = inet_sk(newsk);
1671         ireq                  = inet_rsk(req);
1672         newinet->inet_daddr   = ireq->rmt_addr;
1673         newinet->inet_rcv_saddr = ireq->loc_addr;
1674         newinet->inet_saddr           = ireq->loc_addr;
1675         inet_opt              = ireq->opt;
1676         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1677         ireq->opt             = NULL;
1678         newinet->mc_index     = inet_iif(skb);
1679         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1680         newinet->rcv_tos      = ip_hdr(skb)->tos;
1681         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1682         if (inet_opt)
1683                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1684         newinet->inet_id = newtp->write_seq ^ jiffies;
1685
1686         if (!dst) {
1687                 dst = inet_csk_route_child_sock(sk, newsk, req);
1688                 if (!dst)
1689                         goto put_and_exit;
1690         } else {
1691                 /* syncookie case : see end of cookie_v4_check() */
1692         }
1693         sk_setup_caps(newsk, dst);
1694
1695         tcp_mtup_init(newsk);
1696         tcp_sync_mss(newsk, dst_mtu(dst));
1697         newtp->advmss = dst_metric_advmss(dst);
1698         if (tcp_sk(sk)->rx_opt.user_mss &&
1699             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1700                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1701
1702         tcp_initialize_rcv_mss(newsk);
1703         tcp_synack_rtt_meas(newsk, req);
1704         newtp->total_retrans = req->num_retrans;
1705
1706 #ifdef CONFIG_TCP_MD5SIG
1707         /* Copy over the MD5 key from the original socket */
1708         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1709                                 AF_INET);
1710         if (key != NULL) {
1711                 /*
1712                  * We're using one, so create a matching key
1713                  * on the newsk structure. If we fail to get
1714                  * memory, then we end up not copying the key
1715                  * across. Shucks.
1716                  */
1717                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1718                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1719                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1720         }
1721 #endif
1722
1723         if (__inet_inherit_port(sk, newsk) < 0)
1724                 goto put_and_exit;
1725         __inet_hash_nolisten(newsk, NULL);
1726
1727         return newsk;
1728
1729 exit_overflow:
1730         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1731 exit_nonewsk:
1732         dst_release(dst);
1733 exit:
1734         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1735         return NULL;
1736 put_and_exit:
1737         inet_csk_prepare_forced_close(newsk);
1738         tcp_done(newsk);
1739         goto exit;
1740 }
1741 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1742
1743 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1744 {
1745         struct tcphdr *th = tcp_hdr(skb);
1746         const struct iphdr *iph = ip_hdr(skb);
1747         struct sock *nsk;
1748         struct request_sock **prev;
1749         /* Find possible connection requests. */
1750         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1751                                                        iph->saddr, iph->daddr);
1752         if (req)
1753                 return tcp_check_req(sk, skb, req, prev, false);
1754
1755         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1756                         th->source, iph->daddr, th->dest, inet_iif(skb));
1757
1758         if (nsk) {
1759                 if (nsk->sk_state != TCP_TIME_WAIT) {
1760                         bh_lock_sock(nsk);
1761                         return nsk;
1762                 }
1763                 inet_twsk_put(inet_twsk(nsk));
1764                 return NULL;
1765         }
1766
1767 #ifdef CONFIG_SYN_COOKIES
1768         if (!th->syn)
1769                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1770 #endif
1771         return sk;
1772 }
1773
1774 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1775 {
1776         const struct iphdr *iph = ip_hdr(skb);
1777
1778         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1779                 if (!tcp_v4_check(skb->len, iph->saddr,
1780                                   iph->daddr, skb->csum)) {
1781                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1782                         return 0;
1783                 }
1784         }
1785
1786         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1787                                        skb->len, IPPROTO_TCP, 0);
1788
1789         if (skb->len <= 76) {
1790                 return __skb_checksum_complete(skb);
1791         }
1792         return 0;
1793 }
1794
1795
1796 /* The socket must have it's spinlock held when we get
1797  * here.
1798  *
1799  * We have a potential double-lock case here, so even when
1800  * doing backlog processing we use the BH locking scheme.
1801  * This is because we cannot sleep with the original spinlock
1802  * held.
1803  */
1804 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1805 {
1806         struct sock *rsk;
1807 #ifdef CONFIG_TCP_MD5SIG
1808         /*
1809          * We really want to reject the packet as early as possible
1810          * if:
1811          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1812          *  o There is an MD5 option and we're not expecting one
1813          */
1814         if (tcp_v4_inbound_md5_hash(sk, skb))
1815                 goto discard;
1816 #endif
1817
1818         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1819                 struct dst_entry *dst = sk->sk_rx_dst;
1820
1821                 sock_rps_save_rxhash(sk, skb);
1822                 if (dst) {
1823                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1824                             dst->ops->check(dst, 0) == NULL) {
1825                                 dst_release(dst);
1826                                 sk->sk_rx_dst = NULL;
1827                         }
1828                 }
1829                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1830                         rsk = sk;
1831                         goto reset;
1832                 }
1833                 return 0;
1834         }
1835
1836         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1837                 goto csum_err;
1838
1839         if (sk->sk_state == TCP_LISTEN) {
1840                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1841                 if (!nsk)
1842                         goto discard;
1843
1844                 if (nsk != sk) {
1845                         sock_rps_save_rxhash(nsk, skb);
1846                         if (tcp_child_process(sk, nsk, skb)) {
1847                                 rsk = nsk;
1848                                 goto reset;
1849                         }
1850                         return 0;
1851                 }
1852         } else
1853                 sock_rps_save_rxhash(sk, skb);
1854
1855         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1856                 rsk = sk;
1857                 goto reset;
1858         }
1859         return 0;
1860
1861 reset:
1862         tcp_v4_send_reset(rsk, skb);
1863 discard:
1864         kfree_skb(skb);
1865         /* Be careful here. If this function gets more complicated and
1866          * gcc suffers from register pressure on the x86, sk (in %ebx)
1867          * might be destroyed here. This current version compiles correctly,
1868          * but you have been warned.
1869          */
1870         return 0;
1871
1872 csum_err:
1873         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1874         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1875         goto discard;
1876 }
1877 EXPORT_SYMBOL(tcp_v4_do_rcv);
1878
1879 void tcp_v4_early_demux(struct sk_buff *skb)
1880 {
1881         const struct iphdr *iph;
1882         const struct tcphdr *th;
1883         struct sock *sk;
1884
1885         if (skb->pkt_type != PACKET_HOST)
1886                 return;
1887
1888         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1889                 return;
1890
1891         iph = ip_hdr(skb);
1892         th = tcp_hdr(skb);
1893
1894         if (th->doff < sizeof(struct tcphdr) / 4)
1895                 return;
1896
1897         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1898                                        iph->saddr, th->source,
1899                                        iph->daddr, ntohs(th->dest),
1900                                        skb->skb_iif);
1901         if (sk) {
1902                 skb->sk = sk;
1903                 skb->destructor = sock_edemux;
1904                 if (sk->sk_state != TCP_TIME_WAIT) {
1905                         struct dst_entry *dst = sk->sk_rx_dst;
1906
1907                         if (dst)
1908                                 dst = dst_check(dst, 0);
1909                         if (dst &&
1910                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1911                                 skb_dst_set_noref(skb, dst);
1912                 }
1913         }
1914 }
1915
1916 /* Packet is added to VJ-style prequeue for processing in process
1917  * context, if a reader task is waiting. Apparently, this exciting
1918  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1919  * failed somewhere. Latency? Burstiness? Well, at least now we will
1920  * see, why it failed. 8)8)                               --ANK
1921  *
1922  */
1923 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1924 {
1925         struct tcp_sock *tp = tcp_sk(sk);
1926
1927         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1928                 return false;
1929
1930         if (skb->len <= tcp_hdrlen(skb) &&
1931             skb_queue_len(&tp->ucopy.prequeue) == 0)
1932                 return false;
1933
1934         skb_dst_force(skb);
1935         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1936         tp->ucopy.memory += skb->truesize;
1937         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1938                 struct sk_buff *skb1;
1939
1940                 BUG_ON(sock_owned_by_user(sk));
1941
1942                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1943                         sk_backlog_rcv(sk, skb1);
1944                         NET_INC_STATS_BH(sock_net(sk),
1945                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1946                 }
1947
1948                 tp->ucopy.memory = 0;
1949         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1950                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1951                                            POLLIN | POLLRDNORM | POLLRDBAND);
1952                 if (!inet_csk_ack_scheduled(sk))
1953                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1954                                                   (3 * tcp_rto_min(sk)) / 4,
1955                                                   sysctl_tcp_rto_max);
1956         }
1957         return true;
1958 }
1959 EXPORT_SYMBOL(tcp_prequeue);
1960
1961 /*
1962  *      From tcp_input.c
1963  */
1964
1965 int tcp_v4_rcv(struct sk_buff *skb)
1966 {
1967         const struct iphdr *iph;
1968         const struct tcphdr *th;
1969         struct sock *sk;
1970         int ret;
1971         struct net *net = dev_net(skb->dev);
1972
1973         if (skb->pkt_type != PACKET_HOST)
1974                 goto discard_it;
1975
1976         /* Count it even if it's bad */
1977         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1978
1979         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1980                 goto discard_it;
1981
1982         th = tcp_hdr(skb);
1983
1984         if (th->doff < sizeof(struct tcphdr) / 4)
1985                 goto bad_packet;
1986         if (!pskb_may_pull(skb, th->doff * 4))
1987                 goto discard_it;
1988
1989         /* An explanation is required here, I think.
1990          * Packet length and doff are validated by header prediction,
1991          * provided case of th->doff==0 is eliminated.
1992          * So, we defer the checks. */
1993         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1994                 goto csum_error;
1995
1996         th = tcp_hdr(skb);
1997         iph = ip_hdr(skb);
1998         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1999         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2000                                     skb->len - th->doff * 4);
2001         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2002         TCP_SKB_CB(skb)->when    = 0;
2003         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2004         TCP_SKB_CB(skb)->sacked  = 0;
2005
2006         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2007         if (!sk)
2008                 goto no_tcp_socket;
2009
2010 process:
2011         if (sk->sk_state == TCP_TIME_WAIT)
2012                 goto do_time_wait;
2013
2014         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2015                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2016                 goto discard_and_relse;
2017         }
2018
2019         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2020                 goto discard_and_relse;
2021         nf_reset(skb);
2022
2023         if (sk_filter(sk, skb))
2024                 goto discard_and_relse;
2025
2026         skb->dev = NULL;
2027
2028         bh_lock_sock_nested(sk);
2029         ret = 0;
2030         if (!sock_owned_by_user(sk)) {
2031 #ifdef CONFIG_NET_DMA
2032                 struct tcp_sock *tp = tcp_sk(sk);
2033                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2034                         tp->ucopy.dma_chan = net_dma_find_channel();
2035                 if (tp->ucopy.dma_chan)
2036                         ret = tcp_v4_do_rcv(sk, skb);
2037                 else
2038 #endif
2039                 {
2040                         if (!tcp_prequeue(sk, skb))
2041                                 ret = tcp_v4_do_rcv(sk, skb);
2042                 }
2043         } else if (unlikely(sk_add_backlog(sk, skb,
2044                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2045                 bh_unlock_sock(sk);
2046                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2047                 goto discard_and_relse;
2048         }
2049         bh_unlock_sock(sk);
2050
2051         sock_put(sk);
2052
2053         return ret;
2054
2055 no_tcp_socket:
2056         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2057                 goto discard_it;
2058
2059         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2060 csum_error:
2061                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2062 bad_packet:
2063                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2064         } else {
2065                 tcp_v4_send_reset(NULL, skb);
2066         }
2067
2068 discard_it:
2069         /* Discard frame. */
2070         kfree_skb(skb);
2071         return 0;
2072
2073 discard_and_relse:
2074         sock_put(sk);
2075         goto discard_it;
2076
2077 do_time_wait:
2078         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2079                 inet_twsk_put(inet_twsk(sk));
2080                 goto discard_it;
2081         }
2082
2083         if (skb->len < (th->doff << 2)) {
2084                 inet_twsk_put(inet_twsk(sk));
2085                 goto bad_packet;
2086         }
2087         if (tcp_checksum_complete(skb)) {
2088                 inet_twsk_put(inet_twsk(sk));
2089                 goto csum_error;
2090         }
2091         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2092         case TCP_TW_SYN: {
2093                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2094                                                         &tcp_hashinfo,
2095                                                         iph->saddr, th->source,
2096                                                         iph->daddr, th->dest,
2097                                                         inet_iif(skb));
2098                 if (sk2) {
2099                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2100                         inet_twsk_put(inet_twsk(sk));
2101                         sk = sk2;
2102                         goto process;
2103                 }
2104                 /* Fall through to ACK */
2105         }
2106         case TCP_TW_ACK:
2107                 tcp_v4_timewait_ack(sk, skb);
2108                 break;
2109         case TCP_TW_RST:
2110                 goto no_tcp_socket;
2111         case TCP_TW_SUCCESS:;
2112         }
2113         goto discard_it;
2114 }
2115
2116 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2117         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2118         .twsk_unique    = tcp_twsk_unique,
2119         .twsk_destructor= tcp_twsk_destructor,
2120 };
2121
2122 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2123 {
2124         struct dst_entry *dst = skb_dst(skb);
2125
2126         dst_hold(dst);
2127         sk->sk_rx_dst = dst;
2128         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2129 }
2130 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2131
2132 const struct inet_connection_sock_af_ops ipv4_specific = {
2133         .queue_xmit        = ip_queue_xmit,
2134         .send_check        = tcp_v4_send_check,
2135         .rebuild_header    = inet_sk_rebuild_header,
2136         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2137         .conn_request      = tcp_v4_conn_request,
2138         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2139         .net_header_len    = sizeof(struct iphdr),
2140         .setsockopt        = ip_setsockopt,
2141         .getsockopt        = ip_getsockopt,
2142         .addr2sockaddr     = inet_csk_addr2sockaddr,
2143         .sockaddr_len      = sizeof(struct sockaddr_in),
2144         .bind_conflict     = inet_csk_bind_conflict,
2145 #ifdef CONFIG_COMPAT
2146         .compat_setsockopt = compat_ip_setsockopt,
2147         .compat_getsockopt = compat_ip_getsockopt,
2148 #endif
2149         .mtu_reduced       = tcp_v4_mtu_reduced,
2150 };
2151 EXPORT_SYMBOL(ipv4_specific);
2152
2153 #ifdef CONFIG_TCP_MD5SIG
2154 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2155         .md5_lookup             = tcp_v4_md5_lookup,
2156         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2157         .md5_parse              = tcp_v4_parse_md5_keys,
2158 };
2159 #endif
2160
2161 /* NOTE: A lot of things set to zero explicitly by call to
2162  *       sk_alloc() so need not be done here.
2163  */
2164 static int tcp_v4_init_sock(struct sock *sk)
2165 {
2166         struct inet_connection_sock *icsk = inet_csk(sk);
2167
2168         tcp_init_sock(sk);
2169         icsk->icsk_MMSRB = 0;
2170
2171         icsk->icsk_af_ops = &ipv4_specific;
2172
2173 #ifdef CONFIG_TCP_MD5SIG
2174         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2175 #endif
2176
2177         return 0;
2178 }
2179
2180 void tcp_v4_destroy_sock(struct sock *sk)
2181 {
2182         struct tcp_sock *tp = tcp_sk(sk);
2183
2184         tcp_clear_xmit_timers(sk);
2185
2186         tcp_cleanup_congestion_control(sk);
2187
2188         /* Cleanup up the write buffer. */
2189         tcp_write_queue_purge(sk);
2190
2191         /* Cleans up our, hopefully empty, out_of_order_queue. */
2192         __skb_queue_purge(&tp->out_of_order_queue);
2193
2194 #ifdef CONFIG_TCP_MD5SIG
2195         /* Clean up the MD5 key list, if any */
2196         if (tp->md5sig_info) {
2197                 tcp_clear_md5_list(sk);
2198                 kfree_rcu(tp->md5sig_info, rcu);
2199                 tp->md5sig_info = NULL;
2200         }
2201 #endif
2202
2203 #ifdef CONFIG_NET_DMA
2204         /* Cleans up our sk_async_wait_queue */
2205         __skb_queue_purge(&sk->sk_async_wait_queue);
2206 #endif
2207
2208         /* Clean prequeue, it must be empty really */
2209         __skb_queue_purge(&tp->ucopy.prequeue);
2210
2211         /* Clean up a referenced TCP bind bucket. */
2212         if (inet_csk(sk)->icsk_bind_hash)
2213                 inet_put_port(sk);
2214
2215         BUG_ON(tp->fastopen_rsk != NULL);
2216
2217         /* If socket is aborted during connect operation */
2218         tcp_free_fastopen_req(tp);
2219
2220         sk_sockets_allocated_dec(sk);
2221         sock_release_memcg(sk);
2222 }
2223 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2224
2225 void tcp_v4_handle_retrans_time_by_uid(struct uid_err uid_e)
2226 {
2227     unsigned int bucket;
2228     uid_t skuid = (uid_t)(uid_e.appuid);
2229         struct inet_connection_sock *icsk = NULL;//inet_csk(sk);
2230
2231
2232     for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
2233         struct hlist_nulls_node *node;
2234         struct sock *sk;
2235         spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
2236
2237         spin_lock_bh(lock);
2238         sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
2239
2240             if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
2241                 continue;
2242             if (sock_flag(sk, SOCK_DEAD))
2243                 continue;
2244
2245             if(sk->sk_socket){
2246                 if(SOCK_INODE(sk->sk_socket)->i_uid != skuid)
2247                     continue;
2248                 else
2249                     printk("[mmspb] tcp_v4_handle_retrans_time_by_uid socket uid(%d) match!",
2250                         SOCK_INODE(sk->sk_socket)->i_uid);
2251             } else{
2252                 continue;
2253             }
2254
2255                 sock_hold(sk);
2256                 spin_unlock_bh(lock);
2257
2258                 local_bh_disable();
2259                 bh_lock_sock(sk);
2260
2261                 // update sk time out value
2262                 icsk = inet_csk(sk);
2263                 printk("[mmspb] tcp_v4_handle_retrans_time_by_uid update timer\n");
2264
2265                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + 2);
2266                 icsk->icsk_rto = sysctl_tcp_rto_min * 30;
2267                 icsk->icsk_MMSRB = 1;
2268
2269                 bh_unlock_sock(sk);
2270                 local_bh_enable();
2271                 spin_lock_bh(lock);
2272                 sock_put(sk);
2273
2274             }
2275             spin_unlock_bh(lock);
2276         }
2277
2278 }
2279
2280
2281 /*
2282  * tcp_v4_nuke_addr_by_uid - destroy all sockets of spcial uid
2283  */
2284 void tcp_v4_reset_connections_by_uid(struct uid_err uid_e)
2285 {
2286     unsigned int bucket;
2287     uid_t skuid = (uid_t)(uid_e.appuid);
2288
2289     for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
2290         struct hlist_nulls_node *node;
2291         struct sock *sk;
2292         spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
2293
2294 restart:
2295         spin_lock_bh(lock);
2296         sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
2297
2298             if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
2299                 continue;
2300             if (sock_flag(sk, SOCK_DEAD))
2301                 continue;
2302
2303             if(sk->sk_socket){
2304                 if(SOCK_INODE(sk->sk_socket)->i_uid != skuid)
2305                     continue;
2306                 else
2307                     printk(KERN_INFO "SIOCKILLSOCK socket uid(%d) match!",
2308                         SOCK_INODE(sk->sk_socket)->i_uid);
2309             } else{
2310                 continue;
2311             }
2312
2313                 sock_hold(sk);
2314                 spin_unlock_bh(lock);
2315
2316                 local_bh_disable();
2317                 bh_lock_sock(sk);
2318                 sk->sk_err = uid_e.errNum;
2319                 printk(KERN_INFO "SIOCKILLSOCK set sk err == %d!! \n", sk->sk_err);
2320                 sk->sk_error_report(sk);
2321
2322                 tcp_done(sk);
2323                 bh_unlock_sock(sk);
2324                 local_bh_enable();
2325                 sock_put(sk);
2326
2327                 goto restart;
2328             }
2329             spin_unlock_bh(lock);
2330         }
2331 }
2332
2333
2334 #ifdef CONFIG_PROC_FS
2335 /* Proc filesystem TCP sock list dumping. */
2336
2337 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2338 {
2339         return hlist_nulls_empty(head) ? NULL :
2340                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2341 }
2342
2343 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2344 {
2345         return !is_a_nulls(tw->tw_node.next) ?
2346                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2347 }
2348
2349 /*
2350  * Get next listener socket follow cur.  If cur is NULL, get first socket
2351  * starting from bucket given in st->bucket; when st->bucket is zero the
2352  * very first socket in the hash table is returned.
2353  */
2354 static void *listening_get_next(struct seq_file *seq, void *cur)
2355 {
2356         struct inet_connection_sock *icsk;
2357         struct hlist_nulls_node *node;
2358         struct sock *sk = cur;
2359         struct inet_listen_hashbucket *ilb;
2360         struct tcp_iter_state *st = seq->private;
2361         struct net *net = seq_file_net(seq);
2362
2363         if (!sk) {
2364                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2365                 spin_lock_bh(&ilb->lock);
2366                 sk = sk_nulls_head(&ilb->head);
2367                 st->offset = 0;
2368                 goto get_sk;
2369         }
2370         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2371         ++st->num;
2372         ++st->offset;
2373
2374         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2375                 struct request_sock *req = cur;
2376
2377                 icsk = inet_csk(st->syn_wait_sk);
2378                 req = req->dl_next;
2379                 while (1) {
2380                         while (req) {
2381                                 if (req->rsk_ops->family == st->family) {
2382                                         cur = req;
2383                                         goto out;
2384                                 }
2385                                 req = req->dl_next;
2386                         }
2387                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2388                                 break;
2389 get_req:
2390                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2391                 }
2392                 sk        = sk_nulls_next(st->syn_wait_sk);
2393                 st->state = TCP_SEQ_STATE_LISTENING;
2394                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2395         } else {
2396                 icsk = inet_csk(sk);
2397                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2398                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2399                         goto start_req;
2400                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2401                 sk = sk_nulls_next(sk);
2402         }
2403 get_sk:
2404         sk_nulls_for_each_from(sk, node) {
2405                 if (!net_eq(sock_net(sk), net))
2406                         continue;
2407                 if (sk->sk_family == st->family) {
2408                         cur = sk;
2409                         goto out;
2410                 }
2411                 icsk = inet_csk(sk);
2412                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2413                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2414 start_req:
2415                         st->uid         = sock_i_uid(sk);
2416                         st->syn_wait_sk = sk;
2417                         st->state       = TCP_SEQ_STATE_OPENREQ;
2418                         st->sbucket     = 0;
2419                         goto get_req;
2420                 }
2421                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2422         }
2423         spin_unlock_bh(&ilb->lock);
2424         st->offset = 0;
2425         if (++st->bucket < INET_LHTABLE_SIZE) {
2426                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2427                 spin_lock_bh(&ilb->lock);
2428                 sk = sk_nulls_head(&ilb->head);
2429                 goto get_sk;
2430         }
2431         cur = NULL;
2432 out:
2433         return cur;
2434 }
2435
2436 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2437 {
2438         struct tcp_iter_state *st = seq->private;
2439         void *rc;
2440
2441         st->bucket = 0;
2442         st->offset = 0;
2443         rc = listening_get_next(seq, NULL);
2444
2445         while (rc && *pos) {
2446                 rc = listening_get_next(seq, rc);
2447                 --*pos;
2448         }
2449         return rc;
2450 }
2451
2452 static inline bool empty_bucket(struct tcp_iter_state *st)
2453 {
2454         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2455                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2456 }
2457
2458 /*
2459  * Get first established socket starting from bucket given in st->bucket.
2460  * If st->bucket is zero, the very first socket in the hash is returned.
2461  */
2462 static void *established_get_first(struct seq_file *seq)
2463 {
2464         struct tcp_iter_state *st = seq->private;
2465         struct net *net = seq_file_net(seq);
2466         void *rc = NULL;
2467
2468         st->offset = 0;
2469         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2470                 struct sock *sk;
2471                 struct hlist_nulls_node *node;
2472                 struct inet_timewait_sock *tw;
2473                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2474
2475                 /* Lockless fast path for the common case of empty buckets */
2476                 if (empty_bucket(st))
2477                         continue;
2478
2479                 spin_lock_bh(lock);
2480                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2481                         if (sk->sk_family != st->family ||
2482                             !net_eq(sock_net(sk), net)) {
2483                                 continue;
2484                         }
2485                         rc = sk;
2486                         goto out;
2487                 }
2488                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2489                 inet_twsk_for_each(tw, node,
2490                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2491                         if (tw->tw_family != st->family ||
2492                             !net_eq(twsk_net(tw), net)) {
2493                                 continue;
2494                         }
2495                         rc = tw;
2496                         goto out;
2497                 }
2498                 spin_unlock_bh(lock);
2499                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2500         }
2501 out:
2502         return rc;
2503 }
2504
2505 static void *established_get_next(struct seq_file *seq, void *cur)
2506 {
2507         struct sock *sk = cur;
2508         struct inet_timewait_sock *tw;
2509         struct hlist_nulls_node *node;
2510         struct tcp_iter_state *st = seq->private;
2511         struct net *net = seq_file_net(seq);
2512
2513         ++st->num;
2514         ++st->offset;
2515
2516         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2517                 tw = cur;
2518                 tw = tw_next(tw);
2519 get_tw:
2520                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2521                         tw = tw_next(tw);
2522                 }
2523                 if (tw) {
2524                         cur = tw;
2525                         goto out;
2526                 }
2527                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2528                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2529
2530                 /* Look for next non empty bucket */
2531                 st->offset = 0;
2532                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2533                                 empty_bucket(st))
2534                         ;
2535                 if (st->bucket > tcp_hashinfo.ehash_mask)
2536                         return NULL;
2537
2538                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2539                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2540         } else
2541                 sk = sk_nulls_next(sk);
2542
2543         sk_nulls_for_each_from(sk, node) {
2544                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2545                         goto found;
2546         }
2547
2548         st->state = TCP_SEQ_STATE_TIME_WAIT;
2549         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2550         goto get_tw;
2551 found:
2552         cur = sk;
2553 out:
2554         return cur;
2555 }
2556
2557 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2558 {
2559         struct tcp_iter_state *st = seq->private;
2560         void *rc;
2561
2562         st->bucket = 0;
2563         rc = established_get_first(seq);
2564
2565         while (rc && pos) {
2566                 rc = established_get_next(seq, rc);
2567                 --pos;
2568         }
2569         return rc;
2570 }
2571
2572 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2573 {
2574         void *rc;
2575         struct tcp_iter_state *st = seq->private;
2576
2577         st->state = TCP_SEQ_STATE_LISTENING;
2578         rc        = listening_get_idx(seq, &pos);
2579
2580         if (!rc) {
2581                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2582                 rc        = established_get_idx(seq, pos);
2583         }
2584
2585         return rc;
2586 }
2587
2588 static void *tcp_seek_last_pos(struct seq_file *seq)
2589 {
2590         struct tcp_iter_state *st = seq->private;
2591         int offset = st->offset;
2592         int orig_num = st->num;
2593         void *rc = NULL;
2594
2595         switch (st->state) {
2596         case TCP_SEQ_STATE_OPENREQ:
2597         case TCP_SEQ_STATE_LISTENING:
2598                 if (st->bucket >= INET_LHTABLE_SIZE)
2599                         break;
2600                 st->state = TCP_SEQ_STATE_LISTENING;
2601                 rc = listening_get_next(seq, NULL);
2602                 while (offset-- && rc)
2603                         rc = listening_get_next(seq, rc);
2604                 if (rc)
2605                         break;
2606                 st->bucket = 0;
2607                 /* Fallthrough */
2608         case TCP_SEQ_STATE_ESTABLISHED:
2609         case TCP_SEQ_STATE_TIME_WAIT:
2610                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2611                 if (st->bucket > tcp_hashinfo.ehash_mask)
2612                         break;
2613                 rc = established_get_first(seq);
2614                 while (offset-- && rc)
2615                         rc = established_get_next(seq, rc);
2616         }
2617
2618         st->num = orig_num;
2619
2620         return rc;
2621 }
2622
2623 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2624 {
2625         struct tcp_iter_state *st = seq->private;
2626         void *rc;
2627
2628         if (*pos && *pos == st->last_pos) {
2629                 rc = tcp_seek_last_pos(seq);
2630                 if (rc)
2631                         goto out;
2632         }
2633
2634         st->state = TCP_SEQ_STATE_LISTENING;
2635         st->num = 0;
2636         st->bucket = 0;
2637         st->offset = 0;
2638         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2639
2640 out:
2641         st->last_pos = *pos;
2642         return rc;
2643 }
2644
2645 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2646 {
2647         struct tcp_iter_state *st = seq->private;
2648         void *rc = NULL;
2649
2650         if (v == SEQ_START_TOKEN) {
2651                 rc = tcp_get_idx(seq, 0);
2652                 goto out;
2653         }
2654
2655         switch (st->state) {
2656         case TCP_SEQ_STATE_OPENREQ:
2657         case TCP_SEQ_STATE_LISTENING:
2658                 rc = listening_get_next(seq, v);
2659                 if (!rc) {
2660                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2661                         st->bucket = 0;
2662                         st->offset = 0;
2663                         rc        = established_get_first(seq);
2664                 }
2665                 break;
2666         case TCP_SEQ_STATE_ESTABLISHED:
2667         case TCP_SEQ_STATE_TIME_WAIT:
2668                 rc = established_get_next(seq, v);
2669                 break;
2670         }
2671 out:
2672         ++*pos;
2673         st->last_pos = *pos;
2674         return rc;
2675 }
2676
2677 static void tcp_seq_stop(struct seq_file *seq, void *v)
2678 {
2679         struct tcp_iter_state *st = seq->private;
2680
2681         switch (st->state) {
2682         case TCP_SEQ_STATE_OPENREQ:
2683                 if (v) {
2684                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2685                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2686                 }
2687         case TCP_SEQ_STATE_LISTENING:
2688                 if (v != SEQ_START_TOKEN)
2689                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2690                 break;
2691         case TCP_SEQ_STATE_TIME_WAIT:
2692         case TCP_SEQ_STATE_ESTABLISHED:
2693                 if (v)
2694                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2695                 break;
2696         }
2697 }
2698
2699 int tcp_seq_open(struct inode *inode, struct file *file)
2700 {
2701         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2702         struct tcp_iter_state *s;
2703         int err;
2704
2705         err = seq_open_net(inode, file, &afinfo->seq_ops,
2706                           sizeof(struct tcp_iter_state));
2707         if (err < 0)
2708                 return err;
2709
2710         s = ((struct seq_file *)file->private_data)->private;
2711         s->family               = afinfo->family;
2712         s->last_pos             = 0;
2713         return 0;
2714 }
2715 EXPORT_SYMBOL(tcp_seq_open);
2716
2717 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2718 {
2719         int rc = 0;
2720         struct proc_dir_entry *p;
2721
2722         afinfo->seq_ops.start           = tcp_seq_start;
2723         afinfo->seq_ops.next            = tcp_seq_next;
2724         afinfo->seq_ops.stop            = tcp_seq_stop;
2725
2726         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2727                              afinfo->seq_fops, afinfo);
2728         if (!p)
2729                 rc = -ENOMEM;
2730         return rc;
2731 }
2732 EXPORT_SYMBOL(tcp_proc_register);
2733
2734 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2735 {
2736         remove_proc_entry(afinfo->name, net->proc_net);
2737 }
2738 EXPORT_SYMBOL(tcp_proc_unregister);
2739
2740 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2741                          struct seq_file *f, int i, kuid_t uid, int *len)
2742 {
2743         const struct inet_request_sock *ireq = inet_rsk(req);
2744         long delta = req->expires - jiffies;
2745
2746         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2747                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2748                 i,
2749                 ireq->loc_addr,
2750                 ntohs(inet_sk(sk)->inet_sport),
2751                 ireq->rmt_addr,
2752                 ntohs(ireq->rmt_port),
2753                 TCP_SYN_RECV,
2754                 0, 0, /* could print option size, but that is af dependent. */
2755                 1,    /* timers active (only the expire timer) */
2756                 jiffies_delta_to_clock_t(delta),
2757                 req->num_timeout,
2758                 from_kuid_munged(seq_user_ns(f), uid),
2759                 0,  /* non standard timer */
2760                 0, /* open_requests have no inode */
2761                 atomic_read(&sk->sk_refcnt),
2762                 req,
2763                 len);
2764 }
2765
2766 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2767 {
2768         int timer_active;
2769         unsigned long timer_expires;
2770         const struct tcp_sock *tp = tcp_sk(sk);
2771         const struct inet_connection_sock *icsk = inet_csk(sk);
2772         const struct inet_sock *inet = inet_sk(sk);
2773         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2774         __be32 dest = inet->inet_daddr;
2775         __be32 src = inet->inet_rcv_saddr;
2776         __u16 destp = ntohs(inet->inet_dport);
2777         __u16 srcp = ntohs(inet->inet_sport);
2778         int rx_queue;
2779
2780         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2781             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2782             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2783                 timer_active    = 1;
2784                 timer_expires   = icsk->icsk_timeout;
2785         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2786                 timer_active    = 4;
2787                 timer_expires   = icsk->icsk_timeout;
2788         } else if (timer_pending(&sk->sk_timer)) {
2789                 timer_active    = 2;
2790                 timer_expires   = sk->sk_timer.expires;
2791         } else {
2792                 timer_active    = 0;
2793                 timer_expires = jiffies;
2794         }
2795
2796         if (sk->sk_state == TCP_LISTEN)
2797                 rx_queue = sk->sk_ack_backlog;
2798         else
2799                 /*
2800                  * because we dont lock socket, we might find a transient negative value
2801                  */
2802                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2803
2804         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2805                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2806                 i, src, srcp, dest, destp, sk->sk_state,
2807                 tp->write_seq - tp->snd_una,
2808                 rx_queue,
2809                 timer_active,
2810                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2811                 icsk->icsk_retransmits,
2812                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2813                 icsk->icsk_probes_out,
2814                 sock_i_ino(sk),
2815                 atomic_read(&sk->sk_refcnt), sk,
2816                 jiffies_to_clock_t(icsk->icsk_rto),
2817                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2818                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2819                 tp->snd_cwnd,
2820                 sk->sk_state == TCP_LISTEN ?
2821                     (fastopenq ? fastopenq->max_qlen : 0) :
2822                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2823                 len);
2824 }
2825
2826 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2827                                struct seq_file *f, int i, int *len)
2828 {
2829         __be32 dest, src;
2830         __u16 destp, srcp;
2831         long delta = tw->tw_ttd - jiffies;
2832
2833         dest  = tw->tw_daddr;
2834         src   = tw->tw_rcv_saddr;
2835         destp = ntohs(tw->tw_dport);
2836         srcp  = ntohs(tw->tw_sport);
2837
2838         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2839                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2840                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2841                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2842                 atomic_read(&tw->tw_refcnt), tw, len);
2843 }
2844
2845 #define TMPSZ 150
2846
2847 static int tcp4_seq_show(struct seq_file *seq, void *v)
2848 {
2849         struct tcp_iter_state *st;
2850         int len;
2851
2852         if (v == SEQ_START_TOKEN) {
2853                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2854                            "  sl  local_address rem_address   st tx_queue "
2855                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2856                            "inode");
2857                 goto out;
2858         }
2859         st = seq->private;
2860
2861         switch (st->state) {
2862         case TCP_SEQ_STATE_LISTENING:
2863         case TCP_SEQ_STATE_ESTABLISHED:
2864                 get_tcp4_sock(v, seq, st->num, &len);
2865                 break;
2866         case TCP_SEQ_STATE_OPENREQ:
2867                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2868                 break;
2869         case TCP_SEQ_STATE_TIME_WAIT:
2870                 get_timewait4_sock(v, seq, st->num, &len);
2871                 break;
2872         }
2873         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2874 out:
2875         return 0;
2876 }
2877
2878 static const struct file_operations tcp_afinfo_seq_fops = {
2879         .owner   = THIS_MODULE,
2880         .open    = tcp_seq_open,
2881         .read    = seq_read,
2882         .llseek  = seq_lseek,
2883         .release = seq_release_net
2884 };
2885
2886 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2887         .name           = "tcp",
2888         .family         = AF_INET,
2889         .seq_fops       = &tcp_afinfo_seq_fops,
2890         .seq_ops        = {
2891                 .show           = tcp4_seq_show,
2892         },
2893 };
2894
2895 static int __net_init tcp4_proc_init_net(struct net *net)
2896 {
2897         return tcp_proc_register(net, &tcp4_seq_afinfo);
2898 }
2899
2900 static void __net_exit tcp4_proc_exit_net(struct net *net)
2901 {
2902         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2903 }
2904
2905 static struct pernet_operations tcp4_net_ops = {
2906         .init = tcp4_proc_init_net,
2907         .exit = tcp4_proc_exit_net,
2908 };
2909
2910 int __init tcp4_proc_init(void)
2911 {
2912         return register_pernet_subsys(&tcp4_net_ops);
2913 }
2914
2915 void tcp4_proc_exit(void)
2916 {
2917         unregister_pernet_subsys(&tcp4_net_ops);
2918 }
2919 #endif /* CONFIG_PROC_FS */
2920
2921 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2922 {
2923         const struct iphdr *iph = skb_gro_network_header(skb);
2924         __wsum wsum;
2925         __sum16 sum;
2926
2927         switch (skb->ip_summed) {
2928         case CHECKSUM_COMPLETE:
2929                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2930                                   skb->csum)) {
2931                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2932                         break;
2933                 }
2934 flush:
2935                 NAPI_GRO_CB(skb)->flush = 1;
2936                 return NULL;
2937
2938         case CHECKSUM_NONE:
2939                 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2940                                           skb_gro_len(skb), IPPROTO_TCP, 0);
2941                 sum = csum_fold(skb_checksum(skb,
2942                                              skb_gro_offset(skb),
2943                                              skb_gro_len(skb),
2944                                              wsum));
2945                 if (sum)
2946                         goto flush;
2947
2948                 skb->ip_summed = CHECKSUM_UNNECESSARY;
2949                 break;
2950         }
2951
2952         return tcp_gro_receive(head, skb);
2953 }
2954
2955 int tcp4_gro_complete(struct sk_buff *skb)
2956 {
2957         const struct iphdr *iph = ip_hdr(skb);
2958         struct tcphdr *th = tcp_hdr(skb);
2959
2960         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2961                                   iph->saddr, iph->daddr, 0);
2962         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2963
2964         return tcp_gro_complete(skb);
2965 }
2966
2967 struct proto tcp_prot = {
2968         .name                   = "TCP",
2969         .owner                  = THIS_MODULE,
2970         .close                  = tcp_close,
2971         .connect                = tcp_v4_connect,
2972         .disconnect             = tcp_disconnect,
2973         .accept                 = inet_csk_accept,
2974         .ioctl                  = tcp_ioctl,
2975         .init                   = tcp_v4_init_sock,
2976         .destroy                = tcp_v4_destroy_sock,
2977         .shutdown               = tcp_shutdown,
2978         .setsockopt             = tcp_setsockopt,
2979         .getsockopt             = tcp_getsockopt,
2980         .recvmsg                = tcp_recvmsg,
2981         .sendmsg                = tcp_sendmsg,
2982         .sendpage               = tcp_sendpage,
2983         .backlog_rcv            = tcp_v4_do_rcv,
2984         .release_cb             = tcp_release_cb,
2985         .hash                   = inet_hash,
2986         .unhash                 = inet_unhash,
2987         .get_port               = inet_csk_get_port,
2988         .enter_memory_pressure  = tcp_enter_memory_pressure,
2989         .sockets_allocated      = &tcp_sockets_allocated,
2990         .orphan_count           = &tcp_orphan_count,
2991         .memory_allocated       = &tcp_memory_allocated,
2992         .memory_pressure        = &tcp_memory_pressure,
2993         .sysctl_wmem            = sysctl_tcp_wmem,
2994         .sysctl_rmem            = sysctl_tcp_rmem,
2995         .max_header             = MAX_TCP_HEADER,
2996         .obj_size               = sizeof(struct tcp_sock),
2997         .slab_flags             = SLAB_DESTROY_BY_RCU,
2998         .twsk_prot              = &tcp_timewait_sock_ops,
2999         .rsk_prot               = &tcp_request_sock_ops,
3000         .h.hashinfo             = &tcp_hashinfo,
3001         .no_autobind            = true,
3002 #ifdef CONFIG_COMPAT
3003         .compat_setsockopt      = compat_tcp_setsockopt,
3004         .compat_getsockopt      = compat_tcp_getsockopt,
3005 #endif
3006 #ifdef CONFIG_MEMCG_KMEM
3007         .init_cgroup            = tcp_init_cgroup,
3008         .destroy_cgroup         = tcp_destroy_cgroup,
3009         .proto_cgroup           = tcp_proto_cgroup,
3010 #endif
3011 };
3012 EXPORT_SYMBOL(tcp_prot);
3013
3014 static void __net_exit tcp_sk_exit(struct net *net)
3015 {
3016         int cpu;
3017
3018         for_each_possible_cpu(cpu)
3019                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3020         free_percpu(net->ipv4.tcp_sk);
3021 }
3022
3023 static int __net_init tcp_sk_init(struct net *net)
3024 {
3025         int res, cpu;
3026
3027         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3028         if (!net->ipv4.tcp_sk)
3029                 return -ENOMEM;
3030
3031         for_each_possible_cpu(cpu) {
3032                 struct sock *sk;
3033
3034                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3035                                            IPPROTO_TCP, net);
3036                 if (res)
3037                         goto fail;
3038                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3039         }
3040         net->ipv4.sysctl_tcp_ecn = 2;
3041         return 0;
3042
3043 fail:
3044         tcp_sk_exit(net);
3045
3046         return res;
3047 }
3048
3049 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3050 {
3051         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
3052 }
3053
3054 static struct pernet_operations __net_initdata tcp_sk_ops = {
3055        .init       = tcp_sk_init,
3056        .exit       = tcp_sk_exit,
3057        .exit_batch = tcp_sk_exit_batch,
3058 };
3059
3060 void __init tcp_v4_init(void)
3061 {
3062         inet_hashinfo_init(&tcp_hashinfo);
3063         if (register_pernet_subsys(&tcp_sk_ops))
3064                 panic("Failed to create the TCP control socket.\n");
3065 }