net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/bootmem.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 273
 274 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 275
 276 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 277
 278 int sysctl_tcp_mem[3];
 279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 281
 282 EXPORT_SYMBOL(sysctl_tcp_mem);
 283 EXPORT_SYMBOL(sysctl_tcp_rmem);
 284 EXPORT_SYMBOL(sysctl_tcp_wmem);
 285
 286 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 287 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 288
 289 EXPORT_SYMBOL(tcp_memory_allocated);
 290 EXPORT_SYMBOL(tcp_sockets_allocated);
 291
 292 /*
 293  * Pressure flag: try to collapse.
 294  * Technical note: it is used by multiple contexts non atomically.
 295  * All the sk_stream_mem_schedule() is of this nature: accounting
 296  * is strict, actions are advisory and have some latency.
 297  */
 298 int tcp_memory_pressure;
 299
 300 EXPORT_SYMBOL(tcp_memory_pressure);
 301
 302 void tcp_enter_memory_pressure(void)
 303 {
 304         if (!tcp_memory_pressure) {
 305                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 306                 tcp_memory_pressure = 1;
 307         }
 308 }
 309
 310 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 311
 312 /*
 313  *      Wait for a TCP event.
 314  *
 315  *      Note that we don't need to lock the socket, as the upper poll layers
 316  *      take care of normal races (between the test and the event) and we don't
 317  *      go look at any of the socket buffers directly.
 318  */
 319 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 320 {
 321         unsigned int mask;
 322         struct sock *sk = sock->sk;
 323         struct tcp_sock *tp = tcp_sk(sk);
 324
 325         poll_wait(file, sk->sk_sleep, wait);
 326         if (sk->sk_state == TCP_LISTEN)
 327                 return inet_csk_listen_poll(sk);
 328
 329         /* Socket is not locked. We are protected from async events
 330            by poll logic and correct handling of state changes
 331            made by another threads is impossible in any case.
 332          */
 333
 334         mask = 0;
 335         if (sk->sk_err)
 336                 mask = POLLERR;
 337
 338         /*
 339          * POLLHUP is certainly not done right. But poll() doesn't
 340          * have a notion of HUP in just one direction, and for a
 341          * socket the read side is more interesting.
 342          *
 343          * Some poll() documentation says that POLLHUP is incompatible
 344          * with the POLLOUT/POLLWR flags, so somebody should check this
 345          * all. But careful, it tends to be safer to return too many
 346          * bits than too few, and you can easily break real applications
 347          * if you don't tell them that something has hung up!
 348          *
 349          * Check-me.
 350          *
 351          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 352          * our fs/select.c). It means that after we received EOF,
 353          * poll always returns immediately, making impossible poll() on write()
 354          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 355          * if and only if shutdown has been made in both directions.
 356          * Actually, it is interesting to look how Solaris and DUX
 357          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 358          * then we could set it on SND_SHUTDOWN. BTW examples given
 359          * in Stevens' books assume exactly this behaviour, it explains
 360          * why PULLHUP is incompatible with POLLOUT.    --ANK
 361          *
 362          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 363          * blocking on fresh not-connected or disconnected socket. --ANK
 364          */
 365         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 366                 mask |= POLLHUP;
 367         if (sk->sk_shutdown & RCV_SHUTDOWN)
 368                 mask |= POLLIN | POLLRDNORM;
 369
 370         /* Connected? */
 371         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 372                 /* Potential race condition. If read of tp below will
 373                  * escape above sk->sk_state, we can be illegally awaken
 374                  * in SYN_* states. */
 375                 if ((tp->rcv_nxt != tp->copied_seq) &&
 376                     (tp->urg_seq != tp->copied_seq ||
 377                      tp->rcv_nxt != tp->copied_seq + 1 ||
 378                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 379                         mask |= POLLIN | POLLRDNORM;
 380
 381                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 382                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 383                                 mask |= POLLOUT | POLLWRNORM;
 384                         } else {  /* send SIGIO later */
 385                                 set_bit(SOCK_ASYNC_NOSPACE,
 386                                         &sk->sk_socket->flags);
 387                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 388
 389                                 /* Race breaker. If space is freed after
 390                                  * wspace test but before the flags are set,
 391                                  * IO signal will be lost.
 392                                  */
 393                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 394                                         mask |= POLLOUT | POLLWRNORM;
 395                         }
 396                 }
 397
 398                 if (tp->urg_data & TCP_URG_VALID)
 399                         mask |= POLLPRI;
 400         }
 401         return mask;
 402 }
 403
 404 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 405 {
 406         struct tcp_sock *tp = tcp_sk(sk);
 407         int answ;
 408
 409         switch (cmd) {
 410         case SIOCINQ:
 411                 if (sk->sk_state == TCP_LISTEN)
 412                         return -EINVAL;
 413
 414                 lock_sock(sk);
 415                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 416                         answ = 0;
 417                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 418                          !tp->urg_data ||
 419                          before(tp->urg_seq, tp->copied_seq) ||
 420                          !before(tp->urg_seq, tp->rcv_nxt)) {
 421                         answ = tp->rcv_nxt - tp->copied_seq;
 422
 423                         /* Subtract 1, if FIN is in queue. */
 424                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 425                                 answ -=
 426                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 427                 } else
 428                         answ = tp->urg_seq - tp->copied_seq;
 429                 release_sock(sk);
 430                 break;
 431         case SIOCATMARK:
 432                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 433                 break;
 434         case SIOCOUTQ:
 435                 if (sk->sk_state == TCP_LISTEN)
 436                         return -EINVAL;
 437
 438                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 439                         answ = 0;
 440                 else
 441                         answ = tp->write_seq - tp->snd_una;
 442                 break;
 443         default:
 444                 return -ENOIOCTLCMD;
 445         };
 446
 447         return put_user(answ, (int __user *)arg);
 448 }
 449
 450 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 451 {
 452         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 453         tp->pushed_seq = tp->write_seq;
 454 }
 455
 456 static inline int forced_push(struct tcp_sock *tp)
 457 {
 458         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 459 }
 460
 461 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 462                               struct sk_buff *skb)
 463 {
 464         skb->csum = 0;
 465         TCP_SKB_CB(skb)->seq = tp->write_seq;
 466         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 467         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 468         TCP_SKB_CB(skb)->sacked = 0;
 469         skb_header_release(skb);
 470         __skb_queue_tail(&sk->sk_write_queue, skb);
 471         sk_charge_skb(sk, skb);
 472         if (!sk->sk_send_head)
 473                 sk->sk_send_head = skb;
 474         if (tp->nonagle & TCP_NAGLE_PUSH)
 475                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 476 }
 477
 478 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 479                                 struct sk_buff *skb)
 480 {
 481         if (flags & MSG_OOB) {
 482                 tp->urg_mode = 1;
 483                 tp->snd_up = tp->write_seq;
 484                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 485         }
 486 }
 487
 488 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
 489                             int mss_now, int nonagle)
 490 {
 491         if (sk->sk_send_head) {
 492                 struct sk_buff *skb = sk->sk_write_queue.prev;
 493                 if (!(flags & MSG_MORE) || forced_push(tp))
 494                         tcp_mark_push(tp, skb);
 495                 tcp_mark_urg(tp, flags, skb);
 496                 __tcp_push_pending_frames(sk, tp, mss_now,
 497                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 498         }
 499 }
 500
 501 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 502                          size_t psize, int flags)
 503 {
 504         struct tcp_sock *tp = tcp_sk(sk);
 505         int mss_now, size_goal;
 506         int err;
 507         ssize_t copied;
 508         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 509
 510         /* Wait for a connection to finish. */
 511         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 512                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 513                         goto out_err;
 514
 515         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 516
 517         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 518         size_goal = tp->xmit_size_goal;
 519         copied = 0;
 520
 521         err = -EPIPE;
 522         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 523                 goto do_error;
 524
 525         while (psize > 0) {
 526                 struct sk_buff *skb = sk->sk_write_queue.prev;
 527                 struct page *page = pages[poffset / PAGE_SIZE];
 528                 int copy, i, can_coalesce;
 529                 int offset = poffset % PAGE_SIZE;
 530                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 531
 532                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 533 new_segment:
 534                         if (!sk_stream_memory_free(sk))
 535                                 goto wait_for_sndbuf;
 536
 537                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 538                                                    sk->sk_allocation);
 539                         if (!skb)
 540                                 goto wait_for_memory;
 541
 542                         skb_entail(sk, tp, skb);
 543                         copy = size_goal;
 544                 }
 545
 546                 if (copy > size)
 547                         copy = size;
 548
 549                 i = skb_shinfo(skb)->nr_frags;
 550                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 551                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 552                         tcp_mark_push(tp, skb);
 553                         goto new_segment;
 554                 }
 555                 if (!sk_stream_wmem_schedule(sk, copy))
 556                         goto wait_for_memory;
 557
 558                 if (can_coalesce) {
 559                         skb_shinfo(skb)->frags[i - 1].size += copy;
 560                 } else {
 561                         get_page(page);
 562                         skb_fill_page_desc(skb, i, page, offset, copy);
 563                 }
 564
 565                 skb->len += copy;
 566                 skb->data_len += copy;
 567                 skb->truesize += copy;
 568                 sk->sk_wmem_queued += copy;
 569                 sk->sk_forward_alloc -= copy;
 570                 skb->ip_summed = CHECKSUM_HW;
 571                 tp->write_seq += copy;
 572                 TCP_SKB_CB(skb)->end_seq += copy;
 573                 skb_shinfo(skb)->tso_segs = 0;
 574
 575                 if (!copied)
 576                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 577
 578                 copied += copy;
 579                 poffset += copy;
 580                 if (!(psize -= copy))
 581                         goto out;
 582
 583                 if (skb->len < mss_now || (flags & MSG_OOB))
 584                         continue;
 585
 586                 if (forced_push(tp)) {
 587                         tcp_mark_push(tp, skb);
 588                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 589                 } else if (skb == sk->sk_send_head)
 590                         tcp_push_one(sk, mss_now);
 591                 continue;
 592
 593 wait_for_sndbuf:
 594                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 595 wait_for_memory:
 596                 if (copied)
 597                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 598
 599                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 600                         goto do_error;
 601
 602                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 603                 size_goal = tp->xmit_size_goal;
 604         }
 605
 606 out:
 607         if (copied)
 608                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 609         return copied;
 610
 611 do_error:
 612         if (copied)
 613                 goto out;
 614 out_err:
 615         return sk_stream_error(sk, flags, err);
 616 }
 617
 618 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 619                      size_t size, int flags)
 620 {
 621         ssize_t res;
 622         struct sock *sk = sock->sk;
 623
 624 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 625
 626         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 627             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 628                 return sock_no_sendpage(sock, page, offset, size, flags);
 629
 630 #undef TCP_ZC_CSUM_FLAGS
 631
 632         lock_sock(sk);
 633         TCP_CHECK_TIMER(sk);
 634         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 635         TCP_CHECK_TIMER(sk);
 636         release_sock(sk);
 637         return res;
 638 }
 639
 640 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 641 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 642
 643 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 644 {
 645         int tmp = tp->mss_cache;
 646
 647         if (sk->sk_route_caps & NETIF_F_SG) {
 648                 if (sk->sk_route_caps & NETIF_F_TSO)
 649                         tmp = 0;
 650                 else {
 651                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 652
 653                         if (tmp >= pgbreak &&
 654                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 655                                 tmp = pgbreak;
 656                 }
 657         }
 658
 659         return tmp;
 660 }
 661
 662 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 663                 size_t size)
 664 {
 665         struct iovec *iov;
 666         struct tcp_sock *tp = tcp_sk(sk);
 667         struct sk_buff *skb;
 668         int iovlen, flags;
 669         int mss_now, size_goal;
 670         int err, copied;
 671         long timeo;
 672
 673         lock_sock(sk);
 674         TCP_CHECK_TIMER(sk);
 675
 676         flags = msg->msg_flags;
 677         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 678
 679         /* Wait for a connection to finish. */
 680         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 681                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 682                         goto out_err;
 683
 684         /* This should be in poll */
 685         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 686
 687         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 688         size_goal = tp->xmit_size_goal;
 689
 690         /* Ok commence sending. */
 691         iovlen = msg->msg_iovlen;
 692         iov = msg->msg_iov;
 693         copied = 0;
 694
 695         err = -EPIPE;
 696         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 697                 goto do_error;
 698
 699         while (--iovlen >= 0) {
 700                 int seglen = iov->iov_len;
 701                 unsigned char __user *from = iov->iov_base;
 702
 703                 iov++;
 704
 705                 while (seglen > 0) {
 706                         int copy;
 707
 708                         skb = sk->sk_write_queue.prev;
 709
 710                         if (!sk->sk_send_head ||
 711                             (copy = size_goal - skb->len) <= 0) {
 712
 713 new_segment:
 714                                 /* Allocate new segment. If the interface is SG,
 715                                  * allocate skb fitting to single page.
 716                                  */
 717                                 if (!sk_stream_memory_free(sk))
 718                                         goto wait_for_sndbuf;
 719
 720                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 721                                                            0, sk->sk_allocation);
 722                                 if (!skb)
 723                                         goto wait_for_memory;
 724
 725                                 /*
 726                                  * Check whether we can use HW checksum.
 727                                  */
 728                                 if (sk->sk_route_caps &
 729                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 730                                      NETIF_F_HW_CSUM))
 731                                         skb->ip_summed = CHECKSUM_HW;
 732
 733                                 skb_entail(sk, tp, skb);
 734                                 copy = size_goal;
 735                         }
 736
 737                         /* Try to append data to the end of skb. */
 738                         if (copy > seglen)
 739                                 copy = seglen;
 740
 741                         /* Where to copy to? */
 742                         if (skb_tailroom(skb) > 0) {
 743                                 /* We have some space in skb head. Superb! */
 744                                 if (copy > skb_tailroom(skb))
 745                                         copy = skb_tailroom(skb);
 746                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 747                                         goto do_fault;
 748                         } else {
 749                                 int merge = 0;
 750                                 int i = skb_shinfo(skb)->nr_frags;
 751                                 struct page *page = TCP_PAGE(sk);
 752                                 int off = TCP_OFF(sk);
 753
 754                                 if (skb_can_coalesce(skb, i, page, off) &&
 755                                     off != PAGE_SIZE) {
 756                                         /* We can extend the last page
 757                                          * fragment. */
 758                                         merge = 1;
 759                                 } else if (i == MAX_SKB_FRAGS ||
 760                                            (!i &&
 761                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 762                                         /* Need to add new fragment and cannot
 763                                          * do this because interface is non-SG,
 764                                          * or because all the page slots are
 765                                          * busy. */
 766                                         tcp_mark_push(tp, skb);
 767                                         goto new_segment;
 768                                 } else if (page) {
 769                                         if (off == PAGE_SIZE) {
 770                                                 put_page(page);
 771                                                 TCP_PAGE(sk) = page = NULL;
 772                                                 off = 0;
 773                                         }
 774                                 } else
 775                                         off = 0;
 776
 777                                 if (copy > PAGE_SIZE - off)
 778                                         copy = PAGE_SIZE - off;
 779
 780                                 if (!sk_stream_wmem_schedule(sk, copy))
 781                                         goto wait_for_memory;
 782
 783                                 if (!page) {
 784                                         /* Allocate new cache page. */
 785                                         if (!(page = sk_stream_alloc_page(sk)))
 786                                                 goto wait_for_memory;
 787                                 }
 788
 789                                 /* Time to copy data. We are close to
 790                                  * the end! */
 791                                 err = skb_copy_to_page(sk, from, skb, page,
 792                                                        off, copy);
 793                                 if (err) {
 794                                         /* If this page was new, give it to the
 795                                          * socket so it does not get leaked.
 796                                          */
 797                                         if (!TCP_PAGE(sk)) {
 798                                                 TCP_PAGE(sk) = page;
 799                                                 TCP_OFF(sk) = 0;
 800                                         }
 801                                         goto do_error;
 802                                 }
 803
 804                                 /* Update the skb. */
 805                                 if (merge) {
 806                                         skb_shinfo(skb)->frags[i - 1].size +=
 807                                                                         copy;
 808                                 } else {
 809                                         skb_fill_page_desc(skb, i, page, off, copy);
 810                                         if (TCP_PAGE(sk)) {
 811                                                 get_page(page);
 812                                         } else if (off + copy < PAGE_SIZE) {
 813                                                 get_page(page);
 814                                                 TCP_PAGE(sk) = page;
 815                                         }
 816                                 }
 817
 818                                 TCP_OFF(sk) = off + copy;
 819                         }
 820
 821                         if (!copied)
 822                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 823
 824                         tp->write_seq += copy;
 825                         TCP_SKB_CB(skb)->end_seq += copy;
 826                         skb_shinfo(skb)->tso_segs = 0;
 827
 828                         from += copy;
 829                         copied += copy;
 830                         if ((seglen -= copy) == 0 && iovlen == 0)
 831                                 goto out;
 832
 833                         if (skb->len < mss_now || (flags & MSG_OOB))
 834                                 continue;
 835
 836                         if (forced_push(tp)) {
 837                                 tcp_mark_push(tp, skb);
 838                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 839                         } else if (skb == sk->sk_send_head)
 840                                 tcp_push_one(sk, mss_now);
 841                         continue;
 842
 843 wait_for_sndbuf:
 844                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 845 wait_for_memory:
 846                         if (copied)
 847                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 848
 849                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 850                                 goto do_error;
 851
 852                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 853                         size_goal = tp->xmit_size_goal;
 854                 }
 855         }
 856
 857 out:
 858         if (copied)
 859                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 860         TCP_CHECK_TIMER(sk);
 861         release_sock(sk);
 862         return copied;
 863
 864 do_fault:
 865         if (!skb->len) {
 866                 if (sk->sk_send_head == skb)
 867                         sk->sk_send_head = NULL;
 868                 __skb_unlink(skb, &sk->sk_write_queue);
 869                 sk_stream_free_skb(sk, skb);
 870         }
 871
 872 do_error:
 873         if (copied)
 874                 goto out;
 875 out_err:
 876         err = sk_stream_error(sk, flags, err);
 877         TCP_CHECK_TIMER(sk);
 878         release_sock(sk);
 879         return err;
 880 }
 881
 882 /*
 883  *      Handle reading urgent data. BSD has very simple semantics for
 884  *      this, no blocking and very strange errors 8)
 885  */
 886
 887 static int tcp_recv_urg(struct sock *sk, long timeo,
 888                         struct msghdr *msg, int len, int flags,
 889                         int *addr_len)
 890 {
 891         struct tcp_sock *tp = tcp_sk(sk);
 892
 893         /* No URG data to read. */
 894         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
 895             tp->urg_data == TCP_URG_READ)
 896                 return -EINVAL; /* Yes this is right ! */
 897
 898         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
 899                 return -ENOTCONN;
 900
 901         if (tp->urg_data & TCP_URG_VALID) {
 902                 int err = 0;
 903                 char c = tp->urg_data;
 904
 905                 if (!(flags & MSG_PEEK))
 906                         tp->urg_data = TCP_URG_READ;
 907
 908                 /* Read urgent data. */
 909                 msg->msg_flags |= MSG_OOB;
 910
 911                 if (len > 0) {
 912                         if (!(flags & MSG_TRUNC))
 913                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
 914                         len = 1;
 915                 } else
 916                         msg->msg_flags |= MSG_TRUNC;
 917
 918                 return err ? -EFAULT : len;
 919         }
 920
 921         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
 922                 return 0;
 923
 924         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 925          * the available implementations agree in this case:
 926          * this call should never block, independent of the
 927          * blocking state of the socket.
 928          * Mike <pall@rz.uni-karlsruhe.de>
 929          */
 930         return -EAGAIN;
 931 }
 932
 933 /* Clean up the receive buffer for full frames taken by the user,
 934  * then send an ACK if necessary.  COPIED is the number of bytes
 935  * tcp_recvmsg has given to the user so far, it speeds up the
 936  * calculation of whether or not we must ACK for the sake of
 937  * a window update.
 938  */
 939 static void cleanup_rbuf(struct sock *sk, int copied)
 940 {
 941         struct tcp_sock *tp = tcp_sk(sk);
 942         int time_to_ack = 0;
 943
 944 #if TCP_DEBUG
 945         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 946
 947         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 948 #endif
 949
 950         if (inet_csk_ack_scheduled(sk)) {
 951                 const struct inet_connection_sock *icsk = inet_csk(sk);
 952                    /* Delayed ACKs frequently hit locked sockets during bulk
 953                     * receive. */
 954                 if (icsk->icsk_ack.blocked ||
 955                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
 956                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 957                     /*
 958                      * If this read emptied read buffer, we send ACK, if
 959                      * connection is not bidirectional, user drained
 960                      * receive buffer and there was a small segment
 961                      * in queue.
 962                      */
 963                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
 964                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 965                         time_to_ack = 1;
 966         }
 967
 968         /* We send an ACK if we can now advertise a non-zero window
 969          * which has been raised "significantly".
 970          *
 971          * Even if window raised up to infinity, do not send window open ACK
 972          * in states, where we will not receive more. It is useless.
 973          */
 974         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 975                 __u32 rcv_window_now = tcp_receive_window(tp);
 976
 977                 /* Optimize, __tcp_select_window() is not cheap. */
 978                 if (2*rcv_window_now <= tp->window_clamp) {
 979                         __u32 new_window = __tcp_select_window(sk);
 980
 981                         /* Send ACK now, if this read freed lots of space
 982                          * in our buffer. Certainly, new_window is new window.
 983                          * We can advertise it now, if it is not less than current one.
 984                          * "Lots" means "at least twice" here.
 985                          */
 986                         if (new_window && new_window >= 2 * rcv_window_now)
 987                                 time_to_ack = 1;
 988                 }
 989         }
 990         if (time_to_ack)
 991                 tcp_send_ack(sk);
 992 }
 993
 994 static void tcp_prequeue_process(struct sock *sk)
 995 {
 996         struct sk_buff *skb;
 997         struct tcp_sock *tp = tcp_sk(sk);
 998
 999         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1000
1001         /* RX process wants to run with disabled BHs, though it is not
1002          * necessary */
1003         local_bh_disable();
1004         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1005                 sk->sk_backlog_rcv(sk, skb);
1006         local_bh_enable();
1007
1008         /* Clear memory counter. */
1009         tp->ucopy.memory = 0;
1010 }
1011
1012 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1013 {
1014         struct sk_buff *skb;
1015         u32 offset;
1016
1017         skb_queue_walk(&sk->sk_receive_queue, skb) {
1018                 offset = seq - TCP_SKB_CB(skb)->seq;
1019                 if (skb->h.th->syn)
1020                         offset--;
1021                 if (offset < skb->len || skb->h.th->fin) {
1022                         *off = offset;
1023                         return skb;
1024                 }
1025         }
1026         return NULL;
1027 }
1028
1029 /*
1030  * This routine provides an alternative to tcp_recvmsg() for routines
1031  * that would like to handle copying from skbuffs directly in 'sendfile'
1032  * fashion.
1033  * Note:
1034  *      - It is assumed that the socket was locked by the caller.
1035  *      - The routine does not block.
1036  *      - At present, there is no support for reading OOB data
1037  *        or for 'peeking' the socket using this routine
1038  *        (although both would be easy to implement).
1039  */
1040 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1041                   sk_read_actor_t recv_actor)
1042 {
1043         struct sk_buff *skb;
1044         struct tcp_sock *tp = tcp_sk(sk);
1045         u32 seq = tp->copied_seq;
1046         u32 offset;
1047         int copied = 0;
1048
1049         if (sk->sk_state == TCP_LISTEN)
1050                 return -ENOTCONN;
1051         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1052                 if (offset < skb->len) {
1053                         size_t used, len;
1054
1055                         len = skb->len - offset;
1056                         /* Stop reading if we hit a patch of urgent data */
1057                         if (tp->urg_data) {
1058                                 u32 urg_offset = tp->urg_seq - seq;
1059                                 if (urg_offset < len)
1060                                         len = urg_offset;
1061                                 if (!len)
1062                                         break;
1063                         }
1064                         used = recv_actor(desc, skb, offset, len);
1065                         if (used <= len) {
1066                                 seq += used;
1067                                 copied += used;
1068                                 offset += used;
1069                         }
1070                         if (offset != skb->len)
1071                                 break;
1072                 }
1073                 if (skb->h.th->fin) {
1074                         sk_eat_skb(sk, skb);
1075                         ++seq;
1076                         break;
1077                 }
1078                 sk_eat_skb(sk, skb);
1079                 if (!desc->count)
1080                         break;
1081         }
1082         tp->copied_seq = seq;
1083
1084         tcp_rcv_space_adjust(sk);
1085
1086         /* Clean up data we have read: This will do ACK frames. */
1087         if (copied)
1088                 cleanup_rbuf(sk, copied);
1089         return copied;
1090 }
1091
1092 /*
1093  *      This routine copies from a sock struct into the user buffer.
1094  *
1095  *      Technical note: in 2.3 we work on _locked_ socket, so that
1096  *      tricks with *seq access order and skb->users are not required.
1097  *      Probably, code can be easily improved even more.
1098  */
1099
1100 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1101                 size_t len, int nonblock, int flags, int *addr_len)
1102 {
1103         struct tcp_sock *tp = tcp_sk(sk);
1104         int copied = 0;
1105         u32 peek_seq;
1106         u32 *seq;
1107         unsigned long used;
1108         int err;
1109         int target;             /* Read at least this many bytes */
1110         long timeo;
1111         struct task_struct *user_recv = NULL;
1112
1113         lock_sock(sk);
1114
1115         TCP_CHECK_TIMER(sk);
1116
1117         err = -ENOTCONN;
1118         if (sk->sk_state == TCP_LISTEN)
1119                 goto out;
1120
1121         timeo = sock_rcvtimeo(sk, nonblock);
1122
1123         /* Urgent data needs to be handled specially. */
1124         if (flags & MSG_OOB)
1125                 goto recv_urg;
1126
1127         seq = &tp->copied_seq;
1128         if (flags & MSG_PEEK) {
1129                 peek_seq = tp->copied_seq;
1130                 seq = &peek_seq;
1131         }
1132
1133         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1134
1135         do {
1136                 struct sk_buff *skb;
1137                 u32 offset;
1138
1139                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1140                 if (tp->urg_data && tp->urg_seq == *seq) {
1141                         if (copied)
1142                                 break;
1143                         if (signal_pending(current)) {
1144                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1145                                 break;
1146                         }
1147                 }
1148
1149                 /* Next get a buffer. */
1150
1151                 skb = skb_peek(&sk->sk_receive_queue);
1152                 do {
1153                         if (!skb)
1154                                 break;
1155
1156                         /* Now that we have two receive queues this
1157                          * shouldn't happen.
1158                          */
1159                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1160                                 printk(KERN_INFO "recvmsg bug: copied %X "
1161                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1162                                 break;
1163                         }
1164                         offset = *seq - TCP_SKB_CB(skb)->seq;
1165                         if (skb->h.th->syn)
1166                                 offset--;
1167                         if (offset < skb->len)
1168                                 goto found_ok_skb;
1169                         if (skb->h.th->fin)
1170                                 goto found_fin_ok;
1171                         BUG_TRAP(flags & MSG_PEEK);
1172                         skb = skb->next;
1173                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1174
1175                 /* Well, if we have backlog, try to process it now yet. */
1176
1177                 if (copied >= target && !sk->sk_backlog.tail)
1178                         break;
1179
1180                 if (copied) {
1181                         if (sk->sk_err ||
1182                             sk->sk_state == TCP_CLOSE ||
1183                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1184                             !timeo ||
1185                             signal_pending(current) ||
1186                             (flags & MSG_PEEK))
1187                                 break;
1188                 } else {
1189                         if (sock_flag(sk, SOCK_DONE))
1190                                 break;
1191
1192                         if (sk->sk_err) {
1193                                 copied = sock_error(sk);
1194                                 break;
1195                         }
1196
1197                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1198                                 break;
1199
1200                         if (sk->sk_state == TCP_CLOSE) {
1201                                 if (!sock_flag(sk, SOCK_DONE)) {
1202                                         /* This occurs when user tries to read
1203                                          * from never connected socket.
1204                                          */
1205                                         copied = -ENOTCONN;
1206                                         break;
1207                                 }
1208                                 break;
1209                         }
1210
1211                         if (!timeo) {
1212                                 copied = -EAGAIN;
1213                                 break;
1214                         }
1215
1216                         if (signal_pending(current)) {
1217                                 copied = sock_intr_errno(timeo);
1218                                 break;
1219                         }
1220                 }
1221
1222                 cleanup_rbuf(sk, copied);
1223
1224                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1225                         /* Install new reader */
1226                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1227                                 user_recv = current;
1228                                 tp->ucopy.task = user_recv;
1229                                 tp->ucopy.iov = msg->msg_iov;
1230                         }
1231
1232                         tp->ucopy.len = len;
1233
1234                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1235                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1236
1237                         /* Ugly... If prequeue is not empty, we have to
1238                          * process it before releasing socket, otherwise
1239                          * order will be broken at second iteration.
1240                          * More elegant solution is required!!!
1241                          *
1242                          * Look: we have the following (pseudo)queues:
1243                          *
1244                          * 1. packets in flight
1245                          * 2. backlog
1246                          * 3. prequeue
1247                          * 4. receive_queue
1248                          *
1249                          * Each queue can be processed only if the next ones
1250                          * are empty. At this point we have empty receive_queue.
1251                          * But prequeue _can_ be not empty after 2nd iteration,
1252                          * when we jumped to start of loop because backlog
1253                          * processing added something to receive_queue.
1254                          * We cannot release_sock(), because backlog contains
1255                          * packets arrived _after_ prequeued ones.
1256                          *
1257                          * Shortly, algorithm is clear --- to process all
1258                          * the queues in order. We could make it more directly,
1259                          * requeueing packets from backlog to prequeue, if
1260                          * is not empty. It is more elegant, but eats cycles,
1261                          * unfortunately.
1262                          */
1263                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1264                                 goto do_prequeue;
1265
1266                         /* __ Set realtime policy in scheduler __ */
1267                 }
1268
1269                 if (copied >= target) {
1270                         /* Do not sleep, just process backlog. */
1271                         release_sock(sk);
1272                         lock_sock(sk);
1273                 } else
1274                         sk_wait_data(sk, &timeo);
1275
1276                 if (user_recv) {
1277                         int chunk;
1278
1279                         /* __ Restore normal policy in scheduler __ */
1280
1281                         if ((chunk = len - tp->ucopy.len) != 0) {
1282                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1283                                 len -= chunk;
1284                                 copied += chunk;
1285                         }
1286
1287                         if (tp->rcv_nxt == tp->copied_seq &&
1288                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1289 do_prequeue:
1290                                 tcp_prequeue_process(sk);
1291
1292                                 if ((chunk = len - tp->ucopy.len) != 0) {
1293                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1294                                         len -= chunk;
1295                                         copied += chunk;
1296                                 }
1297                         }
1298                 }
1299                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1300                         if (net_ratelimit())
1301                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1302                                        current->comm, current->pid);
1303                         peek_seq = tp->copied_seq;
1304                 }
1305                 continue;
1306
1307         found_ok_skb:
1308                 /* Ok so how much can we use? */
1309                 used = skb->len - offset;
1310                 if (len < used)
1311                         used = len;
1312
1313                 /* Do we have urgent data here? */
1314                 if (tp->urg_data) {
1315                         u32 urg_offset = tp->urg_seq - *seq;
1316                         if (urg_offset < used) {
1317                                 if (!urg_offset) {
1318                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1319                                                 ++*seq;
1320                                                 offset++;
1321                                                 used--;
1322                                                 if (!used)
1323                                                         goto skip_copy;
1324                                         }
1325                                 } else
1326                                         used = urg_offset;
1327                         }
1328                 }
1329
1330                 if (!(flags & MSG_TRUNC)) {
1331                         err = skb_copy_datagram_iovec(skb, offset,
1332                                                       msg->msg_iov, used);
1333                         if (err) {
1334                                 /* Exception. Bailout! */
1335                                 if (!copied)
1336                                         copied = -EFAULT;
1337                                 break;
1338                         }
1339                 }
1340
1341                 *seq += used;
1342                 copied += used;
1343                 len -= used;
1344
1345                 tcp_rcv_space_adjust(sk);
1346
1347 skip_copy:
1348                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1349                         tp->urg_data = 0;
1350                         tcp_fast_path_check(sk, tp);
1351                 }
1352                 if (used + offset < skb->len)
1353                         continue;
1354
1355                 if (skb->h.th->fin)
1356                         goto found_fin_ok;
1357                 if (!(flags & MSG_PEEK))
1358                         sk_eat_skb(sk, skb);
1359                 continue;
1360
1361         found_fin_ok:
1362                 /* Process the FIN. */
1363                 ++*seq;
1364                 if (!(flags & MSG_PEEK))
1365                         sk_eat_skb(sk, skb);
1366                 break;
1367         } while (len > 0);
1368
1369         if (user_recv) {
1370                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1371                         int chunk;
1372
1373                         tp->ucopy.len = copied > 0 ? len : 0;
1374
1375                         tcp_prequeue_process(sk);
1376
1377                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1378                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1379                                 len -= chunk;
1380                                 copied += chunk;
1381                         }
1382                 }
1383
1384                 tp->ucopy.task = NULL;
1385                 tp->ucopy.len = 0;
1386         }
1387
1388         /* According to UNIX98, msg_name/msg_namelen are ignored
1389          * on connected socket. I was just happy when found this 8) --ANK
1390          */
1391
1392         /* Clean up data we have read: This will do ACK frames. */
1393         cleanup_rbuf(sk, copied);
1394
1395         TCP_CHECK_TIMER(sk);
1396         release_sock(sk);
1397         return copied;
1398
1399 out:
1400         TCP_CHECK_TIMER(sk);
1401         release_sock(sk);
1402         return err;
1403
1404 recv_urg:
1405         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1406         goto out;
1407 }
1408
1409 /*
1410  *      State processing on a close. This implements the state shift for
1411  *      sending our FIN frame. Note that we only send a FIN for some
1412  *      states. A shutdown() may have already sent the FIN, or we may be
1413  *      closed.
1414  */
1415
1416 static const unsigned char new_state[16] = {
1417   /* current state:        new state:      action:      */
1418   /* (Invalid)          */ TCP_CLOSE,
1419   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1420   /* TCP_SYN_SENT       */ TCP_CLOSE,
1421   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1422   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1423   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1424   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1425   /* TCP_CLOSE          */ TCP_CLOSE,
1426   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1427   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1428   /* TCP_LISTEN         */ TCP_CLOSE,
1429   /* TCP_CLOSING        */ TCP_CLOSING,
1430 };
1431
1432 static int tcp_close_state(struct sock *sk)
1433 {
1434         int next = (int)new_state[sk->sk_state];
1435         int ns = next & TCP_STATE_MASK;
1436
1437         tcp_set_state(sk, ns);
1438
1439         return next & TCP_ACTION_FIN;
1440 }
1441
1442 /*
1443  *      Shutdown the sending side of a connection. Much like close except
1444  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1445  */
1446
1447 void tcp_shutdown(struct sock *sk, int how)
1448 {
1449         /*      We need to grab some memory, and put together a FIN,
1450          *      and then put it into the queue to be sent.
1451          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1452          */
1453         if (!(how & SEND_SHUTDOWN))
1454                 return;
1455
1456         /* If we've already sent a FIN, or it's a closed state, skip this. */
1457         if ((1 << sk->sk_state) &
1458             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1459              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1460                 /* Clear out any half completed packets.  FIN if needed. */
1461                 if (tcp_close_state(sk))
1462                         tcp_send_fin(sk);
1463         }
1464 }
1465
1466 void tcp_close(struct sock *sk, long timeout)
1467 {
1468         struct sk_buff *skb;
1469         int data_was_unread = 0;
1470
1471         lock_sock(sk);
1472         sk->sk_shutdown = SHUTDOWN_MASK;
1473
1474         if (sk->sk_state == TCP_LISTEN) {
1475                 tcp_set_state(sk, TCP_CLOSE);
1476
1477                 /* Special case. */
1478                 inet_csk_listen_stop(sk);
1479
1480                 goto adjudge_to_death;
1481         }
1482
1483         /*  We need to flush the recv. buffs.  We do this only on the
1484          *  descriptor close, not protocol-sourced closes, because the
1485          *  reader process may not have drained the data yet!
1486          */
1487         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1488                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1489                           skb->h.th->fin;
1490                 data_was_unread += len;
1491                 __kfree_skb(skb);
1492         }
1493
1494         sk_stream_mem_reclaim(sk);
1495
1496         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1497          * 3.10, we send a RST here because data was lost.  To
1498          * witness the awful effects of the old behavior of always
1499          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1500          * a bulk GET in an FTP client, suspend the process, wait
1501          * for the client to advertise a zero window, then kill -9
1502          * the FTP client, wheee...  Note: timeout is always zero
1503          * in such a case.
1504          */
1505         if (data_was_unread) {
1506                 /* Unread data was tossed, zap the connection. */
1507                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1508                 tcp_set_state(sk, TCP_CLOSE);
1509                 tcp_send_active_reset(sk, GFP_KERNEL);
1510         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1511                 /* Check zero linger _after_ checking for unread data. */
1512                 sk->sk_prot->disconnect(sk, 0);
1513                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1514         } else if (tcp_close_state(sk)) {
1515                 /* We FIN if the application ate all the data before
1516                  * zapping the connection.
1517                  */
1518
1519                 /* RED-PEN. Formally speaking, we have broken TCP state
1520                  * machine. State transitions:
1521                  *
1522                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1523                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1524                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1525                  *
1526                  * are legal only when FIN has been sent (i.e. in window),
1527                  * rather than queued out of window. Purists blame.
1528                  *
1529                  * F.e. "RFC state" is ESTABLISHED,
1530                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1531                  *
1532                  * The visible declinations are that sometimes
1533                  * we enter time-wait state, when it is not required really
1534                  * (harmless), do not send active resets, when they are
1535                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1536                  * they look as CLOSING or LAST_ACK for Linux)
1537                  * Probably, I missed some more holelets.
1538                  *                                              --ANK
1539                  */
1540                 tcp_send_fin(sk);
1541         }
1542
1543         sk_stream_wait_close(sk, timeout);
1544
1545 adjudge_to_death:
1546         /* It is the last release_sock in its life. It will remove backlog. */
1547         release_sock(sk);
1548
1549
1550         /* Now socket is owned by kernel and we acquire BH lock
1551            to finish close. No need to check for user refs.
1552          */
1553         local_bh_disable();
1554         bh_lock_sock(sk);
1555         BUG_TRAP(!sock_owned_by_user(sk));
1556
1557         sock_hold(sk);
1558         sock_orphan(sk);
1559
1560         /*      This is a (useful) BSD violating of the RFC. There is a
1561          *      problem with TCP as specified in that the other end could
1562          *      keep a socket open forever with no application left this end.
1563          *      We use a 3 minute timeout (about the same as BSD) then kill
1564          *      our end. If they send after that then tough - BUT: long enough
1565          *      that we won't make the old 4*rto = almost no time - whoops
1566          *      reset mistake.
1567          *
1568          *      Nope, it was not mistake. It is really desired behaviour
1569          *      f.e. on http servers, when such sockets are useless, but
1570          *      consume significant resources. Let's do it with special
1571          *      linger2 option.                                 --ANK
1572          */
1573
1574         if (sk->sk_state == TCP_FIN_WAIT2) {
1575                 struct tcp_sock *tp = tcp_sk(sk);
1576                 if (tp->linger2 < 0) {
1577                         tcp_set_state(sk, TCP_CLOSE);
1578                         tcp_send_active_reset(sk, GFP_ATOMIC);
1579                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1580                 } else {
1581                         const int tmo = tcp_fin_time(sk);
1582
1583                         if (tmo > TCP_TIMEWAIT_LEN) {
1584                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1585                         } else {
1586                                 atomic_inc(sk->sk_prot->orphan_count);
1587                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1588                                 goto out;
1589                         }
1590                 }
1591         }
1592         if (sk->sk_state != TCP_CLOSE) {
1593                 sk_stream_mem_reclaim(sk);
1594                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1595                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1596                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1597                         if (net_ratelimit())
1598                                 printk(KERN_INFO "TCP: too many of orphaned "
1599                                        "sockets\n");
1600                         tcp_set_state(sk, TCP_CLOSE);
1601                         tcp_send_active_reset(sk, GFP_ATOMIC);
1602                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1603                 }
1604         }
1605         atomic_inc(sk->sk_prot->orphan_count);
1606
1607         if (sk->sk_state == TCP_CLOSE)
1608                 inet_csk_destroy_sock(sk);
1609         /* Otherwise, socket is reprieved until protocol close. */
1610
1611 out:
1612         bh_unlock_sock(sk);
1613         local_bh_enable();
1614         sock_put(sk);
1615 }
1616
1617 /* These states need RST on ABORT according to RFC793 */
1618
1619 static inline int tcp_need_reset(int state)
1620 {
1621         return (1 << state) &
1622                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1623                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1624 }
1625
1626 int tcp_disconnect(struct sock *sk, int flags)
1627 {
1628         struct inet_sock *inet = inet_sk(sk);
1629         struct inet_connection_sock *icsk = inet_csk(sk);
1630         struct tcp_sock *tp = tcp_sk(sk);
1631         int err = 0;
1632         int old_state = sk->sk_state;
1633
1634         if (old_state != TCP_CLOSE)
1635                 tcp_set_state(sk, TCP_CLOSE);
1636
1637         /* ABORT function of RFC793 */
1638         if (old_state == TCP_LISTEN) {
1639                 inet_csk_listen_stop(sk);
1640         } else if (tcp_need_reset(old_state) ||
1641                    (tp->snd_nxt != tp->write_seq &&
1642                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1643                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1644                  * states
1645                  */
1646                 tcp_send_active_reset(sk, gfp_any());
1647                 sk->sk_err = ECONNRESET;
1648         } else if (old_state == TCP_SYN_SENT)
1649                 sk->sk_err = ECONNRESET;
1650
1651         tcp_clear_xmit_timers(sk);
1652         __skb_queue_purge(&sk->sk_receive_queue);
1653         sk_stream_writequeue_purge(sk);
1654         __skb_queue_purge(&tp->out_of_order_queue);
1655
1656         inet->dport = 0;
1657
1658         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1659                 inet_reset_saddr(sk);
1660
1661         sk->sk_shutdown = 0;
1662         sock_reset_flag(sk, SOCK_DONE);
1663         tp->srtt = 0;
1664         if ((tp->write_seq += tp->max_window + 2) == 0)
1665                 tp->write_seq = 1;
1666         icsk->icsk_backoff = 0;
1667         tp->snd_cwnd = 2;
1668         icsk->icsk_probes_out = 0;
1669         tp->packets_out = 0;
1670         tp->snd_ssthresh = 0x7fffffff;
1671         tp->snd_cwnd_cnt = 0;
1672         tp->bytes_acked = 0;
1673         tcp_set_ca_state(sk, TCP_CA_Open);
1674         tcp_clear_retrans(tp);
1675         inet_csk_delack_init(sk);
1676         sk->sk_send_head = NULL;
1677         tp->rx_opt.saw_tstamp = 0;
1678         tcp_sack_reset(&tp->rx_opt);
1679         __sk_dst_reset(sk);
1680
1681         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1682
1683         sk->sk_error_report(sk);
1684         return err;
1685 }
1686
1687 /*
1688  *      Socket option code for TCP.
1689  */
1690 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1691                    int optlen)
1692 {
1693         struct tcp_sock *tp = tcp_sk(sk);
1694         struct inet_connection_sock *icsk = inet_csk(sk);
1695         int val;
1696         int err = 0;
1697
1698         if (level != SOL_TCP)
1699                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
1700                                                      optval, optlen);
1701
1702         /* This is a string value all the others are int's */
1703         if (optname == TCP_CONGESTION) {
1704                 char name[TCP_CA_NAME_MAX];
1705
1706                 if (optlen < 1)
1707                         return -EINVAL;
1708
1709                 val = strncpy_from_user(name, optval,
1710                                         min(TCP_CA_NAME_MAX-1, optlen));
1711                 if (val < 0)
1712                         return -EFAULT;
1713                 name[val] = 0;
1714
1715                 lock_sock(sk);
1716                 err = tcp_set_congestion_control(sk, name);
1717                 release_sock(sk);
1718                 return err;
1719         }
1720
1721         if (optlen < sizeof(int))
1722                 return -EINVAL;
1723
1724         if (get_user(val, (int __user *)optval))
1725                 return -EFAULT;
1726
1727         lock_sock(sk);
1728
1729         switch (optname) {
1730         case TCP_MAXSEG:
1731                 /* Values greater than interface MTU won't take effect. However
1732                  * at the point when this call is done we typically don't yet
1733                  * know which interface is going to be used */
1734                 if (val < 8 || val > MAX_TCP_WINDOW) {
1735                         err = -EINVAL;
1736                         break;
1737                 }
1738                 tp->rx_opt.user_mss = val;
1739                 break;
1740
1741         case TCP_NODELAY:
1742                 if (val) {
1743                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1744                          * this option on corked socket is remembered, but
1745                          * it is not activated until cork is cleared.
1746                          *
1747                          * However, when TCP_NODELAY is set we make
1748                          * an explicit push, which overrides even TCP_CORK
1749                          * for currently queued segments.
1750                          */
1751                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1752                         tcp_push_pending_frames(sk, tp);
1753                 } else {
1754                         tp->nonagle &= ~TCP_NAGLE_OFF;
1755                 }
1756                 break;
1757
1758         case TCP_CORK:
1759                 /* When set indicates to always queue non-full frames.
1760                  * Later the user clears this option and we transmit
1761                  * any pending partial frames in the queue.  This is
1762                  * meant to be used alongside sendfile() to get properly
1763                  * filled frames when the user (for example) must write
1764                  * out headers with a write() call first and then use
1765                  * sendfile to send out the data parts.
1766                  *
1767                  * TCP_CORK can be set together with TCP_NODELAY and it is
1768                  * stronger than TCP_NODELAY.
1769                  */
1770                 if (val) {
1771                         tp->nonagle |= TCP_NAGLE_CORK;
1772                 } else {
1773                         tp->nonagle &= ~TCP_NAGLE_CORK;
1774                         if (tp->nonagle&TCP_NAGLE_OFF)
1775                                 tp->nonagle |= TCP_NAGLE_PUSH;
1776                         tcp_push_pending_frames(sk, tp);
1777                 }
1778                 break;
1779
1780         case TCP_KEEPIDLE:
1781                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1782                         err = -EINVAL;
1783                 else {
1784                         tp->keepalive_time = val * HZ;
1785                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1786                             !((1 << sk->sk_state) &
1787                               (TCPF_CLOSE | TCPF_LISTEN))) {
1788                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1789                                 if (tp->keepalive_time > elapsed)
1790                                         elapsed = tp->keepalive_time - elapsed;
1791                                 else
1792                                         elapsed = 0;
1793                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1794                         }
1795                 }
1796                 break;
1797         case TCP_KEEPINTVL:
1798                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1799                         err = -EINVAL;
1800                 else
1801                         tp->keepalive_intvl = val * HZ;
1802                 break;
1803         case TCP_KEEPCNT:
1804                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1805                         err = -EINVAL;
1806                 else
1807                         tp->keepalive_probes = val;
1808                 break;
1809         case TCP_SYNCNT:
1810                 if (val < 1 || val > MAX_TCP_SYNCNT)
1811                         err = -EINVAL;
1812                 else
1813                         icsk->icsk_syn_retries = val;
1814                 break;
1815
1816         case TCP_LINGER2:
1817                 if (val < 0)
1818                         tp->linger2 = -1;
1819                 else if (val > sysctl_tcp_fin_timeout / HZ)
1820                         tp->linger2 = 0;
1821                 else
1822                         tp->linger2 = val * HZ;
1823                 break;
1824
1825         case TCP_DEFER_ACCEPT:
1826                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
1827                 if (val > 0) {
1828                         /* Translate value in seconds to number of
1829                          * retransmits */
1830                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1831                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1832                                        icsk->icsk_accept_queue.rskq_defer_accept))
1833                                 icsk->icsk_accept_queue.rskq_defer_accept++;
1834                         icsk->icsk_accept_queue.rskq_defer_accept++;
1835                 }
1836                 break;
1837
1838         case TCP_WINDOW_CLAMP:
1839                 if (!val) {
1840                         if (sk->sk_state != TCP_CLOSE) {
1841                                 err = -EINVAL;
1842                                 break;
1843                         }
1844                         tp->window_clamp = 0;
1845                 } else
1846                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1847                                                 SOCK_MIN_RCVBUF / 2 : val;
1848                 break;
1849
1850         case TCP_QUICKACK:
1851                 if (!val) {
1852                         icsk->icsk_ack.pingpong = 1;
1853                 } else {
1854                         icsk->icsk_ack.pingpong = 0;
1855                         if ((1 << sk->sk_state) &
1856                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1857                             inet_csk_ack_scheduled(sk)) {
1858                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1859                                 cleanup_rbuf(sk, 1);
1860                                 if (!(val & 1))
1861                                         icsk->icsk_ack.pingpong = 1;
1862                         }
1863                 }
1864                 break;
1865
1866         default:
1867                 err = -ENOPROTOOPT;
1868                 break;
1869         };
1870         release_sock(sk);
1871         return err;
1872 }
1873
1874 /* Return information about state of tcp endpoint in API format. */
1875 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1876 {
1877         struct tcp_sock *tp = tcp_sk(sk);
1878         const struct inet_connection_sock *icsk = inet_csk(sk);
1879         u32 now = tcp_time_stamp;
1880
1881         memset(info, 0, sizeof(*info));
1882
1883         info->tcpi_state = sk->sk_state;
1884         info->tcpi_ca_state = icsk->icsk_ca_state;
1885         info->tcpi_retransmits = icsk->icsk_retransmits;
1886         info->tcpi_probes = icsk->icsk_probes_out;
1887         info->tcpi_backoff = icsk->icsk_backoff;
1888
1889         if (tp->rx_opt.tstamp_ok)
1890                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1891         if (tp->rx_opt.sack_ok)
1892                 info->tcpi_options |= TCPI_OPT_SACK;
1893         if (tp->rx_opt.wscale_ok) {
1894                 info->tcpi_options |= TCPI_OPT_WSCALE;
1895                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1896                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1897         }
1898
1899         if (tp->ecn_flags&TCP_ECN_OK)
1900                 info->tcpi_options |= TCPI_OPT_ECN;
1901
1902         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1903         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1904         info->tcpi_snd_mss = tp->mss_cache;
1905         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1906
1907         info->tcpi_unacked = tp->packets_out;
1908         info->tcpi_sacked = tp->sacked_out;
1909         info->tcpi_lost = tp->lost_out;
1910         info->tcpi_retrans = tp->retrans_out;
1911         info->tcpi_fackets = tp->fackets_out;
1912
1913         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1914         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1915         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1916
1917         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
1918         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1919         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1920         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1921         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1922         info->tcpi_snd_cwnd = tp->snd_cwnd;
1923         info->tcpi_advmss = tp->advmss;
1924         info->tcpi_reordering = tp->reordering;
1925
1926         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1927         info->tcpi_rcv_space = tp->rcvq_space.space;
1928
1929         info->tcpi_total_retrans = tp->total_retrans;
1930 }
1931
1932 EXPORT_SYMBOL_GPL(tcp_get_info);
1933
1934 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1935                    int __user *optlen)
1936 {
1937         struct inet_connection_sock *icsk = inet_csk(sk);
1938         struct tcp_sock *tp = tcp_sk(sk);
1939         int val, len;
1940
1941         if (level != SOL_TCP)
1942                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
1943                                                      optval, optlen);
1944
1945         if (get_user(len, optlen))
1946                 return -EFAULT;
1947
1948         len = min_t(unsigned int, len, sizeof(int));
1949
1950         if (len < 0)
1951                 return -EINVAL;
1952
1953         switch (optname) {
1954         case TCP_MAXSEG:
1955                 val = tp->mss_cache;
1956                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1957                         val = tp->rx_opt.user_mss;
1958                 break;
1959         case TCP_NODELAY:
1960                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
1961                 break;
1962         case TCP_CORK:
1963                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
1964                 break;
1965         case TCP_KEEPIDLE:
1966                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1967                 break;
1968         case TCP_KEEPINTVL:
1969                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1970                 break;
1971         case TCP_KEEPCNT:
1972                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1973                 break;
1974         case TCP_SYNCNT:
1975                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1976                 break;
1977         case TCP_LINGER2:
1978                 val = tp->linger2;
1979                 if (val >= 0)
1980                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1981                 break;
1982         case TCP_DEFER_ACCEPT:
1983                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1984                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1985                 break;
1986         case TCP_WINDOW_CLAMP:
1987                 val = tp->window_clamp;
1988                 break;
1989         case TCP_INFO: {
1990                 struct tcp_info info;
1991
1992                 if (get_user(len, optlen))
1993                         return -EFAULT;
1994
1995                 tcp_get_info(sk, &info);
1996
1997                 len = min_t(unsigned int, len, sizeof(info));
1998                 if (put_user(len, optlen))
1999                         return -EFAULT;
2000                 if (copy_to_user(optval, &info, len))
2001                         return -EFAULT;
2002                 return 0;
2003         }
2004         case TCP_QUICKACK:
2005                 val = !icsk->icsk_ack.pingpong;
2006                 break;
2007
2008         case TCP_CONGESTION:
2009                 if (get_user(len, optlen))
2010                         return -EFAULT;
2011                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2012                 if (put_user(len, optlen))
2013                         return -EFAULT;
2014                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2015                         return -EFAULT;
2016                 return 0;
2017         default:
2018                 return -ENOPROTOOPT;
2019         };
2020
2021         if (put_user(len, optlen))
2022                 return -EFAULT;
2023         if (copy_to_user(optval, &val, len))
2024                 return -EFAULT;
2025         return 0;
2026 }
2027
2028
2029 extern void __skb_cb_too_small_for_tcp(int, int);
2030 extern struct tcp_congestion_ops tcp_reno;
2031
2032 static __initdata unsigned long thash_entries;
2033 static int __init set_thash_entries(char *str)
2034 {
2035         if (!str)
2036                 return 0;
2037         thash_entries = simple_strtoul(str, &str, 0);
2038         return 1;
2039 }
2040 __setup("thash_entries=", set_thash_entries);
2041
2042 void __init tcp_init(void)
2043 {
2044         struct sk_buff *skb = NULL;
2045         int order, i;
2046
2047         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2048                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2049                                            sizeof(skb->cb));
2050
2051         tcp_hashinfo.bind_bucket_cachep =
2052                 kmem_cache_create("tcp_bind_bucket",
2053                                   sizeof(struct inet_bind_bucket), 0,
2054                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2055         if (!tcp_hashinfo.bind_bucket_cachep)
2056                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2057
2058         /* Size and allocate the main established and bind bucket
2059          * hash tables.
2060          *
2061          * The methodology is similar to that of the buffer cache.
2062          */
2063         tcp_hashinfo.ehash =
2064                 alloc_large_system_hash("TCP established",
2065                                         sizeof(struct inet_ehash_bucket),
2066                                         thash_entries,
2067                                         (num_physpages >= 128 * 1024) ?
2068                                         13 : 15,
2069                                         HASH_HIGHMEM,
2070                                         &tcp_hashinfo.ehash_size,
2071                                         NULL,
2072                                         0);
2073         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2074         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2075                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2076                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2077         }
2078
2079         tcp_hashinfo.bhash =
2080                 alloc_large_system_hash("TCP bind",
2081                                         sizeof(struct inet_bind_hashbucket),
2082                                         tcp_hashinfo.ehash_size,
2083                                         (num_physpages >= 128 * 1024) ?
2084                                         13 : 15,
2085                                         HASH_HIGHMEM,
2086                                         &tcp_hashinfo.bhash_size,
2087                                         NULL,
2088                                         64 * 1024);
2089         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2090         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2091                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2092                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2093         }
2094
2095         /* Try to be a bit smarter and adjust defaults depending
2096          * on available memory.
2097          */
2098         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2099                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2100                         order++)
2101                 ;
2102         if (order >= 4) {
2103                 sysctl_local_port_range[0] = 32768;
2104                 sysctl_local_port_range[1] = 61000;
2105                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2106                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2107                 sysctl_max_syn_backlog = 1024;
2108         } else if (order < 3) {
2109                 sysctl_local_port_range[0] = 1024 * (3 - order);
2110                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2111                 sysctl_tcp_max_orphans >>= (3 - order);
2112                 sysctl_max_syn_backlog = 128;
2113         }
2114
2115         sysctl_tcp_mem[0] =  768 << order;
2116         sysctl_tcp_mem[1] = 1024 << order;
2117         sysctl_tcp_mem[2] = 1536 << order;
2118
2119         if (order < 3) {
2120                 sysctl_tcp_wmem[2] = 64 * 1024;
2121                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2122                 sysctl_tcp_rmem[1] = 43689;
2123                 sysctl_tcp_rmem[2] = 2 * 43689;
2124         }
2125
2126         printk(KERN_INFO "TCP: Hash tables configured "
2127                "(established %d bind %d)\n",
2128                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2129
2130         tcp_register_congestion_control(&tcp_reno);
2131 }
2132
2133 EXPORT_SYMBOL(tcp_close);
2134 EXPORT_SYMBOL(tcp_disconnect);
2135 EXPORT_SYMBOL(tcp_getsockopt);
2136 EXPORT_SYMBOL(tcp_ioctl);
2137 EXPORT_SYMBOL(tcp_poll);
2138 EXPORT_SYMBOL(tcp_read_sock);
2139 EXPORT_SYMBOL(tcp_recvmsg);
2140 EXPORT_SYMBOL(tcp_sendmsg);
2141 EXPORT_SYMBOL(tcp_sendpage);
2142 EXPORT_SYMBOL(tcp_setsockopt);
2143 EXPORT_SYMBOL(tcp_shutdown);
2144 EXPORT_SYMBOL(tcp_statistics);