tcp: Revert "tcp: remove header prediction"
authorFlorian Westphal <fw@strlen.de>
Wed, 30 Aug 2017 17:24:58 +0000 (19:24 +0200)
committerDavid S. Miller <davem@davemloft.net>
Wed, 30 Aug 2017 18:20:09 +0000 (11:20 -0700)
This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78.

Eric Dumazet says:
  We found at Google a significant regression caused by
  45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction

  In typical RPC  (TCP_RR), when a TCP socket receives data, we now call
  tcp_ack() while we used to not call it.

  This touches enough cache lines to cause a slowdown.

so problem does not seem to be HP removal itself but the tcp_ack()
call.  Therefore, it might be possible to remove HP after all, provided
one finds a way to elide tcp_ack for most cases.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/tcp.h
include/net/tcp.h
include/uapi/linux/snmp.h
net/ipv4/proc.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c

index 267164a1d5591450087c2479be4306d3c0d55f37..4aa40ef02d32cf2719513ca77c38f45a5e646b82 100644 (file)
@@ -147,6 +147,12 @@ struct tcp_sock {
        u16     tcp_header_len; /* Bytes of tcp header to send          */
        u16     gso_segs;       /* Max number of segs per GSO packet    */
 
+/*
+ *     Header prediction flags
+ *     0x5?10 << 16 + snd_wnd in net byte order
+ */
+       __be32  pred_flags;
+
 /*
  *     RFC793 variables by their proper names. This means you can
  *     read the code and the spec side by side (and laugh ...)
index c546d13ffbca857c80443403551fe92b5f1cf23c..9c3db054e47f1a27fe3ff4a4a081674f424a7a93 100644 (file)
@@ -634,6 +634,29 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
        return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
 }
 
+static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
+{
+       tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+                              ntohl(TCP_FLAG_ACK) |
+                              snd_wnd);
+}
+
+static inline void tcp_fast_path_on(struct tcp_sock *tp)
+{
+       __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
+}
+
+static inline void tcp_fast_path_check(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
+           tp->rcv_wnd &&
+           atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+           !tp->urg_data)
+               tcp_fast_path_on(tp);
+}
+
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(struct sock *sk)
 {
index b3f346fb9fe3a0540f01450f4d1165b8e8880fb0..758f12b585410b2de6936459c8e296902f7cd1a1 100644 (file)
@@ -184,7 +184,9 @@ enum
        LINUX_MIB_DELAYEDACKLOST,               /* DelayedACKLost */
        LINUX_MIB_LISTENOVERFLOWS,              /* ListenOverflows */
        LINUX_MIB_LISTENDROPS,                  /* ListenDrops */
+       LINUX_MIB_TCPHPHITS,                    /* TCPHPHits */
        LINUX_MIB_TCPPUREACKS,                  /* TCPPureAcks */
+       LINUX_MIB_TCPHPACKS,                    /* TCPHPAcks */
        LINUX_MIB_TCPRENORECOVERY,              /* TCPRenoRecovery */
        LINUX_MIB_TCPSACKRECOVERY,              /* TCPSackRecovery */
        LINUX_MIB_TCPSACKRENEGING,              /* TCPSACKReneging */
index b6d3fe03feb39baa52da988a4b923cad0cf64f83..127153f1ed8a7894e0b2d2059af08bdcc6a00236 100644 (file)
@@ -206,7 +206,9 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
        SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
        SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+       SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
        SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+       SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
        SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
        SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
        SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
index 566083ee2654c25410d80ff56ce5adb49bb28ae7..21ca2df274c5130a13d31a391a1408d779af34af 100644 (file)
@@ -1963,8 +1963,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                tcp_rcv_space_adjust(sk);
 
 skip_copy:
-               if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+               if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
                        tp->urg_data = 0;
+                       tcp_fast_path_check(sk);
+               }
                if (used + offset < skb->len)
                        continue;
 
index a0e436366d31707922e21c495bec7d8312d643d2..c5d7656beeee29b3c92e1c8824dbf00d3fa32d28 100644 (file)
@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define FLAG_DATA_SACKED       0x20 /* New SACK.                               */
 #define FLAG_ECE               0x40 /* ECE in this ACK                         */
 #define FLAG_LOST_RETRANS      0x80 /* This ACK marks some retransmission lost */
+#define FLAG_SLOWPATH          0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ORIG_SACK_ACKED   0x200 /* Never retransmitted data are (s)acked  */
 #define FLAG_SND_UNA_ADVANCED  0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK      0x800 /* SACK blocks contained D-SACK info */
@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
                if (tp->snd_wnd != nwin) {
                        tp->snd_wnd = nwin;
 
+                       /* Note, it is the only place, where
+                        * fast path is recovered for sending TCP.
+                        */
+                       tp->pred_flags = 0;
+                       tcp_fast_path_check(sk);
+
                        if (tcp_send_head(sk))
                                tcp_slow_start_after_idle_check(sk);
 
@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (flag & FLAG_UPDATE_TS_RECENT)
                tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-       {
+       if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+               /* Window is constant, pure forward advance.
+                * No more checks are required.
+                * Note, we use the fact that SND.UNA>=SND.WL2.
+                */
+               tcp_update_wl(tp, ack_seq);
+               tcp_snd_una_update(tp, ack);
+               flag |= FLAG_WIN_UPDATE;
+
+               tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
+
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
+       } else {
                u32 ack_ev_flags = CA_ACK_SLOWPATH;
 
                if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
        if (TCP_SKB_CB(skb)->has_rxtstamp)
                TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
 
+       /* Disable header prediction. */
+       tp->pred_flags = 0;
        inet_csk_schedule_ack(sk);
 
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4647,6 +4668,8 @@ queue_and_out:
                if (tp->rx_opt.num_sacks)
                        tcp_sack_remove(tp);
 
+               tcp_fast_path_check(sk);
+
                if (eaten > 0)
                        kfree_skb_partial(skb, fragstolen);
                if (!sock_flag(sk, SOCK_DEAD))
@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk)
        NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
 
        /* Massive buffer overcommit. */
+       tp->pred_flags = 0;
        return -1;
 }
 
@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
 
        tp->urg_data = TCP_URG_NOTYET;
        tp->urg_seq = ptr;
+
+       /* Disable header prediction. */
+       tp->pred_flags = 0;
 }
 
 /* This is the 'fast' part of urgent handling. */
@@ -5301,6 +5328,26 @@ discard:
 
 /*
  *     TCP receive function for the ESTABLISHED state.
+ *
+ *     It is split into a fast path and a slow path. The fast path is
+ *     disabled when:
+ *     - A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path.
+ *     - Out of order segments arrived.
+ *     - Urgent data is expected.
+ *     - There is no buffer space left
+ *     - Unexpected TCP flags/window values/header lengths are received
+ *       (detected by checking the TCP header against pred_flags)
+ *     - Data is sent in both directions. Fast path only supports pure senders
+ *       or pure receivers (this means either the sequence number or the ack
+ *       value must stay constant)
+ *     - Unexpected TCP option.
+ *
+ *     When these conditions are not satisfied it drops into a standard
+ *     receive procedure patterned after RFC793 to handle all cases.
+ *     The first three cases are guaranteed by proper pred_flags setting,
+ *     the rest is checked inline. Fast processing is turned on in
+ *     tcp_data_queue when everything is OK.
  */
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                         const struct tcphdr *th)
@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
        tcp_mstamp_refresh(tp);
        if (unlikely(!sk->sk_rx_dst))
                inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+       /*
+        *      Header prediction.
+        *      The code loosely follows the one in the famous
+        *      "30 instruction TCP receive" Van Jacobson mail.
+        *
+        *      Van's trick is to deposit buffers into socket queue
+        *      on a device interrupt, to call tcp_recv function
+        *      on the receive process context and checksum and copy
+        *      the buffer to user space. smart...
+        *
+        *      Our current scheme is not silly either but we take the
+        *      extra cost of the net_bh soft interrupt processing...
+        *      We do checksum and copy also but from device to kernel.
+        */
 
        tp->rx_opt.saw_tstamp = 0;
 
+       /*      pred_flags is 0xS?10 << 16 + snd_wnd
+        *      if header_prediction is to be made
+        *      'S' will always be tp->tcp_header_len >> 2
+        *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+        *  turn it off (when there are holes in the receive
+        *       space for instance)
+        *      PSH flag is ignored.
+        */
+
+       if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+           TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+           !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+               int tcp_header_len = tp->tcp_header_len;
+
+               /* Timestamp header prediction: tcp_header_len
+                * is automatically equal to th->doff*4 due to pred_flags
+                * match.
+                */
+
+               /* Check timestamp */
+               if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+                       /* No? Slow path! */
+                       if (!tcp_parse_aligned_timestamp(tp, th))
+                               goto slow_path;
+
+                       /* If PAWS failed, check it more carefully in slow path */
+                       if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+                               goto slow_path;
+
+                       /* DO NOT update ts_recent here, if checksum fails
+                        * and timestamp was corrupted part, it will result
+                        * in a hung connection since we will drop all
+                        * future packets due to the PAWS test.
+                        */
+               }
+
+               if (len <= tcp_header_len) {
+                       /* Bulk data transfer: sender */
+                       if (len == tcp_header_len) {
+                               /* Predicted packet is in window by definition.
+                                * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+                                * Hence, check seq<=rcv_wup reduces to:
+                                */
+                               if (tcp_header_len ==
+                                   (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+                                   tp->rcv_nxt == tp->rcv_wup)
+                                       tcp_store_ts_recent(tp);
+
+                               /* We know that such packets are checksummed
+                                * on entry.
+                                */
+                               tcp_ack(sk, skb, 0);
+                               __kfree_skb(skb);
+                               tcp_data_snd_check(sk);
+                               return;
+                       } else { /* Header too small */
+                               TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+                               goto discard;
+                       }
+               } else {
+                       int eaten = 0;
+                       bool fragstolen = false;
+
+                       if (tcp_checksum_complete(skb))
+                               goto csum_error;
+
+                       if ((int)skb->truesize > sk->sk_forward_alloc)
+                               goto step5;
+
+                       /* Predicted packet is in window by definition.
+                        * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+                        * Hence, check seq<=rcv_wup reduces to:
+                        */
+                       if (tcp_header_len ==
+                           (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+                           tp->rcv_nxt == tp->rcv_wup)
+                               tcp_store_ts_recent(tp);
+
+                       tcp_rcv_rtt_measure_ts(sk, skb);
+
+                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+                       /* Bulk data transfer: receiver */
+                       eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+                                             &fragstolen);
+
+                       tcp_event_data_recv(sk, skb);
+
+                       if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+                               /* Well, only one small jumplet in fast path... */
+                               tcp_ack(sk, skb, FLAG_DATA);
+                               tcp_data_snd_check(sk);
+                               if (!inet_csk_ack_scheduled(sk))
+                                       goto no_ack;
+                       }
+
+                       __tcp_ack_snd_check(sk, 0);
+no_ack:
+                       if (eaten)
+                               kfree_skb_partial(skb, fragstolen);
+                       sk->sk_data_ready(sk);
+                       return;
+               }
+       }
+
+slow_path:
        if (len < (th->doff << 2) || tcp_checksum_complete(skb))
                goto csum_error;
 
        if (!th->ack && !th->rst && !th->syn)
                goto discard;
 
+       /*
+        *      Standard slow path.
+        */
+
        if (!tcp_validate_incoming(sk, skb, th, 1))
                return;
 
-       if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
+step5:
+       if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
                goto discard;
 
        tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
        if (sock_flag(sk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+       if (!tp->rx_opt.snd_wscale)
+               __tcp_fast_path_on(tp, tp->snd_wnd);
+       else
+               tp->pred_flags = 0;
 }
 
 static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_ecn_rcv_synack(tp, th);
 
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-               tcp_ack(sk, skb, 0);
+               tcp_ack(sk, skb, FLAG_SLOWPATH);
 
                /* Ok.. it's good. Set up sequence numbers and
                 * move to established.
@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                return 0;
 
        /* step 5: check the ACK field */
-
-       acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
+       acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+                                     FLAG_UPDATE_TS_RECENT |
                                      FLAG_NO_CHALLENGE_ACK) > 0;
 
        if (!acceptable) {
@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                tp->lsndtime = tcp_jiffies32;
 
                tcp_initialize_rcv_mss(sk);
+               tcp_fast_path_on(tp);
                break;
 
        case TCP_FIN_WAIT1: {
index 1537b87c657fad2e80d86935ae3b730027395a89..188a6f31356db0d0b7825aa476888467f7c0dfd8 100644 (file)
@@ -436,6 +436,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
                struct tcp_sock *newtp = tcp_sk(newsk);
 
                /* Now setup tcp_sock */
+               newtp->pred_flags = 0;
+
                newtp->rcv_wup = newtp->copied_seq =
                newtp->rcv_nxt = treq->rcv_isn + 1;
                newtp->segs_in = 1;
index 3e0d19631534f3eec800330d939448b50e095b1f..5b6690d05abb98884adfa1693f97d896dd202893 100644 (file)
@@ -295,7 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
        /* RFC1323 scaling applied */
        new_win >>= tp->rx_opt.rcv_wscale;
 
+       /* If we advertise zero window, disable fast path. */
        if (new_win == 0) {
+               tp->pred_flags = 0;
                if (old_win)
                        NET_INC_STATS(sock_net(sk),
                                      LINUX_MIB_TCPTOZEROWINDOWADV);