tcp: enable RACK loss detection to trigger recovery

author Yuchung Cheng <ycheng@google.com>

Fri, 13 Jan 2017 06:11:36 +0000 (22:11 -0800)

committer David S. Miller <davem@davemloft.net>

Sat, 14 Jan 2017 03:37:16 +0000 (22:37 -0500)
author Yuchung Cheng <ycheng@google.com>
Fri, 13 Jan 2017 06:11:36 +0000 (22:11 -0800)
committer David S. Miller <davem@davemloft.net>
Sat, 14 Jan 2017 03:37:16 +0000 (22:37 -0500)
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 5fb1e75a32a9d58eaea8b83224b8ea33b0f575f1..423438dd6fe9e80a820ca956784c29858124ce4c 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
  extern int sysctl_tcp_thin_linear_timeouts;
  extern int sysctl_tcp_thin_dupack;
  extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_recovery;
+#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
+
  extern int sysctl_tcp_limit_output_bytes;
  extern int sysctl_tcp_challenge_ack_limit;
  extern int sysctl_tcp_min_tso_segs;
@@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
  
         tp->do_early_retrans = sysctl_tcp_early_retrans &&
                 sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
+               !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
                 net->ipv4.sysctl_tcp_reordering == 3;
  }
  
@@ -1859,13 +1863,6 @@ void tcp_v4_init(void);
  void tcp_init(void);
  
  /* tcp_recovery.c */
-
-/* Flags to enable various loss recovery features. See below */
-extern int sysctl_tcp_recovery;
-
-/* Use TCP RACK to detect (some) tail and retransmit losses */
-#define TCP_RACK_LOST_RETRANS  0x1
-
  extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
  extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
                              const struct skb_mstamp *xmit_time,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 9c98dc874825d4f0cb595f0079125b339e8043a2..4ad75b8c4fee379892dd520e828dbffacf6108a8 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
   *     F.e. after RTO, when all the queue is considered as lost,
   *     lost_out = packets_out and in_flight = retrans_out.
   *
- *             Essentially, we have now two algorithms counting
+ *             Essentially, we have now a few algorithms detecting
   *             lost packets.
   *
- *             FACK: It is the simplest heuristics. As soon as we decided
+ *             If the receiver supports SACK:
+ *
+ *             RFC6675/3517: It is the conventional algorithm. A packet is
+ *             considered lost if the number of higher sequence packets
+ *             SACKed is greater than or equal the DUPACK thoreshold
+ *             (reordering). This is implemented in tcp_mark_head_lost and
+ *             tcp_update_scoreboard.
+ *
+ *             RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ *             (2017-) that checks timing instead of counting DUPACKs.
+ *             Essentially a packet is considered lost if it's not S/ACKed
+ *             after RTT + reordering_window, where both metrics are
+ *             dynamically measured and adjusted. This is implemented in
+ *             tcp_rack_mark_lost.
+ *
+ *             FACK: it is the simplest heuristics. As soon as we decided
   *             that something is lost, we decide that _all_ not SACKed
   *             packets until the most forward SACK are lost. I.e.
   *             lost_out = fackets_out - sacked_out and left_out = fackets_out.
@@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
   *             takes place. We use FACK by default until reordering
   *             is suspected on the path to this destination.
   *
- *             NewReno: when Recovery is entered, we assume that one segment
+ *             If the receiver does not support SACK:
+ *
+ *             NewReno (RFC6582): in Recovery we assume that one segment
   *             is lost (classic Reno). While we are in Recovery and
   *             a partial ACK arrives, we assume that one more packet
   *             is lost (NewReno). This heuristics are the same in NewReno
   *             and SACK.
   *
- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
- *  deflation etc. CWND is real congestion window, never inflated, changes
- *  only according to classic VJ rules.
- *
   * Really tricky (and requiring careful tuning) part of algorithm
   * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
   * The first determines the moment _when_ we should reduce CWND and,
@@ -2807,7 +2820,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
         struct tcp_sock *tp = tcp_sk(sk);
  
         /* Use RACK to detect loss */
-       if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
+       if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
                 u32 prior_retrans = tp->retrans_out;
  
                 tcp_rack_mark_lost(sk, ack_time);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c

index 1e330a2f913dee25cc5a5cc99bead162b00d3bf8..4ecb38ae85042db7fa59e1aa6c74c9c3da0b1099 100644 (file)
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,7 +1,7 @@
  #include <linux/tcp.h>
  #include <net/tcp.h>
  
-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
  
  static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  {
@@ -24,7 +24,9 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
                (t1->v64 == t2->v64 && after(seq1, seq2));
  }
  
-/* Marks a packet lost, if some packet sent later has been (s)acked.
+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+ *
+ * Marks a packet lost, if some packet sent later has been (s)acked.
   * The underlying idea is similar to the traditional dupthresh and FACK
   * but they look at different metrics:
   *
@@ -37,8 +39,10 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
   * is being more resilient to reordering by simply allowing some
   * "settling delay", instead of tweaking the dupthresh.
   *
- * The current version is only used after recovery starts but can be
- * easily extended to detect the first loss.
+ * When tcp_rack_detect_loss() detects some packets are lost and we
+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
+ * make us enter the CA_Recovery state.
   */
  static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
                                  u32 *reo_timeout)
@@ -54,7 +58,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
          * to queuing or delayed ACKs.
          */
         reo_wnd = 1000;
-       if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+       if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
                 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
  
         tcp_for_write_queue(skb, sk) {
@@ -105,7 +109,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
         struct tcp_sock *tp = tcp_sk(sk);
         u32 timeout;
  
-       if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+       if (!tp->rack.advanced)
                 return;
  
         /* Reset the advanced flag to avoid unnecessary queue scanning */
author	Yuchung Cheng <ycheng@google.com>
	Fri, 13 Jan 2017 06:11:36 +0000 (22:11 -0800)
committer	David S. Miller <davem@davemloft.net>
	Sat, 14 Jan 2017 03:37:16 +0000 (22:37 -0500)
include/net/tcp.h		patch \| blob \| blame \| history
net/ipv4/tcp_input.c		patch \| blob \| blame \| history
net/ipv4/tcp_recovery.c		patch \| blob \| blame \| history