[TCP]: MTU probing

author John Heffner <jheffner@psc.edu>

Tue, 21 Mar 2006 01:53:41 +0000 (17:53 -0800)

committer David S. Miller <davem@davemloft.net>

Tue, 21 Mar 2006 01:53:41 +0000 (17:53 -0800)
author John Heffner <jheffner@psc.edu>
Tue, 21 Mar 2006 01:53:41 +0000 (17:53 -0800)
committer David S. Miller <davem@davemloft.net>
Tue, 21 Mar 2006 01:53:41 +0000 (17:53 -0800)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index 8ad4beab288806e7bf69a919dd15568774082719..6e8880ea49e71a595e6496369c4854820ca82fe2 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -397,6 +397,8 @@ enum
         NET_TCP_CONG_CONTROL=110,
         NET_TCP_ABC=111,
         NET_IPV4_IPFRAG_MAX_DIST=112,
+       NET_TCP_MTU_PROBING=113,
+       NET_TCP_BASE_MSS=114,
  };
  
  enum {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h

index fa587c94e9d0483fc774356b933235cddafcb44c..b3abe33f4e5f122ec7e68bc43b94fec50ad4f041 100644 (file)
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
   * @icsk_probes_out:      unanswered 0 window probes
   * @icsk_ext_hdr_len:     Network protocol overhead (IP/IPv6 options)
   * @icsk_ack:             Delayed ACK control data
+ * @icsk_mtup;            MTU probing control data
   */
  struct inet_connection_sock {
         /* inet_sock has to be the first member! */
@@ -104,6 +105,18 @@ struct inet_connection_sock {
                 __u16             last_seg_size; /* Size of last incoming segment          */
                 __u16             rcv_mss;       /* MSS used for delayed ACK decisions     */ 
         } icsk_ack;
+       struct {
+               int               enabled;
+
+               /* Range of MTUs to search */
+               int               search_high;
+               int               search_low;
+
+               /* Information on the current probe. */
+               int               probe_size;
+               __u32             probe_seq_start;
+               __u32             probe_seq_end;
+       } icsk_mtup;
         u32                       icsk_ca_priv[16];
  #define ICSK_CA_PRIV_SIZE      (16 * sizeof(u32))
  };
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 77f21c65bbca2e1f2a460d167bc390d76eb6eb50..16879fa560de05fb5ce479a7440c8abb98883fc0 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
  /* Minimal RCV_MSS. */
  #define TCP_MIN_RCVMSS         536U
  
+/* The least MTU to use for probing */
+#define TCP_BASE_MSS           512
+
  /* After receiving this amount of duplicate ACKs fast retransmit starts. */
  #define TCP_FASTRETRANS_THRESH 3
  
@@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
  extern int sysctl_tcp_moderate_rcvbuf;
  extern int sysctl_tcp_tso_win_divisor;
  extern int sysctl_tcp_abc;
+extern int sysctl_tcp_mtu_probing;
+extern int sysctl_tcp_base_mss;
  
  extern atomic_t tcp_memory_allocated;
  extern atomic_t tcp_sockets_allocated;
@@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
  
  extern void tcp_initialize_rcv_mss(struct sock *sk);
  
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
+extern void tcp_mtup_init(struct sock *sk);
+
  static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
  {
         tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 16984d4a8a065b26a48d51a5bd08c80781d2f21a..ebf2e0b363c4c0a5bd16507549cf1078aaae5464 100644 (file)
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
+       {
+               .ctl_name       = NET_TCP_MTU_PROBING,
+               .procname       = "tcp_mtu_probing",
+               .data           = &sysctl_tcp_mtu_probing,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = NET_TCP_BASE_MSS,
+               .procname       = "tcp_base_mss",
+               .data           = &sysctl_tcp_base_mss,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
  
         { .ctl_name = 0 }
  };
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index e9a54ae7d6903845598db14a8e1cba54026faf1b..0ac388e3d01dfc451f4141dfaed4da777c634819 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
         }
  }
  
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+       icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       /* FIXME: breaks with very large cwnd */
+       tp->prior_ssthresh = tcp_current_ssthresh(sk);
+       tp->snd_cwnd = tp->snd_cwnd *
+                      tcp_mss_to_mtu(sk, tp->mss_cache) /
+                      icsk->icsk_mtup.probe_size;
+       tp->snd_cwnd_cnt = 0;
+       tp->snd_cwnd_stamp = tcp_time_stamp;
+       tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+
+       icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+       icsk->icsk_mtup.probe_size = 0;
+       tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+
  /* Process an event, which can update packets-in-flight not trivially.
   * Main goal of this function is to calculate new estimate for left_out,
   * taking into account both packets sitting in receiver's buffer and
@@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                         return;
                 }
  
+               /* MTU probe failure: don't reduce cwnd */
+               if (icsk->icsk_ca_state < TCP_CA_CWR &&
+                   icsk->icsk_mtup.probe_size &&
+                   tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
+                       tcp_mtup_probe_failed(sk);
+                       /* Restores the reduction we did in tcp_mtup_probe() */
+                       tp->snd_cwnd++;
+                       tcp_simple_retransmit(sk);
+                       return;
+               }
+
                 /* Otherwise enter Recovery state */
  
                 if (IsReno(tp))
@@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                         tp->retrans_stamp = 0;
                 }
  
+               /* MTU probing checks */
+               if (icsk->icsk_mtup.probe_size) {
+                       if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
+                               tcp_mtup_probe_success(sk, skb);
+                       }
+               }
+
                 if (sacked) {
                         if (sacked & TCPCB_RETRANS) {
                                 if(sacked & TCPCB_SACKED_RETRANS)
@@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
                         tp->rx_opt.sack_ok |= 2;
  
+               tcp_mtup_init(sk);
                 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                 tcp_initialize_rcv_mss(sk);
  
@@ -4211,6 +4258,7 @@ discard:
                 if (tp->ecn_flags&TCP_ECN_OK)
                         sock_set_flag(sk, SOCK_NO_LARGESEND);
  
+               tcp_mtup_init(sk);
                 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                 tcp_initialize_rcv_mss(sk);
  
@@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                  */
                                 tp->lsndtime = tcp_time_stamp;
  
+                               tcp_mtup_init(sk);
                                 tcp_initialize_rcv_mss(sk);
                                 tcp_init_buffer_space(sk);
                                 tcp_fast_path_on(tp);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 233bdf2599658513c8ede17ebcc77c38bc2cac4a..57e7a26e82137bcf668ff06dc7b6d758497928d4 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
         newinet->id = newtp->write_seq ^ jiffies;
  
+       tcp_mtup_init(newsk);
         tcp_sync_mss(newsk, dst_mtu(dst));
         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
         tcp_initialize_rcv_mss(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 9f498a6c8895b63412c60e302fc9ea3245695d60..8197b5e12f1fdc37a1ba9a871012deca60d9f939 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1;
   */
  int sysctl_tcp_tso_win_divisor = 3;
  
+int sysctl_tcp_mtu_probing = 0;
+int sysctl_tcp_base_mss = 512;
+
+EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
+EXPORT_SYMBOL(sysctl_tcp_base_mss);
+
  static void update_send_head(struct sock *sk, struct tcp_sock *tp,
                              struct sk_buff *skb)
  {
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
         return 0;
  }
  
+/* Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int mss_now;
+
+       /* Calculate base mss without TCP options:
+          It is MMS_S - sizeof(tcphdr) of rfc1122
+        */
+       mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+       /* Clamp it (mss_clamp does not include tcp options) */
+       if (mss_now > tp->rx_opt.mss_clamp)
+               mss_now = tp->rx_opt.mss_clamp;
+
+       /* Now subtract optional transport overhead */
+       mss_now -= icsk->icsk_ext_hdr_len;
+
+       /* Then reserve room for full set of TCP options and 8 bytes of data */
+       if (mss_now < 48)
+               mss_now = 48;
+
+       /* Now subtract TCP options size, not including SACKs */
+       mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+       return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int mtu;
+
+       mtu = mss +
+             tp->tcp_header_len +
+             icsk->icsk_ext_hdr_len +
+             icsk->icsk_af_ops->net_header_len;
+
+       return mtu;
+}
+
+void tcp_mtup_init(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+       icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+                              icsk->icsk_af_ops->net_header_len;
+       icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+       icsk->icsk_mtup.probe_size = 0;
+}
+
  /* This function synchronize snd mss to current pmtu/exthdr set.
  
     tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct inet_connection_sock *icsk = inet_csk(sk);
-       /* Calculate base mss without TCP options:
-          It is MMS_S - sizeof(tcphdr) of rfc1122
-        */
-       int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
-                      sizeof(struct tcphdr));
+       int mss_now;
  
-       /* Clamp it (mss_clamp does not include tcp options) */
-       if (mss_now > tp->rx_opt.mss_clamp)
-               mss_now = tp->rx_opt.mss_clamp;
+       if (icsk->icsk_mtup.search_high > pmtu)
+               icsk->icsk_mtup.search_high = pmtu;
  
-       /* Now subtract optional transport overhead */
-       mss_now -= icsk->icsk_ext_hdr_len;
-
-       /* Then reserve room for full set of TCP options and 8 bytes of data */
-       if (mss_now < 48)
-               mss_now = 48;
-
-       /* Now subtract TCP options size, not including SACKs */
-       mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+       mss_now = tcp_mtu_to_mss(sk, pmtu);
  
         /* Bound mss with half of window */
         if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  
         /* And store cached results */
         icsk->icsk_pmtu_cookie = pmtu;
+       if (icsk->icsk_mtup.enabled)
+               mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
         tp->mss_cache = mss_now;
  
         return mss_now;
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
         return 1;
  }
  
+/* Create a new MTU probe if we are ready.
+ * Returns 0 if we should wait to probe (no cwnd available),
+ *         1 if a probe was sent,
+ *         -1 otherwise */
+static int tcp_mtu_probe(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb, *nskb, *next;
+       int len;
+       int probe_size;
+       unsigned int pif;
+       int copy;
+       int mss_now;
+
+       /* Not currently probing/verifying,
+        * not in recovery,
+        * have enough cwnd, and
+        * not SACKing (the variable headers throw things off) */
+       if (!icsk->icsk_mtup.enabled ||
+           icsk->icsk_mtup.probe_size ||
+           inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+           tp->snd_cwnd < 11 ||
+           tp->rx_opt.eff_sacks)
+               return -1;
+
+       /* Very simple search strategy: just double the MSS. */
+       mss_now = tcp_current_mss(sk, 0);
+       probe_size = 2*tp->mss_cache;
+       if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+               /* TODO: set timer for probe_converge_event */
+               return -1;
+       }
+
+       /* Have enough data in the send queue to probe? */
+       len = 0;
+       if ((skb = sk->sk_send_head) == NULL)
+               return -1;
+       while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
+               skb = skb->next;
+       if (len < probe_size)
+               return -1;
+
+       /* Receive window check. */
+       if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
+               if (tp->snd_wnd < probe_size)
+                       return -1;
+               else
+                       return 0;
+       }
+
+       /* Do we need to wait to drain cwnd? */
+       pif = tcp_packets_in_flight(tp);
+       if (pif + 2 > tp->snd_cwnd) {
+               /* With no packets in flight, don't stall. */
+               if (pif == 0)
+                       return -1;
+               else
+                       return 0;
+       }
+
+       /* We're allowed to probe.  Build it now. */
+       if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+               return -1;
+       sk_charge_skb(sk, nskb);
+
+       skb = sk->sk_send_head;
+       __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
+       sk->sk_send_head = nskb;
+
+       TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+       TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+       TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+       TCP_SKB_CB(nskb)->sacked = 0;
+       nskb->csum = 0;
+       if (skb->ip_summed == CHECKSUM_HW)
+               nskb->ip_summed = CHECKSUM_HW;
+
+       len = 0;
+       while (len < probe_size) {
+               next = skb->next;
+
+               copy = min_t(int, skb->len, probe_size - len);
+               if (nskb->ip_summed)
+                       skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+               else
+                       nskb->csum = skb_copy_and_csum_bits(skb, 0,
+                                        skb_put(nskb, copy), copy, nskb->csum);
+
+               if (skb->len <= copy) {
+                       /* We've eaten all the data from this skb.
+                        * Throw it away. */
+                       TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+                       __skb_unlink(skb, &sk->sk_write_queue);
+                       sk_stream_free_skb(sk, skb);
+               } else {
+                       TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+                                                  ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+                       if (!skb_shinfo(skb)->nr_frags) {
+                               skb_pull(skb, copy);
+                               if (skb->ip_summed != CHECKSUM_HW)
+                                       skb->csum = csum_partial(skb->data, skb->len, 0);
+                       } else {
+                               __pskb_trim_head(skb, copy);
+                               tcp_set_skb_tso_segs(sk, skb, mss_now);
+                       }
+                       TCP_SKB_CB(skb)->seq += copy;
+               }
+
+               len += copy;
+               skb = next;
+       }
+       tcp_init_tso_segs(sk, nskb, nskb->len);
+
+       /* We're ready to send.  If this fails, the probe will
+        * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+       TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+       if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+               /* Decrement cwnd here because we are sending
+               * effectively two packets. */
+               tp->snd_cwnd--;
+               update_send_head(sk, tp, nskb);
+
+               icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+               icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+               icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+               return 1;
+       }
+
+       return -1;
+}
+
+
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
         struct sk_buff *skb;
         unsigned int tso_segs, sent_pkts;
         int cwnd_quota;
+       int result;
  
         /* If we are closed, the bytes will have to remain here.
          * In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
                 return 0;
  
         sent_pkts = 0;
+
+       /* Do MTU probing. */
+       if ((result = tcp_mtu_probe(sk)) == 0) {
+               return 0;
+       } else if (result > 0) {
+               sent_pkts = 1;
+       }
+
         while ((skb = sk->sk_send_head)) {
                 unsigned int limit;
  
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk)
  int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
         unsigned int cur_mss = tcp_current_mss(sk, 0);
         int err;
  
+       /* Inconslusive MTU probe */
+       if (icsk->icsk_mtup.probe_size) {
+               icsk->icsk_mtup.probe_size = 0;
+       }
+
         /* Do not sent more than we queued. 1/4 is reserved for possible
          * copying overhead: fragmentation, tunneling, mangling etc.
          */
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk)
         if (tp->rx_opt.user_mss)
                 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
         tp->max_window = 0;
+       tcp_mtup_init(sk);
         tcp_sync_mss(sk, dst_mtu(dst));
  
         if (!tp->window_clamp)
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack);
  EXPORT_SYMBOL(tcp_simple_retransmit);
  EXPORT_SYMBOL(tcp_sync_mss);
  EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
+EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

index e1880959614a49d8855daecf1285e9fdaf48af05..7c1bde3cd6cb725b06fbea58a35743a62faa0552 100644 (file)
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
  /* A write timeout has occurred. Process the after effects. */
  static int tcp_write_timeout(struct sock *sk)
  {
-       const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
         int retry_until;
+       int mss;
  
         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                 if (icsk->icsk_retransmits)
@@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk)
                 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
         } else {
                 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
-                       /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
-                          hole detection. :-(
-
-                          It is place to make it. It is not made. I do not want
-                          to make it. It is disgusting. It does not work in any
-                          case. Let me to cite the same draft, which requires for
-                          us to implement this:
-
-   "The one security concern raised by this memo is that ICMP black holes
-   are often caused by over-zealous security administrators who block
-   all ICMP messages.  It is vitally important that those who design and
-   deploy security systems understand the impact of strict filtering on
-   upper-layer protocols.  The safest web site in the world is worthless
-   if most TCP implementations cannot transfer data from it.  It would
-   be far nicer to have all of the black holes fixed rather than fixing
-   all of the TCP implementations."
-
-                           Golden words :-).
-                  */
+                       /* Black hole detection */
+                       if (sysctl_tcp_mtu_probing) {
+                               if (!icsk->icsk_mtup.enabled) {
+                                       icsk->icsk_mtup.enabled = 1;
+                                       tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+                               } else {
+                                       mss = min(sysctl_tcp_base_mss,
+                                                 tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
+                                       mss = max(mss, 68 - tp->tcp_header_len);
+                                       icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+                                       tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+                               }
+                       }
  
                         dst_negative_advice(&sk->sk_dst_cache);
                 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

index ca9cf6853755fc3081693334c6f8f3e70265dcac..14de50380f4eacdd8db3697d0f0295a8a0bcc314 100644 (file)
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
                                                      newnp->opt->opt_flen);
  
+       tcp_mtup_init(newsk);
         tcp_sync_mss(newsk, dst_mtu(dst));
         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
         tcp_initialize_rcv_mss(newsk);
author	John Heffner <jheffner@psc.edu>
	Tue, 21 Mar 2006 01:53:41 +0000 (17:53 -0800)
committer	David S. Miller <davem@davemloft.net>
	Tue, 21 Mar 2006 01:53:41 +0000 (17:53 -0800)
include/linux/sysctl.h		patch \| blob \| blame \| history
include/net/inet_connection_sock.h		patch \| blob \| blame \| history
include/net/tcp.h		patch \| blob \| blame \| history
net/ipv4/sysctl_net_ipv4.c		patch \| blob \| blame \| history
net/ipv4/tcp_input.c		patch \| blob \| blame \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| blame \| history
net/ipv4/tcp_output.c		patch \| blob \| blame \| history
net/ipv4/tcp_timer.c		patch \| blob \| blame \| history
net/ipv6/tcp_ipv6.c		patch \| blob \| blame \| history