#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
-/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
- *
- * If we receive a SYN packet with these bits set, it means a network is
- * playing bad games with TOS bits. In order to avoid possible false congestion
- * notifications, we disable TCP ECN negociation.
- */
-static inline void
-TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
- struct net *net)
-{
- const struct tcphdr *th = tcp_hdr(skb);
-
- if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
- INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
- inet_rsk(req)->ecn_ok = 1;
-}
-
/* Due to TSO, an SKB can be composed of multiple actual
* packets. To keep these tracked properly, we use this.
*/
#define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
+/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
+/* Requires ECN/ECT set on all packets */
+#define TCP_CONG_NEEDS_ECN 0x2
struct tcp_congestion_ops {
struct list_head list;
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
+static inline bool tcp_ca_needs_ecn(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
+}
+
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cwnd_event(sk, event);
}
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static inline void
+TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
+ const struct sock *listen_sk)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, need_ecn;
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ need_ecn = tcp_ca_needs_ecn(listen_sk);
+
+ if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+ else if (ect && need_ecn)
+ inet_rsk(req)->ecn_ok = 1;
+}
+
/* These functions determine how the current flow behaves in respect of SACK
* handling. SACK is negotiated with the peer, and therefore it can vary
* between different flows.
}
/* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
/* Packet ECN state for a SYN. */
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk)) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
}
static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
+ struct sock *sk)
{
- if (inet_rsk(req)->ecn_ok)
+ if (inet_rsk(req)->ecn_ok) {
th->ece = 1;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
tcp_hdr(skb)->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
- } else {
+ } else if (!tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}
}
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
- TCP_ECN_send_synack(tcp_sk(sk), skb);
+ TCP_ECN_send_synack(sk, skb);
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- TCP_ECN_make_synack(req, th);
+ TCP_ECN_make_synack(req, th, sk);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is