/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
*/
- space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ space = max_t(u32, space, sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
}
/* Set initial window to a value enough for senders starting with
- * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
+ * initial congestion window of sysctl_tcp_default_init_rwnd. Place
* a limit on the initial window when mss is larger than 1460.
*/
if (mss > (1 << *rcv_wscale)) {
- int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
+ int init_cwnd = sysctl_tcp_default_init_rwnd;
if (mss > 1460)
- init_cwnd =
- max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
+ init_cwnd = max_t(u32, (1460 * init_cwnd) / mss, 2);
/* when initializing use the value from init_rcv_wnd
* rather than the default from above
*/
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
- tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+ tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ 0, GFP_ATOMIC);
}
/*
* One tasklest per cpu tries to send more skbs.
if (flags & (1UL << TCP_TSQ_DEFERRED))
tcp_tsq_handler(sk);
+ /* Here begins the tricky part :
+ * We are called from release_sock() with :
+ * 1) BH disabled
+ * 2) sk_lock.slock spinlock held
+ * 3) socket owned by us (sk->sk_lock.owned == 1)
+ *
+ * But following code is meant to be called from BH handlers,
+ * so we should keep BH disabled, but early release socket ownership
+ */
+ sock_release_ownership(sk);
+
if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
tcp_write_timer_handler(sk);
__sock_put(sk);
__sock_put(sk);
}
if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
- sk->sk_prot->mtu_reduced(sk);
+ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
}
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- if (tcp_packets_in_flight(tp) == 0) {
+ if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
- skb->ooo_okay = 1;
- } else
- skb->ooo_okay = 0;
+
+ /* if no packet is in qdisc/device queue, then allow XPS to select
+ * another queue.
+ */
+ skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb->sk = sk;
- skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
- tcp_wfree : sock_wfree;
+ skb->destructor = tcp_wfree;
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
if (skb->len <= mss_now || !sk_can_gso(sk) ||
skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
if (nsize < 0)
nsize = 0;
- if (skb_cloned(skb) &&
- skb_is_nonlinear(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
/* Get a new skb... force flag on. */
/* If a full-sized TSO skb can be sent, do it. */
if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
- sk->sk_gso_max_segs * tp->mss_cache))
+ tp->xmit_size_goal_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
-
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
- if (tso_segs == 1) {
+ if (tso_segs == 1 || !sk->sk_gso_max_segs) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
}
- /* TSQ : sk_wmem_alloc accounts skb truesize,
- * including skb overhead. But thats OK.
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
*/
- if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+ limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
+ sk->sk_pacing_rate >> 10);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
- break;
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ * We abuse smp_mb__after_clear_bit() because
+ * there is no smp_mb__after_set_bit() yet
+ */
+ smp_mb__after_clear_bit();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ break;
}
+
limit = mss_now;
- if (tso_segs > 1 && !tcp_urg_mode(tp))
+ if (tso_segs > 1 && sk->sk_gso_max_segs && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
- TCP_RTO_MAX);
+ sysctl_tcp_rto_max);
return true;
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
- /* Probe with zero data doesn't trigger fast recovery. */
- if (skb->len > 0)
- err = __tcp_retransmit_skb(sk, skb);
+ err = __tcp_retransmit_skb(sk, skb);
/* Record snd_nxt for loss detection. */
if (likely(!err))
rearm_timer:
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ sysctl_tcp_rto_max);
if (likely(!err))
NET_INC_STATS_BH(sock_net(sk),
int oldpcount = tcp_skb_pcount(skb);
if (unlikely(oldpcount > 1)) {
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
tcp_init_tso_segs(sk, skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
if (!tp->retrans_stamp)
tp->retrans_stamp = TCP_SKB_CB(skb)->when;
- tp->undo_retrans += tcp_skb_pcount(skb);
-
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
*/
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
}
+
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
return err;
}
if (skb == tcp_write_queue_head(sk))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ sysctl_tcp_rto_max);
}
}
-/* Send a fin. The caller locks the socket for us. This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int mss_now;
- /* Optimization, tack on the FIN if we have a queue of
- * unsent frames. But be careful about outgoing SACKS
- * and IP options.
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
*/
- mss_now = tcp_current_mss(sk);
-
- if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
- TCP_SKB_CB(skb)->end_seq++;
+ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
} else {
- /* Socket is locked, keep trying until memory is available. */
- for (;;) {
- skb = alloc_skb_fclone(MAX_TCP_HEADER,
- sk->sk_allocation);
- if (skb)
- break;
- yield();
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
}
-
- /* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
int tcp_header_size;
int mss;
- skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
}
#endif
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
if (likely(!tp->repair))
tp->rcv_nxt = 0;
+ else
+ tp->rcv_tstamp = tcp_time_stamp;
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
- struct sk_buff *syn_data = NULL, *data;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
+ struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
- syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
- sk->sk_allocation);
- if (syn_data == NULL)
- goto fallback;
-
- for (i = 0; i < iovlen && syn_data->len < space; ++i) {
- struct iovec *iov = &fo->data->msg_iov[i];
- unsigned char __user *from = iov->iov_base;
- int len = iov->iov_len;
+ space = min_t(size_t, space, fo->size);
- if (syn_data->len + len > space)
- len = space - syn_data->len;
- else if (i + 1 == iovlen)
- /* No more data pending in inet_wait_for_connect() */
- fo->data = NULL;
+ /* limit to order-0 allocations */
+ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- if (skb_add_data(syn_data, from, len))
- goto fallback;
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ if (!syn_data)
+ goto fallback;
+ syn_data->ip_summed = CHECKSUM_PARTIAL;
+ memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+ skb_shinfo(syn_data)->gso_segs = 1;
+ if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
+ fo->data->msg_iov, 0, space))) {
+ kfree_skb(syn_data);
+ goto fallback;
}
- /* Queue a data-only packet after the regular SYN for retransmission */
- data = pskb_copy(syn_data, sk->sk_allocation);
- if (data == NULL)
- goto fallback;
- TCP_SKB_CB(data)->seq++;
- TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
- TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
- tcp_connect_queue_skb(sk, data);
- fo->copied = data->len;
+ /* No more data pending in inet_wait_for_connect() */
+ if (space == fo->size)
+ fo->data = NULL;
+ fo->copied = space;
+
+ tcp_connect_queue_skb(sk, syn_data);
+
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ /* Now full SYN+DATA was cloned and sent (or not),
+ * remove the SYN from the original skb (syn_data)
+ * we keep in write queue in case of a retransmit, as we
+ * also have the SYN packet (with no data) in the same queue.
+ */
+ TCP_SKB_CB(syn_data)->seq++;
+ TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+ if (!err) {
tp->syn_data = (fo->copied > 0);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
goto done;
}
- syn_data = NULL;
fallback:
/* Send a regular SYN with Fast Open cookie request option */
err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
if (err)
tp->syn_fastopen = 0;
- kfree_skb(syn_data);
done:
fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
return err;
return 0;
}
- buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
- if (unlikely(buff == NULL))
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (unlikely(!buff))
return -ENOBUFS;
- /* Reserve space for headers. */
- skb_reserve(buff, MAX_TCP_HEADER);
-
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff);
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ inet_csk(sk)->icsk_rto, sysctl_tcp_rto_max);
return 0;
}
EXPORT_SYMBOL(tcp_connect);
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
+ TCP_DELACK_MAX, sysctl_tcp_rto_max);
return;
}
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
- tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
tcp_xmit_probe_skb(sk, 0);
}
}
icsk->icsk_backoff++;
icsk->icsk_probes_out++;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ min_t(unsigned int, icsk->icsk_rto << icsk->icsk_backoff, sysctl_tcp_rto_max),
+ sysctl_tcp_rto_max);
} else {
/* If packet was not sent due to local congestion,
* do not backoff and do not remember icsk_probes_out.
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
min(icsk->icsk_rto << icsk->icsk_backoff,
TCP_RESOURCE_PROBE_INTERVAL),
- TCP_RTO_MAX);
+ sysctl_tcp_rto_max);
}
}