#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#ifdef CONFIG_MPTCP
+ #include <net/mptcp.h>
+ #include <net/mptcp_v4.h>
+#endif
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
const int type = icmp_hdr(icmp_skb)->type;
const int code = icmp_hdr(icmp_skb)->code;
struct sock *sk;
+#ifdef CONFIG_MPTCP
+ struct sock *meta_sk;
+#endif
struct sk_buff *skb;
struct request_sock *fastopen;
__u32 seq, snd_una;
(code == ICMP_NET_UNREACH ||
code == ICMP_HOST_UNREACH)));
+#ifdef CONFIG_MPTCP
+ tp = tcp_sk(sk);
+ if (mptcp(tp))
+ meta_sk = mptcp_meta_sk(sk);
+ else
+ meta_sk = sk;
+
+ bh_lock_sock(meta_sk);
+#else
bh_lock_sock(sk);
+#endif
/* If too many ICMPs get dropped on busy
* servers this needs to be solved differently.
* We do take care of PMTU discovery (RFC1191) special case :
* we can receive locally generated ICMP messages while socket is held.
*/
+#ifdef CONFIG_MPTCP
+ if (sock_owned_by_user(meta_sk)) {
+#else
if (sock_owned_by_user(sk)) {
+#endif
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
}
}
icsk = inet_csk(sk);
+#ifndef CONFIG_MPTCP
tp = tcp_sk(sk);
+#endif
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
fastopen = tp->fastopen_rsk;
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
goto out;
tp->mtu_info = info;
+#ifdef CONFIG_MPTCP
+ if (!sock_owned_by_user(meta_sk)) {
+#else
if (!sock_owned_by_user(sk)) {
+#endif
tcp_v4_mtu_reduced(sk);
} else {
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
sock_hold(sk);
+#ifdef CONFIG_MPTCP
+ if (mptcp(tp))
+ mptcp_tsq_flags(sk);
+#endif
}
goto out;
}
!icsk->icsk_backoff || fastopen)
break;
+#ifdef CONFIG_MPTCP
+ if (sock_owned_by_user(meta_sk))
+#else
if (sock_owned_by_user(sk))
+#endif
break;
icsk->icsk_backoff--;
} else {
/* RTO revert clocked out retransmission.
* Will retransmit now */
- tcp_retransmit_timer(sk);
+#ifdef CONFIG_MPTCP
+ tcp_sk(sk)->ops->retransmit_timer(sk);
+#else
+ tcp_retransmit_timer(sk);
+#endif
+
}
break;
if (fastopen && !fastopen->sk)
break;
+#ifdef CONFIG_MPTCP
+ if (!sock_owned_by_user(meta_sk)) {
+#else
if (!sock_owned_by_user(sk)) {
+#endif
sk->sk_err = err;
sk->sk_error_report(sk);
*/
inet = inet_sk(sk);
+#ifdef CONFIG_MPTCP
+ if (!sock_owned_by_user(meta_sk) && inet->recverr) {
+#else
if (!sock_owned_by_user(sk) && inet->recverr) {
+#endif
sk->sk_err = err;
sk->sk_error_report(sk);
} else { /* Only an error on timeout */
}
out:
+#ifdef CONFIG_MPTCP
+ bh_unlock_sock(meta_sk);
+#else
bh_unlock_sock(sk);
+#endif
sock_put(sk);
}
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
+#ifndef CONFIG_MPTCP
+static
+#endif
+void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
outside socket context is ugly, certainly. What can I do?
*/
-
+#ifdef CONFIG_MPTCP
+static void tcp_v4_send_ack(const struct sock *sk,
+ struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
+ u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_md5sig_key *key,
+ int reply_flags, u8 tos, int mptcp)
+#else
static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
+#endif
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
#ifdef CONFIG_TCP_MD5SIG
+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
+#endif
+#ifdef CONFIG_MPTCP
+ + ((MPTCP_SUB_LEN_DSS >> 2) +
+ (MPTCP_SUB_LEN_ACK >> 2))
#endif
];
} rep;
ip_hdr(skb)->daddr, &rep.th);
}
#endif
+#ifdef CONFIG_MPTCP
+ if (mptcp) {
+ int offset = (tsecr) ? 3 : 0;
+ /* Construction of 32-bit data_ack */
+ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
+ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
+ (0x20 << 8) |
+ (0x01));
+ rep.opt[offset] = htonl(data_ack);
+
+ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ }
+#endif /* CONFIG_MPTCP */
arg.flags = reply_flags;
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-
+#ifdef CONFIG_MPTCP
+ u32 data_ack = 0;
+ int mptcp = 0;
+if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
+ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
+ mptcp = 1;
+ }
+#endif
tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+#ifdef CONFIG_MPTCP
+ data_ack,
+#endif
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tcp_twsk_md5_key(tcptw),
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
tw->tw_tos
+#ifdef CONFIG_MPTCP
+ , mptcp
+#endif
);
inet_twsk_put(tw);
}
-static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+#ifndef CONFIG_MPTCP
+static
+#endif
+void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
- /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ /* sk->sk_state == TCP_LISTEN or meta-sk -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
- tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
+#ifdef CONFIG_MPTCP
+ u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
+ tcp_rsk(req)->snt_isn + 1 :
+ tcp_sk(sk)->snd_nxt;
+#else
+ u32 seq = (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt;
+#endif
+
+ tcp_v4_send_ack(sk, skb, seq,
+ tcp_rsk(req)->rcv_nxt,
+#ifdef CONFIG_MPTCP
+ 0,
+#endif
+ req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
- ip_hdr(skb)->tos);
+ ip_hdr(skb)->tos
+#ifdef CONFIG_MPTCP
+ , 0
+#endif
+ );
}
/*
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
+#ifndef CONFIG_MPTCP
+static
+#endif
+int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
/*
* IPv4 request_sock destructor.
*/
-static void tcp_v4_reqsk_destructor(struct request_sock *req)
+#ifndef CONFIG_MPTCP
+static
+#endif
+void tcp_v4_reqsk_destructor(struct request_sock *req)
{
kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}
return false;
}
+#ifdef CONFIG_MPTCP
+static int tcp_v4_init_req(struct request_sock *req, const struct sock *sk_listener,
+ struct sk_buff *skb, bool want_cookie)
+#else
static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
+#endif
{
struct inet_request_sock *ireq = inet_rsk(req);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
ireq->no_srccheck = inet_sk(sk_listener)->transparent;
RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
+
+#ifdef CONFIG_MPTCP
+ return 0;
+#endif
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
.syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+#ifndef CONFIG_MPTCP
+static
+#endif
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
+#ifndef CONFIG_MPTCP
+static
+#endif
+struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
+#ifdef CONFIG_MPTCP
+ if (is_meta_sk(sk))
+ return mptcp_v4_do_rcv(sk, skb);
+#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
wake_up_interruptible_sync_poll(sk_sleep(sk),
POLLIN | POLLRDNORM | POLLRDBAND);
- if (!inet_csk_ack_scheduled(sk))
+ if (!inet_csk_ack_scheduled(sk)
+#ifdef CONFIG_MPTCP
+ && !mptcp(tp)
+#endif
+ )
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
(3 * tcp_rto_min(sk)) / 4,
TCP_RTO_MAX);
* From tcp_input.c
*/
+#define RC_RETRY_CNT 3
int tcp_v4_rcv(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
+#ifdef CONFIG_MPTCP
+ struct sock *meta_sk = NULL;
+#endif
int ret;
struct net *net = dev_net(skb->dev);
+ unsigned int retry_cnt = RC_RETRY_CNT;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+#ifdef CONFIG_MPTCP
+ TCP_SKB_CB(skb)->mptcp_flags = 0;
+ TCP_SKB_CB(skb)->dss_off = 0;
+#endif
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+#ifndef CONFIG_MPTCP
if (!sk)
goto no_tcp_socket;
+#endif
process:
+#ifdef CONFIG_MPTCP
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+#else
if (sk->sk_state == TCP_TIME_WAIT)
+#endif
goto do_time_wait;
+#ifdef CONFIG_MPTCP
+ if (!sk)
+ goto no_tcp_socket;
+#endif
+ /*
+ * FIXME: SEC patch for P171206-06874
+ * If ACK packets for three-way handshake are received at the same time by multi core,
+ * each core will try to access request socket and create new socket to establish TCP connection.
+ * But, there is no synchronization scheme to avoid race condition for request socket,
+ * 2nd attempt that create new socket will be fail, it caused 2nd ACK packet discard.
+ *
+ * For that reason,
+ * If 2nd ACK packet contained meaningful data, it caused unintended packet drop.
+ * so, 2nd core should wait at this point until new socket was created by 1st core.
+ * */
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ if (atomic_read(&req->rsk_refcnt) > (2+1) && retry_cnt > 0) {
+ reqsk_put(req);
+ if (retry_cnt == RC_RETRY_CNT)
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSK);
+ retry_cnt--;
+ udelay(500);
+
+ goto lookup;
+ }
+
+ if (!retry_cnt)
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPRACECNDREQSKDROP);
+ }
+
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk;
reqsk_put(req);
goto discard_it;
}
- if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ if (unlikely(sk->sk_state != TCP_LISTEN
+#ifdef CONFIG_MPTCP
+ && !is_meta_sk(sk)
+#endif
+ )) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
sock_hold(sk);
+
+#ifdef CONFIG_MPTCP
+ if (is_meta_sk(sk)) {
+ bh_lock_sock(sk);
+
+ if (sock_owned_by_user(sk)) {
+ skb->sk = sk;
+ if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
+ bh_unlock_sock(sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+
+ reqsk_put(req);
+ goto discard_and_relse;
+ }
+
+ reqsk_put(req);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ return 0;
+ }
+ }
+#endif
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
+#ifdef CONFIG_MPTCP
+ if (is_meta_sk(sk))
+ bh_unlock_sock(sk);
+#endif
goto discard_and_relse;
}
if (nsk == sk) {
reqsk_put(req);
+#ifdef CONFIG_MPTCP
+ if (is_meta_sk(sk))
+ bh_unlock_sock(sk);
+#endif
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
sk_incoming_cpu_update(sk);
- bh_lock_sock_nested(sk);
+#ifdef CONFIG_MPTCP
+ if (mptcp(tcp_sk(sk))) {
+ meta_sk = mptcp_meta_sk(sk);
+
+ bh_lock_sock_nested(meta_sk);
+ if (sock_owned_by_user(meta_sk))
+ skb->sk = sk;
+ } else {
+ meta_sk = sk;
+#endif
+ bh_lock_sock_nested(sk);
+#ifdef CONFIG_MPTCP
+ }
+#endif
tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
+#ifdef CONFIG_MPTCP
+ if (!sock_owned_by_user(meta_sk)) {
+ if (!tcp_prequeue(meta_sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else if (unlikely(sk_add_backlog(meta_sk, skb,
+ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
+ bh_unlock_sock(meta_sk);
+#else
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
} else if (unlikely(sk_add_backlog(sk, skb,
sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
+#endif
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
+#ifdef CONFIG_MPTCP
+ bh_unlock_sock(meta_sk);
+#else
bh_unlock_sock(sk);
+#endif
put_and_return:
sock_put(sk);
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
+#ifdef CONFIG_MPTCP
+ if (!sk && th->syn && !th->ack) {
+ int ret = mptcp_lookup_join(skb, NULL);
+
+ if (ret < 0) {
+ tcp_v4_send_reset(NULL, skb);
+ goto discard_it;
+ } else if (ret > 0) {
+ return 0;
+ }
+ }
+#endif
+
if (tcp_checksum_complete(skb)) {
csum_error:
TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
sk = sk2;
goto process;
}
+#ifdef CONFIG_MPTCP
+ if (th->syn && !th->ack) {
+ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
+
+ if (ret < 0) {
+ tcp_v4_send_reset(NULL, skb);
+ goto discard_it;
+ } else if (ret > 0) {
+ return 0;
+ }
+ }
+#endif
/* Fall through to ACK */
}
case TCP_TW_ACK:
tcp_init_sock(sk);
+#ifdef CONFIG_MPTCP
+ if (sock_flag(sk, SOCK_MPTCP))
+ icsk->icsk_af_ops = &mptcp_v4_specific;
+ else
+#endif
icsk->icsk_af_ops = &ipv4_specific;
#ifdef CONFIG_TCP_MD5SIG
tcp_clear_xmit_timers(sk);
tcp_cleanup_congestion_control(sk);
+#ifdef CONFIG_MPTCP
+ if (mptcp(tp))
+ mptcp_destroy_sock(sk);
+ if (tp->inside_tk_table)
+ mptcp_hash_remove(tp);
+#endif
/* Cleanup up the write buffer. */
tcp_write_queue_purge(sk);
}
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_MPTCP
+static void tcp_v4_clear_sk(struct sock *sk, int size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* we do not want to clear tk_table field, because of RCU lookups */
+ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table.next));
+
+ memset(&tp->tk_table.pprev, 0, size - offsetof(struct tcp_sock, tk_table.pprev));
+}
+
+void tcp_copy_sk(struct sock *nsk, const struct sock *osk)
+{
+ struct tcp_sock *ntp = tcp_sk(nsk);
+ struct tcp_sock *otp = tcp_sk(osk);
+
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+ memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ offsetof(struct tcp_sock, tk_table.next) - offsetof(struct sock, sk_dontcopy_end));
+
+ memcpy(&ntp->tk_table.pprev, &otp->tk_table.pprev,
+ osk->sk_prot->obj_size - offsetof(struct tcp_sock, tk_table.pprev));
+
+ ntp->tk_table.pprev = NULL;
+}
+#endif
+
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.proto_cgroup = tcp_proto_cgroup,
#endif
.diag_destroy = tcp_abort,
+#ifdef CONFIG_MPTCP
+ .clear_sk = tcp_v4_clear_sk,
+ .copy_sk = tcp_copy_sk,
+#endif
};
EXPORT_SYMBOL(tcp_prot);