net: accept UFO datagrams from tuntap and packet
authorWillem de Bruijn <willemb@google.com>
Tue, 21 Nov 2017 15:22:25 +0000 (10:22 -0500)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 17 Dec 2017 14:07:58 +0000 (15:07 +0100)
[ Upstream commit 0c19f846d582af919db66a5914a0189f9f92c936 ]

Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.

Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.

Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.

It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.

To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").

(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.

Tested
  Booted a v4.13 guest kernel with QEMU. On a host kernel before this
  patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
  enabled, same as on a v4.13 host kernel.

  A UFO packet sent from the guest appears on the tap device:
    host:
      nc -l -p -u 8000 &
      tcpdump -n -i tap0

    guest:
      dd if=/dev/zero of=payload.txt bs=1 count=2000
      nc -u 192.16.1.1 8000 < payload.txt

  Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
  packets arriving fragmented:

    ./with_tap_pair.sh ./tap_send_ufo tap0 tap1
    (from https://github.com/wdebruij/kerneltools/tree/master/tests)

Changes
  v1 -> v2
    - simplified set_offload change (review comment)
    - documented test procedure

Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
15 files changed:
drivers/net/tap.c
drivers/net/tun.c
include/linux/netdev_features.h
include/linux/netdevice.h
include/linux/skbuff.h
include/linux/virtio_net.h
include/net/ipv6.h
net/core/dev.c
net/ipv4/af_inet.c
net/ipv4/udp_offload.c
net/ipv6/output_core.c
net/ipv6/udp_offload.c
net/openvswitch/datapath.c
net/openvswitch/flow.c
net/sched/act_csum.c

index e4c6c78fab3b34efc16a262c6a588f5021cb5a45..bfd4ded0a53fb015226d0b03ceb9e5dda9f904e5 100644 (file)
@@ -1080,7 +1080,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
        case TUNSETOFFLOAD:
                /* let the user check for future flags */
                if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
-                           TUN_F_TSO_ECN))
+                           TUN_F_TSO_ECN | TUN_F_UFO))
                        return -EINVAL;
 
                rtnl_lock();
index d06f88312e1e8ff1c3b701aa34f6e6d3ac7219f8..c91b110f21699b97a81963567f66be4847a222c0 100644 (file)
@@ -2157,6 +2157,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
                                features |= NETIF_F_TSO6;
                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
                }
+
+               arg &= ~TUN_F_UFO;
        }
 
        /* This gives the user a way to test for new features in future by
index dc8b4896b77b090e8329bdee9766033a6a3b95fb..b1b0ca7ccb2bacac5d997f97f86848e928bc9da7 100644 (file)
@@ -54,8 +54,9 @@ enum {
        NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
        NETIF_F_GSO_SCTP_BIT,           /* ... SCTP fragmentation */
        NETIF_F_GSO_ESP_BIT,            /* ... ESP with TSO */
+       NETIF_F_GSO_UDP_BIT,            /* ... UFO, deprecated except tuntap */
        /**/NETIF_F_GSO_LAST =          /* last bit, see GSO_MASK */
-               NETIF_F_GSO_ESP_BIT,
+               NETIF_F_GSO_UDP_BIT,
 
        NETIF_F_FCOE_CRC_BIT,           /* FCoE CRC32 */
        NETIF_F_SCTP_CRC_BIT,           /* SCTP checksum offload */
@@ -132,6 +133,7 @@ enum {
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_GSO_SCTP       __NETIF_F(GSO_SCTP)
 #define NETIF_F_GSO_ESP                __NETIF_F(GSO_ESP)
+#define NETIF_F_GSO_UDP                __NETIF_F(GSO_UDP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX        __NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX        __NETIF_F(HW_VLAN_STAG_TX)
index 2eaac7d75af4f1bbdaf876acc55b4bd0d37a7f36..46bf7cc7d5d589ebaa1209312ba5fc671a6eb802 100644 (file)
@@ -4101,6 +4101,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
+       BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
 
        return (features & feature) == feature;
 }
index d448a4804aeabbbb44179d5e47e9f806733d3e66..051e0939ec190ab2d3701508917fa4119e8b17a8 100644 (file)
@@ -569,6 +569,8 @@ enum {
        SKB_GSO_SCTP = 1 << 14,
 
        SKB_GSO_ESP = 1 << 15,
+
+       SKB_GSO_UDP = 1 << 16,
 };
 
 #if BITS_PER_LONG > 32
index 210034c896e31e725c6de6bfb33b0406c3b1927a..f144216febc642fd70512df9dddefe1a7f119478 100644 (file)
@@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                                        const struct virtio_net_hdr *hdr,
                                        bool little_endian)
 {
-       unsigned short gso_type = 0;
+       unsigned int gso_type = 0;
 
        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
@@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                case VIRTIO_NET_HDR_GSO_TCPV6:
                        gso_type = SKB_GSO_TCPV6;
                        break;
+               case VIRTIO_NET_HDR_GSO_UDP:
+                       gso_type = SKB_GSO_UDP;
+                       break;
                default:
                        return -EINVAL;
                }
index 6eac5cf8f1e6e00510355a0f4eaa9566a5a0086f..35e9dd2d18baa4b5a33c8653b4b402090eab3dc6 100644 (file)
@@ -727,7 +727,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 __be32 ipv6_select_ident(struct net *net,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr);
-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
 
index 11596a302a265212cb5dfe40f51b5b01fb20d0ce..27357fc1730b93ebf302dbaf0d7a365dd56e4dea 100644 (file)
@@ -2735,7 +2735,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
        if (tx_path)
-               return skb->ip_summed != CHECKSUM_PARTIAL;
+               return skb->ip_summed != CHECKSUM_PARTIAL &&
+                      skb->ip_summed != CHECKSUM_UNNECESSARY;
 
        return skb->ip_summed == CHECKSUM_NONE;
 }
index e31108e5ef79c574751bb23773b41c2daf6e0975..b9d9a2b8792c7a9aa6744f80b55f7b5a727a5cef 100644 (file)
@@ -1221,9 +1221,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                                 netdev_features_t features)
 {
-       bool fixedid = false, gso_partial, encap;
+       bool udpfrag = false, fixedid = false, gso_partial, encap;
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        const struct net_offload *ops;
+       unsigned int offset = 0;
        struct iphdr *iph;
        int proto, tot_len;
        int nhoff;
@@ -1258,6 +1259,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
        segs = ERR_PTR(-EPROTONOSUPPORT);
 
        if (!skb->encapsulation || encap) {
+               udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
                fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
 
                /* fixed ID is invalid if DF bit is not set */
@@ -1277,7 +1279,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
        skb = segs;
        do {
                iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
-               if (skb_is_gso(skb)) {
+               if (udpfrag) {
+                       iph->frag_off = htons(offset >> 3);
+                       if (skb->next)
+                               iph->frag_off |= htons(IP_MF);
+                       offset += skb->len - nhoff - ihl;
+                       tot_len = skb->len - nhoff;
+               } else if (skb_is_gso(skb)) {
                        if (!fixedid) {
                                iph->id = htons(id);
                                id += skb_shinfo(skb)->gso_segs;
index e360d55be5554d1bee56d3f493752ba9ae2c8015..01801b77bd0da45764fd0e9a80f22b0e46633934 100644 (file)
@@ -187,16 +187,57 @@ out_unlock:
 }
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
-static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb,
-                                          netdev_features_t features)
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+                                        netdev_features_t features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
+       unsigned int mss;
+       __wsum csum;
+       struct udphdr *uh;
+       struct iphdr *iph;
 
        if (skb->encapsulation &&
            (skb_shinfo(skb)->gso_type &
-            (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)))
+            (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
                segs = skb_udp_tunnel_segment(skb, features, false);
+               goto out;
+       }
+
+       if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+               goto out;
+
+       mss = skb_shinfo(skb)->gso_size;
+       if (unlikely(skb->len <= mss))
+               goto out;
+
+       /* Do software UFO. Complete and fill in the UDP checksum as
+        * HW cannot do checksum of UDP packets sent as multiple
+        * IP fragments.
+        */
 
+       uh = udp_hdr(skb);
+       iph = ip_hdr(skb);
+
+       uh->check = 0;
+       csum = skb_checksum(skb, 0, skb->len, 0);
+       uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+       if (uh->check == 0)
+               uh->check = CSUM_MANGLED_0;
+
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+       /* If there is no outer header we can fake a checksum offload
+        * due to the fact that we have already done the checksum in
+        * software prior to segmenting the frame.
+        */
+       if (!skb->encap_hdr_csum)
+               features |= NETIF_F_HW_CSUM;
+
+       /* Fragment the skb. IP headers of the fragments are updated in
+        * inet_gso_segment()
+        */
+       segs = skb_segment(skb, features);
+out:
        return segs;
 }
 
@@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv4_offload = {
        .callbacks = {
-               .gso_segment = udp4_tunnel_segment,
+               .gso_segment = udp4_ufo_fragment,
                .gro_receive  = udp4_gro_receive,
                .gro_complete = udp4_gro_complete,
        },
index a338bbc33cf3cd895fa77e137d6f6389e9a6519c..4fe7c90962ddae3356200376aa911bab6d75bb48 100644 (file)
@@ -39,7 +39,7 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
  *
  * The network header must be set before calling this.
  */
-void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
+__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
 {
        static u32 ip6_proxy_idents_hashrnd __read_mostly;
        struct in6_addr buf[2];
@@ -51,14 +51,14 @@ void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
                                   offsetof(struct ipv6hdr, saddr),
                                   sizeof(buf), buf);
        if (!addrs)
-               return;
+               return 0;
 
        net_get_random_once(&ip6_proxy_idents_hashrnd,
                            sizeof(ip6_proxy_idents_hashrnd));
 
        id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
                                 &addrs[1], &addrs[0]);
-       skb_shinfo(skb)->ip6_frag_id = htonl(id);
+       return htonl(id);
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
 
index 455fd4e39333233289e9a844de512f200119ff1a..a0f89ad76f9d2233b9e048418069aacd92ac6a25 100644 (file)
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb,
-                                          netdev_features_t features)
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
+                                        netdev_features_t features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
+       unsigned int mss;
+       unsigned int unfrag_ip6hlen, unfrag_len;
+       struct frag_hdr *fptr;
+       u8 *packet_start, *prevhdr;
+       u8 nexthdr;
+       u8 frag_hdr_sz = sizeof(struct frag_hdr);
+       __wsum csum;
+       int tnl_hlen;
+       int err;
+
+       mss = skb_shinfo(skb)->gso_size;
+       if (unlikely(skb->len <= mss))
+               goto out;
 
        if (skb->encapsulation && skb_shinfo(skb)->gso_type &
            (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
                segs = skb_udp_tunnel_segment(skb, features, true);
+       else {
+               const struct ipv6hdr *ipv6h;
+               struct udphdr *uh;
+
+               if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+                       goto out;
+
+               /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+                * do checksum of UDP packets sent as multiple IP fragments.
+                */
+
+               uh = udp_hdr(skb);
+               ipv6h = ipv6_hdr(skb);
+
+               uh->check = 0;
+               csum = skb_checksum(skb, 0, skb->len, 0);
+               uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
+                                         &ipv6h->daddr, csum);
+               if (uh->check == 0)
+                       uh->check = CSUM_MANGLED_0;
+
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+               /* If there is no outer header we can fake a checksum offload
+                * due to the fact that we have already done the checksum in
+                * software prior to segmenting the frame.
+                */
+               if (!skb->encap_hdr_csum)
+                       features |= NETIF_F_HW_CSUM;
+
+               /* Check if there is enough headroom to insert fragment header. */
+               tnl_hlen = skb_tnl_header_len(skb);
+               if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
+                       if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+                               goto out;
+               }
+
+               /* Find the unfragmentable header and shift it left by frag_hdr_sz
+                * bytes to insert fragment header.
+                */
+               err = ip6_find_1stfragopt(skb, &prevhdr);
+               if (err < 0)
+                       return ERR_PTR(err);
+               unfrag_ip6hlen = err;
+               nexthdr = *prevhdr;
+               *prevhdr = NEXTHDR_FRAGMENT;
+               unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+                            unfrag_ip6hlen + tnl_hlen;
+               packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+               memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+               SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+               skb->mac_header -= frag_hdr_sz;
+               skb->network_header -= frag_hdr_sz;
+
+               fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+               fptr->nexthdr = nexthdr;
+               fptr->reserved = 0;
+               fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb);
+
+               /* Fragment the skb. ipv6 header and the remaining fields of the
+                * fragment header are updated in ipv6_gso_segment()
+                */
+               segs = skb_segment(skb, features);
+       }
 
+out:
        return segs;
 }
 
@@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv6_offload = {
        .callbacks = {
-               .gso_segment    =       udp6_tunnel_segment,
+               .gso_segment    =       udp6_ufo_fragment,
                .gro_receive    =       udp6_gro_receive,
                .gro_complete   =       udp6_gro_complete,
        },
index c3aec6227c91b776dd41c7c8f6a9b1d1e313aee4..294444bb075c8b88dd3807a0a659be17a43863c4 100644 (file)
@@ -335,6 +335,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
                             const struct dp_upcall_info *upcall_info,
                                 uint32_t cutlen)
 {
+       unsigned short gso_type = skb_shinfo(skb)->gso_type;
+       struct sw_flow_key later_key;
        struct sk_buff *segs, *nskb;
        int err;
 
@@ -345,9 +347,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
        if (segs == NULL)
                return -EINVAL;
 
+       if (gso_type & SKB_GSO_UDP) {
+               /* The initial flow key extracted by ovs_flow_key_extract()
+                * in this case is for a first fragment, so we need to
+                * properly mark later fragments.
+                */
+               later_key = *key;
+               later_key.ip.frag = OVS_FRAG_TYPE_LATER;
+       }
+
        /* Queue all of the segments. */
        skb = segs;
        do {
+               if (gso_type & SKB_GSO_UDP && skb != segs)
+                       key = &later_key;
+
                err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
                if (err)
                        break;
index 8c94cef25a72b685456b2ab3e7155b17f944abcf..cfb652a4e007732bcb1e5b0b1109d111a1e16a7c 100644 (file)
@@ -584,7 +584,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
                        key->ip.frag = OVS_FRAG_TYPE_LATER;
                        return 0;
                }
-               if (nh->frag_off & htons(IP_MF))
+               if (nh->frag_off & htons(IP_MF) ||
+                       skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
                        key->ip.frag = OVS_FRAG_TYPE_FIRST;
                else
                        key->ip.frag = OVS_FRAG_TYPE_NONE;
@@ -700,6 +701,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
                if (key->ip.frag == OVS_FRAG_TYPE_LATER)
                        return 0;
+               if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+                       key->ip.frag = OVS_FRAG_TYPE_FIRST;
+
                /* Transport layer. */
                if (key->ip.proto == NEXTHDR_TCP) {
                        if (tcphdr_ok(skb)) {
index 1c40caadcff959ba0c6cec6b8e32f7b459c42cfa..d836f998117b2417548b22a73940300405ce65b8 100644 (file)
@@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
        const struct iphdr *iph;
        u16 ul;
 
+       if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+               return 1;
+
        /*
         * Support both UDP and UDPLITE checksum algorithms, Don't use
         * udph->len to get the real length without any protocol check,
@@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
        const struct ipv6hdr *ip6h;
        u16 ul;
 
+       if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+               return 1;
+
        /*
         * Support both UDP and UDPLITE checksum algorithms, Don't use
         * udph->len to get the real length without any protocol check,