ip6_tunnel: add collect_md mode to IPv6 tunnels
authorAlexei Starovoitov <ast@fb.com>
Thu, 15 Sep 2016 20:00:30 +0000 (13:00 -0700)
committerDavid S. Miller <davem@davemloft.net>
Sat, 17 Sep 2016 14:13:07 +0000 (10:13 -0400)
Similar to gre, vxlan, geneve tunnels allow IPIP6 and IP6IP6 tunnels
to operate in 'collect metadata' mode.
Unlike ipv4 code here it's possible to reuse ip6_tnl_xmit() function
for both collect_md and traditional tunnels.
bpf_skb_[gs]et_tunnel_key() helpers and ovs (in the future) are the users.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/ip6_tunnel.h
net/ipv6/ip6_tunnel.c

index 43a5a0e4524cbbd9daa6f9bd0b50a92b229deabd..20ed9699fcd40be5362083fdbbf58d2da9420b44 100644 (file)
@@ -23,6 +23,7 @@ struct __ip6_tnl_parm {
        __u8 proto;             /* tunnel protocol */
        __u8 encap_limit;       /* encapsulation limit for tunnel */
        __u8 hop_limit;         /* hop limit for tunnel */
+       bool collect_md;
        __be32 flowinfo;        /* traffic class and flowlabel for tunnel */
        __u32 flags;            /* tunnel flags */
        struct in6_addr laddr;  /* local tunnel end-point address */
index 5c5779720ef1979495ed80782ef63964161f9dc4..6a66adba0c229a0fd11c5806163a4cb87eb9483c 100644 (file)
@@ -57,6 +57,7 @@
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 MODULE_AUTHOR("Ville Nuorvala");
 MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -90,6 +91,7 @@ struct ip6_tnl_net {
        struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE];
        struct ip6_tnl __rcu *tnls_wc[1];
        struct ip6_tnl __rcu **tnls[2];
+       struct ip6_tnl __rcu *collect_md_tun;
 };
 
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
@@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
                        return t;
        }
 
+       t = rcu_dereference(ip6n->collect_md_tun);
+       if (t)
+               return t;
+
        t = rcu_dereference(ip6n->tnls_wc[0]);
        if (t && (t->dev->flags & IFF_UP))
                return t;
@@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 {
        struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
 
+       if (t->parms.collect_md)
+               rcu_assign_pointer(ip6n->collect_md_tun, t);
        rcu_assign_pointer(t->next , rtnl_dereference(*tp));
        rcu_assign_pointer(*tp, t);
 }
@@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
        struct ip6_tnl __rcu **tp;
        struct ip6_tnl *iter;
 
+       if (t->parms.collect_md)
+               rcu_assign_pointer(ip6n->collect_md_tun, NULL);
+
        for (tp = ip6_tnl_bucket(ip6n, &t->parms);
             (iter = rtnl_dereference(*tp)) != NULL;
             tp = &iter->next) {
@@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
 
        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 
+       if (tun_dst)
+               skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
        gro_cells_receive(&tunnel->gro_cells, skb);
        return 0;
 
@@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 {
        struct ip6_tnl *t;
        const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+       struct metadata_dst *tun_dst = NULL;
        int ret = -1;
 
        rcu_read_lock();
@@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
                        goto drop;
                if (iptunnel_pull_header(skb, 0, tpi->proto, false))
                        goto drop;
-               ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate,
+               if (t->parms.collect_md) {
+                       tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+                       if (!tun_dst)
+                               return 0;
+               }
+               ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
                                    log_ecn_error);
        }
 
@@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
        int mtu;
        unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
        unsigned int max_headroom = psh_hlen;
+       u8 hop_limit;
        int err = -1;
 
+       if (t->parms.collect_md) {
+               hop_limit = skb_tunnel_info(skb)->key.ttl;
+               goto route_lookup;
+       } else {
+               hop_limit = t->parms.hop_limit;
+       }
+
        /* NBMA tunnel */
        if (ipv6_addr_any(&t->parms.raddr)) {
                struct in6_addr *addr6;
@@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
                goto tx_err_link_failure;
 
        if (!dst) {
+route_lookup:
                dst = ip6_route_output(net, NULL, fl6);
 
                if (dst->error)
@@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
                        dst = NULL;
                        goto tx_err_link_failure;
                }
+               if (t->parms.collect_md &&
+                   ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+                                      &fl6->daddr, 0, &fl6->saddr))
+                       goto tx_err_link_failure;
                ndst = dst;
        }
 
@@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
        }
        if (mtu < IPV6_MIN_MTU)
                mtu = IPV6_MIN_MTU;
-       if (skb_dst(skb))
+       if (skb_dst(skb) && !t->parms.collect_md)
                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
        if (skb->len > mtu && !skb_is_gso(skb)) {
                *pmtu = mtu;
@@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
                skb = new_skb;
        }
 
-       if (!fl6->flowi6_mark && ndst)
-               dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+       if (t->parms.collect_md) {
+               if (t->encap.type != TUNNEL_ENCAP_NONE)
+                       goto tx_err_dst_release;
+       } else {
+               if (!fl6->flowi6_mark && ndst)
+                       dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
+       }
        skb_dst_set(skb, dst);
 
        if (encap_limit >= 0) {
@@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
        ipv6h = ipv6_hdr(skb);
        ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
                     ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
-       ipv6h->hop_limit = t->parms.hop_limit;
+       ipv6h->hop_limit = hop_limit;
        ipv6h->nexthdr = proto;
        ipv6h->saddr = fl6->saddr;
        ipv6h->daddr = fl6->daddr;
@@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
        if (tproto != IPPROTO_IPIP && tproto != 0)
                return -1;
 
-       if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-               encap_limit = t->parms.encap_limit;
+       dsfield = ipv4_get_dsfield(iph);
 
-       memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-       fl6.flowi6_proto = IPPROTO_IPIP;
+       if (t->parms.collect_md) {
+               struct ip_tunnel_info *tun_info;
+               const struct ip_tunnel_key *key;
 
-       dsfield = ipv4_get_dsfield(iph);
+               tun_info = skb_tunnel_info(skb);
+               if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+                            ip_tunnel_info_af(tun_info) != AF_INET6))
+                       return -1;
+               key = &tun_info->key;
+               memset(&fl6, 0, sizeof(fl6));
+               fl6.flowi6_proto = IPPROTO_IPIP;
+               fl6.daddr = key->u.ipv6.dst;
+               fl6.flowlabel = key->label;
+       } else {
+               if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+                       encap_limit = t->parms.encap_limit;
 
-       if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-               fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
-                                         & IPV6_TCLASS_MASK;
-       if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-               fl6.flowi6_mark = skb->mark;
+               memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+               fl6.flowi6_proto = IPPROTO_IPIP;
+
+               if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+                       fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+                                        & IPV6_TCLASS_MASK;
+               if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+                       fl6.flowi6_mark = skb->mark;
+       }
 
        if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
                return -1;
@@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
            ip6_tnl_addr_conflict(t, ipv6h))
                return -1;
 
-       offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
-       if (offset > 0) {
-               struct ipv6_tlv_tnl_enc_lim *tel;
-               tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
-               if (tel->encap_limit == 0) {
-                       icmpv6_send(skb, ICMPV6_PARAMPROB,
-                                   ICMPV6_HDR_FIELD, offset + 2);
+       dsfield = ipv6_get_dsfield(ipv6h);
+
+       if (t->parms.collect_md) {
+               struct ip_tunnel_info *tun_info;
+               const struct ip_tunnel_key *key;
+
+               tun_info = skb_tunnel_info(skb);
+               if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+                            ip_tunnel_info_af(tun_info) != AF_INET6))
                        return -1;
+               key = &tun_info->key;
+               memset(&fl6, 0, sizeof(fl6));
+               fl6.flowi6_proto = IPPROTO_IPV6;
+               fl6.daddr = key->u.ipv6.dst;
+               fl6.flowlabel = key->label;
+       } else {
+               offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+               if (offset > 0) {
+                       struct ipv6_tlv_tnl_enc_lim *tel;
+
+                       tel = (void *)&skb_network_header(skb)[offset];
+                       if (tel->encap_limit == 0) {
+                               icmpv6_send(skb, ICMPV6_PARAMPROB,
+                                           ICMPV6_HDR_FIELD, offset + 2);
+                               return -1;
+                       }
+                       encap_limit = tel->encap_limit - 1;
+               } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+                       encap_limit = t->parms.encap_limit;
                }
-               encap_limit = tel->encap_limit - 1;
-       } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
-               encap_limit = t->parms.encap_limit;
 
-       memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-       fl6.flowi6_proto = IPPROTO_IPV6;
+               memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+               fl6.flowi6_proto = IPPROTO_IPV6;
 
-       dsfield = ipv6_get_dsfield(ipv6h);
-       if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
-               fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
-       if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
-               fl6.flowlabel |= ip6_flowlabel(ipv6h);
-       if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
-               fl6.flowi6_mark = skb->mark;
+               if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+                       fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK);
+               if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+                       fl6.flowlabel |= ip6_flowlabel(ipv6h);
+               if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+                       fl6.flowi6_mark = skb->mark;
+       }
 
        if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
                return -1;
@@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev)
        if (err)
                return err;
        ip6_tnl_link_config(t);
+       if (t->parms.collect_md) {
+               dev->features |= NETIF_F_NETNS_LOCAL;
+               netif_keep_dst(dev);
+       }
        return 0;
 }
 
@@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
 
        if (data[IFLA_IPTUN_PROTO])
                parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+       if (data[IFLA_IPTUN_COLLECT_METADATA])
+               parms->collect_md = true;
 }
 
 static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
@@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[])
 {
        struct net *net = dev_net(dev);
+       struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
        struct ip6_tnl *nt, *t;
        struct ip_tunnel_encap ipencap;
 
@@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 
        ip6_tnl_netlink_parms(data, &nt->parms);
 
-       t = ip6_tnl_locate(net, &nt->parms, 0);
-       if (!IS_ERR(t))
-               return -EEXIST;
+       if (nt->parms.collect_md) {
+               if (rtnl_dereference(ip6n->collect_md_tun))
+                       return -EEXIST;
+       } else {
+               t = ip6_tnl_locate(net, &nt->parms, 0);
+               if (!IS_ERR(t))
+                       return -EEXIST;
+       }
 
        return ip6_tnl_create2(dev);
 }
@@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
                        return err;
        }
        ip6_tnl_netlink_parms(data, &p);
+       if (p.collect_md)
+               return -EINVAL;
 
        t = ip6_tnl_locate(net, &p, 0);
        if (!IS_ERR(t)) {
@@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
                nla_total_size(2) +
                /* IFLA_IPTUN_ENCAP_DPORT */
                nla_total_size(2) +
+               /* IFLA_IPTUN_COLLECT_METADATA */
+               nla_total_size(0) +
                0;
 }
 
@@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
            nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
                goto nla_put_failure;
 
-       if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
-                       tunnel->encap.type) ||
-       nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
-                    tunnel->encap.sport) ||
-       nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
-                    tunnel->encap.dport) ||
-       nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
-                   tunnel->encap.flags))
+       if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) ||
+           nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) ||
+           nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) ||
+           nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags))
                goto nla_put_failure;
 
+       if (parm->collect_md)
+               if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+                       goto nla_put_failure;
        return 0;
 
 nla_put_failure:
@@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
        [IFLA_IPTUN_ENCAP_FLAGS]        = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_SPORT]        = { .type = NLA_U16 },
        [IFLA_IPTUN_ENCAP_DPORT]        = { .type = NLA_U16 },
+       [IFLA_IPTUN_COLLECT_METADATA]   = { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ip6_link_ops __read_mostly = {