add DOVE extensions for VXLAN
authorDavid Stevens <dlstevens@us.ibm.com>
Tue, 20 Nov 2012 02:50:14 +0000 (02:50 +0000)
committerDavid S. Miller <davem@davemloft.net>
Tue, 20 Nov 2012 18:41:28 +0000 (13:41 -0500)
This patch provides extensions to VXLAN for supporting Distributed
Overlay Virtual Ethernet (DOVE) networks. The patch includes:

+ a dove flag per VXLAN device to enable DOVE extensions
+ ARP reduction, whereby a bridge-connected VXLAN tunnel endpoint
answers ARP requests from the local bridge on behalf of
remote DOVE clients
+ route short-circuiting (aka L3 switching). Known destination IP
addresses use the corresponding destination MAC address for
switching rather than going to a (possibly remote) router first.
+ netlink notification messages for forwarding table and L3 switching
misses

Changes since v2
- combined bools into "u32 flags"
- replaced loop with !is_zero_ether_addr()

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/vxlan.c
include/uapi/linux/if_link.h

index a14df1ce99ffb7defc3504392a0254d833bf5670..ce77b8b693ae40809660bfa85313b99530eb068b 100644 (file)
@@ -29,6 +29,8 @@
 #include <linux/etherdevice.h>
 #include <linux/if_ether.h>
 #include <linux/hash.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
 #include <net/ip.h>
 #include <net/icmp.h>
 #include <net/udp.h>
@@ -110,7 +112,7 @@ struct vxlan_dev {
        __u16             port_max;
        __u8              tos;          /* TOS override */
        __u8              ttl;
-       bool              learn;
+       u32               flags;        /* VXLAN_F_* below */
 
        unsigned long     age_interval;
        struct timer_list age_timer;
@@ -121,6 +123,12 @@ struct vxlan_dev {
        struct hlist_head fdb_head[FDB_HASH_SIZE];
 };
 
+#define VXLAN_F_LEARN  0x01
+#define VXLAN_F_PROXY  0x02
+#define VXLAN_F_RSC    0x04
+#define VXLAN_F_L2MISS 0x08
+#define VXLAN_F_L3MISS 0x10
+
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
 
@@ -154,6 +162,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
        struct nda_cacheinfo ci;
        struct nlmsghdr *nlh;
        struct ndmsg *ndm;
+       bool send_ip, send_eth;
 
        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
        if (nlh == NULL)
@@ -161,16 +170,24 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
        ndm = nlmsg_data(nlh);
        memset(ndm, 0, sizeof(*ndm));
-       ndm->ndm_family = AF_BRIDGE;
+
+       send_eth = send_ip = true;
+
+       if (type == RTM_GETNEIGH) {
+               ndm->ndm_family = AF_INET;
+               send_ip = fdb->remote_ip != 0;
+               send_eth = !is_zero_ether_addr(fdb->eth_addr);
+       } else
+               ndm->ndm_family = AF_BRIDGE;
        ndm->ndm_state = fdb->state;
        ndm->ndm_ifindex = vxlan->dev->ifindex;
        ndm->ndm_flags = NTF_SELF;
        ndm->ndm_type = NDA_DST;
 
-       if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+       if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
                goto nla_put_failure;
 
-       if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+       if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip))
                goto nla_put_failure;
 
        ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
@@ -222,6 +239,29 @@ errout:
                rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
+static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
+{
+       struct vxlan_dev *vxlan = netdev_priv(dev);
+       struct vxlan_fdb f;
+
+       memset(&f, 0, sizeof f);
+       f.state = NUD_STALE;
+       f.remote_ip = ipa; /* goes to NDA_DST */
+
+       vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
+}
+
+static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
+{
+       struct vxlan_fdb        f;
+
+       memset(&f, 0, sizeof f);
+       f.state = NUD_STALE;
+       memcpy(f.eth_addr, eth_addr, ETH_ALEN);
+
+       vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
+}
+
 /* Hash Ethernet address */
 static u32 eth_hash(const unsigned char *addr)
 {
@@ -551,6 +591,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                goto drop;
        }
 
+       skb_reset_mac_header(skb);
+
        /* Re-examine inner Ethernet packet */
        oip = ip_hdr(skb);
        skb->protocol = eth_type_trans(skb, vxlan->dev);
@@ -560,7 +602,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                               vxlan->dev->dev_addr) == 0)
                goto drop;
 
-       if (vxlan->learn)
+       if (vxlan->flags & VXLAN_F_LEARN)
                vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
 
        __skb_tunnel_rx(skb, vxlan->dev);
@@ -599,6 +641,117 @@ drop:
        return 0;
 }
 
+static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
+{
+       struct vxlan_dev *vxlan = netdev_priv(dev);
+       struct arphdr *parp;
+       u8 *arpptr, *sha;
+       __be32 sip, tip;
+       struct neighbour *n;
+
+       if (dev->flags & IFF_NOARP)
+               goto out;
+
+       if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
+               dev->stats.tx_dropped++;
+               goto out;
+       }
+       parp = arp_hdr(skb);
+
+       if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
+            parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+           parp->ar_pro != htons(ETH_P_IP) ||
+           parp->ar_op != htons(ARPOP_REQUEST) ||
+           parp->ar_hln != dev->addr_len ||
+           parp->ar_pln != 4)
+               goto out;
+       arpptr = (u8 *)parp + sizeof(struct arphdr);
+       sha = arpptr;
+       arpptr += dev->addr_len;        /* sha */
+       memcpy(&sip, arpptr, sizeof(sip));
+       arpptr += sizeof(sip);
+       arpptr += dev->addr_len;        /* tha */
+       memcpy(&tip, arpptr, sizeof(tip));
+
+       if (ipv4_is_loopback(tip) ||
+           ipv4_is_multicast(tip))
+               goto out;
+
+       n = neigh_lookup(&arp_tbl, &tip, dev);
+
+       if (n) {
+               struct vxlan_dev *vxlan = netdev_priv(dev);
+               struct vxlan_fdb *f;
+               struct sk_buff  *reply;
+
+               if (!(n->nud_state & NUD_CONNECTED)) {
+                       neigh_release(n);
+                       goto out;
+               }
+
+               f = vxlan_find_mac(vxlan, n->ha);
+               if (f && f->remote_ip == 0) {
+                       /* bridge-local neighbor */
+                       neigh_release(n);
+                       goto out;
+               }
+
+               reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+                               n->ha, sha);
+
+               neigh_release(n);
+
+               skb_reset_mac_header(reply);
+               __skb_pull(reply, skb_network_offset(reply));
+               reply->ip_summed = CHECKSUM_UNNECESSARY;
+               reply->pkt_type = PACKET_HOST;
+
+               if (netif_rx_ni(reply) == NET_RX_DROP)
+                       dev->stats.rx_dropped++;
+       } else if (vxlan->flags & VXLAN_F_L3MISS)
+               vxlan_ip_miss(dev, tip);
+out:
+       consume_skb(skb);
+       return NETDEV_TX_OK;
+}
+
+static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
+{
+       struct vxlan_dev *vxlan = netdev_priv(dev);
+       struct neighbour *n;
+       struct iphdr *pip;
+
+       if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
+               return false;
+
+       n = NULL;
+       switch (ntohs(eth_hdr(skb)->h_proto)) {
+       case ETH_P_IP:
+               if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+                       return false;
+               pip = ip_hdr(skb);
+               n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
+               break;
+       default:
+               return false;
+       }
+
+       if (n) {
+               bool diff;
+
+               diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0;
+               if (diff) {
+                       memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+                               dev->addr_len);
+                       memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
+               }
+               neigh_release(n);
+               return diff;
+       } else if (vxlan->flags & VXLAN_F_L3MISS)
+               vxlan_ip_miss(dev, pip->daddr);
+       return false;
+}
+
 /* Extract dsfield from inner protocol */
 static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
                                   const struct sk_buff *skb)
@@ -621,22 +774,6 @@ static inline u8 vxlan_ecn_encap(u8 tos,
        return INET_ECN_encapsulate(tos, inner);
 }
 
-static __be32 vxlan_find_dst(struct vxlan_dev *vxlan, struct sk_buff *skb)
-{
-       const struct ethhdr *eth = (struct ethhdr *) skb->data;
-       const struct vxlan_fdb *f;
-
-       if (is_multicast_ether_addr(eth->h_dest))
-               return vxlan->gaddr;
-
-       f = vxlan_find_mac(vxlan, eth->h_dest);
-       if (f)
-               return f->remote_ip;
-       else
-               return vxlan->gaddr;
-
-}
-
 static void vxlan_sock_free(struct sk_buff *skb)
 {
        sock_put(skb->sk);
@@ -683,6 +820,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct rtable *rt;
        const struct iphdr *old_iph;
+       struct ethhdr *eth;
        struct iphdr *iph;
        struct vxlanhdr *vxh;
        struct udphdr *uh;
@@ -693,10 +831,50 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
        __be16 df = 0;
        __u8 tos, ttl;
        int err;
+       bool did_rsc = false;
+       const struct vxlan_fdb *f;
+
+       skb_reset_mac_header(skb);
+       eth = eth_hdr(skb);
+
+       if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
+               return arp_reduce(dev, skb);
+       else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
+               did_rsc = route_shortcircuit(dev, skb);
 
-       dst = vxlan_find_dst(vxlan, skb);
-       if (!dst)
+       f = vxlan_find_mac(vxlan, eth->h_dest);
+       if (f == NULL) {
+               did_rsc = false;
+               dst = vxlan->gaddr;
+               if (!dst && (vxlan->flags & VXLAN_F_L2MISS) &&
+                   !is_multicast_ether_addr(eth->h_dest))
+                       vxlan_fdb_miss(vxlan, eth->h_dest);
+       } else
+               dst = f->remote_ip;
+
+       if (!dst) {
+               if (did_rsc) {
+                       __skb_pull(skb, skb_network_offset(skb));
+                       skb->ip_summed = CHECKSUM_NONE;
+                       skb->pkt_type = PACKET_HOST;
+
+                       /* short-circuited back to local bridge */
+                       if (netif_rx(skb) == NET_RX_SUCCESS) {
+                               struct vxlan_stats *stats =
+                                               this_cpu_ptr(vxlan->stats);
+
+                               u64_stats_update_begin(&stats->syncp);
+                               stats->tx_packets++;
+                               stats->tx_bytes += pkt_len;
+                               u64_stats_update_end(&stats->syncp);
+                       } else {
+                               dev->stats.tx_errors++;
+                               dev->stats.tx_aborted_errors++;
+                       }
+                       return NETDEV_TX_OK;
+               }
                goto drop;
+       }
 
        /* Need space for new headers (invalidates iph ptr) */
        if (skb_cow_head(skb, VXLAN_HEADROOM))
@@ -1019,6 +1197,10 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
        [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
        [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
        [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
+       [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
+       [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
+       [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
+       [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1114,13 +1296,25 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
                vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
        if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
-               vxlan->learn = true;
+               vxlan->flags |= VXLAN_F_LEARN;
 
        if (data[IFLA_VXLAN_AGEING])
                vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
        else
                vxlan->age_interval = FDB_AGE_DEFAULT;
 
+       if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
+               vxlan->flags |= VXLAN_F_PROXY;
+
+       if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
+               vxlan->flags |= VXLAN_F_RSC;
+
+       if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
+               vxlan->flags |= VXLAN_F_L2MISS;
+
+       if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
+               vxlan->flags |= VXLAN_F_L3MISS;
+
        if (data[IFLA_VXLAN_LIMIT])
                vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
@@ -1157,6 +1351,10 @@ static size_t vxlan_get_size(const struct net_device *dev)
                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
                nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
+               nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
+               nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
+               nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
+               nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
                nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
                nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -1185,7 +1383,15 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
        if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
            nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
-           nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn) ||
+           nla_put_u8(skb, IFLA_VXLAN_LEARNING,
+                       !!(vxlan->flags & VXLAN_F_LEARN)) ||
+           nla_put_u8(skb, IFLA_VXLAN_PROXY,
+                       !!(vxlan->flags & VXLAN_F_PROXY)) ||
+           nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
+           nla_put_u8(skb, IFLA_VXLAN_L2MISS,
+                       !!(vxlan->flags & VXLAN_F_L2MISS)) ||
+           nla_put_u8(skb, IFLA_VXLAN_L3MISS,
+                       !!(vxlan->flags & VXLAN_F_L3MISS)) ||
            nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
            nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax))
                goto nla_put_failure;
index 7aae0179ae446e5bd8aa25cfbae6e23bb955735f..bb58aeb7f34dedbcab1dd16ff8751cabe870b03e 100644 (file)
@@ -302,6 +302,10 @@ enum {
        IFLA_VXLAN_AGEING,
        IFLA_VXLAN_LIMIT,
        IFLA_VXLAN_PORT_RANGE,
+       IFLA_VXLAN_PROXY,
+       IFLA_VXLAN_RSC,
+       IFLA_VXLAN_L2MISS,
+       IFLA_VXLAN_L3MISS,
        __IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)