mpls: flow-based multipath selection
authorRobert Shearman <rshearma@brocade.com>
Fri, 23 Oct 2015 13:03:28 +0000 (06:03 -0700)
committerDavid S. Miller <davem@davemloft.net>
Fri, 23 Oct 2015 13:26:45 +0000 (06:26 -0700)
Change the selection of a multipath route to use a flow-based
hash. This more suitable for traffic sensitive to reordering within a
flow (e.g. TCP, L2VPN) and whilst still allowing a good distribution
of traffic given enough flows.

Selection of the path for a multipath route is done using a hash of:
1. Label stack up to MAX_MP_SELECT_LABELS labels or up to and
   including entropy label, whichever is first.
2. 3-tuple of (L3 src, L3 dst, proto) from IPv4/IPv6 header in MPLS
   payload, if present.

Naturally, a 5-tuple hash using L4 information in addition would be
possible and be better in some scenarios, but there is a tradeoff
between looking deeper into the packet to achieve good distribution,
and packet forwarding performance, and I have erred on the side of the
latter as the default.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/mpls/af_mpls.c

index ee3097a971852dcfe2a65ac47feeb1144dc3c95b..cc972e30355b600769051dc3841ba7fa582b42a5 100644 (file)
 #include <net/nexthop.h>
 #include "internal.h"
 
+/* Maximum number of labels to look ahead at when selecting a path of
+ * a multipath route
+ */
+#define MAX_MP_SELECT_LABELS 4
+
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
 
@@ -77,10 +82,78 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
-static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt)
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+                                            struct sk_buff *skb, bool bos)
 {
-       /* assume single nexthop for now */
-       return &rt->rt_nh[0];
+       struct mpls_entry_decoded dec;
+       struct mpls_shim_hdr *hdr;
+       bool eli_seen = false;
+       int label_index;
+       int nh_index = 0;
+       u32 hash = 0;
+
+       /* No need to look further into packet if there's only
+        * one path
+        */
+       if (rt->rt_nhn == 1)
+               goto out;
+
+       for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
+            label_index++) {
+               if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
+                       break;
+
+               /* Read and decode the current label */
+               hdr = mpls_hdr(skb) + label_index;
+               dec = mpls_entry_decode(hdr);
+
+               /* RFC6790 - reserved labels MUST NOT be used as keys
+                * for the load-balancing function
+                */
+               if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) {
+                       hash = jhash_1word(dec.label, hash);
+
+                       /* The entropy label follows the entropy label
+                        * indicator, so this means that the entropy
+                        * label was just added to the hash - no need to
+                        * go any deeper either in the label stack or in the
+                        * payload
+                        */
+                       if (eli_seen)
+                               break;
+               } else if (dec.label == MPLS_LABEL_ENTROPY) {
+                       eli_seen = true;
+               }
+
+               bos = dec.bos;
+               if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index +
+                                        sizeof(struct iphdr))) {
+                       const struct iphdr *v4hdr;
+
+                       v4hdr = (const struct iphdr *)(mpls_hdr(skb) +
+                                                      label_index);
+                       if (v4hdr->version == 4) {
+                               hash = jhash_3words(ntohl(v4hdr->saddr),
+                                                   ntohl(v4hdr->daddr),
+                                                   v4hdr->protocol, hash);
+                       } else if (v4hdr->version == 6 &&
+                               pskb_may_pull(skb, sizeof(*hdr) * label_index +
+                                             sizeof(struct ipv6hdr))) {
+                               const struct ipv6hdr *v6hdr;
+
+                               v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) +
+                                                               label_index);
+
+                               hash = __ipv6_addr_jhash(&v6hdr->saddr, hash);
+                               hash = __ipv6_addr_jhash(&v6hdr->daddr, hash);
+                               hash = jhash_1word(v6hdr->nexthdr, hash);
+                       }
+               }
+       }
+
+       nh_index = hash % rt->rt_nhn;
+out:
+       return &rt->rt_nh[nh_index];
 }
 
 static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
@@ -175,7 +248,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
        if (!rt)
                goto drop;
 
-       nh = mpls_select_multipath(rt);
+       nh = mpls_select_multipath(rt, skb, dec.bos);
        if (!nh)
                goto drop;
 
@@ -541,6 +614,12 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
                if (!rtnh_ok(rtnh, remaining))
                        goto errout;
 
+               /* neither weighted multipath nor any flags
+                * are supported
+                */
+               if (rtnh->rtnh_hops || rtnh->rtnh_flags)
+                       goto errout;
+
                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *attrs = rtnh_attrs(rtnh);