ipv4: Add FIB nexthop exceptions.
authorDavid S. Miller <davem@davemloft.net>
Tue, 17 Jul 2012 11:19:00 +0000 (04:19 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 17 Jul 2012 15:48:50 +0000 (08:48 -0700)
In a regime where we have subnetted route entries, we need a way to
store persistent storage about destination specific learned values
such as redirects and PMTU values.

This is implemented here via nexthop exceptions.

The initial implementation is a 2048 entry hash table with relaiming
starting at chain length 5.  A more sophisticated scheme can be
devised if that proves necessary.

Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/ip_fib.h
net/ipv4/fib_semantics.c
net/ipv4/route.c

index 5697acefeba38e5351c7c962d6816e639d59cd49..e9ee1ca0708729529e25bafd63a51c6184e786b7 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <net/flow.h>
 #include <linux/seq_file.h>
+#include <linux/rcupdate.h>
 #include <net/fib_rules.h>
 #include <net/inetpeer.h>
 
@@ -46,6 +47,22 @@ struct fib_config {
 
 struct fib_info;
 
+struct fib_nh_exception {
+       struct fib_nh_exception __rcu   *fnhe_next;
+       __be32                          fnhe_daddr;
+       u32                             fnhe_pmtu;
+       u32                             fnhe_gw;
+       unsigned long                   fnhe_expires;
+       unsigned long                   fnhe_stamp;
+};
+
+struct fnhe_hash_bucket {
+       struct fib_nh_exception __rcu   *chain;
+};
+
+#define FNHE_HASH_SIZE         2048
+#define FNHE_RECLAIM_DEPTH     5
+
 struct fib_nh {
        struct net_device       *nh_dev;
        struct hlist_node       nh_hash;
@@ -63,6 +80,7 @@ struct fib_nh {
        __be32                  nh_gw;
        __be32                  nh_saddr;
        int                     nh_saddr_genid;
+       struct fnhe_hash_bucket *nh_exceptions;
 };
 
 /*
index d71bfbdc0bf4374da21c7fcd3d7817adf90a0b5f..1e09852df512471333d34552b0c32e0ed28251d3 100644 (file)
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
        },
 };
 
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+       struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+       int i;
+
+       for (i = 0; i < FNHE_HASH_SIZE; i++) {
+               struct fib_nh_exception *fnhe;
+
+               fnhe = rcu_dereference(hash[i].chain);
+               while (fnhe) {
+                       struct fib_nh_exception *next;
+                       
+                       next = rcu_dereference(fnhe->fnhe_next);
+                       kfree(fnhe);
+
+                       fnhe = next;
+               }
+       }
+       kfree(hash);
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
        change_nexthops(fi) {
                if (nexthop_nh->nh_dev)
                        dev_put(nexthop_nh->nh_dev);
+               if (nexthop_nh->nh_exceptions)
+                       free_nh_exceptions(nexthop_nh);
        } endfor_nexthops(fi);
 
        release_net(fi->fib_net);
index b35d3bfc66cd66e59466ff743d10d1150194186f..a5bd0b4acc614d020a9bff3a2e53dbc846c37ff5 100644 (file)
@@ -1275,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt)
        spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
+                            const struct iphdr *iph,
+                            int oif, u8 tos,
+                            u8 prot, u32 mark, int flow_flags)
+{
+       if (sk) {
+               const struct inet_sock *inet = inet_sk(sk);
+
+               oif = sk->sk_bound_dev_if;
+               mark = sk->sk_mark;
+               tos = RT_CONN_FLAGS(sk);
+               prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+       }
+       flowi4_init_output(fl4, oif, mark, tos,
+                          RT_SCOPE_UNIVERSE, prot,
+                          flow_flags,
+                          iph->daddr, iph->saddr, 0, 0);
+}
+
+static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
+{
+       const struct iphdr *iph = ip_hdr(skb);
+       int oif = skb->dev->ifindex;
+       u8 tos = RT_TOS(iph->tos);
+       u8 prot = iph->protocol;
+       u32 mark = skb->mark;
+
+       __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
+
+static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
+{
+       const struct inet_sock *inet = inet_sk(sk);
+       struct ip_options_rcu *inet_opt;
+       __be32 daddr = inet->inet_daddr;
+
+       rcu_read_lock();
+       inet_opt = rcu_dereference(inet->inet_opt);
+       if (inet_opt && inet_opt->opt.srr)
+               daddr = inet_opt->opt.faddr;
+       flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+                          RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                          inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+                          inet_sk_flowi_flags(sk),
+                          daddr, inet->inet_saddr, 0, 0);
+       rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
+                                struct sk_buff *skb)
+{
+       if (skb)
+               build_skb_flow_key(fl4, skb, sk);
+       else
+               build_sk_flow_key(fl4, sk);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
+
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
+{
+       struct fib_nh_exception *fnhe, *oldest;
+
+       oldest = rcu_dereference(hash->chain);
+       for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+            fnhe = rcu_dereference(fnhe->fnhe_next)) {
+               if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+                       oldest = fnhe;
+       }
+       return oldest;
+}
+
+static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+       struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+       struct fib_nh_exception *fnhe;
+       int depth;
+       u32 hval;
+
+       if (!hash) {
+               hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
+                                                  GFP_ATOMIC);
+               if (!hash)
+                       return NULL;
+       }
+
+       hval = (__force u32) daddr;
+       hval ^= (hval >> 11) ^ (hval >> 22);
+       hash += hval;
+
+       depth = 0;
+       for (fnhe = rcu_dereference(hash->chain); fnhe;
+            fnhe = rcu_dereference(fnhe->fnhe_next)) {
+               if (fnhe->fnhe_daddr == daddr)
+                       goto out;
+               depth++;
+       }
+
+       if (depth > FNHE_RECLAIM_DEPTH) {
+               fnhe = fnhe_oldest(hash + hval, daddr);
+               goto out_daddr;
+       }
+       fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+       if (!fnhe)
+               return NULL;
+
+       fnhe->fnhe_next = hash->chain;
+       rcu_assign_pointer(hash->chain, fnhe);
+
+out_daddr:
+       fnhe->fnhe_daddr = daddr;
+out:
+       fnhe->fnhe_stamp = jiffies;
+       return fnhe;
+}
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
 {
        __be32 new_gw = icmp_hdr(skb)->un.gateway;
        __be32 old_gw = ip_hdr(skb)->saddr;
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
+       struct fib_result res;
        struct neighbour *n;
-       struct rtable *rt;
        struct net *net;
 
        switch (icmp_hdr(skb)->code & 7) {
@@ -1296,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
                return;
        }
 
-       rt = (struct rtable *) dst;
        if (rt->rt_gateway != old_gw)
                return;
 
@@ -1320,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
                        goto reject_redirect;
        }
 
-       n = ipv4_neigh_lookup(dst, NULL, &new_gw);
+       n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
        if (n) {
                if (!(n->nud_state & NUD_VALID)) {
                        neigh_event_send(n, NULL);
                } else {
+                       if (fib_lookup(net, fl4, &res) == 0) {
+                               struct fib_nh *nh = &FIB_RES_NH(res);
+                               struct fib_nh_exception *fnhe;
+
+                               spin_lock_bh(&fnhe_lock);
+                               fnhe = find_or_create_fnhe(nh, fl4->daddr);
+                               if (fnhe)
+                                       fnhe->fnhe_gw = new_gw;
+                               spin_unlock_bh(&fnhe_lock);
+                       }
                        rt->rt_gateway = new_gw;
                        rt->rt_flags |= RTCF_REDIRECTED;
                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
@@ -1349,6 +1474,17 @@ reject_redirect:
        ;
 }
 
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+       struct rtable *rt;
+       struct flowi4 fl4;
+
+       rt = (struct rtable *) dst;
+
+       ip_rt_build_flow_key(&fl4, sk, skb);
+       __ip_do_redirect(rt, skb, &fl4);
+}
+
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
        struct rtable *rt = (struct rtable *)dst;
@@ -1508,33 +1644,51 @@ out:    kfree_skb(skb);
        return 0;
 }
 
-static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                             struct sk_buff *skb, u32 mtu)
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
-       struct rtable *rt = (struct rtable *) dst;
-
-       dst_confirm(dst);
+       struct fib_result res;
 
        if (mtu < ip_rt_min_pmtu)
                mtu = ip_rt_min_pmtu;
 
+       if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+               struct fib_nh *nh = &FIB_RES_NH(res);
+               struct fib_nh_exception *fnhe;
+
+               spin_lock_bh(&fnhe_lock);
+               fnhe = find_or_create_fnhe(nh, fl4->daddr);
+               if (fnhe) {
+                       fnhe->fnhe_pmtu = mtu;
+                       fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
+               }
+               spin_unlock_bh(&fnhe_lock);
+       }
        rt->rt_pmtu = mtu;
        dst_set_expires(&rt->dst, ip_rt_mtu_expires);
 }
 
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                             struct sk_buff *skb, u32 mtu)
+{
+       struct rtable *rt = (struct rtable *) dst;
+       struct flowi4 fl4;
+
+       ip_rt_build_flow_key(&fl4, sk, skb);
+       __ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
                      int oif, u32 mark, u8 protocol, int flow_flags)
 {
-       const struct iphdr *iph = (const struct iphdr *)skb->data;
+       const struct iphdr *iph = (const struct iphdr *) skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
 
-       flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-                          protocol, flow_flags,
-                          iph->daddr, iph->saddr, 0, 0);
+       __build_flow_key(&fl4, NULL, iph, oif,
+                        RT_TOS(iph->tos), protocol, mark, flow_flags);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
-               ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
+               __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
 }
@@ -1542,27 +1696,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 
 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 {
-       const struct inet_sock *inet = inet_sk(sk);
+       const struct iphdr *iph = (const struct iphdr *) skb->data;
+       struct flowi4 fl4;
+       struct rtable *rt;
 
-       return ipv4_update_pmtu(skb, sock_net(sk), mtu,
-                               sk->sk_bound_dev_if, sk->sk_mark,
-                               inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-                               inet_sk_flowi_flags(sk));
+       __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+       rt = __ip_route_output_key(sock_net(sk), &fl4);
+       if (!IS_ERR(rt)) {
+               __ip_rt_update_pmtu(rt, &fl4, mtu);
+               ip_rt_put(rt);
+       }
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
 void ipv4_redirect(struct sk_buff *skb, struct net *net,
                   int oif, u32 mark, u8 protocol, int flow_flags)
 {
-       const struct iphdr *iph = (const struct iphdr *)skb->data;
+       const struct iphdr *iph = (const struct iphdr *) skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
 
-       flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-                          protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
+       __build_flow_key(&fl4, NULL, iph, oif,
+                        RT_TOS(iph->tos), protocol, mark, flow_flags);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
-               ip_do_redirect(&rt->dst, NULL, skb);
+               __ip_do_redirect(rt, skb, &fl4);
                ip_rt_put(rt);
        }
 }
@@ -1570,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
 
 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 {
-       const struct inet_sock *inet = inet_sk(sk);
+       const struct iphdr *iph = (const struct iphdr *) skb->data;
+       struct flowi4 fl4;
+       struct rtable *rt;
 
-       return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
-                            sk->sk_mark,
-                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-                            inet_sk_flowi_flags(sk));
+       __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+       rt = __ip_route_output_key(sock_net(sk), &fl4);
+       if (!IS_ERR(rt)) {
+               __ip_do_redirect(rt, skb, &fl4);
+               ip_rt_put(rt);
+       }
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
 
@@ -1722,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
        dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 }
 
+static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+{
+       struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+       struct fib_nh_exception *fnhe;
+       u32 hval;
+
+       hval = (__force u32) daddr;
+       hval ^= (hval >> 11) ^ (hval >> 22);
+
+       for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+            fnhe = rcu_dereference(fnhe->fnhe_next)) {
+               if (fnhe->fnhe_daddr == daddr) {
+                       if (fnhe->fnhe_pmtu) {
+                               unsigned long expires = fnhe->fnhe_expires;
+                               unsigned long diff = jiffies - expires;
+
+                               if (time_before(jiffies, expires)) {
+                                       rt->rt_pmtu = fnhe->fnhe_pmtu;
+                                       dst_set_expires(&rt->dst, diff);
+                               }
+                       }
+                       if (fnhe->fnhe_gw)
+                               rt->rt_gateway = fnhe->fnhe_gw;
+                       fnhe->fnhe_stamp = jiffies;
+                       break;
+               }
+       }
+}
+
 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
                           const struct fib_result *res,
                           struct fib_info *fi, u16 type, u32 itag)
 {
        if (fi) {
-               if (FIB_RES_GW(*res) &&
-                   FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-                       rt->rt_gateway = FIB_RES_GW(*res);
+               struct fib_nh *nh = &FIB_RES_NH(*res);
+
+               if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+                       rt->rt_gateway = nh->nh_gw;
+               if (unlikely(nh->nh_exceptions))
+                       rt_bind_exception(rt, nh, fl4->daddr);
                rt_init_metrics(rt, fl4, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
                rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;