ipv4: fix fnhe usage by non-cached routes
authorJulian Anastasov <ja@ssi.bg>
Wed, 2 May 2018 06:41:19 +0000 (09:41 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 19 May 2018 08:20:23 +0000 (10:20 +0200)
[ Upstream commit 94720e3aee6884d8c8beb678001629da60ec6366 ]

Allow some non-cached routes to use non-expired fnhe:

1. ip_del_fnhe: moved above and now called by find_exception.
The 4.5+ commit deed49df7390 expires fnhe only when caching
routes. Change that to:

1.1. use fnhe for non-cached local output routes, with the help
from (2)

1.2. allow __mkroute_input to detect expired fnhe (outdated
fnhe_gw, for example) when do_cache is false, eg. when itag!=0
for unicast destinations.

2. __mkroute_output: keep fi to allow local routes with orig_oif != 0
to use fnhe info even when the new route will not be cached into fnhe.
After commit 839da4d98960 ("net: ipv4: set orig_oif based on fib
result for local traffic") it means all local routes will be affected
because they are not cached. This change is used to solve a PMTU
problem with IPVS (and probably Netfilter DNAT) setups that redirect
local clients from target local IP (local route to Virtual IP)
to new remote IP target, eg. IPVS TUN real server. Loopback has
64K MTU and we need to create fnhe on the local route that will
keep the reduced PMTU for the Virtual IP. Without this change
fnhe_pmtu is updated from ICMP but never exposed to non-cached
local routes. This includes routes with flowi4_oif!=0 for 4.6+ and
with flowi4_oif=any for 4.14+).

3. update_or_create_fnhe: make sure fnhe_expires is not 0 for
new entries

Fixes: 839da4d98960 ("net: ipv4: set orig_oif based on fib result for local traffic")
Fixes: d6d5e999e5df ("route: do not cache fib route info on local routes with oif")
Fixes: deed49df7390 ("route: check and remove route cache when we get route")
Cc: David Ahern <dsahern@gmail.com>
Cc: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
net/ipv4/route.c

index 5ea559f8c456464b13628a899c4c3023173671fc..28bc3a98adc719558cdad71969df1bcd21e87e87 100644 (file)
@@ -711,7 +711,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
                fnhe->fnhe_daddr = daddr;
                fnhe->fnhe_gw = gw;
                fnhe->fnhe_pmtu = pmtu;
-               fnhe->fnhe_expires = expires;
+               fnhe->fnhe_expires = max(1UL, expires);
 
                /* Exception created; mark the cached routes for the nexthop
                 * stale, so anyone caching it rechecks if this exception
@@ -1286,6 +1286,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+       struct fnhe_hash_bucket *hash;
+       struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+       u32 hval = fnhe_hashfun(daddr);
+
+       spin_lock_bh(&fnhe_lock);
+
+       hash = rcu_dereference_protected(nh->nh_exceptions,
+                                        lockdep_is_held(&fnhe_lock));
+       hash += hval;
+
+       fnhe_p = &hash->chain;
+       fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+       while (fnhe) {
+               if (fnhe->fnhe_daddr == daddr) {
+                       rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+                               fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+                       fnhe_flush_routes(fnhe);
+                       kfree_rcu(fnhe, rcu);
+                       break;
+               }
+               fnhe_p = &fnhe->fnhe_next;
+               fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+                                                lockdep_is_held(&fnhe_lock));
+       }
+
+       spin_unlock_bh(&fnhe_lock);
+}
+
 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 {
        struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
@@ -1299,8 +1329,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 
        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
-               if (fnhe->fnhe_daddr == daddr)
+               if (fnhe->fnhe_daddr == daddr) {
+                       if (fnhe->fnhe_expires &&
+                           time_after(jiffies, fnhe->fnhe_expires)) {
+                               ip_del_fnhe(nh, daddr);
+                               break;
+                       }
                        return fnhe;
+               }
        }
        return NULL;
 }
@@ -1620,36 +1656,6 @@ static void ip_handle_martian_source(struct net_device *dev,
 #endif
 }
 
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
-{
-       struct fnhe_hash_bucket *hash;
-       struct fib_nh_exception *fnhe, __rcu **fnhe_p;
-       u32 hval = fnhe_hashfun(daddr);
-
-       spin_lock_bh(&fnhe_lock);
-
-       hash = rcu_dereference_protected(nh->nh_exceptions,
-                                        lockdep_is_held(&fnhe_lock));
-       hash += hval;
-
-       fnhe_p = &hash->chain;
-       fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
-       while (fnhe) {
-               if (fnhe->fnhe_daddr == daddr) {
-                       rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
-                               fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
-                       fnhe_flush_routes(fnhe);
-                       kfree_rcu(fnhe, rcu);
-                       break;
-               }
-               fnhe_p = &fnhe->fnhe_next;
-               fnhe = rcu_dereference_protected(fnhe->fnhe_next,
-                                                lockdep_is_held(&fnhe_lock));
-       }
-
-       spin_unlock_bh(&fnhe_lock);
-}
-
 static void set_lwt_redirect(struct rtable *rth)
 {
        if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
@@ -1716,20 +1722,10 @@ static int __mkroute_input(struct sk_buff *skb,
 
        fnhe = find_exception(&FIB_RES_NH(*res), daddr);
        if (do_cache) {
-               if (fnhe) {
+               if (fnhe)
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
-                       if (rth && rth->dst.expires &&
-                           time_after(jiffies, rth->dst.expires)) {
-                               ip_del_fnhe(&FIB_RES_NH(*res), daddr);
-                               fnhe = NULL;
-                       } else {
-                               goto rt_cache;
-                       }
-               }
-
-               rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-
-rt_cache:
+               else
+                       rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
@@ -2206,39 +2202,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
                 * be set to the loopback interface as well.
                 */
-               fi = NULL;
+               do_cache = false;
        }
 
        fnhe = NULL;
        do_cache &= fi != NULL;
-       if (do_cache) {
+       if (fi) {
                struct rtable __rcu **prth;
                struct fib_nh *nh = &FIB_RES_NH(*res);
 
                fnhe = find_exception(nh, fl4->daddr);
+               if (!do_cache)
+                       goto add;
                if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
-                       rth = rcu_dereference(*prth);
-                       if (rth && rth->dst.expires &&
-                           time_after(jiffies, rth->dst.expires)) {
-                               ip_del_fnhe(nh, fl4->daddr);
-                               fnhe = NULL;
-                       } else {
-                               goto rt_cache;
+               } else {
+                       if (unlikely(fl4->flowi4_flags &
+                                    FLOWI_FLAG_KNOWN_NH &&
+                                    !(nh->nh_gw &&
+                                      nh->nh_scope == RT_SCOPE_LINK))) {
+                               do_cache = false;
+                               goto add;
                        }
+                       prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                }
-
-               if (unlikely(fl4->flowi4_flags &
-                            FLOWI_FLAG_KNOWN_NH &&
-                            !(nh->nh_gw &&
-                              nh->nh_scope == RT_SCOPE_LINK))) {
-                       do_cache = false;
-                       goto add;
-               }
-               prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
                rth = rcu_dereference(*prth);
-
-rt_cache:
                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
                        return rth;
        }