ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu
authorSabrina Dubroca <sd@queasysnail.net>
Wed, 14 Mar 2018 09:21:14 +0000 (10:21 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 30 May 2018 05:50:36 +0000 (07:50 +0200)
[ Upstream commit d52e5a7e7ca49457dd31fc8b42fb7c0d58a31221 ]

Prior to the rework of PMTU information storage in commit
2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer."),
when a PMTU event advertising a PMTU smaller than
net.ipv4.route.min_pmtu was received, we would disable setting the DF
flag on packets by locking the MTU metric, and set the PMTU to
net.ipv4.route.min_pmtu.

Since then, we don't disable DF, and set PMTU to
net.ipv4.route.min_pmtu, so the intermediate router that has this link
with a small MTU will have to drop the packets.

This patch reestablishes pre-2.6.39 behavior by splitting
rtable->rt_pmtu into a bitfield with rt_mtu_locked and rt_pmtu.
rt_mtu_locked indicates that we shouldn't set the DF bit on that path,
and is checked in ip_dont_fragment().

One possible workaround is to set net.ipv4.route.min_pmtu to a value low
enough to accommodate the lowest MTU encountered.

Fixes: 2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
include/net/ip.h
include/net/ip_fib.h
include/net/route.h
net/ipv4/route.c
net/ipv4/xfrm4_policy.c

index 0e3dcd5a134d8a7fcffaa465d92777d57cc56736..bc9b4deeb60e5e3af0cdc8d13a64052336710f60 100644 (file)
@@ -304,6 +304,13 @@ int ip_decrease_ttl(struct iphdr *iph)
        return --iph->ttl;
 }
 
+static inline int ip_mtu_locked(const struct dst_entry *dst)
+{
+       const struct rtable *rt = (const struct rtable *)dst;
+
+       return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
+}
+
 static inline
 int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 {
@@ -311,7 +318,7 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 
        return  pmtudisc == IP_PMTUDISC_DO ||
                (pmtudisc == IP_PMTUDISC_WANT &&
-                !(dst_metric_locked(dst, RTAX_MTU)));
+                !ip_mtu_locked(dst));
 }
 
 static inline bool ip_sk_accept_pmtu(const struct sock *sk)
@@ -337,7 +344,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
        struct net *net = dev_net(dst->dev);
 
        if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
-           dst_metric_locked(dst, RTAX_MTU) ||
+           ip_mtu_locked(dst) ||
            !forwarding)
                return dst_mtu(dst);
 
index aa758280d8a8e61e35b00a421e1647ad83ae9908..978387d6c3e62affd4c340b057c1a913a6509d94 100644 (file)
@@ -57,6 +57,7 @@ struct fib_nh_exception {
        int                             fnhe_genid;
        __be32                          fnhe_daddr;
        u32                             fnhe_pmtu;
+       bool                            fnhe_mtu_locked;
        __be32                          fnhe_gw;
        unsigned long                   fnhe_expires;
        struct rtable __rcu             *fnhe_rth_input;
index 0429d47cad25c27eebe1a665e250d055f90d4e84..b8488efef920aba3742bb3d488fdfa0bd88b20dc 100644 (file)
@@ -63,7 +63,8 @@ struct rtable {
        __be32                  rt_gateway;
 
        /* Miscellaneous cached information */
-       u32                     rt_pmtu;
+       u32                     rt_mtu_locked:1,
+                               rt_pmtu:31;
 
        u32                     rt_table_id;
 
index 4c9fbf4f5905b57a7d6758438f8c116788b62b18..890141d32ab97b95cb3450ceae64f2a386afaa93 100644 (file)
@@ -618,6 +618,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 {
        rt->rt_pmtu = fnhe->fnhe_pmtu;
+       rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
        rt->dst.expires = fnhe->fnhe_expires;
 
        if (fnhe->fnhe_gw) {
@@ -628,7 +629,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 }
 
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
-                                 u32 pmtu, unsigned long expires)
+                                 u32 pmtu, bool lock, unsigned long expires)
 {
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe;
@@ -665,8 +666,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
                        fnhe->fnhe_genid = genid;
                if (gw)
                        fnhe->fnhe_gw = gw;
-               if (pmtu)
+               if (pmtu) {
                        fnhe->fnhe_pmtu = pmtu;
+                       fnhe->fnhe_mtu_locked = lock;
+               }
                fnhe->fnhe_expires = max(1UL, expires);
                /* Update all cached dsts too */
                rt = rcu_dereference(fnhe->fnhe_rth_input);
@@ -690,6 +693,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
                fnhe->fnhe_daddr = daddr;
                fnhe->fnhe_gw = gw;
                fnhe->fnhe_pmtu = pmtu;
+               fnhe->fnhe_mtu_locked = lock;
                fnhe->fnhe_expires = expires;
 
                /* Exception created; mark the cached routes for the nexthop
@@ -771,7 +775,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
                                struct fib_nh *nh = &FIB_RES_NH(res);
 
                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
-                                               0, jiffies + ip_rt_gc_timeout);
+                                               0, false,
+                                               jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -983,15 +988,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
        struct dst_entry *dst = &rt->dst;
        struct fib_result res;
+       bool lock = false;
 
-       if (dst_metric_locked(dst, RTAX_MTU))
+       if (ip_mtu_locked(dst))
                return;
 
        if (ipv4_mtu(dst) < mtu)
                return;
 
-       if (mtu < ip_rt_min_pmtu)
+       if (mtu < ip_rt_min_pmtu) {
+               lock = true;
                mtu = ip_rt_min_pmtu;
+       }
 
        if (rt->rt_pmtu == mtu &&
            time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
@@ -1001,7 +1009,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
        if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
                struct fib_nh *nh = &FIB_RES_NH(res);
 
-               update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+               update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
                                      jiffies + ip_rt_mtu_expires);
        }
        rcu_read_unlock();
@@ -1256,7 +1264,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 
        mtu = READ_ONCE(dst->dev->mtu);
 
-       if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+       if (unlikely(ip_mtu_locked(dst))) {
                if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }
@@ -1481,6 +1489,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
                rt->rt_is_input = 0;
                rt->rt_iif = 0;
                rt->rt_pmtu = 0;
+               rt->rt_mtu_locked = 0;
                rt->rt_gateway = 0;
                rt->rt_uses_gateway = 0;
                rt->rt_table_id = 0;
@@ -2403,6 +2412,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
                rt->rt_is_input = ort->rt_is_input;
                rt->rt_iif = ort->rt_iif;
                rt->rt_pmtu = ort->rt_pmtu;
+               rt->rt_mtu_locked = ort->rt_mtu_locked;
 
                rt->rt_genid = rt_genid_ipv4(net);
                rt->rt_flags = ort->rt_flags;
@@ -2505,6 +2515,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
        if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+       if (rt->rt_mtu_locked && expires)
+               metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
 
index 6a7ff69575353f5242aa800023999bcc12823fee..622e158a6fc400bfcf0b444647ca30385eed168b 100644 (file)
@@ -97,6 +97,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
        xdst->u.rt.rt_gateway = rt->rt_gateway;
        xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
        xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+       xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
        xdst->u.rt.rt_table_id = rt->rt_table_id;
        INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);