net: Implement read-only protection and COW'ing of metrics.
authorDavid S. Miller <davem@davemloft.net>
Thu, 27 Jan 2011 04:51:05 +0000 (20:51 -0800)
committerDavid S. Miller <davem@davemloft.net>
Thu, 27 Jan 2011 04:51:05 +0000 (20:51 -0800)
Routing metrics are now copy-on-write.

Initially a route entry points it's metrics at a read-only location.
If a routing table entry exists, it will point there.  Else it will
point at the all zero metric place-holder called 'dst_default_metrics'.

The writeability state of the metrics is stored in the low bits of the
metrics pointer, we have two bits left to spare if we want to store
more states.

For the initial implementation, COW is implemented simply via kmalloc.
However future enhancements will change this to place the writable
metrics somewhere else, in order to increase sharing.  Very likely
this "somewhere else" will be the inetpeer cache.

Note also that this means that metrics updates may transiently fail
if we cannot COW the metrics successfully.

But even by itself, this patch should decrease memory usage and
increase cache locality especially for routing workloads.  In those
cases the read-only metric copies stay in place and never get written
to.

TCP workloads where metrics get updated, and those rare cases where
PMTU triggers occur, will take a very slight performance hit.  But
that hit will be alleviated when the long-term writable metrics
move to a more sharable location.

Since the metrics storage went from a u32 array of RTAX_MAX entries to
what is essentially a pointer, some retooling of the dst_entry layout
was necessary.

Most importantly, we need to preserve the alignment of the reference
count so that it doesn't share cache lines with the read-mostly state,
as per Eric Dumazet's alignment assertion checks.

The only non-trivial bit here is the move of the 'flags' member into
the writeable cacheline.  This is OK since we are always accessing the
flags around the same moment when we made a modification to the
reference count.

Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/dst.h
include/net/dst_ops.h
include/net/route.h
net/core/dst.c
net/decnet/dn_route.c
net/ipv4/route.c
net/ipv4/xfrm4_policy.c
net/ipv6/route.c
net/ipv6/xfrm6_policy.c

index be5a0d4c491d98eb1c2123364a137c5c954a4152..94a8c234ea2a24488443d6f1f4d664da839e9ad4 100644 (file)
@@ -40,24 +40,10 @@ struct dst_entry {
        struct rcu_head         rcu_head;
        struct dst_entry        *child;
        struct net_device       *dev;
-       short                   error;
-       short                   obsolete;
-       int                     flags;
-#define DST_HOST               0x0001
-#define DST_NOXFRM             0x0002
-#define DST_NOPOLICY           0x0004
-#define DST_NOHASH             0x0008
-#define DST_NOCACHE            0x0010
+       struct  dst_ops         *ops;
+       unsigned long           _metrics;
        unsigned long           expires;
-
-       unsigned short          header_len;     /* more space at head required */
-       unsigned short          trailer_len;    /* space to reserve at tail */
-
-       unsigned int            rate_tokens;
-       unsigned long           rate_last;      /* rate limiting for ICMP */
-
        struct dst_entry        *path;
-
        struct neighbour        *neighbour;
        struct hh_cache         *hh;
 #ifdef CONFIG_XFRM
@@ -68,17 +54,16 @@ struct dst_entry {
        int                     (*input)(struct sk_buff*);
        int                     (*output)(struct sk_buff*);
 
-       struct  dst_ops         *ops;
-
-       u32                     _metrics[RTAX_MAX];
-
+       short                   error;
+       short                   obsolete;
+       unsigned short          header_len;     /* more space at head required */
+       unsigned short          trailer_len;    /* space to reserve at tail */
 #ifdef CONFIG_IP_ROUTE_CLASSID
        __u32                   tclassid;
 #else
        __u32                   __pad2;
 #endif
 
-
        /*
         * Align __refcnt to a 64 bytes alignment
         * (L1_CACHE_SIZE would be too much)
@@ -93,6 +78,14 @@ struct dst_entry {
        atomic_t                __refcnt;       /* client references    */
        int                     __use;
        unsigned long           lastuse;
+       unsigned long           rate_last;      /* rate limiting for ICMP */
+       unsigned int            rate_tokens;
+       int                     flags;
+#define DST_HOST               0x0001
+#define DST_NOXFRM             0x0002
+#define DST_NOPOLICY           0x0004
+#define DST_NOHASH             0x0008
+#define DST_NOCACHE            0x0010
        union {
                struct dst_entry        *next;
                struct rtable __rcu     *rt_next;
@@ -103,10 +96,69 @@ struct dst_entry {
 
 #ifdef __KERNEL__
 
+extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
+
+#define DST_METRICS_READ_ONLY  0x1UL
+#define __DST_METRICS_PTR(Y)   \
+       ((u32 *)((Y) & ~DST_METRICS_READ_ONLY))
+#define DST_METRICS_PTR(X)     __DST_METRICS_PTR((X)->_metrics)
+
+static inline bool dst_metrics_read_only(const struct dst_entry *dst)
+{
+       return dst->_metrics & DST_METRICS_READ_ONLY;
+}
+
+extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);
+
+static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
+{
+       unsigned long val = dst->_metrics;
+       if (!(val & DST_METRICS_READ_ONLY))
+               __dst_destroy_metrics_generic(dst, val);
+}
+
+static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
+{
+       unsigned long p = dst->_metrics;
+
+       if (p & DST_METRICS_READ_ONLY)
+               return dst->ops->cow_metrics(dst, p);
+       return __DST_METRICS_PTR(p);
+}
+
+/* This may only be invoked before the entry has reached global
+ * visibility.
+ */
+static inline void dst_init_metrics(struct dst_entry *dst,
+                                   const u32 *src_metrics,
+                                   bool read_only)
+{
+       dst->_metrics = ((unsigned long) src_metrics) |
+               (read_only ? DST_METRICS_READ_ONLY : 0);
+}
+
+static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
+{
+       u32 *dst_metrics = dst_metrics_write_ptr(dest);
+
+       if (dst_metrics) {
+               u32 *src_metrics = DST_METRICS_PTR(src);
+
+               memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
+       }
+}
+
+static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
+{
+       return DST_METRICS_PTR(dst);
+}
+
 static inline u32
 dst_metric_raw(const struct dst_entry *dst, const int metric)
 {
-       return dst->_metrics[metric-1];
+       u32 *p = DST_METRICS_PTR(dst);
+
+       return p[metric-1];
 }
 
 static inline u32
@@ -131,22 +183,10 @@ dst_metric_advmss(const struct dst_entry *dst)
 
 static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
 {
-       dst->_metrics[metric-1] = val;
-}
-
-static inline void dst_import_metrics(struct dst_entry *dst, const u32 *src_metrics)
-{
-       memcpy(dst->_metrics, src_metrics, RTAX_MAX * sizeof(u32));
-}
+       u32 *p = dst_metrics_write_ptr(dst);
 
-static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
-{
-       dst_import_metrics(dest, src->_metrics);
-}
-
-static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
-{
-       return dst->_metrics;
+       if (p)
+               p[metric-1] = val;
 }
 
 static inline u32
index 21a320b8708eb1b987c5d981b4b14ff684123715..dc0746328947f3b259c093093f4068d98a934d5a 100644 (file)
@@ -18,6 +18,7 @@ struct dst_ops {
        struct dst_entry *      (*check)(struct dst_entry *, __u32 cookie);
        unsigned int            (*default_advmss)(const struct dst_entry *);
        unsigned int            (*default_mtu)(const struct dst_entry *);
+       u32 *                   (*cow_metrics)(struct dst_entry *, unsigned long);
        void                    (*destroy)(struct dst_entry *);
        void                    (*ifdown)(struct dst_entry *,
                                          struct net_device *dev, int how);
index 93e10c453f6b8a08cc12dd3a70cb86d6473a4b19..5677cbf0c6e659f58cf3c7b9befed68be2e15f0e 100644 (file)
@@ -49,6 +49,7 @@
 
 struct fib_nh;
 struct inet_peer;
+struct fib_info;
 struct rtable {
        struct dst_entry        dst;
 
@@ -69,6 +70,7 @@ struct rtable {
        /* Miscellaneous cached information */
        __be32                  rt_spec_dst; /* RFC1122 specific destination */
        struct inet_peer        *peer; /* long-living peer info */
+       struct fib_info         *fi; /* for client ref to shared metrics */
 };
 
 static inline bool rt_is_input_route(struct rtable *rt)
index b99c7c7ffce2b72b0089ba8c53d4de6e5b131059..578893505702fd13aec07c5ea5c0c1b335e2a834 100644 (file)
@@ -164,6 +164,8 @@ int dst_discard(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dst_discard);
 
+static const u32 dst_default_metrics[RTAX_MAX];
+
 void *dst_alloc(struct dst_ops *ops)
 {
        struct dst_entry *dst;
@@ -180,6 +182,7 @@ void *dst_alloc(struct dst_ops *ops)
        dst->lastuse = jiffies;
        dst->path = dst;
        dst->input = dst->output = dst_discard;
+       dst_init_metrics(dst, dst_default_metrics, true);
 #if RT_CACHE_DEBUG >= 2
        atomic_inc(&dst_total);
 #endif
@@ -282,6 +285,42 @@ void dst_release(struct dst_entry *dst)
 }
 EXPORT_SYMBOL(dst_release);
 
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+       u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+       if (p) {
+               u32 *old_p = __DST_METRICS_PTR(old);
+               unsigned long prev, new;
+
+               memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+               new = (unsigned long) p;
+               prev = cmpxchg(&dst->_metrics, old, new);
+
+               if (prev != old) {
+                       kfree(p);
+                       p = __DST_METRICS_PTR(prev);
+                       if (prev & DST_METRICS_READ_ONLY)
+                               p = NULL;
+               }
+       }
+       return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false.  */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+       unsigned long prev, new;
+
+       new = (unsigned long) dst_default_metrics;
+       prev = cmpxchg(&dst->_metrics, old, new);
+       if (prev == old)
+               kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
 /**
  * skb_dst_set_noref - sets skb dst, without a reference
  * @skb: buffer
index 5e636365d33cb2e99b1077b59ce9e14c4b5c7c02..42c9c62d3417a1c988b67160f5577d53c138bce6 100644 (file)
@@ -112,6 +112,7 @@ static int dn_dst_gc(struct dst_ops *ops);
 static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
 static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
 static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
+static void dn_dst_destroy(struct dst_entry *);
 static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
 static void dn_dst_link_failure(struct sk_buff *);
 static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
@@ -133,11 +134,18 @@ static struct dst_ops dn_dst_ops = {
        .check =                dn_dst_check,
        .default_advmss =       dn_dst_default_advmss,
        .default_mtu =          dn_dst_default_mtu,
+       .cow_metrics =          dst_cow_metrics_generic,
+       .destroy =              dn_dst_destroy,
        .negative_advice =      dn_dst_negative_advice,
        .link_failure =         dn_dst_link_failure,
        .update_pmtu =          dn_dst_update_pmtu,
 };
 
+static void dn_dst_destroy(struct dst_entry *dst)
+{
+       dst_destroy_metrics_generic(dst);
+}
+
 static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
 {
        __u16 tmp = (__u16 __force)(src ^ dst);
@@ -814,14 +822,14 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 {
        struct dn_fib_info *fi = res->fi;
        struct net_device *dev = rt->dst.dev;
+       unsigned int mss_metric;
        struct neighbour *n;
-       unsigned int metric;
 
        if (fi) {
                if (DN_FIB_RES_GW(*res) &&
                    DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
                        rt->rt_gateway = DN_FIB_RES_GW(*res);
-               dst_import_metrics(&rt->dst, fi->fib_metrics);
+               dst_init_metrics(&rt->dst, fi->fib_metrics, true);
        }
        rt->rt_type = res->type;
 
@@ -834,10 +842,10 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 
        if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
                dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
-       metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
-       if (metric) {
+       mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
+       if (mss_metric) {
                unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
-               if (metric > mss)
+               if (mss_metric > mss)
                        dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
        }
        return 0;
index 3e5b7cc2db4fefbbfd682856588c8eb2fcc4ddc6..980030d4e4ae9161b3a1a9edd971e57e341e0338 100644 (file)
@@ -152,6 +152,36 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 {
 }
 
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+       u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+       if (p) {
+               u32 *old_p = __DST_METRICS_PTR(old);
+               unsigned long prev, new;
+
+               memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+               new = (unsigned long) p;
+               prev = cmpxchg(&dst->_metrics, old, new);
+
+               if (prev != old) {
+                       kfree(p);
+                       p = __DST_METRICS_PTR(prev);
+                       if (prev & DST_METRICS_READ_ONLY)
+                               p = NULL;
+               } else {
+                       struct rtable *rt = (struct rtable *) dst;
+
+                       if (rt->fi) {
+                               fib_info_put(rt->fi);
+                               rt->fi = NULL;
+                       }
+               }
+       }
+       return p;
+}
+
 static struct dst_ops ipv4_dst_ops = {
        .family =               AF_INET,
        .protocol =             cpu_to_be16(ETH_P_IP),
@@ -159,6 +189,7 @@ static struct dst_ops ipv4_dst_ops = {
        .check =                ipv4_dst_check,
        .default_advmss =       ipv4_default_advmss,
        .default_mtu =          ipv4_default_mtu,
+       .cow_metrics =          ipv4_cow_metrics,
        .destroy =              ipv4_dst_destroy,
        .ifdown =               ipv4_dst_ifdown,
        .negative_advice =      ipv4_negative_advice,
@@ -1441,6 +1472,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 
                                if (rt->peer)
                                        atomic_inc(&rt->peer->refcnt);
+                               if (rt->fi)
+                                       atomic_inc(&rt->fi->fib_clntref);
 
                                if (arp_bind_neighbour(&rt->dst) ||
                                    !(rt->dst.neighbour->nud_state &
@@ -1720,6 +1753,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
        struct rtable *rt = (struct rtable *) dst;
        struct inet_peer *peer = rt->peer;
 
+       dst_destroy_metrics_generic(dst);
+       if (rt->fi) {
+               fib_info_put(rt->fi);
+               rt->fi = NULL;
+       }
        if (peer) {
                rt->peer = NULL;
                inet_putpeer(peer);
@@ -1824,7 +1862,9 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
                if (FIB_RES_GW(*res) &&
                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
                        rt->rt_gateway = FIB_RES_GW(*res);
-               dst_import_metrics(dst, fi->fib_metrics);
+               rt->fi = fi;
+               atomic_inc(&fi->fib_clntref);
+               dst_init_metrics(dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
                dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
 #endif
@@ -2752,6 +2792,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
                rt->peer = ort->peer;
                if (rt->peer)
                        atomic_inc(&rt->peer->refcnt);
+               rt->fi = ort->fi;
+               if (rt->fi)
+                       atomic_inc(&rt->fi->fib_clntref);
 
                dst_free(new);
        }
index b057d40addec3ef2ffab44413421e502894fe60e..19fbdec6baaa1d36b0501335271e263e1f84ad30 100644 (file)
@@ -196,8 +196,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 {
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
+       dst_destroy_metrics_generic(dst);
+
        if (likely(xdst->u.rt.peer))
                inet_putpeer(xdst->u.rt.peer);
+
        xfrm_dst_destroy(xdst);
 }
 
@@ -215,6 +218,7 @@ static struct dst_ops xfrm4_dst_ops = {
        .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   xfrm4_garbage_collect,
        .update_pmtu =          xfrm4_update_pmtu,
+       .cow_metrics =          dst_cow_metrics_generic,
        .destroy =              xfrm4_dst_destroy,
        .ifdown =               xfrm4_dst_ifdown,
        .local_out =            __ip_local_out,
index 1534508f6c68a3c4f010657e94051e06a7d727c4..45fafa018f12f410850a5a5335f27e590c8fa65d 100644 (file)
@@ -105,6 +105,7 @@ static struct dst_ops ip6_dst_ops_template = {
        .check                  =       ip6_dst_check,
        .default_advmss         =       ip6_default_advmss,
        .default_mtu            =       ip6_default_mtu,
+       .cow_metrics            =       dst_cow_metrics_generic,
        .destroy                =       ip6_dst_destroy,
        .ifdown                 =       ip6_dst_ifdown,
        .negative_advice        =       ip6_negative_advice,
@@ -125,6 +126,10 @@ static struct dst_ops ip6_dst_blackhole_ops = {
        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 };
 
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+       [RTAX_HOPLIMIT - 1] = 255,
+};
+
 static struct rt6_info ip6_null_entry_template = {
        .dst = {
                .__refcnt       = ATOMIC_INIT(1),
@@ -193,6 +198,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
        }
+       dst_destroy_metrics_generic(dst);
        if (peer) {
                BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
                rt->rt6i_peer = NULL;
@@ -2681,7 +2687,8 @@ static int __net_init ip6_route_net_init(struct net *net)
        net->ipv6.ip6_null_entry->dst.path =
                (struct dst_entry *)net->ipv6.ip6_null_entry;
        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
-       dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
+       dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+                        ip6_template_metrics, true);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
@@ -2692,7 +2699,8 @@ static int __net_init ip6_route_net_init(struct net *net)
        net->ipv6.ip6_prohibit_entry->dst.path =
                (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
-       dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
+       dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+                        ip6_template_metrics, true);
 
        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -2702,7 +2710,8 @@ static int __net_init ip6_route_net_init(struct net *net)
        net->ipv6.ip6_blk_hole_entry->dst.path =
                (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
-       dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
+       dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+                        ip6_template_metrics, true);
 #endif
 
        net->ipv6.sysctl.flush_delay = 0;
index da87428681cccc9d955fdc80ee6f8bd4998356dd..834dc02f1d4f91307d84750cd254625fc25ebec2 100644 (file)
@@ -220,6 +220,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
 
        if (likely(xdst->u.rt6.rt6i_idev))
                in6_dev_put(xdst->u.rt6.rt6i_idev);
+       dst_destroy_metrics_generic(dst);
        if (likely(xdst->u.rt6.rt6i_peer))
                inet_putpeer(xdst->u.rt6.rt6i_peer);
        xfrm_dst_destroy(xdst);
@@ -257,6 +258,7 @@ static struct dst_ops xfrm6_dst_ops = {
        .protocol =             cpu_to_be16(ETH_P_IPV6),
        .gc =                   xfrm6_garbage_collect,
        .update_pmtu =          xfrm6_update_pmtu,
+       .cow_metrics =          dst_cow_metrics_generic,
        .destroy =              xfrm6_dst_destroy,
        .ifdown =               xfrm6_dst_ifdown,
        .local_out =            __ip6_local_out,