net: avoid a pair of dst_hold()/dst_release() in ip_append_data()
authorEric Dumazet <dada1@cosmosbay.com>
Mon, 24 Nov 2008 23:52:46 +0000 (15:52 -0800)
committerDavid S. Miller <davem@davemloft.net>
Mon, 24 Nov 2008 23:52:46 +0000 (15:52 -0800)
We can reduce pressure on dst entry refcount that slowdown UDP transmit
path on SMP machines. This pressure is visible on RTP servers when
delivering content to mediagateways, especially big ones, handling
thousand of streams. Several cpus send UDP frames to the same
destination, hence use the same dst entry.

This patch makes ip_append_data() eventually steal the refcount its
callers had to take on the dst entry.

This doesnt avoid all refcounting, but still gives speedups on SMP,
on UDP/RAW transmit path

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/ip.h
net/ipv4/icmp.c
net/ipv4/ip_output.c
net/ipv4/raw.c
net/ipv4/udp.c

index bc026ecb513f7c57f1d836017f4a02b6d8e72a57..ddef10c22e3a0411922349dae31b08192da72081 100644 (file)
@@ -110,7 +110,7 @@ extern int          ip_append_data(struct sock *sk,
                                                   int odd, struct sk_buff *skb),
                                void *from, int len, int protolen,
                                struct ipcm_cookie *ipc,
-                               struct rtable *rt,
+                               struct rtable **rt,
                                unsigned int flags);
 extern int             ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
 extern ssize_t         ip_append_page(struct sock *sk, struct page *page,
index 21e497efbd7ff7bdde5bc8b401a62cb3ce1c68e1..7b88be9803b13fd6c2c53576b14672cba49f91b5 100644 (file)
@@ -321,12 +321,12 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
 }
 
 static void icmp_push_reply(struct icmp_bxm *icmp_param,
-                           struct ipcm_cookie *ipc, struct rtable *rt)
+                           struct ipcm_cookie *ipc, struct rtable **rt)
 {
        struct sock *sk;
        struct sk_buff *skb;
 
-       sk = icmp_sk(dev_net(rt->u.dst.dev));
+       sk = icmp_sk(dev_net((*rt)->u.dst.dev));
        if (ip_append_data(sk, icmp_glue_bits, icmp_param,
                           icmp_param->data_len+icmp_param->head_len,
                           icmp_param->head_len,
@@ -392,7 +392,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        }
        if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
                               icmp_param->data.icmph.code))
-               icmp_push_reply(icmp_param, &ipc, rt);
+               icmp_push_reply(icmp_param, &ipc, &rt);
        ip_rt_put(rt);
 out_unlock:
        icmp_xmit_unlock(sk);
@@ -635,7 +635,7 @@ route_done:
                icmp_param.data_len = room;
        icmp_param.head_len = sizeof(struct icmphdr);
 
-       icmp_push_reply(&icmp_param, &ipc, rt);
+       icmp_push_reply(&icmp_param, &ipc, &rt);
 ende:
        ip_rt_put(rt);
 out_unlock:
index 46d7be233eac9132e6e023c492b36b535e9bb1df..5516825a0751e7e9b6e0ede5ff3b19f809a8ff2e 100644 (file)
@@ -778,7 +778,7 @@ int ip_append_data(struct sock *sk,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int length, int transhdrlen,
-                  struct ipcm_cookie *ipc, struct rtable *rt,
+                  struct ipcm_cookie *ipc, struct rtable **rtp,
                   unsigned int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
@@ -793,6 +793,7 @@ int ip_append_data(struct sock *sk,
        int offset = 0;
        unsigned int maxfraglen, fragheaderlen;
        int csummode = CHECKSUM_NONE;
+       struct rtable *rt;
 
        if (flags&MSG_PROBE)
                return 0;
@@ -812,7 +813,11 @@ int ip_append_data(struct sock *sk,
                        inet->cork.flags |= IPCORK_OPT;
                        inet->cork.addr = ipc->addr;
                }
-               dst_hold(&rt->u.dst);
+               rt = *rtp;
+               /*
+                * We steal reference to this route, caller should not release it
+                */
+               *rtp = NULL;
                inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
                                            rt->u.dst.dev->mtu :
                                            dst_mtu(rt->u.dst.path);
@@ -1391,7 +1396,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
-                      &ipc, rt, MSG_DONTWAIT);
+                      &ipc, &rt, MSG_DONTWAIT);
        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
                if (arg->csumoffset >= 0)
                        *((__sum16 *)skb_transport_header(skb) +
index 998fcffc9e15eb7de9ac8c1a516b6035853ce6fb..dff8bc4e0facd1cc804397bfc7e9c581962ae7c9 100644 (file)
@@ -572,7 +572,7 @@ back_from_confirm:
                        ipc.addr = rt->rt_dst;
                lock_sock(sk);
                err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
-                                       &ipc, rt, msg->msg_flags);
+                                       &ipc, &rt, msg->msg_flags);
                if (err)
                        ip_flush_pending_frames(sk);
                else if (!(msg->msg_flags & MSG_MORE))
index da869ce041d9c6a9a5b149d13eddafda55db993a..549114472db3f1d468e0bc6b85a3c2c4f7fc46bf 100644 (file)
@@ -719,7 +719,7 @@ do_append_data:
        up->len += ulen;
        getfrag  =  is_udplite ?  udplite_getfrag : ip_generic_getfrag;
        err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
-                       sizeof(struct udphdr), &ipc, rt,
+                       sizeof(struct udphdr), &ipc, &rt,
                        corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
        if (err)
                udp_flush_pending_frames(sk);