tun: switch to use skb array for tx
authorJason Wang <jasowang@redhat.com>
Thu, 30 Jun 2016 06:45:36 +0000 (14:45 +0800)
committerDavid S. Miller <davem@davemloft.net>
Fri, 1 Jul 2016 09:32:17 +0000 (05:32 -0400)
We used to queue tx packets in sk_receive_queue, this is less
efficient since it requires spinlocks to synchronize between producer
and consumer.

This patch tries to address this by:

- switch from sk_receive_queue to a skb_array, and resize it when
  tx_queue_len was changed.
- introduce a new proto_ops peek_len which was used for peeking the
  skb length.
- implement a tun version of peek_len for vhost_net to use and convert
  vhost_net to use peek_len if possible.

Pktgen test shows about 15.3% improvement on guest receiving pps for small
buffers:

Before: ~1300000pps
After : ~1500000pps

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/tun.c
drivers/vhost/net.c
include/linux/net.h

index 4884802e0af19a7567ba7fa74ca517ce70c42c2b..74752159ec344b38851c8eac1bce3435699fd90c 100644 (file)
@@ -71,6 +71,7 @@
 #include <net/sock.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
+#include <linux/skb_array.h>
 
 #include <asm/uaccess.h>
 
@@ -167,6 +168,7 @@ struct tun_file {
        };
        struct list_head next;
        struct tun_struct *detached;
+       struct skb_array tx_array;
 };
 
 struct tun_flow_entry {
@@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 
 static void tun_queue_purge(struct tun_file *tfile)
 {
-       skb_queue_purge(&tfile->sk.sk_receive_queue);
+       struct sk_buff *skb;
+
+       while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
+               kfree_skb(skb);
+
        skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
@@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
                            tun->dev->reg_state == NETREG_REGISTERED)
                                unregister_netdevice(tun->dev);
                }
+               if (tun)
+                       skb_array_cleanup(&tfile->tx_array);
                sock_put(&tfile->sk);
        }
 }
@@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
 static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
 {
        struct tun_file *tfile = file->private_data;
+       struct net_device *dev = tun->dev;
        int err;
 
        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
@@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
                if (!err)
                        goto out;
        }
+
+       if (!tfile->detached &&
+           skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+               err = -ENOMEM;
+               goto out;
+       }
+
        tfile->queue_index = tun->numqueues;
        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
        rcu_assign_pointer(tfile->tun, tun);
@@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
        nf_reset(skb);
 
-       /* Enqueue packet */
-       skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
+       if (skb_array_produce(&tfile->tx_array, skb))
+               goto drop;
 
        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
@@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 
        poll_wait(file, sk_sleep(sk), wait);
 
-       if (!skb_queue_empty(&sk->sk_receive_queue))
+       if (!skb_array_empty(&tfile->tx_array))
                mask |= POLLIN | POLLRDNORM;
 
        if (sock_writeable(sk) ||
@@ -1426,22 +1442,61 @@ done:
        return total;
 }
 
+static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
+                                    int *err)
+{
+       DECLARE_WAITQUEUE(wait, current);
+       struct sk_buff *skb = NULL;
+
+       skb = skb_array_consume(&tfile->tx_array);
+       if (skb)
+               goto out;
+       if (noblock) {
+               *err = -EAGAIN;
+               goto out;
+       }
+
+       add_wait_queue(&tfile->wq.wait, &wait);
+       current->state = TASK_INTERRUPTIBLE;
+
+       while (1) {
+               skb = skb_array_consume(&tfile->tx_array);
+               if (skb)
+                       break;
+               if (signal_pending(current)) {
+                       *err = -ERESTARTSYS;
+                       break;
+               }
+               if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
+                       *err = -EFAULT;
+                       break;
+               }
+
+               schedule();
+       }
+
+       current->state = TASK_RUNNING;
+       remove_wait_queue(&tfile->wq.wait, &wait);
+
+out:
+       return skb;
+}
+
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                           struct iov_iter *to,
                           int noblock)
 {
        struct sk_buff *skb;
        ssize_t ret;
-       int peeked, err, off = 0;
+       int err;
 
        tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
        if (!iov_iter_count(to))
                return 0;
 
-       /* Read frames from queue */
-       skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
-                                 &peeked, &off, &err);
+       /* Read frames from ring */
+       skb = tun_ring_recv(tfile, noblock, &err);
        if (!skb)
                return err;
 
@@ -1574,8 +1629,25 @@ out:
        return ret;
 }
 
+static int tun_peek_len(struct socket *sock)
+{
+       struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+       struct tun_struct *tun;
+       int ret = 0;
+
+       tun = __tun_get(tfile);
+       if (!tun)
+               return 0;
+
+       ret = skb_array_peek_len(&tfile->tx_array);
+       tun_put(tun);
+
+       return ret;
+}
+
 /* Ops structure to mimic raw sockets with tun */
 static const struct proto_ops tun_socket_ops = {
+       .peek_len = tun_peek_len,
        .sendmsg = tun_sendmsg,
        .recvmsg = tun_recvmsg,
 };
@@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
        .get_ts_info    = ethtool_op_get_ts_info,
 };
 
+static int tun_queue_resize(struct tun_struct *tun)
+{
+       struct net_device *dev = tun->dev;
+       struct tun_file *tfile;
+       struct skb_array **arrays;
+       int n = tun->numqueues + tun->numdisabled;
+       int ret, i;
+
+       arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+       if (!arrays)
+               return -ENOMEM;
+
+       for (i = 0; i < tun->numqueues; i++) {
+               tfile = rtnl_dereference(tun->tfiles[i]);
+               arrays[i] = &tfile->tx_array;
+       }
+       list_for_each_entry(tfile, &tun->disabled, next)
+               arrays[i++] = &tfile->tx_array;
+
+       ret = skb_array_resize_multiple(arrays, n,
+                                       dev->tx_queue_len, GFP_KERNEL);
+
+       kfree(arrays);
+       return ret;
+}
+
+static int tun_device_event(struct notifier_block *unused,
+                           unsigned long event, void *ptr)
+{
+       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+       struct tun_struct *tun = netdev_priv(dev);
+
+       switch (event) {
+       case NETDEV_CHANGE_TX_QUEUE_LEN:
+               if (tun_queue_resize(tun))
+                       return NOTIFY_BAD;
+               break;
+       default:
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block tun_notifier_block __read_mostly = {
+       .notifier_call  = tun_device_event,
+};
 
 static int __init tun_init(void)
 {
@@ -2416,6 +2535,8 @@ static int __init tun_init(void)
                pr_err("Can't register misc device %d\n", TUN_MINOR);
                goto err_misc;
        }
+
+       register_netdevice_notifier(&tun_notifier_block);
        return  0;
 err_misc:
        rtnl_link_unregister(&tun_link_ops);
@@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
 {
        misc_deregister(&tun_miscdev);
        rtnl_link_unregister(&tun_link_ops);
+       unregister_netdevice_notifier(&tun_notifier_block);
 }
 
 /* Get an underlying socket object from tun file.  Returns error unless file is
index 1d3e45f84549beb85c9bfbe1fc87a283ec164611..e032ca397371099e97e97f7ddff0ae8bd1a41491 100644 (file)
@@ -481,10 +481,14 @@ out:
 
 static int peek_head_len(struct sock *sk)
 {
+       struct socket *sock = sk->sk_socket;
        struct sk_buff *head;
        int len = 0;
        unsigned long flags;
 
+       if (sock->ops->peek_len)
+               return sock->ops->peek_len(sock);
+
        spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
        head = skb_peek(&sk->sk_receive_queue);
        if (likely(head)) {
@@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
        return len;
 }
 
+static int sk_has_rx_data(struct sock *sk)
+{
+       struct socket *sock = sk->sk_socket;
+
+       if (sock->ops->peek_len)
+               return sock->ops->peek_len(sock);
+
+       return skb_queue_empty(&sk->sk_receive_queue);
+}
+
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
        struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
                endtime = busy_clock() + vq->busyloop_timeout;
 
                while (vhost_can_busy_poll(&net->dev, endtime) &&
-                      skb_queue_empty(&sk->sk_receive_queue) &&
+                      !sk_has_rx_data(sk) &&
                       vhost_vq_avail_empty(&net->dev, vq))
                        cpu_relax_lowlatency();
 
index 25aa03b51c4e1e2f6fa1a9d257493dc62098f982..b9f0ff4d489c22cc851170c610f6f56dec7f292f 100644 (file)
@@ -185,6 +185,7 @@ struct proto_ops {
        ssize_t         (*splice_read)(struct socket *sock,  loff_t *ppos,
                                       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
        int             (*set_peek_off)(struct sock *sk, int val);
+       int             (*peek_len)(struct socket *sock);
 };
 
 #define DECLARE_SOCKADDR(type, dst, src)       \