tuntap: multiqueue support
authorJason Wang <jasowang@redhat.com>
Wed, 31 Oct 2012 19:46:00 +0000 (19:46 +0000)
committerDavid S. Miller <davem@davemloft.net>
Thu, 1 Nov 2012 15:14:08 +0000 (11:14 -0400)
This patch converts tun/tap to a multiqueue devices and expose the multiqueue
queues as multiple file descriptors to userspace. Internally, each tun_file were
abstracted as a queue, and an array of pointers to tun_file structurs were
stored in tun_structure device, so multiple tun_files were allowed to be
attached to the device as multiple queues.

When choosing txq, we first try to identify a flow through its rxhash, if it
does not have such one, we could try recorded rxq and then use them to choose
the transmit queue. This policy may be changed in the future.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/tun.c

index bdbb526eca7b201fbd9927f14883b179ee03143a..2762c55aeb6692d5adb746b92beb0c160c19f37c 100644 (file)
@@ -109,6 +109,12 @@ struct tap_filter {
        unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 };
 
+/* 1024 is probably a high enough limit: modern hypervisors seem to support on
+ * the order of 100-200 CPUs so this leaves us some breathing space if we want
+ * to match a queue per guest CPU.
+ */
+#define MAX_TAP_QUEUES 1024
+
 /* A tun_file connects an open character device to a tuntap netdevice. It
  * also contains all socket related strctures (except sock_fprog and tap_filter)
  * to serve as one transmit queue for tuntap device. The sock_fprog and
@@ -129,6 +135,7 @@ struct tun_file {
        struct fasync_struct *fasync;
        /* only used for fasnyc */
        unsigned int flags;
+       u16 queue_index;
 };
 
 /* Since the socket were moved to tun_file, to preserve the behavior of persist
@@ -136,7 +143,8 @@ struct tun_file {
  * file were attached to a persist device.
  */
 struct tun_struct {
-       struct tun_file __rcu   *tfile;
+       struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
+       unsigned int            numqueues;
        unsigned int            flags;
        kuid_t                  owner;
        kgid_t                  group;
@@ -157,56 +165,157 @@ struct tun_struct {
 #endif
 };
 
+/* We try to identify a flow through its rxhash first. The reason that
+ * we do not check rxq no. is becuase some cards(e.g 82599), chooses
+ * the rxq based on the txq where the last packet of the flow comes. As
+ * the userspace application move between processors, we may get a
+ * different rxq no. here. If we could not get rxhash, then we would
+ * hope the rxq no. may help here.
+ */
+static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+       struct tun_struct *tun = netdev_priv(dev);
+       u32 txq = 0;
+       u32 numqueues = 0;
+
+       rcu_read_lock();
+       numqueues = tun->numqueues;
+
+       txq = skb_get_rxhash(skb);
+       if (txq) {
+               /* use multiply and shift instead of expensive divide */
+               txq = ((u64)txq * numqueues) >> 32;
+       } else if (likely(skb_rx_queue_recorded(skb))) {
+               txq = skb_get_rx_queue(skb);
+               while (unlikely(txq >= numqueues))
+                       txq -= numqueues;
+       }
+
+       rcu_read_unlock();
+       return txq;
+}
+
+static void tun_set_real_num_queues(struct tun_struct *tun)
+{
+       netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
+       netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
+}
+
+static void __tun_detach(struct tun_file *tfile, bool clean)
+{
+       struct tun_file *ntfile;
+       struct tun_struct *tun;
+       struct net_device *dev;
+
+       tun = rcu_dereference_protected(tfile->tun,
+                                       lockdep_rtnl_is_held());
+       if (tun) {
+               u16 index = tfile->queue_index;
+               BUG_ON(index >= tun->numqueues);
+               dev = tun->dev;
+
+               rcu_assign_pointer(tun->tfiles[index],
+                                  tun->tfiles[tun->numqueues - 1]);
+               rcu_assign_pointer(tfile->tun, NULL);
+               ntfile = rcu_dereference_protected(tun->tfiles[index],
+                                                  lockdep_rtnl_is_held());
+               ntfile->queue_index = index;
+
+               --tun->numqueues;
+               sock_put(&tfile->sk);
+
+               synchronize_net();
+               /* Drop read queue */
+               skb_queue_purge(&tfile->sk.sk_receive_queue);
+               tun_set_real_num_queues(tun);
+
+               if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST))
+                       if (dev->reg_state == NETREG_REGISTERED)
+                               unregister_netdevice(dev);
+       }
+
+       if (clean) {
+               BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
+                                &tfile->socket.flags));
+               sk_release_kernel(&tfile->sk);
+       }
+}
+
+static void tun_detach(struct tun_file *tfile, bool clean)
+{
+       rtnl_lock();
+       __tun_detach(tfile, clean);
+       rtnl_unlock();
+}
+
+static void tun_detach_all(struct net_device *dev)
+{
+       struct tun_struct *tun = netdev_priv(dev);
+       struct tun_file *tfile;
+       int i, n = tun->numqueues;
+
+       for (i = 0; i < n; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               BUG_ON(!tfile);
+               wake_up_all(&tfile->wq.wait);
+               rcu_assign_pointer(tfile->tun, NULL);
+               --tun->numqueues;
+       }
+       BUG_ON(tun->numqueues != 0);
+
+       synchronize_net();
+       for (i = 0; i < n; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               /* Drop read queue */
+               skb_queue_purge(&tfile->sk.sk_receive_queue);
+               sock_put(&tfile->sk);
+       }
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file)
 {
        struct tun_file *tfile = file->private_data;
        int err;
 
-       ASSERT_RTNL();
-
-       netif_tx_lock_bh(tun->dev);
-
        err = -EINVAL;
-       if (tfile->tun)
+       if (rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held()))
                goto out;
 
        err = -EBUSY;
-       if (tun->tfile)
+       if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
+               goto out;
+
+       err = -E2BIG;
+       if (tun->numqueues == MAX_TAP_QUEUES)
                goto out;
 
        err = 0;
 
-       /* Re-attach filter when attaching to a persist device */
+       /* Re-attach the filter to presist device */
        if (tun->filter_attached == true) {
                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                if (!err)
                        goto out;
        }
+       tfile->queue_index = tun->numqueues;
        rcu_assign_pointer(tfile->tun, tun);
-       tfile->socket.sk->sk_sndbuf = tun->sndbuf;
-       rcu_assign_pointer(tun->tfile, tfile);
-       netif_carrier_on(tun->dev);
+       rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        sock_hold(&tfile->sk);
+       tun->numqueues++;
 
-out:
-       netif_tx_unlock_bh(tun->dev);
-       return err;
-}
+       tun_set_real_num_queues(tun);
 
-static void __tun_detach(struct tun_struct *tun)
-{
-       struct tun_file *tfile = rcu_dereference_protected(tun->tfile,
-                                                       lockdep_rtnl_is_held());
-       /* Detach from net device */
-       netif_carrier_off(tun->dev);
-       rcu_assign_pointer(tun->tfile, NULL);
-       if (tfile) {
-               rcu_assign_pointer(tfile->tun, NULL);
+       if (tun->numqueues == 1)
+               netif_carrier_on(tun->dev);
 
-               synchronize_net();
-               /* Drop read queue */
-               skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
-       }
+       /* device is allowed to go away first, so no need to hold extra
+        * refcnt.
+        */
+
+out:
+       return err;
 }
 
 static struct tun_struct *__tun_get(struct tun_file *tfile)
@@ -349,30 +458,20 @@ static const struct ethtool_ops tun_ethtool_ops;
 /* Net device detach from fd. */
 static void tun_net_uninit(struct net_device *dev)
 {
-       struct tun_struct *tun = netdev_priv(dev);
-       struct tun_file *tfile = rcu_dereference_protected(tun->tfile,
-                                                       lockdep_rtnl_is_held());
-
-       /* Inform the methods they need to stop using the dev.
-        */
-       if (tfile) {
-               wake_up_all(&tfile->wq.wait);
-               __tun_detach(tun);
-               synchronize_net();
-       }
+       tun_detach_all(dev);
 }
 
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
-       netif_start_queue(dev);
+       netif_tx_start_all_queues(dev);
        return 0;
 }
 
 /* Net device close. */
 static int tun_net_close(struct net_device *dev)
 {
-       netif_stop_queue(dev);
+       netif_tx_stop_all_queues(dev);
        return 0;
 }
 
@@ -380,16 +479,20 @@ static int tun_net_close(struct net_device *dev)
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct tun_struct *tun = netdev_priv(dev);
+       int txq = skb->queue_mapping;
        struct tun_file *tfile;
 
        rcu_read_lock();
-       tfile = rcu_dereference(tun->tfile);
+       tfile = rcu_dereference(tun->tfiles[txq]);
+
        /* Drop packet if interface is not attached */
-       if (!tfile)
+       if (txq >= tun->numqueues)
                goto drop;
 
        tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 
+       BUG_ON(!tfile);
+
        /* Drop if the filter does not like it.
         * This is a noop if the filter is disabled.
         * Filter can be enabled only for the TAP devices. */
@@ -400,12 +503,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
            sk_filter(tfile->socket.sk, skb))
                goto drop;
 
+       /* Limit the number of packets queued by divining txq length with the
+        * number of queues.
+        */
        if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
-           >= dev->tx_queue_len) {
+                         >= dev->tx_queue_len / tun->numqueues){
                if (!(tun->flags & TUN_ONE_QUEUE)) {
                        /* Normal queueing mode. */
                        /* Packet scheduler handles dropping of further packets. */
-                       netif_stop_queue(dev);
+                       netif_stop_subqueue(dev, txq);
 
                        /* We won't see all dropped packets individually, so overrun
                         * error is more appropriate. */
@@ -494,6 +600,7 @@ static const struct net_device_ops tun_netdev_ops = {
        .ndo_start_xmit         = tun_net_xmit,
        .ndo_change_mtu         = tun_net_change_mtu,
        .ndo_fix_features       = tun_net_fix_features,
+       .ndo_select_queue       = tun_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = tun_poll_controller,
 #endif
@@ -509,6 +616,7 @@ static const struct net_device_ops tap_netdev_ops = {
        .ndo_set_rx_mode        = tun_net_mclist,
        .ndo_set_mac_address    = eth_mac_addr,
        .ndo_validate_addr      = eth_validate_addr,
+       .ndo_select_queue       = tun_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = tun_poll_controller,
 #endif
@@ -550,7 +658,7 @@ static void tun_net_init(struct net_device *dev)
 /* Character device part */
 
 /* Poll */
-static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
+static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 {
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = __tun_get(tfile);
@@ -995,7 +1103,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                        schedule();
                        continue;
                }
-               netif_wake_queue(tun->dev);
+               netif_wake_subqueue(tun->dev, tfile->queue_index);
 
                ret = tun_put_user(tun, tfile, skb, iv, len);
                kfree_skb(skb);
@@ -1156,6 +1264,9 @@ static int tun_flags(struct tun_struct *tun)
        if (tun->flags & TUN_VNET_HDR)
                flags |= IFF_VNET_HDR;
 
+       if (tun->flags & TUN_TAP_MQ)
+               flags |= IFF_MULTI_QUEUE;
+
        return flags;
 }
 
@@ -1247,8 +1358,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
                if (*ifr->ifr_name)
                        name = ifr->ifr_name;
 
-               dev = alloc_netdev(sizeof(struct tun_struct), name,
-                                  tun_setup);
+               dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
+                                      tun_setup,
+                                      MAX_TAP_QUEUES, MAX_TAP_QUEUES);
                if (!dev)
                        return -ENOMEM;
 
@@ -1283,7 +1395,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
                err = tun_attach(tun, file);
                if (err < 0)
-                       goto failed;
+                       goto err_free_dev;
        }
 
        tun_debug(KERN_INFO, tun, "tun_set_iff\n");
@@ -1303,18 +1415,22 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
        else
                tun->flags &= ~TUN_VNET_HDR;
 
+       if (ifr->ifr_flags & IFF_MULTI_QUEUE)
+               tun->flags |= TUN_TAP_MQ;
+       else
+               tun->flags &= ~TUN_TAP_MQ;
+
        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
        if (netif_running(tun->dev))
-               netif_wake_queue(tun->dev);
+               netif_tx_wake_all_queues(tun->dev);
 
        strcpy(ifr->ifr_name, tun->dev->name);
        return 0;
 
  err_free_dev:
        free_netdev(dev);
- failed:
        return err;
 }
 
@@ -1369,6 +1485,51 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
        return 0;
 }
 
+static void tun_detach_filter(struct tun_struct *tun, int n)
+{
+       int i;
+       struct tun_file *tfile;
+
+       for (i = 0; i < n; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               sk_detach_filter(tfile->socket.sk);
+       }
+
+       tun->filter_attached = false;
+}
+
+static int tun_attach_filter(struct tun_struct *tun)
+{
+       int i, ret = 0;
+       struct tun_file *tfile;
+
+       for (i = 0; i < tun->numqueues; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+               if (ret) {
+                       tun_detach_filter(tun, i);
+                       return ret;
+               }
+       }
+
+       tun->filter_attached = true;
+       return ret;
+}
+
+static void tun_set_sndbuf(struct tun_struct *tun)
+{
+       struct tun_file *tfile;
+       int i;
+
+       for (i = 0; i < tun->numqueues; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                               lockdep_rtnl_is_held());
+               tfile->socket.sk->sk_sndbuf = tun->sndbuf;
+       }
+}
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg, int ifreq_len)
 {
@@ -1397,6 +1558,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                                (unsigned int __user*)argp);
        }
 
+       ret = 0;
        rtnl_lock();
 
        tun = __tun_get(tfile);
@@ -1537,7 +1699,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                        break;
                }
 
-               tun->sndbuf = tfile->socket.sk->sk_sndbuf = sndbuf;
+               tun->sndbuf = sndbuf;
+               tun_set_sndbuf(tun);
                break;
 
        case TUNGETVNETHDRSZ:
@@ -1568,9 +1731,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
                        break;
 
-               ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
-               if (!ret)
-                       tun->filter_attached = true;
+               ret = tun_attach_filter(tun);
                break;
 
        case TUNDETACHFILTER:
@@ -1578,9 +1739,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
                        break;
-               ret = sk_detach_filter(tfile->socket.sk);
-               if (!ret)
-                       tun->filter_attached = false;
+               ret = 0;
+               tun_detach_filter(tun, tun->numqueues);
                break;
 
        default:
@@ -1685,37 +1845,9 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 static int tun_chr_close(struct inode *inode, struct file *file)
 {
        struct tun_file *tfile = file->private_data;
-       struct tun_struct *tun;
        struct net *net = tfile->net;
 
-       rtnl_lock();
-
-       tun = rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held());
-       if (tun) {
-               struct net_device *dev = tun->dev;
-
-               tun_debug(KERN_INFO, tun, "tun_chr_close\n");
-
-               __tun_detach(tun);
-
-               synchronize_net();
-
-               /* If desirable, unregister the netdevice. */
-               if (!(tun->flags & TUN_PERSIST)) {
-                       if (dev->reg_state == NETREG_REGISTERED)
-                               unregister_netdevice(dev);
-               }
-
-               /* drop the reference that netdevice holds */
-               sock_put(&tfile->sk);
-       }
-
-       rtnl_unlock();
-
-       /* drop the reference that file holds */
-       BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
-                        &tfile->socket.flags));
-       sk_release_kernel(&tfile->sk);
+       tun_detach(tfile, true);
        put_net(net);
 
        return 0;