packet: add classic BPF fanout mode
authorWillem de Bruijn <willemb@google.com>
Sat, 15 Aug 2015 02:31:34 +0000 (22:31 -0400)
committerDavid S. Miller <davem@davemloft.net>
Mon, 17 Aug 2015 21:22:47 +0000 (14:22 -0700)
Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program
to select a socket.

This avoids having to keep adding special case fanout modes. One
example use case is application layer load balancing. The QUIC
protocol, for instance, encodes a connection ID in UDP payload.

Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the
only user so far.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/uapi/linux/if_packet.h
net/packet/af_packet.c
net/packet/internal.h

index d3d715f8c88f6d57c4318dc5b001e8efad2d074f..a4bb16fa822ef011aea2014dae4addb87c2d5634 100644 (file)
@@ -55,6 +55,7 @@ struct sockaddr_ll {
 #define PACKET_TX_HAS_OFF              19
 #define PACKET_QDISC_BYPASS            20
 #define PACKET_ROLLOVER_STATS          21
+#define PACKET_FANOUT_DATA             22
 
 #define PACKET_FANOUT_HASH             0
 #define PACKET_FANOUT_LB               1
@@ -62,6 +63,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_ROLLOVER         3
 #define PACKET_FANOUT_RND              4
 #define PACKET_FANOUT_QM               5
+#define PACKET_FANOUT_CBPF             6
 #define PACKET_FANOUT_FLAG_ROLLOVER    0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG      0x8000
 
index b5afe538bb88e9b97eec2faa4e93acbb16be4fa0..8869d07013e6b9eeb97d66cd8df8228d6c17e0fc 100644 (file)
@@ -92,6 +92,7 @@
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
+#include <linux/bpf.h>
 
 #include "internal.h"
 
@@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f,
        return skb_get_queue_mapping(skb) % num;
 }
 
+static unsigned int fanout_demux_bpf(struct packet_fanout *f,
+                                    struct sk_buff *skb,
+                                    unsigned int num)
+{
+       struct bpf_prog *prog;
+       unsigned int ret = 0;
+
+       rcu_read_lock();
+       prog = rcu_dereference(f->bpf_prog);
+       if (prog)
+               ret = BPF_PROG_RUN(prog, skb) % num;
+       rcu_read_unlock();
+
+       return ret;
+}
+
 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
 {
        return f->flags & (flag >> 8);
@@ -1454,6 +1471,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
        case PACKET_FANOUT_ROLLOVER:
                idx = fanout_demux_rollover(f, skb, 0, false, num);
                break;
+       case PACKET_FANOUT_CBPF:
+               idx = fanout_demux_bpf(f, skb, num);
+               break;
        }
 
        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
@@ -1502,6 +1522,74 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
        return false;
 }
 
+static void fanout_init_data(struct packet_fanout *f)
+{
+       switch (f->type) {
+       case PACKET_FANOUT_LB:
+               atomic_set(&f->rr_cur, 0);
+               break;
+       case PACKET_FANOUT_CBPF:
+               RCU_INIT_POINTER(f->bpf_prog, NULL);
+               break;
+       }
+}
+
+static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
+{
+       struct bpf_prog *old;
+
+       spin_lock(&f->lock);
+       old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
+       rcu_assign_pointer(f->bpf_prog, new);
+       spin_unlock(&f->lock);
+
+       if (old) {
+               synchronize_net();
+               bpf_prog_destroy(old);
+       }
+}
+
+static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
+                               unsigned int len)
+{
+       struct bpf_prog *new;
+       struct sock_fprog fprog;
+       int ret;
+
+       if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+               return -EPERM;
+       if (len != sizeof(fprog))
+               return -EINVAL;
+       if (copy_from_user(&fprog, data, len))
+               return -EFAULT;
+
+       ret = bpf_prog_create_from_user(&new, &fprog, NULL);
+       if (ret)
+               return ret;
+
+       __fanout_set_data_bpf(po->fanout, new);
+       return 0;
+}
+
+static int fanout_set_data(struct packet_sock *po, char __user *data,
+                          unsigned int len)
+{
+       switch (po->fanout->type) {
+       case PACKET_FANOUT_CBPF:
+               return fanout_set_data_cbpf(po, data, len);
+       default:
+               return -EINVAL;
+       };
+}
+
+static void fanout_release_data(struct packet_fanout *f)
+{
+       switch (f->type) {
+       case PACKET_FANOUT_CBPF:
+               __fanout_set_data_bpf(f, NULL);
+       };
+}
+
 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 {
        struct packet_sock *po = pkt_sk(sk);
@@ -1519,6 +1607,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
        case PACKET_FANOUT_CPU:
        case PACKET_FANOUT_RND:
        case PACKET_FANOUT_QM:
+       case PACKET_FANOUT_CBPF:
                break;
        default:
                return -EINVAL;
@@ -1561,10 +1650,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
                match->id = id;
                match->type = type;
                match->flags = flags;
-               atomic_set(&match->rr_cur, 0);
                INIT_LIST_HEAD(&match->list);
                spin_lock_init(&match->lock);
                atomic_set(&match->sk_ref, 0);
+               fanout_init_data(match);
                match->prot_hook.type = po->prot_hook.type;
                match->prot_hook.dev = po->prot_hook.dev;
                match->prot_hook.func = packet_rcv_fanout;
@@ -1610,6 +1699,7 @@ static void fanout_release(struct sock *sk)
        if (atomic_dec_and_test(&f->sk_ref)) {
                list_del(&f->list);
                dev_remove_pack(&f->prot_hook);
+               fanout_release_data(f);
                kfree(f);
        }
        mutex_unlock(&fanout_mutex);
@@ -3529,6 +3619,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
                return fanout_add(sk, val & 0xffff, val >> 16);
        }
+       case PACKET_FANOUT_DATA:
+       {
+               if (!po->fanout)
+                       return -EINVAL;
+
+               return fanout_set_data(po, optval, optlen);
+       }
        case PACKET_TX_HAS_OFF:
        {
                unsigned int val;
index e20b3e8829b8acac25b74d7bbc23636b089d1ff4..9ee46314b7d76df47d683c252a92ce97398d592b 100644 (file)
@@ -79,7 +79,10 @@ struct packet_fanout {
        u16                     id;
        u8                      type;
        u8                      flags;
-       atomic_t                rr_cur;
+       union {
+               atomic_t                rr_cur;
+               struct bpf_prog __rcu   *bpf_prog;
+       };
        struct list_head        list;
        struct sock             *arr[PACKET_FANOUT_MAX];
        spinlock_t              lock;