bpf: enable non-root eBPF programs
authorAlexei Starovoitov <ast@plumgrid.com>
Thu, 8 Oct 2015 05:23:21 +0000 (22:23 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 13 Oct 2015 02:13:35 +0000 (19:13 -0700)
In order to let unprivileged users load and execute eBPF programs
teach verifier to prevent pointer leaks.
Verifier will prevent
- any arithmetic on pointers
  (except R10+Imm which is used to compute stack addresses)
- comparison of pointers
  (except if (map_value_ptr == 0) ... )
- passing pointers to helper functions
- indirectly passing pointers in stack to helper functions
- returning pointer from bpf program
- storing pointers into ctx or maps

Spill/fill of pointers into stack is allowed, but mangling
of pointers stored in the stack or reading them byte by byte is not.

Within bpf programs the pointers do exist, since programs need to
be able to access maps, pass skb pointer to LD_ABS insns, etc
but programs cannot pass such pointer values to the outside
or obfuscate them.

Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs,
so that socket filters (tcpdump), af_packet (quic acceleration)
and future kcm can use it.
tracing and tc cls/act program types still require root permissions,
since tracing actually needs to be able to see all kernel pointers
and tc is for root only.

For example, the following unprivileged socket filter program is allowed:
int bpf_prog1(struct __sk_buff *skb)
{
  u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
  u64 *value = bpf_map_lookup_elem(&my_map, &index);

  if (value)
*value += skb->len;
  return 0;
}

but the following program is not:
int bpf_prog1(struct __sk_buff *skb)
{
  u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
  u64 *value = bpf_map_lookup_elem(&my_map, &index);

  if (value)
*value += (u64) skb;
  return 0;
}
since it would leak the kernel address into the map.

Unprivileged socket filter bpf programs have access to the
following helper functions:
- map lookup/update/delete (but they cannot store kernel pointers into them)
- get_random (it's already exposed to unprivileged user space)
- get_smp_processor_id
- tail_call into another socket filter program
- ktime_get_ns

The feature is controlled by sysctl kernel.unprivileged_bpf_disabled.
This toggle defaults to off (0), but can be set true (1).  Once true,
bpf programs and maps cannot be accessed from unprivileged process,
and the toggle cannot be set back to false.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/bpf.h
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/sysctl.c
net/core/filter.c

index b4fdee6cb68612a4eb7039b9f815f219b8ee5eca..02fa3db3c1ec597b0b2d5ac8fc3907f06bcb6a4d 100644 (file)
@@ -167,6 +167,8 @@ void bpf_prog_put_rcu(struct bpf_prog *prog);
 struct bpf_map *bpf_map_get(struct fd f);
 void bpf_map_put(struct bpf_map *map);
 
+extern int sysctl_unprivileged_bpf_disabled;
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 #else
index c868cafbc00c659939a6ec8ac9b808db5f8cd1f6..83697bc8e574aadafb3ab4db613a03c4f988f83e 100644 (file)
@@ -18,6 +18,8 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
 static LIST_HEAD(bpf_map_types);
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -544,6 +546,9 @@ static int bpf_prog_load(union bpf_attr *attr)
            attr->kern_version != LINUX_VERSION_CODE)
                return -EINVAL;
 
+       if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
        /* plain bpf_prog allocation */
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog)
@@ -599,11 +604,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
        union bpf_attr attr = {};
        int err;
 
-       /* the syscall is limited to root temporarily. This restriction will be
-        * lifted when security audit is clean. Note that eBPF+tracing must have
-        * this restriction, since it may pass kernel data to user space
-        */
-       if (!capable(CAP_SYS_ADMIN))
+       if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
                return -EPERM;
 
        if (!access_ok(VERIFY_READ, uattr, 1))
index f8da034c225822a59db5529c4675dacba26146c5..1d6b97be79e1dd6000e626a74bf001e6cab5835d 100644 (file)
@@ -199,6 +199,7 @@ struct verifier_env {
        struct verifier_state_list **explored_states; /* search pruning optimization */
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        u32 used_map_cnt;               /* number of used maps */
+       bool allow_ptr_leaks;
 };
 
 /* verbose verifier prints what it's seeing
@@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size)
                return -EINVAL;
 }
 
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+       switch (type) {
+       case PTR_TO_MAP_VALUE:
+       case PTR_TO_MAP_VALUE_OR_NULL:
+       case PTR_TO_STACK:
+       case PTR_TO_CTX:
+       case FRAME_PTR:
+       case CONST_PTR_TO_MAP:
+               return true;
+       default:
+               return false;
+       }
+}
+
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
@@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
         */
 
        if (value_regno >= 0 &&
-           (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
-            state->regs[value_regno].type == PTR_TO_STACK ||
-            state->regs[value_regno].type == PTR_TO_CTX)) {
+           is_spillable_regtype(state->regs[value_regno].type)) {
 
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
@@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
        return -EACCES;
 }
 
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+       if (env->allow_ptr_leaks)
+               return false;
+
+       switch (env->cur_state.regs[regno].type) {
+       case UNKNOWN_VALUE:
+       case CONST_IMM:
+               return false;
+       default:
+               return true;
+       }
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
        }
 
        if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+               if (t == BPF_WRITE && value_regno >= 0 &&
+                   is_pointer_value(env, value_regno)) {
+                       verbose("R%d leaks addr into map\n", value_regno);
+                       return -EACCES;
+               }
                err = check_map_access(env, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
 
        } else if (state->regs[regno].type == PTR_TO_CTX) {
+               if (t == BPF_WRITE && value_regno >= 0 &&
+                   is_pointer_value(env, value_regno)) {
+                       verbose("R%d leaks addr into ctx\n", value_regno);
+                       return -EACCES;
+               }
                err = check_ctx_access(env, off, size, t);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                        verbose("invalid stack off=%d size=%d\n", off, size);
                        return -EACCES;
                }
-               if (t == BPF_WRITE)
+               if (t == BPF_WRITE) {
+                       if (!env->allow_ptr_leaks &&
+                           state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+                           size != BPF_REG_SIZE) {
+                               verbose("attempt to corrupt spilled pointer on stack\n");
+                               return -EACCES;
+                       }
                        err = check_stack_write(state, off, size, value_regno);
-               else
+               } else {
                        err = check_stack_read(state, off, size, value_regno);
+               }
        } else {
                verbose("R%d invalid mem access '%s'\n",
                        regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
                return -EACCES;
        }
 
-       if (arg_type == ARG_ANYTHING)
+       if (arg_type == ARG_ANYTHING) {
+               if (is_pointer_value(env, regno)) {
+                       verbose("R%d leaks addr into helper function\n", regno);
+                       return -EACCES;
+               }
                return 0;
+       }
 
        if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
            arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id)
 }
 
 /* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 {
+       struct reg_state *regs = env->cur_state.regs;
        u8 opcode = BPF_OP(insn->code);
        int err;
 
@@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
                if (err)
                        return err;
 
+               if (is_pointer_value(env, insn->dst_reg)) {
+                       verbose("R%d pointer arithmetic prohibited\n",
+                               insn->dst_reg);
+                       return -EACCES;
+               }
+
                /* check dest operand */
                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
                if (err)
@@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
                                 */
                                regs[insn->dst_reg] = regs[insn->src_reg];
                        } else {
+                               if (is_pointer_value(env, insn->src_reg)) {
+                                       verbose("R%d partial copy of pointer\n",
+                                               insn->src_reg);
+                                       return -EACCES;
+                               }
                                regs[insn->dst_reg].type = UNKNOWN_VALUE;
                                regs[insn->dst_reg].map_ptr = NULL;
                        }
@@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
                /* pattern match 'bpf_add Rx, imm' instruction */
                if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
                    regs[insn->dst_reg].type == FRAME_PTR &&
-                   BPF_SRC(insn->code) == BPF_K)
+                   BPF_SRC(insn->code) == BPF_K) {
                        stack_relative = true;
+               } else if (is_pointer_value(env, insn->dst_reg)) {
+                       verbose("R%d pointer arithmetic prohibited\n",
+                               insn->dst_reg);
+                       return -EACCES;
+               } else if (BPF_SRC(insn->code) == BPF_X &&
+                          is_pointer_value(env, insn->src_reg)) {
+                       verbose("R%d pointer arithmetic prohibited\n",
+                               insn->src_reg);
+                       return -EACCES;
+               }
 
                /* check dest operand */
                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
                err = check_reg_arg(regs, insn->src_reg, SRC_OP);
                if (err)
                        return err;
+
+               if (is_pointer_value(env, insn->src_reg)) {
+                       verbose("R%d pointer comparison prohibited\n",
+                               insn->src_reg);
+                       return -EACCES;
+               }
        } else {
                if (insn->src_reg != BPF_REG_0) {
                        verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
                        regs[insn->dst_reg].type = CONST_IMM;
                        regs[insn->dst_reg].imm = 0;
                }
+       } else if (is_pointer_value(env, insn->dst_reg)) {
+               verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+               return -EACCES;
        } else if (BPF_SRC(insn->code) == BPF_K &&
                   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
 
@@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env)
                }
 
                if (class == BPF_ALU || class == BPF_ALU64) {
-                       err = check_alu_op(regs, insn);
+                       err = check_alu_op(env, insn);
                        if (err)
                                return err;
 
@@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env)
                                if (err)
                                        return err;
 
+                               if (is_pointer_value(env, BPF_REG_0)) {
+                                       verbose("R0 leaks addr as return value\n");
+                                       return -EACCES;
+                               }
+
 process_bpf_exit:
                                insn_idx = pop_stack(env, &prev_insn_idx);
                                if (insn_idx < 0) {
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        if (ret < 0)
                goto skip_full_check;
 
+       env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
        ret = do_check(env);
 
 skip_full_check:
index e69201d8094eb8bed747329afc17528c4315a6b7..96c856b040819e30f5e9d4dad4ac396569f0eba0 100644 (file)
@@ -64,6 +64,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/bpf.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1138,6 +1139,18 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = timer_migration_handler,
        },
+#endif
+#ifdef CONFIG_BPF_SYSCALL
+       {
+               .procname       = "unprivileged_bpf_disabled",
+               .data           = &sysctl_unprivileged_bpf_disabled,
+               .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
+               .mode           = 0644,
+               /* only handle a transition from default "0" to "1" */
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &one,
+               .extra2         = &one,
+       },
 #endif
        { }
 };
index 5f4cf1cffed366c400104e61b032d95de1591edf..0b00094932ab46f20b9c1ae6948322c5dd130afc 100644 (file)
@@ -1640,7 +1640,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
        case BPF_FUNC_ktime_get_ns:
                return &bpf_ktime_get_ns_proto;
        case BPF_FUNC_trace_printk:
-               return bpf_get_trace_printk_proto();
+               if (capable(CAP_SYS_ADMIN))
+                       return bpf_get_trace_printk_proto();
        default:
                return NULL;
        }