From 14933dc8d9964e46f1d5bd2a4dfe3d3be8e420e0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 24 Apr 2017 19:42:34 -0700 Subject: [PATCH] sparc64: Improve 64-bit constant loading in eBPF JIT. Doing a full 64-bit decomposition is really stupid especially for simple values like 0 and -1. But if we are going to optimize this, go all the way and try for all 2 and 3 instruction sequences not requiring a temporary register as well. First we do the easy cases where it's a zero or sign extended 32-bit number (sethi+or, sethi+xor, respectively). Then we try to find a range of set bits we can load simply then shift up into place, in various ways. Then we try negating the constant and see if we can do a simple sequence using that with a xor at the end. (f.e. the range of set bits can't be loaded simply, but for the negated value it can) The final optimized strategy involves 4 instructions sequences not needing a temporary register. Otherwise we sadly fully decompose using a temp.. Example, from ALU64_XOR_K: 0x0000ffffffff0000 ^ 0x0 = 0x0000ffffffff0000: 0000000000000000 : 0: 9d e3 bf 50 save %sp, -176, %sp 4: 01 00 00 00 nop 8: 90 10 00 18 mov %i0, %o0 c: 13 3f ff ff sethi %hi(0xfffffc00), %o1 10: 92 12 63 ff or %o1, 0x3ff, %o1 ! ffffffff 14: 93 2a 70 10 sllx %o1, 0x10, %o1 18: 15 3f ff ff sethi %hi(0xfffffc00), %o2 1c: 94 12 a3 ff or %o2, 0x3ff, %o2 ! ffffffff 20: 95 2a b0 10 sllx %o2, 0x10, %o2 24: 92 1a 60 00 xor %o1, 0, %o1 28: 12 e2 40 8a cxbe %o1, %o2, 38 2c: 9a 10 20 02 mov 2, %o5 30: 10 60 00 03 b,pn %xcc, 3c 34: 01 00 00 00 nop 38: 9a 10 20 01 mov 1, %o5 ! 1 3c: 81 c7 e0 08 ret 40: 91 eb 40 00 restore %o5, %g0, %o0 Signed-off-by: David S. Miller --- arch/sparc/net/bpf_jit_comp_64.c | 249 ++++++++++++++++++++++++++++++- 1 file changed, 245 insertions(+), 4 deletions(-) diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 2b2f3c3335ce..ec7d10da94f0 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -28,6 +28,11 @@ static inline bool is_simm5(unsigned int value) return value + 0x10 < 0x20; } +static inline bool is_sethi(unsigned int value) +{ + return (value & ~0x3fffff) == 0; +} + static void bpf_flush_icache(void *start_, void *end_) { /* Cheetah's I-cache is fully coherent. */ @@ -367,16 +372,252 @@ static void emit_loadimm_sext(s32 K, unsigned int dest, struct jit_ctx *ctx) } } +static void analyze_64bit_constant(u32 high_bits, u32 low_bits, + int *hbsp, int *lbsp, int *abbasp) +{ + int lowest_bit_set, highest_bit_set, all_bits_between_are_set; + int i; + + lowest_bit_set = highest_bit_set = -1; + i = 0; + do { + if ((lowest_bit_set == -1) && ((low_bits >> i) & 1)) + lowest_bit_set = i; + if ((highest_bit_set == -1) && ((high_bits >> (32 - i - 1)) & 1)) + highest_bit_set = (64 - i - 1); + } while (++i < 32 && (highest_bit_set == -1 || + lowest_bit_set == -1)); + if (i == 32) { + i = 0; + do { + if (lowest_bit_set == -1 && ((high_bits >> i) & 1)) + lowest_bit_set = i + 32; + if (highest_bit_set == -1 && + ((low_bits >> (32 - i - 1)) & 1)) + highest_bit_set = 32 - i - 1; + } while (++i < 32 && (highest_bit_set == -1 || + lowest_bit_set == -1)); + } + + all_bits_between_are_set = 1; + for (i = lowest_bit_set; i <= highest_bit_set; i++) { + if (i < 32) { + if ((low_bits & (1 << i)) != 0) + continue; + } else { + if ((high_bits & (1 << (i - 32))) != 0) + continue; + } + all_bits_between_are_set = 0; + break; + } + *hbsp = highest_bit_set; + *lbsp = lowest_bit_set; + *abbasp = all_bits_between_are_set; +} + +static unsigned long create_simple_focus_bits(unsigned long high_bits, + unsigned long low_bits, + int lowest_bit_set, int shift) +{ + long hi, lo; + + if (lowest_bit_set < 32) { + lo = (low_bits >> lowest_bit_set) << shift; + hi = ((high_bits << (32 - lowest_bit_set)) << shift); + } else { + lo = 0; + hi = ((high_bits >> (lowest_bit_set - 32)) << shift); + } + return hi | lo; +} + +static bool const64_is_2insns(unsigned long high_bits, + unsigned long low_bits) +{ + int highest_bit_set, lowest_bit_set, all_bits_between_are_set; + + if (high_bits == 0 || high_bits == 0xffffffff) + return true; + + analyze_64bit_constant(high_bits, low_bits, + &highest_bit_set, &lowest_bit_set, + &all_bits_between_are_set); + + if ((highest_bit_set == 63 || lowest_bit_set == 0) && + all_bits_between_are_set != 0) + return true; + + if (highest_bit_set - lowest_bit_set < 21) + return true; + + return false; +} + +static void sparc_emit_set_const64_quick2(unsigned long high_bits, + unsigned long low_imm, + unsigned int dest, + int shift_count, struct jit_ctx *ctx) +{ + emit_loadimm32(high_bits, dest, ctx); + + /* Now shift it up into place. */ + emit_alu_K(SLLX, dest, shift_count, ctx); + + /* If there is a low immediate part piece, finish up by + * putting that in as well. + */ + if (low_imm != 0) + emit(OR | IMMED | RS1(dest) | S13(low_imm) | RD(dest), ctx); +} + static void emit_loadimm64(u64 K, unsigned int dest, struct jit_ctx *ctx) { + int all_bits_between_are_set, lowest_bit_set, highest_bit_set; unsigned int tmp = bpf2sparc[TMP_REG_1]; - u32 high_part = (K >> 32); - u32 low_part = (K & 0xffffffff); + u32 low_bits = (K & 0xffffffff); + u32 high_bits = (K >> 32); + + /* These two tests also take care of all of the one + * instruction cases. + */ + if (high_bits == 0xffffffff && (low_bits & 0x80000000)) + return emit_loadimm_sext(K, dest, ctx); + if (high_bits == 0x00000000) + return emit_loadimm32(K, dest, ctx); + + analyze_64bit_constant(high_bits, low_bits, &highest_bit_set, + &lowest_bit_set, &all_bits_between_are_set); + + /* 1) mov -1, %reg + * sllx %reg, shift, %reg + * 2) mov -1, %reg + * srlx %reg, shift, %reg + * 3) mov some_small_const, %reg + * sllx %reg, shift, %reg + */ + if (((highest_bit_set == 63 || lowest_bit_set == 0) && + all_bits_between_are_set != 0) || + ((highest_bit_set - lowest_bit_set) < 12)) { + int shift = lowest_bit_set; + long the_const = -1; + + if ((highest_bit_set != 63 && lowest_bit_set != 0) || + all_bits_between_are_set == 0) { + the_const = + create_simple_focus_bits(high_bits, low_bits, + lowest_bit_set, 0); + } else if (lowest_bit_set == 0) + shift = -(63 - highest_bit_set); + + emit(OR | IMMED | RS1(G0) | S13(the_const) | RD(dest), ctx); + if (shift > 0) + emit_alu_K(SLLX, dest, shift, ctx); + else if (shift < 0) + emit_alu_K(SRLX, dest, -shift, ctx); + + return; + } + + /* Now a range of 22 or less bits set somewhere. + * 1) sethi %hi(focus_bits), %reg + * sllx %reg, shift, %reg + * 2) sethi %hi(focus_bits), %reg + * srlx %reg, shift, %reg + */ + if ((highest_bit_set - lowest_bit_set) < 21) { + unsigned long focus_bits = + create_simple_focus_bits(high_bits, low_bits, + lowest_bit_set, 10); + + emit(SETHI(focus_bits, dest), ctx); + + /* If lowest_bit_set == 10 then a sethi alone could + * have done it. + */ + if (lowest_bit_set < 10) + emit_alu_K(SRLX, dest, 10 - lowest_bit_set, ctx); + else if (lowest_bit_set > 10) + emit_alu_K(SLLX, dest, lowest_bit_set - 10, ctx); + return; + } + + /* Ok, now 3 instruction sequences. */ + if (low_bits == 0) { + emit_loadimm32(high_bits, dest, ctx); + emit_alu_K(SLLX, dest, 32, ctx); + return; + } + + /* We may be able to do something quick + * when the constant is negated, so try that. + */ + if (const64_is_2insns((~high_bits) & 0xffffffff, + (~low_bits) & 0xfffffc00)) { + /* NOTE: The trailing bits get XOR'd so we need the + * non-negated bits, not the negated ones. + */ + unsigned long trailing_bits = low_bits & 0x3ff; + + if ((((~high_bits) & 0xffffffff) == 0 && + ((~low_bits) & 0x80000000) == 0) || + (((~high_bits) & 0xffffffff) == 0xffffffff && + ((~low_bits) & 0x80000000) != 0)) { + unsigned long fast_int = (~low_bits & 0xffffffff); + + if ((is_sethi(fast_int) && + (~high_bits & 0xffffffff) == 0)) { + emit(SETHI(fast_int, dest), ctx); + } else if (is_simm13(fast_int)) { + emit(OR | IMMED | RS1(G0) | S13(fast_int) | RD(dest), ctx); + } else { + emit_loadimm64(fast_int, dest, ctx); + } + } else { + u64 n = ((~low_bits) & 0xfffffc00) | + (((unsigned long)((~high_bits) & 0xffffffff))<<32); + emit_loadimm64(n, dest, ctx); + } + + low_bits = -0x400 | trailing_bits; + + emit(XOR | IMMED | RS1(dest) | S13(low_bits) | RD(dest), ctx); + return; + } + + /* 1) sethi %hi(xxx), %reg + * or %reg, %lo(xxx), %reg + * sllx %reg, yyy, %reg + */ + if ((highest_bit_set - lowest_bit_set) < 32) { + unsigned long focus_bits = + create_simple_focus_bits(high_bits, low_bits, + lowest_bit_set, 0); + + /* So what we know is that the set bits straddle the + * middle of the 64-bit word. + */ + sparc_emit_set_const64_quick2(focus_bits, 0, dest, + lowest_bit_set, ctx); + return; + } + + /* 1) sethi %hi(high_bits), %reg + * or %reg, %lo(high_bits), %reg + * sllx %reg, 32, %reg + * or %reg, low_bits, %reg + */ + if (is_simm13(low_bits) && ((int)low_bits > 0)) { + sparc_emit_set_const64_quick2(high_bits, low_bits, + dest, 32, ctx); + return; + } + /* Oh well, we tried... Do a full 64-bit decomposition. */ ctx->tmp_1_used = true; - emit_set_const(high_part, tmp, ctx); - emit_set_const(low_part, dest, ctx); + emit_loadimm32(high_bits, tmp, ctx); + emit_loadimm32(low_bits, dest, ctx); emit_alu_K(SLLX, tmp, 32, ctx); emit(OR | RS1(dest) | RS2(tmp) | RD(dest), ctx); } -- 2.20.1