From: Ard Biesheuvel Date: Thu, 13 Jul 2017 17:16:01 +0000 (+0100) Subject: md/raid6: implement recovery using ARM NEON intrinsics X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=6ec4e2514decd6fb4782a9364fa71d6244d05af4;p=GitHub%2FLineageOS%2Fandroid_kernel_motorola_exynos9610.git md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 30f945329818..583cdd3d49ca 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -121,6 +121,7 @@ extern const struct raid6_recov_calls raid6_recov_ssse3; extern const struct raid6_recov_calls raid6_recov_avx2; extern const struct raid6_recov_calls raid6_recov_avx512; extern const struct raid6_recov_calls raid6_recov_s390xc; +extern const struct raid6_recov_calls raid6_recov_neon; extern const struct raid6_calls raid6_neonx1; extern const struct raid6_calls raid6_neonx2; diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3057011f5599..a93adf6dcfb2 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -5,7 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o -raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_TILEGX) += tilegx8.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o @@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding ifeq ($(ARCH),arm) NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon endif +CFLAGS_recov_neon_inner.o += $(NEON_FLAGS) ifeq ($(ARCH),arm64) +CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 7857049fd7d3..476994723258 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -112,6 +112,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #endif #ifdef CONFIG_S390 &raid6_recov_s390xc, +#endif +#if defined(CONFIG_KERNEL_MODE_NEON) + &raid6_recov_neon, #endif &raid6_recov_intx1, NULL diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c new file mode 100644 index 000000000000..eeb5c4065b92 --- /dev/null +++ b/lib/raid6/recov_neon.c @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include + +#ifdef __KERNEL__ +#include +#else +#define kernel_neon_begin() +#define kernel_neon_end() +#define cpu_has_neon() (1) +#endif + +static int raid6_has_neon(void) +{ + return cpu_has_neon(); +} + +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + +static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_neon_begin(); + __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); + kernel_neon_end(); +} + +static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_neon_begin(); + __raid6_datap_recov_neon(bytes, p, q, dq, qmul); + kernel_neon_end(); +} + +const struct raid6_recov_calls raid6_recov_neon = { + .data2 = raid6_2data_recov_neon, + .datap = raid6_datap_recov_neon, + .valid = raid6_has_neon, + .name = "neon", + .priority = 10, +}; diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c new file mode 100644 index 000000000000..8cd20c9f834a --- /dev/null +++ b/lib/raid6/recov_neon_inner.c @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include + +static const uint8x16_t x0f = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, +}; + +#ifdef CONFIG_ARM +/* + * AArch32 does not provide this intrinsic natively because it does not + * implement the underlying instruction. AArch32 only provides a 64-bit + * wide vtbl.8 instruction, so use that instead. + */ +static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + union { + uint8x16_t val; + uint8x8x2_t pair; + } __a = { a }; + + return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), + vtbl2_u8(__a.pair, vget_high_u8(b))); +} +#endif + +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul) +{ + uint8x16_t pm0 = vld1q_u8(pbmul); + uint8x16_t pm1 = vld1q_u8(pbmul + 16); + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy, px, qx, db; + + px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); + qx = veorq_u8(vx, vy); + + vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4); + vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); + vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f)); + vx = veorq_u8(vx, vy); + db = veorq_u8(vx, qx); + + vst1q_u8(dq, db); + vst1q_u8(dp, veorq_u8(db, px)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul) +{ + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy; + + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f)); + vx = veorq_u8(vx, vy); + vy = veorq_u8(vx, vld1q_u8(p)); + + vst1q_u8(dq, vx); + vst1q_u8(p, vy); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +}