crypto: arm64/crc32 - accelerated support based on x86 SSE implementation

author Ard Biesheuvel <ard.biesheuvel@linaro.org>

Mon, 5 Dec 2016 18:42:27 +0000 (18:42 +0000)

committer Herbert Xu <herbert@gondor.apana.org.au>

Wed, 7 Dec 2016 12:01:22 +0000 (20:01 +0800)
author Ard Biesheuvel <ard.biesheuvel@linaro.org>
Mon, 5 Dec 2016 18:42:27 +0000 (18:42 +0000)
committer Herbert Xu <herbert@gondor.apana.org.au>
Wed, 7 Dec 2016 12:01:22 +0000 (20:01 +0800)
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig

index f1e6dd0fc174ab7445d185c7a6bedd7b2f5f81c5..450a85df041a668ebd6a0bf87dbbe68c7ca5d145 100644 (file)
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -36,6 +36,11 @@ config CRYPTO_CRCT10DIF_ARM64_CE
         depends on KERNEL_MODE_NEON && CRC_T10DIF
         select CRYPTO_HASH
  
+config CRYPTO_CRC32_ARM64_CE
+       tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
+       depends on KERNEL_MODE_NEON && CRC32
+       select CRYPTO_HASH
+
  config CRYPTO_AES_ARM64_CE
         tristate "AES core cipher using ARMv8 Crypto Extensions"
         depends on ARM64 && KERNEL_MODE_NEON
@@ -66,4 +71,5 @@ config CRYPTO_CRC32_ARM64
         tristate "CRC32 and CRC32C using optional ARMv8 instructions"
         depends on ARM64
         select CRYPTO_HASH
+
  endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile

index d3f1ba6d4771ac5b96a937c040c9deccac83055c..aa8888d7b744d29e4403ddc92b933891ddb157ed 100644 (file)
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,6 +20,9 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
  obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
  crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
  
+obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
+crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+
  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
  CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
  
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S

new file mode 100644 (file)

index 0000000..18f5a84
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -0,0 +1,266 @@
+/*
+ * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *           Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .align          6
+       .cpu            generic+crypto+crc
+
+.Lcrc32_constants:
+       /*
+        * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+        * #define CONSTANT_R1  0x154442bd4LL
+        *
+        * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+        * #define CONSTANT_R2  0x1c6e41596LL
+        */
+       .octa           0x00000001c6e415960000000154442bd4
+
+       /*
+        * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+        * #define CONSTANT_R3  0x1751997d0LL
+        *
+        * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+        * #define CONSTANT_R4  0x0ccaa009eLL
+        */
+       .octa           0x00000000ccaa009e00000001751997d0
+
+       /*
+        * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+        * #define CONSTANT_R5  0x163cd6124LL
+        */
+       .quad           0x0000000163cd6124
+       .quad           0x00000000FFFFFFFF
+
+       /*
+        * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+        *
+        * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+        *                                                      = 0x1F7011641LL
+        * #define CONSTANT_RU  0x1F7011641LL
+        */
+       .octa           0x00000001F701164100000001DB710641
+
+.Lcrc32c_constants:
+       .octa           0x000000009e4addf800000000740eef02
+       .octa           0x000000014cd00bd600000000f20c0dfe
+       .quad           0x00000000dd45aab8
+       .quad           0x00000000FFFFFFFF
+       .octa           0x00000000dea713f10000000105ec76f0
+
+       vCONSTANT       .req    v0
+       dCONSTANT       .req    d0
+       qCONSTANT       .req    q0
+
+       BUF             .req    x0
+       LEN             .req    x1
+       CRC             .req    x2
+
+       vzr             .req    v9
+
+       /**
+        * Calculate crc32
+        * BUF - buffer
+        * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+        * CRC - initial crc32
+        * return %eax crc32
+        * uint crc32_pmull_le(unsigned char const *buffer,
+        *                     size_t len, uint crc32)
+        */
+ENTRY(crc32_pmull_le)
+       adr             x3, .Lcrc32_constants
+       b               0f
+
+ENTRY(crc32c_pmull_le)
+       adr             x3, .Lcrc32c_constants
+
+0:     bic             LEN, LEN, #15
+       ld1             {v1.16b-v4.16b}, [BUF], #0x40
+       movi            vzr.16b, #0
+       fmov            dCONSTANT, CRC
+       eor             v1.16b, v1.16b, vCONSTANT.16b
+       sub             LEN, LEN, #0x40
+       cmp             LEN, #0x40
+       b.lt            less_64
+
+       ldr             qCONSTANT, [x3]
+
+loop_64:               /* 64 bytes Full cache line folding */
+       sub             LEN, LEN, #0x40
+
+       pmull2          v5.1q, v1.2d, vCONSTANT.2d
+       pmull2          v6.1q, v2.2d, vCONSTANT.2d
+       pmull2          v7.1q, v3.2d, vCONSTANT.2d
+       pmull2          v8.1q, v4.2d, vCONSTANT.2d
+
+       pmull           v1.1q, v1.1d, vCONSTANT.1d
+       pmull           v2.1q, v2.1d, vCONSTANT.1d
+       pmull           v3.1q, v3.1d, vCONSTANT.1d
+       pmull           v4.1q, v4.1d, vCONSTANT.1d
+
+       eor             v1.16b, v1.16b, v5.16b
+       ld1             {v5.16b}, [BUF], #0x10
+       eor             v2.16b, v2.16b, v6.16b
+       ld1             {v6.16b}, [BUF], #0x10
+       eor             v3.16b, v3.16b, v7.16b
+       ld1             {v7.16b}, [BUF], #0x10
+       eor             v4.16b, v4.16b, v8.16b
+       ld1             {v8.16b}, [BUF], #0x10
+
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v2.16b, v2.16b, v6.16b
+       eor             v3.16b, v3.16b, v7.16b
+       eor             v4.16b, v4.16b, v8.16b
+
+       cmp             LEN, #0x40
+       b.ge            loop_64
+
+less_64:               /* Folding cache line into 128bit */
+       ldr             qCONSTANT, [x3, #16]
+
+       pmull2          v5.1q, v1.2d, vCONSTANT.2d
+       pmull           v1.1q, v1.1d, vCONSTANT.1d
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v1.16b, v1.16b, v2.16b
+
+       pmull2          v5.1q, v1.2d, vCONSTANT.2d
+       pmull           v1.1q, v1.1d, vCONSTANT.1d
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v1.16b, v1.16b, v3.16b
+
+       pmull2          v5.1q, v1.2d, vCONSTANT.2d
+       pmull           v1.1q, v1.1d, vCONSTANT.1d
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v1.16b, v1.16b, v4.16b
+
+       cbz             LEN, fold_64
+
+loop_16:               /* Folding rest buffer into 128bit */
+       subs            LEN, LEN, #0x10
+
+       ld1             {v2.16b}, [BUF], #0x10
+       pmull2          v5.1q, v1.2d, vCONSTANT.2d
+       pmull           v1.1q, v1.1d, vCONSTANT.1d
+       eor             v1.16b, v1.16b, v5.16b
+       eor             v1.16b, v1.16b, v2.16b
+
+       b.ne            loop_16
+
+fold_64:
+       /* perform the last 64 bit fold, also adds 32 zeroes
+        * to the input stream */
+       ext             v2.16b, v1.16b, v1.16b, #8
+       pmull2          v2.1q, v2.2d, vCONSTANT.2d
+       ext             v1.16b, v1.16b, vzr.16b, #8
+       eor             v1.16b, v1.16b, v2.16b
+
+       /* final 32-bit fold */
+       ldr             dCONSTANT, [x3, #32]
+       ldr             d3, [x3, #40]
+
+       ext             v2.16b, v1.16b, vzr.16b, #4
+       and             v1.16b, v1.16b, v3.16b
+       pmull           v1.1q, v1.1d, vCONSTANT.1d
+       eor             v1.16b, v1.16b, v2.16b
+
+       /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+       ldr             qCONSTANT, [x3, #48]
+
+       and             v2.16b, v1.16b, v3.16b
+       ext             v2.16b, vzr.16b, v2.16b, #8
+       pmull2          v2.1q, v2.2d, vCONSTANT.2d
+       and             v2.16b, v2.16b, v3.16b
+       pmull           v2.1q, v2.1d, vCONSTANT.1d
+       eor             v1.16b, v1.16b, v2.16b
+       mov             w0, v1.s[1]
+
+       ret
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+       .macro          __crc32, c
+0:     subs            x2, x2, #16
+       b.mi            8f
+       ldp             x3, x4, [x1], #16
+CPU_BE(        rev             x3, x3          )
+CPU_BE(        rev             x4, x4          )
+       crc32\c\()x     w0, w0, x3
+       crc32\c\()x     w0, w0, x4
+       b.ne            0b
+       ret
+
+8:     tbz             x2, #3, 4f
+       ldr             x3, [x1], #8
+CPU_BE(        rev             x3, x3          )
+       crc32\c\()x     w0, w0, x3
+4:     tbz             x2, #2, 2f
+       ldr             w3, [x1], #4
+CPU_BE(        rev             w3, w3          )
+       crc32\c\()w     w0, w0, w3
+2:     tbz             x2, #1, 1f
+       ldrh            w3, [x1], #2
+CPU_BE(        rev16           w3, w3          )
+       crc32\c\()h     w0, w0, w3
+1:     tbz             x2, #0, 0f
+       ldrb            w3, [x1]
+       crc32\c\()b     w0, w0, w3
+0:     ret
+       .endm
+
+       .align          5
+ENTRY(crc32_armv8_le)
+       __crc32
+ENDPROC(crc32_armv8_le)
+
+       .align          5
+ENTRY(crc32c_armv8_le)
+       __crc32         c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c

new file mode 100644 (file)

index 0000000..8594127
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -0,0 +1,212 @@
+/*
+ * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN          64L     /* minimum size of buffer
+                                        * for crc32_pmull_le_16 */
+#define SCALE_F                        16L     /* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+       u32 *key = crypto_tfm_ctx(tfm);
+
+       *key = 0;
+       return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+       u32 *key = crypto_tfm_ctx(tfm);
+
+       *key = ~0;
+       return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+                             unsigned int keylen)
+{
+       u32 *mctx = crypto_shash_ctx(hash);
+
+       if (keylen != sizeof(u32)) {
+               crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       *mctx = le32_to_cpup((__le32 *)key);
+       return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+       u32 *mctx = crypto_shash_ctx(desc->tfm);
+       u32 *crc = shash_desc_ctx(desc);
+
+       *crc = *mctx;
+       return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+                        unsigned int length)
+{
+       u32 *crc = shash_desc_ctx(desc);
+       unsigned int l;
+
+       if ((u64)data % SCALE_F) {
+               l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+               *crc = fallback_crc32(*crc, data, l);
+
+               data += l;
+               length -= l;
+       }
+
+       if (length >= PMULL_MIN_LEN) {
+               l = round_down(length, SCALE_F);
+
+               kernel_neon_begin_partial(10);
+               *crc = crc32_pmull_le(data, l, *crc);
+               kernel_neon_end();
+
+               data += l;
+               length -= l;
+       }
+
+       if (length > 0)
+               *crc = fallback_crc32(*crc, data, length);
+
+       return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+                        unsigned int length)
+{
+       u32 *crc = shash_desc_ctx(desc);
+       unsigned int l;
+
+       if ((u64)data % SCALE_F) {
+               l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+               *crc = fallback_crc32c(*crc, data, l);
+
+               data += l;
+               length -= l;
+       }
+
+       if (length >= PMULL_MIN_LEN) {
+               l = round_down(length, SCALE_F);
+
+               kernel_neon_begin_partial(10);
+               *crc = crc32c_pmull_le(data, l, *crc);
+               kernel_neon_end();
+
+               data += l;
+               length -= l;
+       }
+
+       if (length > 0) {
+               *crc = fallback_crc32c(*crc, data, length);
+       }
+
+       return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+       u32 *crc = shash_desc_ctx(desc);
+
+       put_unaligned_le32(*crc, out);
+       return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+       u32 *crc = shash_desc_ctx(desc);
+
+       put_unaligned_le32(~*crc, out);
+       return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+       .setkey                 = crc32_pmull_setkey,
+       .init                   = crc32_pmull_init,
+       .update                 = crc32_pmull_update,
+       .final                  = crc32_pmull_final,
+       .descsize               = sizeof(u32),
+       .digestsize             = sizeof(u32),
+
+       .base.cra_ctxsize       = sizeof(u32),
+       .base.cra_init          = crc32_pmull_cra_init,
+       .base.cra_name          = "crc32",
+       .base.cra_driver_name   = "crc32-arm64-ce",
+       .base.cra_priority      = 200,
+       .base.cra_blocksize     = 1,
+       .base.cra_module        = THIS_MODULE,
+}, {
+       .setkey                 = crc32_pmull_setkey,
+       .init                   = crc32_pmull_init,
+       .update                 = crc32c_pmull_update,
+       .final                  = crc32c_pmull_final,
+       .descsize               = sizeof(u32),
+       .digestsize             = sizeof(u32),
+
+       .base.cra_ctxsize       = sizeof(u32),
+       .base.cra_init          = crc32c_pmull_cra_init,
+       .base.cra_name          = "crc32c",
+       .base.cra_driver_name   = "crc32c-arm64-ce",
+       .base.cra_priority      = 200,
+       .base.cra_blocksize     = 1,
+       .base.cra_module        = THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+       if (elf_hwcap & HWCAP_CRC32) {
+               fallback_crc32 = crc32_armv8_le;
+               fallback_crc32c = crc32c_armv8_le;
+       } else {
+               fallback_crc32 = crc32_le;
+               fallback_crc32c = __crc32c_le;
+       }
+
+       return crypto_register_shashes(crc32_pmull_algs,
+                                      ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+       crypto_unregister_shashes(crc32_pmull_algs,
+                                 ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>
	Mon, 5 Dec 2016 18:42:27 +0000 (18:42 +0000)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Wed, 7 Dec 2016 12:01:22 +0000 (20:01 +0800)
arch/arm64/crypto/Kconfig		patch \| blob \| blame \| history
arch/arm64/crypto/Makefile		patch \| blob \| blame \| history
arch/arm64/crypto/crc32-ce-core.S	[new file with mode: 0644]	patch \| blob
arch/arm64/crypto/crc32-ce-glue.c	[new file with mode: 0644]	patch \| blob