s390/vx: add support functions for in-kernel FPU use
authorHendrik Brueckner <brueckner@linux.vnet.ibm.com>
Wed, 18 Feb 2015 13:46:00 +0000 (14:46 +0100)
committerMartin Schwidefsky <schwidefsky@de.ibm.com>
Tue, 14 Jun 2016 14:54:11 +0000 (16:54 +0200)
Introduce the kernel_fpu_begin() and kernel_fpu_end() function
to enclose any in-kernel use of FPU instructions and registers.
In enclosed sections, you can perform floating-point or vector
(SIMD) computations.  The functions take care of saving and
restoring FPU register contents and controls.

For usage details, see the guidelines in arch/s390/include/asm/fpu/api.h

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
arch/s390/include/asm/fpu/api.h
arch/s390/include/asm/fpu/types.h
arch/s390/kernel/Makefile
arch/s390/kernel/fpu.c [new file with mode: 0644]

index 5e04f3cbd320d78e963c7ae18639c94d4b83952c..78ba3ddb9e183f9890bdb87293aea23a4f481759 100644 (file)
@@ -1,6 +1,41 @@
 /*
  * In-kernel FPU support functions
  *
+ *
+ * Consider these guidelines before using in-kernel FPU functions:
+ *
+ *  1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
+ *     use of floating-point or vector registers and instructions.
+ *
+ *  2. For kernel_fpu_begin(), specify the vector register range you want to
+ *     use with the KERNEL_VXR_* constants. Consider these usage guidelines:
+ *
+ *     a) If your function typically runs in process-context, use the lower
+ *       half of the vector registers, for example, specify KERNEL_VXR_LOW.
+ *     b) If your function typically runs in soft-irq or hard-irq context,
+ *       prefer using the upper half of the vector registers, for example,
+ *       specify KERNEL_VXR_HIGH.
+ *
+ *     If you adhere to these guidelines, an interrupted process context
+ *     does not require to save and restore vector registers because of
+ *     disjoint register ranges.
+ *
+ *     Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
+ *     includes logic to save and restore up to 16 vector registers at once.
+ *
+ *  3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
+ *     struct kernel_fpu states.  Vector registers that are in use by outer
+ *     levels are saved and restored.  You can minimize the save and restore
+ *     effort by choosing disjoint vector register ranges.
+ *
+ *  5. To use vector floating-point instructions, specify the KERNEL_FPC
+ *     flag to save and restore floating-point controls in addition to any
+ *     vector register range.
+ *
+ *  6. To use floating-point registers and instructions only, specify the
+ *     KERNEL_FPR flag.  This flag triggers a save and restore of vector
+ *     registers V0 to V15 and floating-point controls.
+ *
  * Copyright IBM Corp. 2015
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  */
@@ -8,6 +43,8 @@
 #ifndef _ASM_S390_FPU_API_H
 #define _ASM_S390_FPU_API_H
 
+#include <linux/preempt.h>
+
 void save_fpu_regs(void);
 
 static inline int test_fp_ctl(u32 fpc)
@@ -27,4 +64,42 @@ static inline int test_fp_ctl(u32 fpc)
        return rc;
 }
 
+#define KERNEL_VXR_V0V7                1
+#define KERNEL_VXR_V8V15       2
+#define KERNEL_VXR_V16V23      4
+#define KERNEL_VXR_V24V31      8
+#define KERNEL_FPR             16
+#define KERNEL_FPC             256
+
+#define KERNEL_VXR_LOW         (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15)
+#define KERNEL_VXR_MID         (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23)
+#define KERNEL_VXR_HIGH                (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
+
+#define KERNEL_FPU_MASK                (KERNEL_VXR_LOW|KERNEL_VXR_HIGH|KERNEL_FPR)
+
+struct kernel_fpu;
+
+/*
+ * Note the functions below must be called with preemption disabled.
+ * Do not enable preemption before calling __kernel_fpu_end() to prevent
+ * an corruption of an existing kernel FPU state.
+ *
+ * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
+ */
+void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
+void __kernel_fpu_end(struct kernel_fpu *state);
+
+
+static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+{
+       preempt_disable();
+       __kernel_fpu_begin(state, flags);
+}
+
+static inline void kernel_fpu_end(struct kernel_fpu *state)
+{
+       __kernel_fpu_end(state);
+       preempt_enable();
+}
+
 #endif /* _ASM_S390_FPU_API_H */
index fe937c9b64713c81ff9d4d8c14d6c14042868663..bce255ead72b400a63103362feff567d934ebc6e 100644 (file)
@@ -24,4 +24,14 @@ struct fpu {
 /* VX array structure for address operand constraints in inline assemblies */
 struct vx_array { __vector128 _[__NUM_VXRS]; };
 
+/* In-kernel FPU state structure */
+struct kernel_fpu {
+       u32         mask;
+       u32         fpc;
+       union {
+               freg_t fprs[__NUM_FPRS];
+               __vector128 vxrs[__NUM_VXRS];
+       };
+};
+
 #endif /* _ASM_S390_FPU_TYPES_H */
index 2f5586ab8a6acf69724cb5096c60d109a2b2b054..8d1419120bb73ba7bfdb7786130225d2eb1d6974 100644 (file)
@@ -45,7 +45,7 @@ obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y  += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y  += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o
 obj-y  += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
-obj-y  += runtime_instr.o cache.o dumpstack.o
+obj-y  += runtime_instr.o cache.o fpu.o dumpstack.o
 obj-y  += entry.o reipl.o relocate_kernel.o
 
 extra-y                                += head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
new file mode 100644 (file)
index 0000000..81d1d18
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * In-kernel vector facility support functions
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <asm/fpu/types.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Per-CPU variable to maintain FPU register ranges that are in use
+ * by the kernel.
+ */
+static DEFINE_PER_CPU(u32, kernel_fpu_state);
+
+#define KERNEL_FPU_STATE_MASK  (KERNEL_FPU_MASK|KERNEL_FPC)
+
+
+void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+{
+       if (!__this_cpu_read(kernel_fpu_state)) {
+               /*
+                * Save user space FPU state and register contents.  Multiple
+                * calls because of interruptions do not matter and return
+                * immediately.  This also sets CIF_FPU to lazy restore FP/VX
+                * register contents when returning to user space.
+                */
+               save_fpu_regs();
+       }
+
+       /* Update flags to use the vector facility for KERNEL_FPR */
+       if (MACHINE_HAS_VX && (state->mask & KERNEL_FPR)) {
+               flags |= KERNEL_VXR_LOW | KERNEL_FPC;
+               flags &= ~KERNEL_FPR;
+       }
+
+       /* Save and update current kernel VX state */
+       state->mask = __this_cpu_read(kernel_fpu_state);
+       __this_cpu_or(kernel_fpu_state, flags & KERNEL_FPU_STATE_MASK);
+
+       /*
+        * If this is the first call to __kernel_fpu_begin(), no additional
+        * work is required.
+        */
+       if (!(state->mask & KERNEL_FPU_STATE_MASK))
+               return;
+
+       /*
+        * If KERNEL_FPR is still set, the vector facility is not available
+        * and, thus, save floating-point control and registers only.
+        */
+       if (state->mask & KERNEL_FPR) {
+               asm volatile("stfpc %0" : "=Q" (state->fpc));
+               asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
+               asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
+               asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
+               asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
+               asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
+               asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
+               asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
+               asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
+               asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
+               asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
+               asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
+               asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
+               asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
+               asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
+               asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
+               asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
+               return;
+       }
+
+       /*
+        * If this is a nested call to __kernel_fpu_begin(), check the saved
+        * state mask to save and later restore the vector registers that
+        * are already in use.  Let's start with checking floating-point
+        * controls.
+        */
+       if (state->mask & KERNEL_FPC)
+               asm volatile("stfpc %0" : "=m" (state->fpc));
+
+       /* Test and save vector registers */
+       asm volatile (
+               /*
+                * Test if any vector register must be saved and, if so,
+                * test if all register can be saved.
+                */
+               "       tmll    %[m],15\n"      /* KERNEL_VXR_MASK */
+               "       jz      20f\n"          /* no work -> done */
+               "       la      1,%[vxrs]\n"    /* load save area */
+               "       jo      18f\n"          /* -> save V0..V31 */
+
+               /*
+                * Test if V8..V23 can be saved at once... this speeds up
+                * for KERNEL_fpu_MID only. Otherwise continue to split the
+                * range of vector registers into two halves and test them
+                * separately.
+                */
+               "       tmll    %[m],6\n"       /* KERNEL_VXR_MID */
+               "       jo      17f\n"          /* -> save V8..V23 */
+
+               /* Test and save the first half of 16 vector registers */
+               "1:     tmll    %[m],3\n"       /* KERNEL_VXR_LOW */
+               "       jz      10f\n"          /* -> KERNEL_VXR_HIGH */
+               "       jo      2f\n"           /* 11 -> save V0..V15 */
+               "       brc     4,3f\n"         /* 01 -> save V0..V7  */
+               "       brc     2,4f\n"         /* 10 -> save V8..V15 */
+
+               /* Test and save the second half of 16 vector registers */
+               "10:    tmll    %[m],12\n"      /* KERNEL_VXR_HIGH */
+               "       jo      19f\n"          /* 11 -> save V16..V31 */
+               "       brc     4,11f\n"        /* 01 -> save V16..V23  */
+               "       brc     2,12f\n"        /* 10 -> save V24..V31 */
+               "       j       20f\n"          /* 00 -> done */
+
+               /*
+                * Below are the vstm combinations to save multiple vector
+                * registers at once.
+                */
+               "2:     .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+               "       j       10b\n"                  /* -> VXR_HIGH */
+               "3:     .word   0xe707,0x1000,0x003e\n" /* vstm 0,7,0(1) */
+               "       j       10b\n"                  /* -> VXR_HIGH */
+               "4:     .word   0xe78f,0x1080,0x003e\n" /* vstm 8,15,128(1) */
+               "       j       10b\n"                  /* -> VXR_HIGH */
+               "\n"
+               "11:    .word   0xe707,0x1100,0x0c3e\n" /* vstm 16,23,256(1) */
+               "       j       20f\n"                  /* -> done */
+               "12:    .word   0xe78f,0x1180,0x0c3e\n" /* vstm 24,31,384(1) */
+               "       j       20f\n"                  /* -> done */
+               "\n"
+               "17:    .word   0xe787,0x1080,0x043e\n" /* vstm 8,23,128(1) */
+               "       nill    %[m],249\n"             /* m &= ~VXR_MID    */
+               "       j       1b\n"                   /* -> VXR_LOW */
+               "\n"
+               "18:    .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+               "19:    .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
+               "20:"
+               : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
+               : [m] "d" (state->mask)
+               : "1", "cc");
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(struct kernel_fpu *state)
+{
+       /* Just update the per-CPU state if there is nothing to restore */
+       if (!(state->mask & KERNEL_FPU_STATE_MASK))
+               goto update_fpu_state;
+
+       /*
+        * If KERNEL_FPR is specified, the vector facility is not available
+        * and, thus, restore floating-point control and registers only.
+        */
+       if (state->mask & KERNEL_FPR) {
+               asm volatile("lfpc %0" : : "Q" (state->fpc));
+               asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
+               asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
+               asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
+               asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
+               asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
+               asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
+               asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
+               asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
+               asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
+               asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
+               asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
+               asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
+               asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
+               asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
+               asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
+               asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
+               goto update_fpu_state;
+       }
+
+       /* Test and restore floating-point controls */
+       if (state->mask & KERNEL_FPC)
+               asm volatile("lfpc %0" : : "Q" (state->fpc));
+
+       /* Test and restore (load) vector registers */
+       asm volatile (
+               /*
+                * Test if any vector registers must be loaded and, if so,
+                * test if all registers can be loaded at once.
+                */
+               "       tmll    %[m],15\n"      /* KERNEL_VXR_MASK */
+               "       jz      20f\n"          /* no work -> done */
+               "       la      1,%[vxrs]\n"    /* load load area */
+               "       jo      18f\n"          /* -> load V0..V31 */
+
+               /*
+                * Test if V8..V23 can be restored at once... this speeds up
+                * for KERNEL_VXR_MID only. Otherwise continue to split the
+                * range of vector registers into two halves and test them
+                * separately.
+                */
+               "       tmll    %[m],6\n"       /* KERNEL_VXR_MID */
+               "       jo      17f\n"          /* -> load V8..V23 */
+
+               /* Test and load the first half of 16 vector registers */
+               "1:     tmll    %[m],3\n"       /* KERNEL_VXR_LOW */
+               "       jz      10f\n"          /* -> KERNEL_VXR_HIGH */
+               "       jo      2f\n"           /* 11 -> load V0..V15 */
+               "       brc     4,3f\n"         /* 01 -> load V0..V7  */
+               "       brc     2,4f\n"         /* 10 -> load V8..V15 */
+
+               /* Test and load the second half of 16 vector registers */
+               "10:    tmll    %[m],12\n"      /* KERNEL_VXR_HIGH */
+               "       jo      19f\n"          /* 11 -> load V16..V31 */
+               "       brc     4,11f\n"        /* 01 -> load V16..V23  */
+               "       brc     2,12f\n"        /* 10 -> load V24..V31 */
+               "       j       20f\n"          /* 00 -> done */
+
+               /*
+                * Below are the vstm combinations to load multiple vector
+                * registers at once.
+                */
+               "2:     .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+               "       j       10b\n"                  /* -> VXR_HIGH */
+               "3:     .word   0xe707,0x1000,0x0036\n" /* vlm 0,7,0(1) */
+               "       j       10b\n"                  /* -> VXR_HIGH */
+               "4:     .word   0xe78f,0x1080,0x0036\n" /* vlm 8,15,128(1) */
+               "       j       10b\n"                  /* -> VXR_HIGH */
+               "\n"
+               "11:    .word   0xe707,0x1100,0x0c36\n" /* vlm 16,23,256(1) */
+               "       j       20f\n"                  /* -> done */
+               "12:    .word   0xe78f,0x1180,0x0c36\n" /* vlm 24,31,384(1) */
+               "       j       20f\n"                  /* -> done */
+               "\n"
+               "17:    .word   0xe787,0x1080,0x0436\n" /* vlm 8,23,128(1) */
+               "       nill    %[m],249\n"             /* m &= ~VXR_MID    */
+               "       j       1b\n"                   /* -> VXR_LOW */
+               "\n"
+               "18:    .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+               "19:    .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+               "20:"
+               :
+               : [vxrs] "Q" (*(struct vx_array *) &state->vxrs),
+                 [m] "d" (state->mask)
+               : "1", "cc");
+
+update_fpu_state:
+       /* Update current kernel VX state */
+       __this_cpu_write(kernel_fpu_state, state->mask);
+}
+EXPORT_SYMBOL(__kernel_fpu_end);