s390/kernel: lazy restore fpu registers
authorHendrik Brueckner <brueckner@linux.vnet.ibm.com>
Wed, 10 Jun 2015 10:53:42 +0000 (12:53 +0200)
committerMartin Schwidefsky <schwidefsky@de.ibm.com>
Wed, 22 Jul 2015 07:58:01 +0000 (09:58 +0200)
Improve the save and restore behavior of FPU register contents to use the
vector extension within the kernel.

The kernel does not use floating-point or vector registers and, therefore,
saving and restoring the FPU register contents are performed for handling
signals or switching processes only.  To prepare for using vector
instructions and vector registers within the kernel, enhance the save
behavior and implement a lazy restore at return to user space from a
system call or interrupt.

To implement the lazy restore, the save_fpu_regs() sets a CPU information
flag, CIF_FPU, to indicate that the FPU registers must be restored.
Saving and setting CIF_FPU is performed in an atomic fashion to be
interrupt-safe.  When the kernel wants to use the vector extension or
wants to change the FPU register state for a task during signal handling,
the save_fpu_regs() must be called first.  The CIF_FPU flag is also set at
process switch.  At return to user space, the FPU state is restored.  In
particular, the FPU state includes the floating-point or vector register
contents, as well as, vector-enablement and floating-point control.  The
FPU state restore and clearing CIF_FPU is also performed in an atomic
fashion.

For KVM, the restore of the FPU register state is performed when restoring
the general-purpose guest registers before the SIE instructions is started.
Because the path towards the SIE instruction is interruptible, the CIF_FPU
flag must be checked again right before going into SIE.  If set, the guest
registers must be reloaded again by re-entering the outer SIE loop.  This
is the same behavior as if the SIE critical section is interrupted.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
15 files changed:
arch/s390/include/asm/ctl_reg.h
arch/s390/include/asm/fpu-internal.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/processor.h
arch/s390/include/asm/switch_to.h
arch/s390/kernel/asm-offsets.c
arch/s390/kernel/compat_signal.c
arch/s390/kernel/entry.S
arch/s390/kernel/nmi.c
arch/s390/kernel/process.c
arch/s390/kernel/ptrace.c
arch/s390/kernel/s390_ksyms.c
arch/s390/kernel/signal.c
arch/s390/kernel/traps.c
arch/s390/kvm/kvm-s390.c

index d7697ab802f6c94813a27394baa255fa26a93ddc..17a3735768681d98290ab51527efe8eed9fcd57e 100644 (file)
@@ -46,6 +46,8 @@ static inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
        __ctl_load(reg, cr, cr);
 }
 
+void __ctl_set_vx(void);
+
 void smp_ctl_set_bit(int cr, int bit);
 void smp_ctl_clear_bit(int cr, int bit);
 
index cc44c75fc4f741ebf7d97ae2ccd554f0298aba0f..237f8fcbe46b410dc66bb71a5ae86c0d8ff896af 100644 (file)
@@ -28,9 +28,14 @@ struct fpu {
        };
 };
 
+void save_fpu_regs(struct fpu *fpu);
+
 #define is_vx_fpu(fpu) (!!((fpu)->flags & FPU_USE_VX))
 #define is_vx_task(tsk) (!!((tsk)->thread.fpu.flags & FPU_USE_VX))
 
+/* VX array structure for address operand constraints in inline assemblies */
+struct vx_array { __vector128 _[__NUM_VXRS]; };
+
 static inline int test_fp_ctl(u32 fpc)
 {
        u32 orig_fpc;
@@ -48,76 +53,6 @@ static inline int test_fp_ctl(u32 fpc)
        return rc;
 }
 
-static inline void save_fp_ctl(u32 *fpc)
-{
-       asm volatile(
-               "       stfpc   %0\n"
-               : "+Q" (*fpc));
-}
-
-static inline int restore_fp_ctl(u32 *fpc)
-{
-       int rc;
-
-       asm volatile(
-               "       lfpc    %1\n"
-               "0:     la      %0,0\n"
-               "1:\n"
-               : "=d" (rc) : "Q" (*fpc), "0" (-EINVAL));
-       return rc;
-}
-
-static inline void save_fp_regs(freg_t *fprs)
-{
-       asm volatile("std 0,%0" : "=Q" (fprs[0]));
-       asm volatile("std 2,%0" : "=Q" (fprs[2]));
-       asm volatile("std 4,%0" : "=Q" (fprs[4]));
-       asm volatile("std 6,%0" : "=Q" (fprs[6]));
-       asm volatile("std 1,%0" : "=Q" (fprs[1]));
-       asm volatile("std 3,%0" : "=Q" (fprs[3]));
-       asm volatile("std 5,%0" : "=Q" (fprs[5]));
-       asm volatile("std 7,%0" : "=Q" (fprs[7]));
-       asm volatile("std 8,%0" : "=Q" (fprs[8]));
-       asm volatile("std 9,%0" : "=Q" (fprs[9]));
-       asm volatile("std 10,%0" : "=Q" (fprs[10]));
-       asm volatile("std 11,%0" : "=Q" (fprs[11]));
-       asm volatile("std 12,%0" : "=Q" (fprs[12]));
-       asm volatile("std 13,%0" : "=Q" (fprs[13]));
-       asm volatile("std 14,%0" : "=Q" (fprs[14]));
-       asm volatile("std 15,%0" : "=Q" (fprs[15]));
-}
-
-static inline void restore_fp_regs(freg_t *fprs)
-{
-       asm volatile("ld 0,%0" : : "Q" (fprs[0]));
-       asm volatile("ld 2,%0" : : "Q" (fprs[2]));
-       asm volatile("ld 4,%0" : : "Q" (fprs[4]));
-       asm volatile("ld 6,%0" : : "Q" (fprs[6]));
-       asm volatile("ld 1,%0" : : "Q" (fprs[1]));
-       asm volatile("ld 3,%0" : : "Q" (fprs[3]));
-       asm volatile("ld 5,%0" : : "Q" (fprs[5]));
-       asm volatile("ld 7,%0" : : "Q" (fprs[7]));
-       asm volatile("ld 8,%0" : : "Q" (fprs[8]));
-       asm volatile("ld 9,%0" : : "Q" (fprs[9]));
-       asm volatile("ld 10,%0" : : "Q" (fprs[10]));
-       asm volatile("ld 11,%0" : : "Q" (fprs[11]));
-       asm volatile("ld 12,%0" : : "Q" (fprs[12]));
-       asm volatile("ld 13,%0" : : "Q" (fprs[13]));
-       asm volatile("ld 14,%0" : : "Q" (fprs[14]));
-       asm volatile("ld 15,%0" : : "Q" (fprs[15]));
-}
-
-static inline void save_vx_regs(__vector128 *vxrs)
-{
-       typedef struct { __vector128 _[__NUM_VXRS]; } addrtype;
-
-       asm volatile(
-               "       la      1,%0\n"
-               "       .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
-               "       .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
-               : "=Q" (*(addrtype *) vxrs) : : "1");
-}
-
 static inline void save_vx_regs_safe(__vector128 *vxrs)
 {
        unsigned long cr0, flags;
@@ -126,20 +61,13 @@ static inline void save_vx_regs_safe(__vector128 *vxrs)
        __ctl_store(cr0, 0, 0);
        __ctl_set_bit(0, 17);
        __ctl_set_bit(0, 18);
-       save_vx_regs(vxrs);
-       __ctl_load(cr0, 0, 0);
-       arch_local_irq_restore(flags);
-}
-
-static inline void restore_vx_regs(__vector128 *vxrs)
-{
-       typedef struct { __vector128 _[__NUM_VXRS]; } addrtype;
-
        asm volatile(
                "       la      1,%0\n"
-               "       .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
-               "       .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
-               : : "Q" (*(addrtype *) vxrs) : "1");
+               "       .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+               "       .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
+               : "=Q" (*(struct vx_array *) vxrs) : : "1");
+       __ctl_load(cr0, 0, 0);
+       arch_local_irq_restore(flags);
 }
 
 static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
@@ -177,24 +105,6 @@ static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
                       sizeof(fpregs->fprs));
 }
 
-static inline void save_fpu_regs(struct fpu *fpu)
-{
-       save_fp_ctl(&fpu->fpc);
-       if (is_vx_fpu(fpu))
-               save_vx_regs(fpu->vxrs);
-       else
-               save_fp_regs(fpu->fprs);
-}
-
-static inline void restore_fpu_regs(struct fpu *fpu)
-{
-       restore_fp_ctl(&fpu->fpc);
-       if (is_vx_fpu(fpu))
-               restore_vx_regs(fpu->vxrs);
-       else
-               restore_fp_regs(fpu->fprs);
-}
-
 #endif
 
 #endif /* _ASM_S390_FPU_INTERNAL_H */
index 3024acbe1f9d63c935b8c74780227f07e429037c..c4f4c52aaa23533497c8891924e685bee7e20e4b 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/kvm.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
+#include <asm/fpu-internal.h>
 #include <asm/isc.h>
 
 #define KVM_MAX_VCPUS 64
@@ -498,10 +499,9 @@ struct kvm_guestdbg_info_arch {
 
 struct kvm_vcpu_arch {
        struct kvm_s390_sie_block *sie_block;
-       s390_fp_regs      host_fpregs;
        unsigned int      host_acrs[NUM_ACRS];
-       s390_fp_regs      guest_fpregs;
-       struct kvm_s390_vregs   *host_vregs;
+       struct fpu        host_fpregs;
+       struct fpu        guest_fpregs;
        struct kvm_s390_local_interrupt local_int;
        struct hrtimer    ckc_timer;
        struct kvm_s390_pgm_info pgm;
index 19f51db7c5e6753f7678b5c8e2d987c462186986..c417015c5304f9c30e78a3ba4e047ffe7f93e994 100644 (file)
 #define CIF_MCCK_PENDING       0       /* machine check handling is pending */
 #define CIF_ASCE               1       /* user asce needs fixup / uaccess */
 #define CIF_NOHZ_DELAY         2       /* delay HZ disable for a tick */
+#define CIF_FPU                        3       /* restore vector registers */
 
 #define _CIF_MCCK_PENDING      (1<<CIF_MCCK_PENDING)
 #define _CIF_ASCE              (1<<CIF_ASCE)
 #define _CIF_NOHZ_DELAY                (1<<CIF_NOHZ_DELAY)
+#define _CIF_FPU               (1<<CIF_FPU)
 
 #ifndef __ASSEMBLY__
 
index caf4f23462b04283508d464270be1174f9ae2650..0a4a3150b7d76401d977eccbc56545a2067028d4 100644 (file)
@@ -36,7 +36,7 @@ static inline void restore_access_regs(unsigned int *acrs)
        }                                                               \
        if (next->mm) {                                                 \
                update_cr_regs(next);                                   \
-               restore_fpu_regs(&next->thread.fpu);                    \
+               set_cpu_flag(CIF_FPU);                                  \
                restore_access_regs(&next->thread.acrs[0]);             \
                restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);  \
        }                                                               \
index a2da259d932741614c4f6b78642c491b3829b223..6bc42c08be09d13ea188402df4227907ec822ed1 100644 (file)
@@ -28,11 +28,16 @@ int main(void)
        DEFINE(__TASK_pid, offsetof(struct task_struct, pid));
        BLANK();
        DEFINE(__THREAD_ksp, offsetof(struct thread_struct, ksp));
+       DEFINE(__THREAD_fpu, offsetof(struct task_struct, thread.fpu));
        DEFINE(__THREAD_per_cause, offsetof(struct thread_struct, per_event.cause));
        DEFINE(__THREAD_per_address, offsetof(struct thread_struct, per_event.address));
        DEFINE(__THREAD_per_paid, offsetof(struct thread_struct, per_event.paid));
        DEFINE(__THREAD_trap_tdb, offsetof(struct thread_struct, trap_tdb));
        BLANK();
+       DEFINE(__FPU_fpc, offsetof(struct fpu, fpc));
+       DEFINE(__FPU_flags, offsetof(struct fpu, flags));
+       DEFINE(__FPU_regs, offsetof(struct fpu, regs));
+       BLANK();
        DEFINE(__TI_task, offsetof(struct thread_info, task));
        DEFINE(__TI_flags, offsetof(struct thread_info, flags));
        DEFINE(__TI_sysc_table, offsetof(struct thread_info, sys_call_table));
index 452995137a699bed60ad8bf48d7ccc0c9bbc11a7..0b46fd4aa31e4ae9e152be6f9a12c842808aeae6 100644 (file)
@@ -161,7 +161,6 @@ static void store_sigregs(void)
 static void load_sigregs(void)
 {
        restore_access_regs(current->thread.acrs);
-       restore_fpu_regs(&current->thread.fpu);
 }
 
 static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs)
@@ -287,6 +286,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
        if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32))
                goto badframe;
        set_current_blocked(&set);
+       save_fpu_regs(&current->thread.fpu);
        if (restore_sigregs32(regs, &frame->sregs))
                goto badframe;
        if (restore_sigregs_ext32(regs, &frame->sregs_ext))
@@ -309,6 +309,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
        set_current_blocked(&set);
        if (compat_restore_altstack(&frame->uc.uc_stack))
                goto badframe;
+       save_fpu_regs(&current->thread.fpu);
        if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
                goto badframe;
        if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
index 84062e7a77dad75c50fa60822255ee2b20b34181..05ea485156ee364b65211df6d712a4d34131c092 100644 (file)
@@ -20,6 +20,8 @@
 #include <asm/page.h>
 #include <asm/sigp.h>
 #include <asm/irq.h>
+#include <asm/fpu-internal.h>
+#include <asm/vx-insn.h>
 
 __PT_R0      = __PT_GPRS
 __PT_R1      = __PT_GPRS + 8
@@ -46,10 +48,10 @@ _TIF_WORK   = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
                   _TIF_UPROBE)
 _TIF_TRACE     = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
                   _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE)
+_CIF_WORK      = (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU)
 _PIF_WORK      = (_PIF_PER_TRAP)
 
-#define BASED(name) name-system_call(%r13)
+#define BASED(name) name-cleanup_critical(%r13)
 
        .macro  TRACE_IRQS_ON
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -280,6 +282,8 @@ ENTRY(system_call)
        jo      .Lsysc_sigpending
        tm      __TI_flags+7(%r12),_TIF_NOTIFY_RESUME
        jo      .Lsysc_notify_resume
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       jo      .Lsysc_vxrs
        tm      __LC_CPU_FLAGS+7,_CIF_ASCE
        jo      .Lsysc_uaccess
        j       .Lsysc_return           # beware of critical section cleanup
@@ -306,6 +310,13 @@ ENTRY(system_call)
        lctlg   %c1,%c1,__LC_USER_ASCE          # load primary asce
        j       .Lsysc_return
 
+#
+# CIF_FPU is set, restore floating-point controls and floating-point registers.
+#
+.Lsysc_vxrs:
+       larl    %r14,.Lsysc_return
+       jg      load_fpu_regs
+
 #
 # _TIF_SIGPENDING is set, call do_signal
 #
@@ -405,7 +416,7 @@ ENTRY(pgm_check_handler)
        stmg    %r8,%r15,__LC_SAVE_AREA_SYNC
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_PGM_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,1
        tmhh    %r8,0x0001              # test problem state bit
@@ -483,7 +494,7 @@ ENTRY(io_int_handler)
        stmg    %r8,%r15,__LC_SAVE_AREA_ASYNC
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_IO_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,2
        SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
@@ -587,6 +598,8 @@ ENTRY(io_int_handler)
        jo      .Lio_sigpending
        tm      __TI_flags+7(%r12),_TIF_NOTIFY_RESUME
        jo      .Lio_notify_resume
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       jo      .Lio_vxrs
        tm      __LC_CPU_FLAGS+7,_CIF_ASCE
        jo      .Lio_uaccess
        j       .Lio_return             # beware of critical section cleanup
@@ -608,6 +621,13 @@ ENTRY(io_int_handler)
        lctlg   %c1,%c1,__LC_USER_ASCE          # load primary asce
        j       .Lio_return
 
+#
+# CIF_FPU is set, restore floating-point controls and floating-point registers.
+#
+.Lio_vxrs:
+       larl    %r14,.Lio_return
+       jg      load_fpu_regs
+
 #
 # _TIF_NEED_RESCHED is set, call schedule
 #
@@ -652,7 +672,7 @@ ENTRY(ext_int_handler)
        stmg    %r8,%r15,__LC_SAVE_AREA_ASYNC
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_EXT_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,3
        SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
@@ -690,6 +710,121 @@ ENTRY(psw_idle)
        br      %r14
 .Lpsw_idle_end:
 
+/* Store floating-point controls and floating-point or vector extension
+ * registers instead.  A critical section cleanup assures that the registers
+ * are stored even if interrupted for some other work. The register %r2
+ * designates a struct fpu to store register contents. If the specified
+ * structure does not contain a register save area, the register store is
+ * omitted (see also comments in arch_dup_task_struct()).
+ *
+ * The CIF_FPU flag is set in any case.  The CIF_FPU triggers a lazy restore
+ * of the register contents at system call or io return.
+ */
+ENTRY(save_fpu_regs)
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bor     %r14
+       stfpc   __FPU_fpc(%r2)
+.Lsave_fpu_regs_fpc_end:
+       lg      %r3,__FPU_regs(%r2)
+       ltgr    %r3,%r3
+       jz      .Lsave_fpu_regs_done      # no save area -> set CIF_FPU
+       tm      __FPU_flags+3(%r2),FPU_USE_VX
+       jz      .Lsave_fpu_regs_fp        # no -> store FP regs
+.Lsave_fpu_regs_vx_low:
+       VSTM    %v0,%v15,0,%r3            # vstm 0,15,0(3)
+.Lsave_fpu_regs_vx_high:
+       VSTM    %v16,%v31,256,%r3         # vstm 16,31,256(3)
+       j       .Lsave_fpu_regs_done      # -> set CIF_FPU flag
+.Lsave_fpu_regs_fp:
+       std     0,0(%r3)
+       std     1,8(%r3)
+       std     2,16(%r3)
+       std     3,24(%r3)
+       std     4,32(%r3)
+       std     5,40(%r3)
+       std     6,48(%r3)
+       std     7,56(%r3)
+       std     8,64(%r3)
+       std     9,72(%r3)
+       std     10,80(%r3)
+       std     11,88(%r3)
+       std     12,96(%r3)
+       std     13,104(%r3)
+       std     14,112(%r3)
+       std     15,120(%r3)
+.Lsave_fpu_regs_done:
+       oi      __LC_CPU_FLAGS+7,_CIF_FPU
+       br      %r14
+.Lsave_fpu_regs_end:
+
+/* Load floating-point controls and floating-point or vector extension
+ * registers.  A critical section cleanup assures that the register contents
+ * are loaded even if interrupted for some other work. Depending on the saved
+ * FP/VX state, the vector-enablement control, CR0.46, is either set or cleared.
+ *
+ * There are special calling conventions to fit into sysc and io return work:
+ *     %r12:   __LC_THREAD_INFO
+ *     %r15:   <kernel stack>
+ * The function requires:
+ *     %r4 and __SF_EMPTY+32(%r15)
+ */
+load_fpu_regs:
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bnor    %r14
+       lg      %r4,__TI_task(%r12)
+       la      %r4,__THREAD_fpu(%r4)
+       lfpc    __FPU_fpc(%r4)
+       stctg   %c0,%c0,__SF_EMPTY+32(%r15)     # store CR0
+       tm      __FPU_flags+3(%r4),FPU_USE_VX   # VX-enabled task ?
+       lg      %r4,__FPU_regs(%r4)             # %r4 <- reg save area
+       jz      .Lload_fpu_regs_fp_ctl          # -> no VX, load FP regs
+.Lload_fpu_regs_vx_ctl:
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jo      .Lload_fpu_regs_vx
+       oi      __SF_EMPTY+32+5(%r15),2         # set VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+.Lload_fpu_regs_vx:
+       VLM     %v0,%v15,0,%r4
+.Lload_fpu_regs_vx_high:
+       VLM     %v16,%v31,256,%r4
+       j       .Lload_fpu_regs_done
+.Lload_fpu_regs_fp_ctl:
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jz      .Lload_fpu_regs_fp
+       ni      __SF_EMPTY+32+5(%r15),253       # clear VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+.Lload_fpu_regs_fp:
+       ld      0,0(%r4)
+       ld      1,8(%r4)
+       ld      2,16(%r4)
+       ld      3,24(%r4)
+       ld      4,32(%r4)
+       ld      5,40(%r4)
+       ld      6,48(%r4)
+       ld      7,56(%r4)
+       ld      8,64(%r4)
+       ld      9,72(%r4)
+       ld      10,80(%r4)
+       ld      11,88(%r4)
+       ld      12,96(%r4)
+       ld      13,104(%r4)
+       ld      14,112(%r4)
+       ld      15,120(%r4)
+.Lload_fpu_regs_done:
+       ni      __LC_CPU_FLAGS+7,255-_CIF_FPU
+       br      %r14
+.Lload_fpu_regs_end:
+
+/* Test and set the vector enablement control in CR0.46 */
+ENTRY(__ctl_set_vx)
+       stctg   %c0,%c0,__SF_EMPTY(%r15)
+       tm      __SF_EMPTY+5(%r15),2
+       bor     %r14
+       oi      __SF_EMPTY+5(%r15),2
+       lctlg   %c0,%c0,__SF_EMPTY(%r15)
+       br      %r14
+.L__ctl_set_vx_end:
+
 .L__critical_end:
 
 /*
@@ -702,7 +837,7 @@ ENTRY(mcck_int_handler)
        lmg     %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
        lg      %r10,__LC_LAST_BREAK
        lg      %r12,__LC_THREAD_INFO
-       larl    %r13,system_call
+       larl    %r13,cleanup_critical
        lmg     %r8,%r9,__LC_MCK_OLD_PSW
        HANDLE_SIE_INTERCEPT %r14,4
        tm      __LC_MCCK_CODE,0x80     # system damage?
@@ -831,6 +966,12 @@ stack_overflow:
        .quad   .Lio_done
        .quad   psw_idle
        .quad   .Lpsw_idle_end
+       .quad   save_fpu_regs
+       .quad   .Lsave_fpu_regs_end
+       .quad   load_fpu_regs
+       .quad   .Lload_fpu_regs_end
+       .quad   __ctl_set_vx
+       .quad   .L__ctl_set_vx_end
 
 cleanup_critical:
        clg     %r9,BASED(.Lcleanup_table)      # system_call
@@ -853,6 +994,18 @@ cleanup_critical:
        jl      0f
        clg     %r9,BASED(.Lcleanup_table+72)   # .Lpsw_idle_end
        jl      .Lcleanup_idle
+       clg     %r9,BASED(.Lcleanup_table+80)   # save_fpu_regs
+       jl      0f
+       clg     %r9,BASED(.Lcleanup_table+88)   # .Lsave_fpu_regs_end
+       jl      .Lcleanup_save_fpu_regs
+       clg     %r9,BASED(.Lcleanup_table+96)   # load_fpu_regs
+       jl      0f
+       clg     %r9,BASED(.Lcleanup_table+104)  # .Lload_fpu_regs_end
+       jl      .Lcleanup_load_fpu_regs
+       clg     %r9,BASED(.Lcleanup_table+112)  # __ctl_set_vx
+       jl      0f
+       clg     %r9,BASED(.Lcleanup_table+120)  # .L__ctl_set_vx_end
+       jl      .Lcleanup___ctl_set_vx
 0:     br      %r14
 
 
@@ -981,6 +1134,145 @@ cleanup_critical:
 .Lcleanup_idle_insn:
        .quad   .Lpsw_idle_lpsw
 
+.Lcleanup_save_fpu_regs:
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bor     %r14
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_done)
+       jhe     5f
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_fp)
+       jhe     4f
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_vx_high)
+       jhe     3f
+       clg     %r9,BASED(.Lcleanup_save_fpu_regs_vx_low)
+       jhe     2f
+       clg     %r9,BASED(.Lcleanup_save_fpu_fpc_end)
+       jhe     1f
+0:     # Store floating-point controls
+       stfpc   __FPU_fpc(%r2)
+1:     # Load register save area and check if VX is active
+       lg      %r3,__FPU_regs(%r2)
+       ltgr    %r3,%r3
+       jz      5f                        # no save area -> set CIF_FPU
+       tm      __FPU_flags+3(%r2),FPU_USE_VX
+       jz      4f                        # no VX -> store FP regs
+2:     # Store vector registers (V0-V15)
+       VSTM    %v0,%v15,0,%r3            # vstm 0,15,0(3)
+3:     # Store vector registers (V16-V31)
+       VSTM    %v16,%v31,256,%r3         # vstm 16,31,256(3)
+       j       5f                        # -> done, set CIF_FPU flag
+4:     # Store floating-point registers
+       std     0,0(%r3)
+       std     1,8(%r3)
+       std     2,16(%r3)
+       std     3,24(%r3)
+       std     4,32(%r3)
+       std     5,40(%r3)
+       std     6,48(%r3)
+       std     7,56(%r3)
+       std     8,64(%r3)
+       std     9,72(%r3)
+       std     10,80(%r3)
+       std     11,88(%r3)
+       std     12,96(%r3)
+       std     13,104(%r3)
+       std     14,112(%r3)
+       std     15,120(%r3)
+5:     # Set CIF_FPU flag
+       oi      __LC_CPU_FLAGS+7,_CIF_FPU
+       lg      %r9,48(%r11)            # return from save_fpu_regs
+       br      %r14
+.Lcleanup_save_fpu_fpc_end:
+       .quad   .Lsave_fpu_regs_fpc_end
+.Lcleanup_save_fpu_regs_vx_low:
+       .quad   .Lsave_fpu_regs_vx_low
+.Lcleanup_save_fpu_regs_vx_high:
+       .quad   .Lsave_fpu_regs_vx_high
+.Lcleanup_save_fpu_regs_fp:
+       .quad   .Lsave_fpu_regs_fp
+.Lcleanup_save_fpu_regs_done:
+       .quad   .Lsave_fpu_regs_done
+
+.Lcleanup_load_fpu_regs:
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       bnor    %r14
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_done)
+       jhe     1f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_fp)
+       jhe     2f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_fp_ctl)
+       jhe     3f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_vx_high)
+       jhe     4f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_vx)
+       jhe     5f
+       clg     %r9,BASED(.Lcleanup_load_fpu_regs_vx_ctl)
+       jhe     6f
+       lg      %r4,__TI_task(%r12)
+       la      %r4,__THREAD_fpu(%r4)
+       lfpc    __FPU_fpc(%r4)
+       tm      __FPU_flags+3(%r4),FPU_USE_VX   # VX-enabled task ?
+       lg      %r4,__FPU_regs(%r4)             # %r4 <- reg save area
+       jz      3f                              # -> no VX, load FP regs
+6:     # Set VX-enablement control
+       stctg   %c0,%c0,__SF_EMPTY+32(%r15)     # store CR0
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jo      5f
+       oi      __SF_EMPTY+32+5(%r15),2         # set VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+5:     # Load V0 ..V15 registers
+       VLM     %v0,%v15,0,%r4
+4:     # Load V16..V31 registers
+       VLM     %v16,%v31,256,%r4
+       j       1f
+3:     # Clear VX-enablement control for FP
+       stctg   %c0,%c0,__SF_EMPTY+32(%r15)     # store CR0
+       tm      __SF_EMPTY+32+5(%r15),2         # test VX control
+       jz      2f
+       ni      __SF_EMPTY+32+5(%r15),253       # clear VX control
+       lctlg   %c0,%c0,__SF_EMPTY+32(%r15)
+2:     # Load floating-point registers
+       ld      0,0(%r4)
+       ld      1,8(%r4)
+       ld      2,16(%r4)
+       ld      3,24(%r4)
+       ld      4,32(%r4)
+       ld      5,40(%r4)
+       ld      6,48(%r4)
+       ld      7,56(%r4)
+       ld      8,64(%r4)
+       ld      9,72(%r4)
+       ld      10,80(%r4)
+       ld      11,88(%r4)
+       ld      12,96(%r4)
+       ld      13,104(%r4)
+       ld      14,112(%r4)
+       ld      15,120(%r4)
+1:     # Clear CIF_FPU bit
+       ni      __LC_CPU_FLAGS+7,255-_CIF_FPU
+       lg      %r9,48(%r11)            # return from load_fpu_regs
+       br      %r14
+.Lcleanup_load_fpu_regs_vx_ctl:
+       .quad   .Lload_fpu_regs_vx_ctl
+.Lcleanup_load_fpu_regs_vx:
+       .quad   .Lload_fpu_regs_vx
+.Lcleanup_load_fpu_regs_vx_high:
+       .quad   .Lload_fpu_regs_vx_high
+.Lcleanup_load_fpu_regs_fp_ctl:
+       .quad   .Lload_fpu_regs_fp_ctl
+.Lcleanup_load_fpu_regs_fp:
+       .quad   .Lload_fpu_regs_fp
+.Lcleanup_load_fpu_regs_done:
+       .quad   .Lload_fpu_regs_done
+
+.Lcleanup___ctl_set_vx:
+       stctg   %c0,%c0,__SF_EMPTY(%r15)
+       tm      __SF_EMPTY+5(%r15),2
+       bor     %r14
+       oi      __SF_EMPTY+5(%r15),2
+       lctlg   %c0,%c0,__SF_EMPTY(%r15)
+       lg      %r9,48(%r11)            # return from __ctl_set_vx
+       br      %r14
+
 /*
  * Integer constants
  */
@@ -1002,6 +1294,11 @@ ENTRY(sie64a)
        stg     %r2,__SF_EMPTY(%r15)            # save control block pointer
        stg     %r3,__SF_EMPTY+8(%r15)          # save guest register save area
        xc      __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU       # load guest fp/vx registers ?
+       jno     .Lsie_load_guest_gprs
+       lg      %r12,__LC_THREAD_INFO           # load fp/vx regs save area
+       brasl   %r14,load_fpu_regs              # load guest fp/vx regs
+.Lsie_load_guest_gprs:
        lmg     %r0,%r13,0(%r3)                 # load guest gprs 0-13
        lg      %r14,__LC_GMAP                  # get gmap pointer
        ltgr    %r14,%r14
@@ -1012,6 +1309,8 @@ ENTRY(sie64a)
        oi      __SIE_PROG0C+3(%r14),1          # we are going into SIE now
        tm      __SIE_PROG20+3(%r14),3          # last exit...
        jnz     .Lsie_done
+       tm      __LC_CPU_FLAGS+7,_CIF_FPU
+       jo      .Lsie_done                      # exit if fp/vx regs changed
        LPP     __SF_EMPTY(%r15)                # set guest id
        sie     0(%r14)
 .Lsie_done:
index e66141c6696aa617c8d55ee2d1dce598859b934f..cbdd94c8ba189378ff3c8f2dcdc308cd093ef342 100644 (file)
@@ -165,8 +165,12 @@ static int notrace s390_revalidate_registers(struct mci *mci)
                cr0.val = S390_lowcore.cregs_save_area[0];
                cr0.afp = cr0.vx = 1;
                __ctl_load(cr0.val, 0, 0);
-               restore_vx_regs((__vector128 *)
-                               &S390_lowcore.vector_save_area);
+               asm volatile(
+                       "       la      1,%0\n"
+                       "       .word   0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+                       "       .word   0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+                       : : "Q" (*(struct vx_array *)
+                                &S390_lowcore.vector_save_area) : "1");
                __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
        }
        /* Revalidate access registers */
index 56949c9cda97859643246833ef47d2d0b9d4b7b8..9cf0063f920e68a8ed6974e0e1487bc630876349 100644 (file)
@@ -90,16 +90,28 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
        *dst = *src;
 
        /* Set up a new floating-point register save area */
+       dst->thread.fpu.fpc = 0;
+       dst->thread.fpu.flags = 0;      /* Always start with VX disabled */
        dst->thread.fpu.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
                                       GFP_KERNEL|__GFP_REPEAT);
        if (!dst->thread.fpu.fprs)
                return -ENOMEM;
 
-       /* Save the fpu registers to new thread structure. */
-       save_fp_ctl(&dst->thread.fpu.fpc);
-       save_fp_regs(dst->thread.fpu.fprs);
-       dst->thread.fpu.flags = 0;     /* Always start with VX disabled */
-
+       /*
+        * Save the floating-point or vector register state of the current
+        * task.  The state is not saved for early kernel threads, for example,
+        * the init_task, which do not have an allocated save area.
+        * The CIF_FPU flag is set in any case to lazy clear or restore a saved
+        * state when switching to a different task or returning to user space.
+        */
+       save_fpu_regs(&current->thread.fpu);
+       dst->thread.fpu.fpc = current->thread.fpu.fpc;
+       if (is_vx_task(current))
+               convert_vx_to_fp(dst->thread.fpu.fprs,
+                                current->thread.fpu.vxrs);
+       else
+               memcpy(dst->thread.fpu.fprs, current->thread.fpu.fprs,
+                      sizeof(freg_t) * __NUM_FPRS);
        return 0;
 }
 
@@ -184,8 +196,15 @@ asmlinkage void execve_tail(void)
  */
 int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 {
-       save_fp_ctl(&fpregs->fpc);
-       save_fp_regs(fpregs->fprs);
+       save_fpu_regs(&current->thread.fpu);
+       fpregs->fpc = current->thread.fpu.fpc;
+       fpregs->pad = 0;
+       if (is_vx_task(current))
+               convert_vx_to_fp((freg_t *)&fpregs->fprs,
+                                current->thread.fpu.vxrs);
+       else
+               memcpy(&fpregs->fprs, current->thread.fpu.fprs,
+                      sizeof(fpregs->fprs));
        return 1;
 }
 EXPORT_SYMBOL(dump_fpu);
index 52e2e1dd919d29f1ba13c6222a02022127057f4f..8c525880a3ff7ddb1e406bd7b257503e81ff8f81 100644 (file)
@@ -45,39 +45,27 @@ void update_cr_regs(struct task_struct *task)
        struct per_regs old, new;
 
        /* Take care of the enable/disable of transactional execution. */
-       if (MACHINE_HAS_TE || MACHINE_HAS_VX) {
+       if (MACHINE_HAS_TE) {
                unsigned long cr, cr_new;
 
                __ctl_store(cr, 0, 0);
-               cr_new = cr;
-               if (MACHINE_HAS_TE) {
-                       /* Set or clear transaction execution TXC bit 8. */
-                       cr_new |= (1UL << 55);
-                       if (task->thread.per_flags & PER_FLAG_NO_TE)
-                               cr_new &= ~(1UL << 55);
-               }
-               if (MACHINE_HAS_VX) {
-                       /* Enable/disable of vector extension */
-                       cr_new &= ~(1UL << 17);
-                       if (task->thread.fpu.vxrs)
-                               cr_new |= (1UL << 17);
-               }
+               /* Set or clear transaction execution TXC bit 8. */
+               cr_new = cr | (1UL << 55);
+               if (task->thread.per_flags & PER_FLAG_NO_TE)
+                       cr_new &= ~(1UL << 55);
                if (cr_new != cr)
                        __ctl_load(cr_new, 0, 0);
-               if (MACHINE_HAS_TE) {
-                       /* Set/clear transaction execution TDC bits 62/63. */
-                       __ctl_store(cr, 2, 2);
-                       cr_new = cr & ~3UL;
-                       if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) {
-                               if (task->thread.per_flags &
-                                   PER_FLAG_TE_ABORT_RAND_TEND)
-                                       cr_new |= 1UL;
-                               else
-                                       cr_new |= 2UL;
-                       }
-                       if (cr_new != cr)
-                               __ctl_load(cr_new, 2, 2);
+               /* Set or clear transaction execution TDC bits 62 and 63. */
+               __ctl_store(cr, 2, 2);
+               cr_new = cr & ~3UL;
+               if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) {
+                       if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND)
+                               cr_new |= 1UL;
+                       else
+                               cr_new |= 2UL;
                }
+               if (cr_new != cr)
+                       __ctl_load(cr_new, 2, 2);
        }
        /* Copy user specified PER registers */
        new.control = thread->per_user.control;
@@ -998,9 +986,6 @@ static int s390_fpregs_set(struct task_struct *target,
        else
                memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
 
-       if (target == current)
-               restore_fpu_regs(&target->thread.fpu);
-
        return rc;
 }
 
@@ -1090,12 +1075,9 @@ static int s390_vxrs_low_set(struct task_struct *target,
                save_fpu_regs(&target->thread.fpu);
 
        rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
-       if (rc == 0) {
+       if (rc == 0)
                for (i = 0; i < __NUM_VXRS_LOW; i++)
                        *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i];
-               if (target == current)
-                       restore_fpu_regs(&target->thread.fpu);
-       }
 
        return rc;
 }
@@ -1137,9 +1119,6 @@ static int s390_vxrs_high_set(struct task_struct *target,
 
        rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1);
-       if (rc == 0 && target == current)
-               restore_vx_regs(target->thread.fpu.vxrs);
-
        return rc;
 }
 
index 9f60467938d177ca6e6ffc0a45fa563b518b0693..5090d3dad10b56cef69b754e6ada95fc4e067a53 100644 (file)
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/kvm_host.h>
+#include <asm/fpu-internal.h>
 #include <asm/ftrace.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -8,6 +9,8 @@ EXPORT_SYMBOL(_mcount);
 #if IS_ENABLED(CONFIG_KVM)
 EXPORT_SYMBOL(sie64a);
 EXPORT_SYMBOL(sie_exit);
+EXPORT_SYMBOL(save_fpu_regs);
+EXPORT_SYMBOL(__ctl_set_vx);
 #endif
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
index 49c259cd5a33b9d8ac07555e2b6741ee9c731dcd..2f4c7e2638c916bef2ab0e20d016133ded183da1 100644 (file)
@@ -112,7 +112,6 @@ static void store_sigregs(void)
 static void load_sigregs(void)
 {
        restore_access_regs(current->thread.acrs);
-       restore_fpu_regs(&current->thread.fpu);
 }
 
 /* Returns non-zero on fault. */
@@ -223,6 +222,7 @@ SYSCALL_DEFINE0(sigreturn)
        if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE))
                goto badframe;
        set_current_blocked(&set);
+       save_fpu_regs(&current->thread.fpu);
        if (restore_sigregs(regs, &frame->sregs))
                goto badframe;
        if (restore_sigregs_ext(regs, &frame->sregs_ext))
@@ -246,6 +246,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
        set_current_blocked(&set);
        if (restore_altstack(&frame->uc.uc_stack))
                goto badframe;
+       save_fpu_regs(&current->thread.fpu);
        if (restore_sigregs(regs, &frame->uc.uc_mcontext))
                goto badframe;
        if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext))
index 7b09224c05a3ac5c062eb6be9cf2d036b7ab4a9f..76f76932ccb9f6d35901ff69380af3a4f12b7668 100644 (file)
@@ -151,7 +151,7 @@ DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN,
 DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN,
              "transaction constraint exception")
 
-static inline void do_fp_trap(struct pt_regs *regs, int fpc)
+static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
 {
        int si_code = 0;
        /* FPC[2] is Data Exception Code */
@@ -236,17 +236,13 @@ int alloc_vector_registers(struct task_struct *tsk)
                return -ENOMEM;
        preempt_disable();
        if (tsk == current)
-               save_fp_regs(tsk->thread.fpu.fprs);
+               save_fpu_regs(&tsk->thread.fpu);
        /* Copy the 16 floating point registers */
        convert_fp_to_vx(vxrs, tsk->thread.fpu.fprs);
        fprs = tsk->thread.fpu.fprs;
        tsk->thread.fpu.vxrs = vxrs;
        tsk->thread.fpu.flags |= FPU_USE_VX;
        kfree(fprs);
-       if (tsk == current) {
-               __ctl_set_bit(0, 17);
-               restore_vx_regs(vxrs);
-       }
        preempt_enable();
        return 0;
 }
@@ -261,7 +257,7 @@ void vector_exception(struct pt_regs *regs)
        }
 
        /* get vector interrupt code from fpc */
-       asm volatile("stfpc %0" : "=Q" (current->thread.fpu.fpc));
+       save_fpu_regs(&current->thread.fpu);
        vic = (current->thread.fpu.fpc & 0xf00) >> 8;
        switch (vic) {
        case 1: /* invalid vector operation */
@@ -299,7 +295,7 @@ void data_exception(struct pt_regs *regs)
 
        location = get_trap_ip(regs);
 
-       asm volatile("stfpc %0" : "=Q" (current->thread.fpu.fpc));
+       save_fpu_regs(&current->thread.fpu);
        /* Check for vector register enablement */
        if (MACHINE_HAS_VX && !is_vx_task(current) &&
            (current->thread.fpu.fpc & FPC_DXC_MASK) == 0xfe00) {
index fc7bc7118b23cb1824860973eeb68b5475f216c5..c0cceaf4a92ead14ccf5f69c9e34d777895b68b3 100644 (file)
@@ -1198,27 +1198,54 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+/*
+ * Backs up the current FP/VX register save area on a particular
+ * destination.  Used to switch between different register save
+ * areas.
+ */
+static inline void save_fpu_to(struct fpu *dst)
+{
+       dst->fpc = current->thread.fpu.fpc;
+       dst->flags = current->thread.fpu.flags;
+       dst->regs = current->thread.fpu.regs;
+}
+
+/*
+ * Switches the FP/VX register save area from which to lazy
+ * restore register contents.
+ */
+static inline void load_fpu_from(struct fpu *from)
+{
+       current->thread.fpu.fpc = from->fpc;
+       current->thread.fpu.flags = from->flags;
+       current->thread.fpu.regs = from->regs;
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-       __u32 fpc;
+       /* Save host register state */
+       save_fpu_regs(&current->thread.fpu);
+       save_fpu_to(&vcpu->arch.host_fpregs);
 
-       save_fp_ctl(&vcpu->arch.host_fpregs.fpc);
-       if (test_kvm_facility(vcpu->kvm, 129))
-               save_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
-       else
-               save_fp_regs(vcpu->arch.host_fpregs.fprs);
-       save_access_regs(vcpu->arch.host_acrs);
        if (test_kvm_facility(vcpu->kvm, 129)) {
-               fpc = vcpu->run->s.regs.fpc;
-               restore_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
-       } else {
-               fpc = vcpu->arch.guest_fpregs.fpc;
-               restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
-       }
-       if (test_fp_ctl(fpc))
+               current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
+               current->thread.fpu.flags = FPU_USE_VX;
+               /*
+                * Use the register save area in the SIE-control block
+                * for register restore and save in kvm_arch_vcpu_put()
+                */
+               current->thread.fpu.vxrs =
+                       (__vector128 *)&vcpu->run->s.regs.vrs;
+               /* Always enable the vector extension for KVM */
+               __ctl_set_vx();
+       } else
+               load_fpu_from(&vcpu->arch.guest_fpregs);
+
+       if (test_fp_ctl(current->thread.fpu.fpc))
                /* User space provided an invalid FPC, let's clear it */
-               fpc = 0;
-       restore_fp_ctl(&fpc);
+               current->thread.fpu.fpc = 0;
+
+       save_access_regs(vcpu->arch.host_acrs);
        restore_access_regs(vcpu->run->s.regs.acrs);
        gmap_enable(vcpu->arch.gmap);
        atomic_set_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@ -1228,19 +1255,22 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
        gmap_disable(vcpu->arch.gmap);
-       if (test_kvm_facility(vcpu->kvm, 129)) {
-               save_fp_ctl(&vcpu->run->s.regs.fpc);
-               save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
-       } else {
-               save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-               save_fp_regs(vcpu->arch.guest_fpregs.fprs);
-       }
-       save_access_regs(vcpu->run->s.regs.acrs);
-       restore_fp_ctl(&vcpu->arch.host_fpregs.fpc);
+
+       save_fpu_regs(&current->thread.fpu);
+
        if (test_kvm_facility(vcpu->kvm, 129))
-               restore_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs);
+               /*
+                * kvm_arch_vcpu_load() set up the register save area to
+                * the &vcpu->run->s.regs.vrs and, thus, the vector registers
+                * are already saved.  Only the floating-point control must be
+                * copied.
+                */
+               vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
        else
-               restore_fp_regs(vcpu->arch.host_fpregs.fprs);
+               save_fpu_to(&vcpu->arch.guest_fpregs);
+       load_fpu_from(&vcpu->arch.host_fpregs);
+
+       save_access_regs(vcpu->run->s.regs.acrs);
        restore_access_regs(vcpu->arch.host_acrs);
 }
 
@@ -1383,7 +1413,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
        vcpu->arch.sie_block = &sie_page->sie_block;
        vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
-       vcpu->arch.host_vregs = &sie_page->vregs;
 
        vcpu->arch.sie_block->icpua = id;
        if (!kvm_is_ucontrol(kvm)) {
@@ -1405,6 +1434,19 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        vcpu->arch.local_int.wq = &vcpu->wq;
        vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
 
+       /*
+        * Allocate a save area for floating-point registers.  If the vector
+        * extension is available, register contents are saved in the SIE
+        * control block.  The allocated save area is still required in
+        * particular places, for example, in kvm_s390_vcpu_store_status().
+        */
+       vcpu->arch.guest_fpregs.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
+                                              GFP_KERNEL);
+       if (!vcpu->arch.guest_fpregs.fprs) {
+               rc = -ENOMEM;
+               goto out_free_sie_block;
+       }
+
        rc = kvm_vcpu_init(vcpu, kvm, id);
        if (rc)
                goto out_free_sie_block;
@@ -1627,16 +1669,16 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        if (test_fp_ctl(fpu->fpc))
                return -EINVAL;
-       memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
+       memcpy(vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
        vcpu->arch.guest_fpregs.fpc = fpu->fpc;
-       restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       restore_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       save_fpu_regs(&current->thread.fpu);
+       load_fpu_from(&vcpu->arch.guest_fpregs);
        return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-       memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
+       memcpy(&fpu->fprs, vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
        fpu->fpc = vcpu->arch.guest_fpregs.fpc;
        return 0;
 }
@@ -2199,8 +2241,21 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
         * copying in vcpu load/put. Lets update our copies before we save
         * it into the save area
         */
-       save_fp_ctl(&vcpu->arch.guest_fpregs.fpc);
-       save_fp_regs(vcpu->arch.guest_fpregs.fprs);
+       save_fpu_regs(&current->thread.fpu);
+       if (test_kvm_facility(vcpu->kvm, 129)) {
+               /*
+                * If the vector extension is available, the vector registers
+                * which overlaps with floating-point registers are saved in
+                * the SIE-control block.  Hence, extract the floating-point
+                * registers and the FPC value and store them in the
+                * guest_fpregs structure.
+                */
+               WARN_ON(!is_vx_task(current));    /* XXX remove later */
+               vcpu->arch.guest_fpregs.fpc = current->thread.fpu.fpc;
+               convert_vx_to_fp(vcpu->arch.guest_fpregs.fprs,
+                                current->thread.fpu.vxrs);
+       } else
+               save_fpu_to(&vcpu->arch.guest_fpregs);
        save_access_regs(vcpu->run->s.regs.acrs);
 
        return kvm_s390_store_status_unloaded(vcpu, addr);
@@ -2227,10 +2282,13 @@ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
 
        /*
         * The guest VXRS are in the host VXRs due to the lazy
-        * copying in vcpu load/put. Let's update our copies before we save
-        * it into the save area.
+        * copying in vcpu load/put. We can simply call save_fpu_regs()
+        * to save the current register state because we are in the
+        * middle of a load/put cycle.
+        *
+        * Let's update our copies before we save it into the save area.
         */
-       save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs);
+       save_fpu_regs(&current->thread.fpu);
 
        return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
 }