KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / x86 / kvm / vmx.c
index 260a9193955538b4fea743045b2f964b2736b24e..be138952728409420a0a8dc853bb7a7eb3714cbe 100644 (file)
@@ -366,6 +366,7 @@ struct nested_vmx {
        struct list_head vmcs02_pool;
        int vmcs02_num;
        u64 vmcs01_tsc_offset;
+       bool change_vmcs01_virtual_x2apic_mode;
        /* L2 must run next, and mustn't decide to exit to L1. */
        bool nested_run_pending;
        /*
@@ -438,6 +439,7 @@ struct vcpu_vmx {
 #endif
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
+               unsigned long vmcs_host_cr4;    /* May not match real cr4 */
        } host_state;
        struct {
                int vm86_active;
@@ -1045,10 +1047,10 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+               == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
 }
 
 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
@@ -1486,6 +1488,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        return;
                }
                break;
+       case MSR_IA32_PEBS_ENABLE:
+               /* PEBS needs a quiescent period after being disabled (to write
+                * a record).  Disabling PEBS through VMX MSR swapping doesn't
+                * provide that period, so a CPU could write host's record into
+                * guest's memory.
+                */
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
 
        for (i = 0; i < m->nr; ++i)
@@ -2493,12 +2502,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
+                       u64 old_msr_data = msr->data;
                        msr->data = data;
                        if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
                                preempt_disable();
-                               kvm_set_shared_msr(msr->index, msr->data,
-                                                  msr->mask);
+                               ret = kvm_set_shared_msr(msr->index, msr->data,
+                                                        msr->mask);
                                preempt_enable();
+                               if (ret)
+                                       msr->data = old_msr_data;
                        }
                        break;
                }
@@ -3062,7 +3074,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
        }
 
        vmcs_write16(sf->selector, var.selector);
-       vmcs_write32(sf->base, var.base);
+       vmcs_writel(sf->base, var.base);
        vmcs_write32(sf->limit, var.limit);
        vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
 }
@@ -3399,15 +3411,22 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
        var->limit = vmx_read_guest_seg_limit(vmx, seg);
        var->selector = vmx_read_guest_seg_selector(vmx, seg);
        ar = vmx_read_guest_seg_ar(vmx, seg);
+       var->unusable = (ar >> 16) & 1;
        var->type = ar & 15;
        var->s = (ar >> 4) & 1;
        var->dpl = (ar >> 5) & 3;
-       var->present = (ar >> 7) & 1;
+       /*
+        * Some userspaces do not preserve unusable property. Since usable
+        * segment has to be present according to VMX spec we can use present
+        * property to amend userspace bug by making unusable segment always
+        * nonpresent. vmx_segment_access_rights() already marks nonpresent
+        * segment as unusable.
+        */
+       var->present = !var->unusable;
        var->avl = (ar >> 12) & 1;
        var->l = (ar >> 13) & 1;
        var->db = (ar >> 14) & 1;
        var->g = (ar >> 15) & 1;
-       var->unusable = (ar >> 16) & 1;
 }
 
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -4066,11 +4085,16 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
        u32 low32, high32;
        unsigned long tmpl;
        struct desc_ptr dt;
+       unsigned long cr4;
 
        vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
-       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
+       /* Save the most likely value for this task's CR4 in the VMCS. */
+       cr4 = read_cr4();
+       vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
+       vmx->host_state.vmcs_host_cr4 = cr4;
+
        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
        /*
@@ -4692,7 +4716,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        if (is_machine_check(intr_info))
                return handle_machine_check(vcpu);
 
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+       if (is_nmi(intr_info))
                return 1;  /* already handled by vmx_vcpu_run() */
 
        if (is_no_device(intr_info)) {
@@ -5055,7 +5079,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
        msr.data = data;
        msr.index = ecx;
        msr.host_initiated = false;
-       if (vmx_set_msr(vcpu, &msr) != 0) {
+       if (kvm_set_msr(vcpu, &msr) != 0) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -6232,6 +6256,18 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6276,6 +6312,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
+       [EXIT_REASON_INVVPID]                 = handle_invvpid,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6469,7 +6507,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 
        switch (exit_reason) {
        case EXIT_REASON_EXCEPTION_NMI:
-               if (!is_exception(intr_info))
+               if (is_nmi(intr_info))
                        return 0;
                else if (is_page_fault(intr_info))
                        return enable_ept;
@@ -6502,6 +6540,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6644,10 +6683,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu);
        else {
-               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-               vcpu->run->hw.hardware_exit_reason = exit_reason;
+               WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
        }
-       return 0;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -6664,6 +6703,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 {
        u32 sec_exec_control;
 
+       /* Postpone execution until vmcs01 is the current VMCS. */
+       if (is_guest_mode(vcpu)) {
+               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+               return;
+       }
+
        /*
         * There is not point to enable virtualize x2apic without enable
         * apicv
@@ -6758,8 +6803,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
                kvm_machine_check();
 
        /* We need to handle NMIs before interrupts are enabled */
-       if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-           (exit_intr_info & INTR_INFO_VALID_MASK)) {
+       if (is_nmi(exit_intr_info)) {
                kvm_before_handle_nmi(&vmx->vcpu);
                asm("int $2");
                kvm_after_handle_nmi(&vmx->vcpu);
@@ -6946,7 +6990,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long debugctlmsr;
+       unsigned long debugctlmsr, cr4;
 
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -6967,6 +7011,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
+       cr4 = read_cr4();
+       if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->host_state.vmcs_host_cr4 = cr4;
+       }
+
        /* When single-stepping over STI and MOV SS, we must clear the
         * corresponding interruptibility bits in the guest state. Otherwise
         * vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7126,8 +7176,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        free_vpid(vmx);
-       free_nested(vmx);
        free_loaded_vmcs(vmx->loaded_vmcs);
+       free_nested(vmx);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7942,7 +7992,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
-       vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
+       vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
        /*
         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
         * actually changed, because it depends on the current state of
@@ -7964,7 +8014,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
         */
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
-       kvm_set_cr4(vcpu, vmcs12->host_cr4);
+       vmx_set_cr4(vcpu, vmcs12->host_cr4);
 
        /* shadow page tables on either EPT or shadow page tables */
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
@@ -8041,6 +8091,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
        /* Update TSC_OFFSET if TSC was changed while L2 ran */
        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
 
+       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+               vmx_set_virtual_x2apic_mode(vcpu,
+                               vcpu->arch.apic_base & X2APIC_ENABLE);
+       }
+
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
        vmx->host_rsp = 0;