KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 260a9193955538b4fea743045b2f964b2736b24e..be138952728409420a0a8dc853bb7a7eb3714cbe 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -366,6 +366,7 @@ struct nested_vmx {
         struct list_head vmcs02_pool;
         int vmcs02_num;
         u64 vmcs01_tsc_offset;
+       bool change_vmcs01_virtual_x2apic_mode;
         /* L2 must run next, and mustn't decide to exit to L1. */
         bool nested_run_pending;
         /*
@@ -438,6 +439,7 @@ struct vcpu_vmx {
  #endif
                 int           gs_ldt_reload_needed;
                 int           fs_reload_needed;
+               unsigned long vmcs_host_cr4;    /* May not match real cr4 */
         } host_state;
         struct {
                 int vm86_active;
@@ -1045,10 +1047,10 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
         return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
  }
  
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
  {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-               == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+               == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
  }
  
  static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
@@ -1486,6 +1488,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                         return;
                 }
                 break;
+       case MSR_IA32_PEBS_ENABLE:
+               /* PEBS needs a quiescent period after being disabled (to write
+                * a record).  Disabling PEBS through VMX MSR swapping doesn't
+                * provide that period, so a CPU could write host's record into
+                * guest's memory.
+                */
+               wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
  
         for (i = 0; i < m->nr; ++i)
@@ -2493,12 +2502,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         break;
                 msr = find_msr_entry(vmx, msr_index);
                 if (msr) {
+                       u64 old_msr_data = msr->data;
                         msr->data = data;
                         if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
                                 preempt_disable();
-                               kvm_set_shared_msr(msr->index, msr->data,
-                                                  msr->mask);
+                               ret = kvm_set_shared_msr(msr->index, msr->data,
+                                                        msr->mask);
                                 preempt_enable();
+                               if (ret)
+                                       msr->data = old_msr_data;
                         }
                         break;
                 }
@@ -3062,7 +3074,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
         }
  
         vmcs_write16(sf->selector, var.selector);
-       vmcs_write32(sf->base, var.base);
+       vmcs_writel(sf->base, var.base);
         vmcs_write32(sf->limit, var.limit);
         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
  }
@@ -3399,15 +3411,22 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
         var->limit = vmx_read_guest_seg_limit(vmx, seg);
         var->selector = vmx_read_guest_seg_selector(vmx, seg);
         ar = vmx_read_guest_seg_ar(vmx, seg);
+       var->unusable = (ar >> 16) & 1;
         var->type = ar & 15;
         var->s = (ar >> 4) & 1;
         var->dpl = (ar >> 5) & 3;
-       var->present = (ar >> 7) & 1;
+       /*
+        * Some userspaces do not preserve unusable property. Since usable
+        * segment has to be present according to VMX spec we can use present
+        * property to amend userspace bug by making unusable segment always
+        * nonpresent. vmx_segment_access_rights() already marks nonpresent
+        * segment as unusable.
+        */
+       var->present = !var->unusable;
         var->avl = (ar >> 12) & 1;
         var->l = (ar >> 13) & 1;
         var->db = (ar >> 14) & 1;
         var->g = (ar >> 15) & 1;
-       var->unusable = (ar >> 16) & 1;
  }
  
  static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -4066,11 +4085,16 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
         u32 low32, high32;
         unsigned long tmpl;
         struct desc_ptr dt;
+       unsigned long cr4;
  
         vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
-       vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
  
+       /* Save the most likely value for this task's CR4 in the VMCS. */
+       cr4 = read_cr4();
+       vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
+       vmx->host_state.vmcs_host_cr4 = cr4;
+
         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
  #ifdef CONFIG_X86_64
         /*
@@ -4692,7 +4716,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
         if (is_machine_check(intr_info))
                 return handle_machine_check(vcpu);
  
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+       if (is_nmi(intr_info))
                 return 1;  /* already handled by vmx_vcpu_run() */
  
         if (is_no_device(intr_info)) {
@@ -5055,7 +5079,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
         msr.data = data;
         msr.index = ecx;
         msr.host_initiated = false;
-       if (vmx_set_msr(vcpu, &msr) != 0) {
+       if (kvm_set_msr(vcpu, &msr) != 0) {
                 trace_kvm_msr_write_ex(ecx, data);
                 kvm_inject_gp(vcpu, 0);
                 return 1;
@@ -6232,6 +6256,18 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         return 1;
  }
  
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+       kvm_queue_exception(vcpu, UD_VECTOR);
+       return 1;
+}
+
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6276,6 +6312,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
         [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
         [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
+       [EXIT_REASON_INVVPID]                 = handle_invvpid,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@ -6469,7 +6507,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
  
         switch (exit_reason) {
         case EXIT_REASON_EXCEPTION_NMI:
-               if (!is_exception(intr_info))
+               if (is_nmi(intr_info))
                         return 0;
                 else if (is_page_fault(intr_info))
                         return enable_ept;
@@ -6502,6 +6540,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
         case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
         case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
         case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
                 /*
                  * VMX instructions trap unconditionally. This allows L1 to
                  * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6644,10 +6683,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
             && kvm_vmx_exit_handlers[exit_reason])
                 return kvm_vmx_exit_handlers[exit_reason](vcpu);
         else {
-               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-               vcpu->run->hw.hardware_exit_reason = exit_reason;
+               WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
         }
-       return 0;
  }
  
  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -6664,6 +6703,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
  {
         u32 sec_exec_control;
  
+       /* Postpone execution until vmcs01 is the current VMCS. */
+       if (is_guest_mode(vcpu)) {
+               to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+               return;
+       }
+
         /*
          * There is not point to enable virtualize x2apic without enable
          * apicv
@@ -6758,8 +6803,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
                 kvm_machine_check();
  
         /* We need to handle NMIs before interrupts are enabled */
-       if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-           (exit_intr_info & INTR_INFO_VALID_MASK)) {
+       if (is_nmi(exit_intr_info)) {
                 kvm_before_handle_nmi(&vmx->vcpu);
                 asm("int $2");
                 kvm_after_handle_nmi(&vmx->vcpu);
@@ -6946,7 +6990,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long debugctlmsr;
+       unsigned long debugctlmsr, cr4;
  
         /* Record the guest's net vcpu time for enforced NMI injections. */
         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -6967,6 +7011,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
  
+       cr4 = read_cr4();
+       if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->host_state.vmcs_host_cr4 = cr4;
+       }
+
         /* When single-stepping over STI and MOV SS, we must clear the
          * corresponding interruptibility bits in the guest state. Otherwise
          * vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7126,8 +7176,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
         free_vpid(vmx);
-       free_nested(vmx);
         free_loaded_vmcs(vmx->loaded_vmcs);
+       free_nested(vmx);
         kfree(vmx->guest_msrs);
         kvm_vcpu_uninit(vcpu);
         kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7942,7 +7992,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  
         kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
         kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
-       vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
+       vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
         /*
          * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
          * actually changed, because it depends on the current state of
@@ -7964,7 +8014,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
          * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
          */
         vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
-       kvm_set_cr4(vcpu, vmcs12->host_cr4);
+       vmx_set_cr4(vcpu, vmcs12->host_cr4);
  
         /* shadow page tables on either EPT or shadow page tables */
         kvm_set_cr3(vcpu, vmcs12->host_cr3);
@@ -8041,6 +8091,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
         /* Update TSC_OFFSET if TSC was changed while L2 ran */
         vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
  
+       if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+               vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+               vmx_set_virtual_x2apic_mode(vcpu,
+                               vcpu->arch.apic_base & X2APIC_ENABLE);
+       }
+
         /* This is needed for same reason as it was needed in prepare_vmcs02 */
         vmx->host_rsp = 0;