nEPT: Nested INVEPT
authorNadav Har'El <nyh@il.ibm.com>
Mon, 5 Aug 2013 08:07:17 +0000 (11:07 +0300)
committerPaolo Bonzini <pbonzini@redhat.com>
Wed, 7 Aug 2013 13:57:42 +0000 (15:57 +0200)
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table
for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in
the course of this modification already calls INVEPT. But if last level
of shadow page is unsync not all L1's changes to EPT12 are intercepted,
which means roots need to be synced when L1 calls INVEPT. Global INVEPT
should not be different since roots are synced by kvm_mmu_load() each
time EPTP02 changes.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c

index f3e01a2cbaa1b965f0adf89aa1c00b7a3619df51..966502d4682eeaf0c61b34b1b186e48cfdedf4fe 100644 (file)
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR         0
 #define VMX_EPT_EXTENT_CONTEXT                 1
 #define VMX_EPT_EXTENT_GLOBAL                  2
+#define VMX_EPT_EXTENT_SHIFT                   24
 
 #define VMX_EPT_EXECUTE_ONLY_BIT               (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT                        (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT                                (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                   (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT                   (1ull << 17)
+#define VMX_EPT_INVEPT_BIT                     (1ull << 20)
 #define VMX_EPT_AD_BIT                             (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT             (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT              (1ull << 26)
index d651082c7cf720a805f653b162343f6b9a984329..7a34e8fe54bd1d8d2f9c98cbde0edd6b38f13ccf 100644 (file)
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
index 992fde984e25c63e562a0ae39096e204bcc3de62..9651c993758883ded46d1b8028e44316dca93ab3 100644 (file)
@@ -3182,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
                                  u32 access, struct x86_exception *exception)
@@ -3451,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
        ++vcpu->stat.tlb_flush;
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
index 2ae0aa4461e828d18ff425a0a632cd3def28555f..5129ba3766c42a48c1fdf32d420082948d06fa1b 100644 (file)
@@ -712,6 +712,7 @@ static void nested_release_page_clean(struct page *page)
        kvm_release_page_clean(page);
 }
 
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -2161,6 +2162,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -6279,6 +6281,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       u32 vmx_instruction_info, types;
+       unsigned long type;
+       gva_t gva;
+       struct x86_exception e;
+       struct {
+               u64 eptp, gpa;
+       } operand;
+       u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+
+       if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+           !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+       types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+       if (!(types & (1UL << type))) {
+               nested_vmx_failValid(vcpu,
+                               VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               return 1;
+       }
+
+       /* According to the Intel VMX instruction reference, the memory
+        * operand is read even if it isn't needed (e.g., for type==global)
+        */
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmx_instruction_info, &gva))
+               return 1;
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                               sizeof(operand), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       switch (type) {
+       case VMX_EPT_EXTENT_CONTEXT:
+               if ((operand.eptp & eptp_mask) !=
+                               (nested_ept_get_cr3(vcpu) & eptp_mask))
+                       break;
+       case VMX_EPT_EXTENT_GLOBAL:
+               kvm_mmu_sync_roots(vcpu);
+               kvm_mmu_flush_tlb(vcpu);
+               nested_vmx_succeed(vcpu);
+               break;
+       default:
+               BUG_ON(1);
+               break;
+       }
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6323,6 +6393,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6549,6 +6620,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!