struct list_head vmcs02_pool;
int vmcs02_num;
u64 vmcs01_tsc_offset;
+ bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
/*
#endif
int gs_ldt_reload_needed;
int fs_reload_needed;
+ unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state;
struct {
int vm86_active;
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
}
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
}
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
return;
}
break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
for (i = 0; i < m->nr; ++i)
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
+ u64 old_msr_data = msr->data;
msr->data = data;
if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
preempt_disable();
- kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
}
break;
}
}
vmcs_write16(sf->selector, var.selector);
- vmcs_write32(sf->base, var.base);
+ vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
var->type = ar & 15;
var->s = (ar >> 4) & 1;
var->dpl = (ar >> 5) & 3;
- var->present = (ar >> 7) & 1;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
var->avl = (ar >> 12) & 1;
var->l = (ar >> 13) & 1;
var->db = (ar >> 14) & 1;
var->g = (ar >> 15) & 1;
- var->unusable = (ar >> 16) & 1;
}
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
+ unsigned long cr4;
vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = read_cr4();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+ vmx->host_state.vmcs_host_cr4 = cr4;
+
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
#ifdef CONFIG_X86_64
/*
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+ if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
if (is_no_device(intr_info)) {
msr.data = data;
msr.index = ecx;
msr.host_initiated = false;
- if (vmx_set_msr(vcpu, &msr) != 0) {
+ if (kvm_set_msr(vcpu, &msr) != 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
return 1;
}
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_INVEPT] = handle_invept,
+ [EXIT_REASON_INVVPID] = handle_invvpid,
};
static const int kvm_vmx_max_exit_handlers =
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
- if (!is_exception(intr_info))
+ if (is_nmi(intr_info))
return 0;
else if (is_page_fault(intr_info))
return enable_ept;
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
/*
* VMX instructions trap unconditionally. This allows L1 to
* emulate them for its L2 guest, i.e., allows 3-level nesting!
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = exit_reason;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
- return 0;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
u32 sec_exec_control;
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+ return;
+ }
+
/*
* There is not point to enable virtualize x2apic without enable
* apicv
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ if (is_nmi(exit_intr_info)) {
kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
kvm_after_handle_nmi(&vmx->vcpu);
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr;
+ unsigned long debugctlmsr, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr4 = read_cr4();
+ if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->host_state.vmcs_host_cr4 = cr4;
+ }
+
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- free_nested(vmx);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_nested(vmx);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
- vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
+ vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
/*
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
* actually changed, because it depends on the current state of
* (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
*/
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
- kvm_set_cr4(vcpu, vmcs12->host_cr4);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
/* shadow page tables on either EPT or shadow page tables */
kvm_set_cr3(vcpu, vmcs12->host_cr3);
/* Update TSC_OFFSET if TSC was changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+ vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+ vmx_set_virtual_x2apic_mode(vcpu,
+ vcpu->arch.apic_base & X2APIC_ENABLE);
+ }
+
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;