KVM: Use eoi to track RTC interrupt delivery status
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / x86 / kvm / vmx.c
index 6667042714cc65e78208c517ccc0802a40dd4ccf..71755573b7ca58e6b047ae2529b1e9c6dee104b6 100644 (file)
@@ -298,7 +298,8 @@ struct __packed vmcs12 {
        u32 guest_activity_state;
        u32 guest_sysenter_cs;
        u32 host_ia32_sysenter_cs;
-       u32 padding32[8]; /* room for future expansion */
+       u32 vmx_preemption_timer_value;
+       u32 padding32[7]; /* room for future expansion */
        u16 virtual_processor_id;
        u16 guest_es_selector;
        u16 guest_cs_selector;
@@ -537,6 +538,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
        FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
        FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+       FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
        FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
        FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
        FIELD(CR0_READ_SHADOW, cr0_read_shadow),
@@ -2022,6 +2024,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_misc_low, nested_vmx_misc_high;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -2040,30 +2043,40 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         */
 
        /* pin-based controls */
+       rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
+             nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
        /*
         * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
         * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
         */
-       nested_vmx_pinbased_ctls_low = 0x16 ;
-       nested_vmx_pinbased_ctls_high = 0x16 |
-               PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
-               PIN_BASED_VIRTUAL_NMIS;
+       nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+       nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
+               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 
-       /* exit controls */
-       nested_vmx_exit_ctls_low = 0;
+       /*
+        * Exit controls
+        * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
+        * 17 must be 1.
+        */
+       nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
        /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
 #ifdef CONFIG_X86_64
        nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #else
        nested_vmx_exit_ctls_high = 0;
 #endif
+       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
                nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
-       nested_vmx_entry_ctls_low = 0;
+       /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
+       nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_entry_ctls_high &=
                VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+       nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2080,6 +2093,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
                CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
+               CPU_BASED_PAUSE_EXITING |
                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        /*
         * We can allow some features even when not supported by the
@@ -2094,7 +2108,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
-               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_WBINVD_EXITING;
+
+       /* miscellaneous data */
+       rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
+       nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
+               VMX_MISC_SAVE_EFER_LMA;
+       nested_vmx_misc_high = 0;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2165,7 +2186,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                        nested_vmx_entry_ctls_high);
                break;
        case MSR_IA32_VMX_MISC:
-               *pdata = 0;
+               *pdata = vmx_control_msr(nested_vmx_misc_low,
+                                        nested_vmx_misc_high);
                break;
        /*
         * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2876,22 +2898,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
        vmx->cpl = 0;
 }
 
-static gva_t rmode_tss_base(struct kvm *kvm)
-{
-       if (!kvm->arch.tss_addr) {
-               struct kvm_memslots *slots;
-               struct kvm_memory_slot *slot;
-               gfn_t base_gfn;
-
-               slots = kvm_memslots(kvm);
-               slot = id_to_memslot(slots, 0);
-               base_gfn = slot->base_gfn + slot->npages - 3;
-
-               return base_gfn << PAGE_SHIFT;
-       }
-       return kvm->arch.tss_addr;
-}
-
 static void fix_rmode_seg(int seg, struct kvm_segment *save)
 {
        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -2942,19 +2948,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
        /*
         * Very old userspace does not call KVM_SET_TSS_ADDR before entering
-        * vcpu. Call it here with phys address pointing 16M below 4G.
+        * vcpu. Warn the user that an update is overdue.
         */
-       if (!vcpu->kvm->arch.tss_addr) {
+       if (!vcpu->kvm->arch.tss_addr)
                printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
                             "called before entering vcpu\n");
-               srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-               vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
-               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       }
 
        vmx_segment_cache_clear(vmx);
 
-       vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+       vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
@@ -3214,7 +3216,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 */
                if (!nested_vmx_allowed(vcpu))
                        return 1;
-       } else if (to_vmx(vcpu)->nested.vmxon)
+       }
+       if (to_vmx(vcpu)->nested.vmxon &&
+           ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
                return 1;
 
        vcpu->arch.cr4 = cr4;
@@ -3599,7 +3603,7 @@ static int init_rmode_tss(struct kvm *kvm)
        int r, idx, ret = 0;
 
        idx = srcu_read_lock(&kvm->srcu);
-       fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+       fn = kvm->arch.tss_addr >> PAGE_SHIFT;
        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
        if (r < 0)
                goto out;
@@ -3692,7 +3696,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
        kvm_userspace_mem.flags = 0;
        kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
        kvm_userspace_mem.memory_size = PAGE_SIZE;
-       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
        if (r)
                goto out;
 
@@ -3722,7 +3726,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
        kvm_userspace_mem.guest_phys_addr =
                kvm->arch.ept_identity_map_addr;
        kvm_userspace_mem.memory_size = PAGE_SIZE;
-       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
        if (r)
                goto out;
 
@@ -4089,11 +4093,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        return 0;
 }
 
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 msr;
-       int ret;
 
        vmx->rmode.vm86_active = 0;
 
@@ -4109,12 +4112,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmx_segment_cache_clear(vmx);
 
        seg_setup(VCPU_SREG_CS);
-       if (kvm_vcpu_is_bsp(&vmx->vcpu))
-               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-       else {
-               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
-               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
-       }
+       vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+       vmcs_write32(GUEST_CS_BASE, 0xffff0000);
 
        seg_setup(VCPU_SREG_DS);
        seg_setup(VCPU_SREG_ES);
@@ -4137,10 +4136,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
        vmcs_writel(GUEST_RFLAGS, 0x02);
-       if (kvm_vcpu_is_bsp(&vmx->vcpu))
-               kvm_rip_write(vcpu, 0xfff0);
-       else
-               kvm_rip_write(vcpu, 0);
+       kvm_rip_write(vcpu, 0xfff0);
 
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4175,19 +4171,13 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
        vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        vmx_set_cr4(&vmx->vcpu, 0);
        vmx_set_efer(&vmx->vcpu, 0);
        vmx_fpu_activate(&vmx->vcpu);
        update_exception_bitmap(&vmx->vcpu);
 
        vpid_sync_context(vmx);
-
-       ret = 0;
-
-       return ret;
 }
 
 /*
@@ -4335,16 +4325,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+       if (is_guest_mode(vcpu)) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               if (to_vmx(vcpu)->nested.nested_run_pending ||
-                   (vmcs12->idt_vectoring_info_field &
-                    VECTORING_INFO_VALID_MASK))
+
+               if (to_vmx(vcpu)->nested.nested_run_pending)
                        return 0;
-               nested_vmx_vmexit(vcpu);
-               vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
-               vmcs12->vm_exit_intr_info = 0;
-               /* fall through to normal code, but now in L1, not L2 */
+               if (nested_exit_on_intr(vcpu)) {
+                       nested_vmx_vmexit(vcpu);
+                       vmcs12->vm_exit_reason =
+                               EXIT_REASON_EXTERNAL_INTERRUPT;
+                       vmcs12->vm_exit_intr_info = 0;
+                       /*
+                        * fall through to normal code, but now in L1, not L2
+                        */
+               }
        }
 
        return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -4362,7 +4356,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
                .flags = 0,
        };
 
-       ret = kvm_set_memory_region(kvm, &tss_mem, false);
+       ret = kvm_set_memory_region(kvm, &tss_mem);
        if (ret)
                return ret;
        kvm->arch.tss_addr = addr;
@@ -4603,34 +4597,50 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
-       if (to_vmx(vcpu)->nested.vmxon &&
-           ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
-               return 1;
-
        if (is_guest_mode(vcpu)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               unsigned long orig_val = val;
+
                /*
                 * We get here when L2 changed cr0 in a way that did not change
                 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
-                * but did change L0 shadowed bits. This can currently happen
-                * with the TS bit: L0 may want to leave TS on (for lazy fpu
-                * loading) while pretending to allow the guest to change it.
+                * but did change L0 shadowed bits. So we first calculate the
+                * effective cr0 value that L1 would like to write into the
+                * hardware. It consists of the L2-owned bits from the new
+                * value combined with the L1-owned bits from L1's guest_cr0.
                 */
-               if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
-                        (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+               val = (val & ~vmcs12->cr0_guest_host_mask) |
+                       (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+               /* TODO: will have to take unrestricted guest mode into
+                * account */
+               if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
+                       return 1;
+
+               if (kvm_set_cr0(vcpu, val))
                        return 1;
-               vmcs_writel(CR0_READ_SHADOW, val);
+               vmcs_writel(CR0_READ_SHADOW, orig_val);
                return 0;
-       } else
+       } else {
+               if (to_vmx(vcpu)->nested.vmxon &&
+                   ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+                       return 1;
                return kvm_set_cr0(vcpu, val);
+       }
 }
 
 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 {
        if (is_guest_mode(vcpu)) {
-               if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
-                        (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               unsigned long orig_val = val;
+
+               /* analogously to handle_set_cr0 */
+               val = (val & ~vmcs12->cr4_guest_host_mask) |
+                       (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+               if (kvm_set_cr4(vcpu, val))
                        return 1;
-               vmcs_writel(CR4_READ_SHADOW, val);
+               vmcs_writel(CR4_READ_SHADOW, orig_val);
                return 0;
        } else
                return kvm_set_cr4(vcpu, val);
@@ -5183,7 +5193,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
                        return 1;
 
-               err = emulate_instruction(vcpu, 0);
+               err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 
                if (err == EMULATE_DO_MMIO) {
                        ret = 0;
@@ -5259,8 +5269,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
        }
 
        /* Create a new VMCS */
-       item = (struct vmcs02_list *)
-               kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+       item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
        if (!item)
                return NULL;
        item->vmcs02.vmcs = alloc_vmcs();
@@ -5908,6 +5917,52 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
        ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+                                      struct vmcs12 *vmcs12)
+{
+       unsigned long exit_qualification;
+       gpa_t bitmap, last_bitmap;
+       unsigned int port;
+       int size;
+       u8 b;
+
+       if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
+               return 1;
+
+       if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+               return 0;
+
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+       port = exit_qualification >> 16;
+       size = (exit_qualification & 7) + 1;
+
+       last_bitmap = (gpa_t)-1;
+       b = -1;
+
+       while (size > 0) {
+               if (port < 0x8000)
+                       bitmap = vmcs12->io_bitmap_a;
+               else if (port < 0x10000)
+                       bitmap = vmcs12->io_bitmap_b;
+               else
+                       return 1;
+               bitmap += (port & 0x7fff) / 8;
+
+               if (last_bitmap != bitmap)
+                       if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+                               return 1;
+               if (b & (1 << (port & 7)))
+                       return 1;
+
+               port++;
+               size--;
+               last_bitmap = bitmap;
+       }
+
+       return 0;
+}
+
 /*
  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
@@ -5939,7 +5994,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
        /* Then read the msr_index'th bit from this bitmap: */
        if (msr_index < 1024*8) {
                unsigned char b;
-               kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+               if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+                       return 1;
                return 1 & (b >> (msr_index & 7));
        } else
                return 1; /* let L1 handle the wrong parameter */
@@ -6033,10 +6089,10 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  */
 static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 {
-       u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       u32 exit_reason = vmx->exit_reason;
 
        if (vmx->nested.nested_run_pending)
                return 0;
@@ -6060,14 +6116,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_TRIPLE_FAULT:
                return 1;
        case EXIT_REASON_PENDING_INTERRUPT:
+               return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
        case EXIT_REASON_NMI_WINDOW:
-               /*
-                * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
-                * (aka Interrupt Window Exiting) only when L1 turned it on,
-                * so if we got a PENDING_INTERRUPT exit, this must be for L1.
-                * Same for NMI Window Exiting.
-                */
-               return 1;
+               return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
        case EXIT_REASON_TASK_SWITCH:
                return 1;
        case EXIT_REASON_CPUID:
@@ -6097,8 +6148,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_DR_ACCESS:
                return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
        case EXIT_REASON_IO_INSTRUCTION:
-               /* TODO: support IO bitmaps */
-               return 1;
+               return nested_vmx_exit_handled_io(vcpu, vmcs12);
        case EXIT_REASON_MSR_READ:
        case EXIT_REASON_MSR_WRITE:
                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -6122,6 +6172,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_EPT_VIOLATION:
        case EXIT_REASON_EPT_MISCONFIG:
                return 0;
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return vmcs12->pin_based_vm_exec_control &
+                       PIN_BASED_VMX_PREEMPTION_TIMER;
        case EXIT_REASON_WBINVD:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
@@ -6388,7 +6441,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
                                      u32 idt_vectoring_info,
                                      int instr_len_field,
                                      int error_code_field)
@@ -6399,46 +6452,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-       vmx->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&vmx->vcpu);
-       kvm_clear_interrupt_queue(&vmx->vcpu);
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        if (!idtv_info_valid)
                return;
 
-       kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
        switch (type) {
        case INTR_TYPE_NMI_INTR:
-               vmx->vcpu.arch.nmi_injected = true;
+               vcpu->arch.nmi_injected = true;
                /*
                 * SDM 3: 27.7.1.2 (September 2008)
                 * Clear bit "block by NMI" before VM entry if a NMI
                 * delivery faulted.
                 */
-               vmx_set_nmi_mask(&vmx->vcpu, false);
+               vmx_set_nmi_mask(vcpu, false);
                break;
        case INTR_TYPE_SOFT_EXCEPTION:
-               vmx->vcpu.arch.event_exit_inst_len =
-                       vmcs_read32(instr_len_field);
+               vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
                /* fall through */
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                       kvm_queue_exception_e(&vmx->vcpu, vector, err);
+                       kvm_queue_exception_e(vcpu, vector, err);
                } else
-                       kvm_queue_exception(&vmx->vcpu, vector);
+                       kvm_queue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
-               vmx->vcpu.arch.event_exit_inst_len =
-                       vmcs_read32(instr_len_field);
+               vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
                /* fall through */
        case INTR_TYPE_EXT_INTR:
-               kvm_queue_interrupt(&vmx->vcpu, vector,
-                       type == INTR_TYPE_SOFT_INTR);
+               kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
                break;
        default:
                break;
@@ -6447,18 +6497,14 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-       if (is_guest_mode(&vmx->vcpu))
-               return;
-       __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+       __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
                                  VM_EXIT_INSTRUCTION_LEN,
                                  IDT_VECTORING_ERROR_CODE);
 }
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu))
-               return;
-       __vmx_complete_interrupts(to_vmx(vcpu),
+       __vmx_complete_interrupts(vcpu,
                                  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
                                  VM_ENTRY_INSTRUCTION_LEN,
                                  VM_ENTRY_EXCEPTION_ERROR_CODE);
@@ -6489,21 +6535,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long debugctlmsr;
 
-       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
-               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               if (vmcs12->idt_vectoring_info_field &
-                               VECTORING_INFO_VALID_MASK) {
-                       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                               vmcs12->idt_vectoring_info_field);
-                       vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-                               vmcs12->vm_exit_instruction_len);
-                       if (vmcs12->idt_vectoring_info_field &
-                                       VECTORING_INFO_DELIVER_CODE_MASK)
-                               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                                       vmcs12->idt_vectoring_error_code);
-               }
-       }
-
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
                vmx->entry_time = ktime_get();
@@ -6662,17 +6693,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
-       if (is_guest_mode(vcpu)) {
-               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
-               if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-                       vmcs12->idt_vectoring_error_code =
-                               vmcs_read32(IDT_VECTORING_ERROR_CODE);
-                       vmcs12->vm_exit_instruction_len =
-                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-               }
-       }
-
        vmx->loaded_vmcs->launched = 1;
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6734,10 +6754,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        put_cpu();
        if (err)
                goto free_vmcs;
-       if (vm_need_virtualize_apic_accesses(kvm))
+       if (vm_need_virtualize_apic_accesses(kvm)) {
                err = alloc_apic_access_page(kvm);
                if (err)
                        goto free_vmcs;
+       }
 
        if (enable_ept) {
                if (!kvm->arch.ept_identity_map_addr)
@@ -6933,7 +6954,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs12->guest_interruptibility_info);
        vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
-       vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+       kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
        vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                vmcs12->guest_pending_dbg_exceptions);
@@ -6946,6 +6967,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                (vmcs_config.pin_based_exec_ctrl |
                 vmcs12->pin_based_vm_exec_control));
 
+       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
+                            vmcs12->vmx_preemption_timer_value);
+
        /*
         * Whether page-faults are trapped is determined by a combination of
         * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
@@ -7223,6 +7248,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        vcpu->cpu = cpu;
        put_cpu();
 
+       vmx_segment_cache_clear(vmx);
+
        vmcs12->launch_state = 1;
 
        prepare_vmcs02(vcpu, vmcs12);
@@ -7273,6 +7300,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                        vcpu->arch.cr4_guest_owned_bits));
 }
 
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+                                      struct vmcs12 *vmcs12)
+{
+       u32 idt_vectoring;
+       unsigned int nr;
+
+       if (vcpu->arch.exception.pending) {
+               nr = vcpu->arch.exception.nr;
+               idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+               if (kvm_exception_is_soft(nr)) {
+                       vmcs12->vm_exit_instruction_len =
+                               vcpu->arch.event_exit_inst_len;
+                       idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+               } else
+                       idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+               if (vcpu->arch.exception.has_error_code) {
+                       idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+                       vmcs12->idt_vectoring_error_code =
+                               vcpu->arch.exception.error_code;
+               }
+
+               vmcs12->idt_vectoring_info_field = idt_vectoring;
+       } else if (vcpu->arch.nmi_pending) {
+               vmcs12->idt_vectoring_info_field =
+                       INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+       } else if (vcpu->arch.interrupt.pending) {
+               nr = vcpu->arch.interrupt.nr;
+               idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+               if (vcpu->arch.interrupt.soft) {
+                       idt_vectoring |= INTR_TYPE_SOFT_INTR;
+                       vmcs12->vm_entry_instruction_len =
+                               vcpu->arch.event_exit_inst_len;
+               } else
+                       idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+               vmcs12->idt_vectoring_info_field = idt_vectoring;
+       }
+}
+
 /*
  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -7284,7 +7353,7 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
  * which already writes to vmcs12 directly.
  */
-void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        /* update guest state fields: */
        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -7338,10 +7407,14 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+       vmcs12->vm_entry_controls =
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+               (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
+
        /* TODO: These cannot have changed unless we have MSR bitmaps and
         * the relevant bit asks not to trap the change */
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-       if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
@@ -7349,21 +7422,38 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        /* update exit information fields: */
 
-       vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+       vmcs12->vm_exit_reason  = to_vmx(vcpu)->exit_reason;
        vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
        vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-       vmcs12->idt_vectoring_info_field =
-               vmcs_read32(IDT_VECTORING_INFO_FIELD);
-       vmcs12->idt_vectoring_error_code =
-               vmcs_read32(IDT_VECTORING_ERROR_CODE);
+       if ((vmcs12->vm_exit_intr_info &
+            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
+               vmcs12->vm_exit_intr_error_code =
+                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       vmcs12->idt_vectoring_info_field = 0;
        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 
-       /* clear vm-entry fields which are to be cleared on exit */
-       if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+       if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+               /* vm_entry_intr_info_field is cleared on exit. Emulate this
+                * instead of reading the real value. */
                vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+               /*
+                * Transfer the event that L0 or L1 may wanted to inject into
+                * L2 to IDT_VECTORING_INFO_FIELD.
+                */
+               vmcs12_save_pending_event(vcpu, vmcs12);
+       }
+
+       /*
+        * Drop what we picked up for L2 via vmx_complete_interrupts. It is
+        * preserved above and would only end up incorrectly in L1.
+        */
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 }
 
 /*
@@ -7375,7 +7465,8 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * Failures During or After Loading Guest State").
  * This function should be called when the active VMCS is L1's (vmcs01).
  */
-void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+                                  struct vmcs12 *vmcs12)
 {
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -7387,6 +7478,7 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+       vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
        /*
         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
         * actually changed, because it depends on the current state of
@@ -7445,6 +7537,9 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                        vmcs12->host_ia32_perf_global_ctrl);
+
+       kvm_set_dr(vcpu, 7, 0x400);
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
 
 /*
@@ -7458,6 +7553,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
        int cpu;
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
+       /* trying to cancel vmlaunch/vmresume is a bug */
+       WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
        leave_guest_mode(vcpu);
        prepare_vmcs12(vcpu, vmcs12);
 
@@ -7468,6 +7566,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
        vcpu->cpu = cpu;
        put_cpu();
 
+       vmx_segment_cache_clear(vmx);
+
        /* if no vmcs02 cache requested, remove the one we used */
        if (VMCS02_POOL_SIZE == 0)
                nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
@@ -7674,7 +7774,7 @@ static int __init vmx_init(void)
        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
                     __alignof__(struct vcpu_vmx), THIS_MODULE);
        if (r)
-               goto out3;
+               goto out5;
 
 #ifdef CONFIG_KEXEC
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -7722,6 +7822,8 @@ static int __init vmx_init(void)
 
        return 0;
 
+out5:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
        free_page((unsigned long)vmx_msr_bitmap_longmode);
 out3: