KVM: race-free exit from KVM_RUN without POSIX signals
authorPaolo Bonzini <pbonzini@redhat.com>
Wed, 8 Feb 2017 10:50:15 +0000 (11:50 +0100)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 17 Feb 2017 11:27:37 +0000 (12:27 +0100)
The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick"
a VCPU out of KVM_RUN through a POSIX signal.  A signal is attached
to a dummy signal handler; by blocking the signal outside KVM_RUN and
unblocking it inside, this possible race is closed:

          VCPU thread                     service thread
   --------------------------------------------------------------
        check flag
                                          set flag
                                          raise signal
        (signal handler does nothing)
        KVM_RUN

However, one issue with KVM_SET_SIGNAL_MASK is that it has to take
tsk->sighand->siglock on every KVM_RUN.  This lock is often on a
remote NUMA node, because it is on the node of a thread's creator.
Taking this lock can be very expensive if there are many userspace
exits (as is the case for SMP Windows VMs without Hyper-V reference
time counter).

As an alternative, we can put the flag directly in kvm_run so that
KVM can see it:

          VCPU thread                     service thread
   --------------------------------------------------------------
                                          raise signal
        signal handler
          set run->immediate_exit
        KVM_RUN
          check run->immediate_exit

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Documentation/virtual/kvm/api.txt
arch/arm/kvm/arm.c
arch/mips/kvm/mips.c
arch/powerpc/kvm/powerpc.c
arch/s390/kvm/kvm-s390.c
arch/x86/kvm/x86.c
include/uapi/linux/kvm.h

index e4f2cdcf78eb33d360b6d43a20af906988c015a3..069450938b795df4e6f5e16f39b864e8011fb844 100644 (file)
@@ -3389,7 +3389,18 @@ struct kvm_run {
 Request that KVM_RUN return when it becomes possible to inject external
 interrupts into the guest.  Useful in conjunction with KVM_INTERRUPT.
 
-       __u8 padding1[7];
+       __u8 immediate_exit;
+
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+exits immediately, returning -EINTR.  In the common scenario where a
+signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
+to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
+Rather than blocking the signal outside KVM_RUN, userspace can set up
+a signal handler that sets run->immediate_exit to a non-zero value.
+
+This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
+
+       __u8 padding1[6];
 
        /* out */
        __u32 exit_reason;
index 21c493a9e5c9edd321f3d78c9430a742daf21457..c9a2103faeb9acf82f0c26164085506f14015822 100644 (file)
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ARM_PSCI_0_2:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_MP_STATE:
+       case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                        return ret;
        }
 
+       if (run->immediate_exit)
+               return -EINTR;
+
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
index 31ee5ee0010befcb1071421ef616a05d27a123b5..ed81e5ac14267f95ed45a316751f120e743c6dcf 100644 (file)
@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-       int r = 0;
+       int r = -EINTR;
        sigset_t sigsaved;
 
        if (vcpu->sigset_active)
@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                vcpu->mmio_needed = 0;
        }
 
+       if (run->immediate_exit)
+               goto out;
+
        lose_fpu(1);
 
        local_irq_disable();
@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        guest_exit_irqoff();
        local_irq_enable();
 
+out:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_SYNC_MMU:
+       case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
index 2b3e4e620078252c087f2f1dc88d7490580a248e..1fe1391ba2c2ff737bfc8b107be3ca4405754cd9 100644 (file)
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ONE_REG:
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_DEVICE_CTRL:
+       case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -1117,7 +1118,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
        }
 
-       r = kvmppc_vcpu_run(run, vcpu);
+       if (run->immediate_exit)
+               r = -EINTR;
+       else
+               r = kvmppc_vcpu_run(run, vcpu);
 
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
index 502de74ea984b4d56ccb935f1f042cb593066692..99e35fe0dea88208f17ce2027030cb038ab51ca2 100644 (file)
@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_IRQCHIP:
        case KVM_CAP_VM_ATTRIBUTES:
        case KVM_CAP_MP_STATE:
+       case KVM_CAP_IMMEDIATE_EXIT:
        case KVM_CAP_S390_INJECT_IRQ:
        case KVM_CAP_S390_USER_SIGP:
        case KVM_CAP_S390_USER_STSI:
@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int rc;
        sigset_t sigsaved;
 
+       if (kvm_run->immediate_exit)
+               return -EINTR;
+
        if (guestdbg_exit_pending(vcpu)) {
                kvm_s390_prepare_debug_exit(vcpu);
                return 0;
index 0aa8db229e0a71397cf616e773600405426b27b5..8d3047c8cce7991684f51f25e18e0bb6b8550465 100644 (file)
@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_DISABLE_QUIRKS:
        case KVM_CAP_SET_BOOT_CPU_ID:
        case KVM_CAP_SPLIT_IRQCHIP:
+       case KVM_CAP_IMMEDIATE_EXIT:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -7202,7 +7203,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-       r = vcpu_run(vcpu);
+       if (kvm_run->immediate_exit)
+               r = -EINTR;
+       else
+               r = vcpu_run(vcpu);
 
 out:
        post_kvm_run_save(vcpu);
index 7964b970b9ad1a1f2e14aba3d63e3e3c91666e01..f51d5082a377ec1b6f2a062e54fe8d4503702e3b 100644 (file)
@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
 struct kvm_run {
        /* in */
        __u8 request_interrupt_window;
-       __u8 padding1[7];
+       __u8 immediate_exit;
+       __u8 padding1[6];
 
        /* out */
        __u32 exit_reason;
@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_RESIZE_HPT 133
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
 
 #ifdef KVM_CAP_IRQ_ROUTING