KVM: s390: CMMA tracking, ESSA emulation, migration mode
authorClaudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Thu, 4 Aug 2016 15:54:42 +0000 (17:54 +0200)
committerChristian Borntraeger <borntraeger@de.ibm.com>
Thu, 22 Jun 2017 10:41:05 +0000 (12:41 +0200)
* Add a migration state bitmap to keep track of which pages have dirty
  CMMA information.
* Disable CMMA by default, so we can track if it's used or not. Enable
  it on first use like we do for storage keys (unless we are doing a
  migration).
* Creates a VM attribute to enter and leave migration mode.
* In migration mode, CMMA is disabled in the SIE block, so ESSA is
  always interpreted and emulated in software.
* Free the migration state on VM destroy.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Documentation/virtual/kvm/devices/vm.txt
arch/s390/include/asm/kvm_host.h
arch/s390/include/uapi/asm/kvm.h
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/priv.c

index 575ccb022aacd98ce87a28fc1cbb9eb11c27d6b2..903fc926860b522bc92122a65c750d4f28be2d71 100644 (file)
@@ -222,3 +222,36 @@ Allows user space to disable dea key wrapping, clearing the wrapping key.
 
 Parameters: none
 Returns:    0
+
+5. GROUP: KVM_S390_VM_MIGRATION
+Architectures: s390
+
+5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o)
+
+Allows userspace to stop migration mode, needed for PGSTE migration.
+Setting this attribute when migration mode is not active will have no
+effects.
+
+Parameters: none
+Returns:    0
+
+5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o)
+
+Allows userspace to start migration mode, needed for PGSTE migration.
+Setting this attribute when migration mode is already active will have
+no effects.
+
+Parameters: none
+Returns:    -ENOMEM if there is not enough free memory to start migration mode
+           -EINVAL if the state of the VM is invalid (e.g. no memory defined)
+           0 in case of success.
+
+5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o)
+
+Allows userspace to query the status of migration mode.
+
+Parameters: address of a buffer in user space to store the data (u64) to;
+           the data itself is either 0 if migration mode is disabled or 1
+           if it is enabled
+Returns:    -EFAULT if the given address is not accessible from kernel space
+           0 in case of success.
index 426614a882a9b12a71c96f06607f30e6b345cd0d..a8cafed79eb44528a125b4c0230708871146e4e7 100644 (file)
@@ -45,6 +45,8 @@
 #define KVM_REQ_ENABLE_IBS         8
 #define KVM_REQ_DISABLE_IBS        9
 #define KVM_REQ_ICPT_OPEREXC       10
+#define KVM_REQ_START_MIGRATION   11
+#define KVM_REQ_STOP_MIGRATION    12
 
 #define SIGP_CTRL_C            0x80
 #define SIGP_CTRL_SCN_MASK     0x3f
@@ -691,6 +693,12 @@ struct kvm_s390_vsie {
        struct page *pages[KVM_MAX_VCPUS];
 };
 
+struct kvm_s390_migration_state {
+       unsigned long bitmap_size;      /* in bits (number of guest pages) */
+       atomic64_t dirty_pages;         /* number of dirty pages */
+       unsigned long *pgste_bitmap;
+};
+
 struct kvm_arch{
        void *sca;
        int use_esca;
@@ -718,6 +726,7 @@ struct kvm_arch{
        struct kvm_s390_crypto crypto;
        struct kvm_s390_vsie vsie;
        u64 epoch;
+       struct kvm_s390_migration_state *migration_state;
        /* subset of available cpu features enabled by user space */
        DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 };
index 3dd2a1d308dd0b92c36c3c5ca5276fd19838f252..d6879a916de525e4cd6183fcd45d3e7703687498 100644 (file)
@@ -70,6 +70,7 @@ struct kvm_s390_io_adapter_req {
 #define KVM_S390_VM_TOD                        1
 #define KVM_S390_VM_CRYPTO             2
 #define KVM_S390_VM_CPU_MODEL          3
+#define KVM_S390_VM_MIGRATION          4
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA    0
@@ -151,6 +152,11 @@ struct kvm_s390_vm_cpu_subfunc {
 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW      2
 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW      3
 
+/* kvm attributes for migration mode */
+#define KVM_S390_VM_MIGRATION_STOP     0
+#define KVM_S390_VM_MIGRATION_START    1
+#define KVM_S390_VM_MIGRATION_STATUS   2
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
        /* general purpose regs for s390 */
index 689ac48361c697318ba6192962c7790d027a3199..c2b3914993748070e6c7e0ad087d8c0bf581c1c6 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
 
+#include <linux/string.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
@@ -750,6 +751,129 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
        return 0;
 }
 
+static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
+{
+       int cx;
+       struct kvm_vcpu *vcpu;
+
+       kvm_for_each_vcpu(cx, vcpu, kvm)
+               kvm_s390_sync_request(req, vcpu);
+}
+
+/*
+ * Must be called with kvm->srcu held to avoid races on memslots, and with
+ * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration.
+ */
+static int kvm_s390_vm_start_migration(struct kvm *kvm)
+{
+       struct kvm_s390_migration_state *mgs;
+       struct kvm_memory_slot *ms;
+       /* should be the only one */
+       struct kvm_memslots *slots;
+       unsigned long ram_pages;
+       int slotnr;
+
+       /* migration mode already enabled */
+       if (kvm->arch.migration_state)
+               return 0;
+
+       slots = kvm_memslots(kvm);
+       if (!slots || !slots->used_slots)
+               return -EINVAL;
+
+       mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
+       if (!mgs)
+               return -ENOMEM;
+       kvm->arch.migration_state = mgs;
+
+       if (kvm->arch.use_cmma) {
+               /*
+                * Get the last slot. They should be sorted by base_gfn, so the
+                * last slot is also the one at the end of the address space.
+                * We have verified above that at least one slot is present.
+                */
+               ms = slots->memslots + slots->used_slots - 1;
+               /* round up so we only use full longs */
+               ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
+               /* allocate enough bytes to store all the bits */
+               mgs->pgste_bitmap = vmalloc(ram_pages / 8);
+               if (!mgs->pgste_bitmap) {
+                       kfree(mgs);
+                       kvm->arch.migration_state = NULL;
+                       return -ENOMEM;
+               }
+
+               mgs->bitmap_size = ram_pages;
+               atomic64_set(&mgs->dirty_pages, ram_pages);
+               /* mark all the pages in active slots as dirty */
+               for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
+                       ms = slots->memslots + slotnr;
+                       bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
+               }
+
+               kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
+       }
+       return 0;
+}
+
+/*
+ * Must be called with kvm->lock to avoid races with ourselves and
+ * kvm_s390_vm_start_migration.
+ */
+static int kvm_s390_vm_stop_migration(struct kvm *kvm)
+{
+       struct kvm_s390_migration_state *mgs;
+
+       /* migration mode already disabled */
+       if (!kvm->arch.migration_state)
+               return 0;
+       mgs = kvm->arch.migration_state;
+       kvm->arch.migration_state = NULL;
+
+       if (kvm->arch.use_cmma) {
+               kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
+               vfree(mgs->pgste_bitmap);
+       }
+       kfree(mgs);
+       return 0;
+}
+
+static int kvm_s390_vm_set_migration(struct kvm *kvm,
+                                    struct kvm_device_attr *attr)
+{
+       int idx, res = -ENXIO;
+
+       mutex_lock(&kvm->lock);
+       switch (attr->attr) {
+       case KVM_S390_VM_MIGRATION_START:
+               idx = srcu_read_lock(&kvm->srcu);
+               res = kvm_s390_vm_start_migration(kvm);
+               srcu_read_unlock(&kvm->srcu, idx);
+               break;
+       case KVM_S390_VM_MIGRATION_STOP:
+               res = kvm_s390_vm_stop_migration(kvm);
+               break;
+       default:
+               break;
+       }
+       mutex_unlock(&kvm->lock);
+
+       return res;
+}
+
+static int kvm_s390_vm_get_migration(struct kvm *kvm,
+                                    struct kvm_device_attr *attr)
+{
+       u64 mig = (kvm->arch.migration_state != NULL);
+
+       if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
+               return -ENXIO;
+
+       if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig)))
+               return -EFAULT;
+       return 0;
+}
+
 static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 {
        u8 gtod_high;
@@ -1090,6 +1214,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
        case KVM_S390_VM_CRYPTO:
                ret = kvm_s390_vm_set_crypto(kvm, attr);
                break;
+       case KVM_S390_VM_MIGRATION:
+               ret = kvm_s390_vm_set_migration(kvm, attr);
+               break;
        default:
                ret = -ENXIO;
                break;
@@ -1112,6 +1239,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
        case KVM_S390_VM_CPU_MODEL:
                ret = kvm_s390_get_cpu_model(kvm, attr);
                break;
+       case KVM_S390_VM_MIGRATION:
+               ret = kvm_s390_vm_get_migration(kvm, attr);
+               break;
        default:
                ret = -ENXIO;
                break;
@@ -1179,6 +1309,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
                        break;
                }
                break;
+       case KVM_S390_VM_MIGRATION:
+               ret = 0;
+               break;
        default:
                ret = -ENXIO;
                break;
@@ -1633,6 +1766,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_s390_destroy_adapters(kvm);
        kvm_s390_clear_float_irqs(kvm);
        kvm_s390_vsie_destroy(kvm);
+       if (kvm->arch.migration_state) {
+               vfree(kvm->arch.migration_state->pgste_bitmap);
+               kfree(kvm->arch.migration_state);
+       }
        KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
@@ -1977,7 +2114,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
        if (!vcpu->arch.sie_block->cbrlo)
                return -ENOMEM;
 
-       vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
        vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
        return 0;
 }
@@ -2489,6 +2625,27 @@ retry:
                goto retry;
        }
 
+       if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
+               /*
+                * Disable CMMA virtualization; we will emulate the ESSA
+                * instruction manually, in order to provide additional
+                * functionalities needed for live migration.
+                */
+               vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA;
+               goto retry;
+       }
+
+       if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
+               /*
+                * Re-enable CMMA virtualization if CMMA is available and
+                * was used.
+                */
+               if ((vcpu->kvm->arch.use_cmma) &&
+                   (vcpu->kvm->mm->context.use_cmma))
+                       vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+               goto retry;
+       }
+
        /* nothing to do, just clear the request */
        kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
index c03106c428cfa89e1f25297efbd2d667a73b5fd7..a226c459809bf0d16657cec3b578c4fe43cd729b 100644 (file)
@@ -24,6 +24,7 @@
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
 #include <asm/pgtable.h>
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/io.h>
@@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
+{
+       struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
+       int r1, r2, nappended, entries;
+       unsigned long gfn, hva, res, pgstev, ptev;
+       unsigned long *cbrlo;
+
+       /*
+        * We don't need to set SD.FPF.SK to 1 here, because if we have a
+        * machine check here we either handle it or crash
+        */
+
+       kvm_s390_get_regs_rre(vcpu, &r1, &r2);
+       gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
+       hva = gfn_to_hva(vcpu->kvm, gfn);
+       entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
+
+       if (kvm_is_error_hva(hva))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+       nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
+       if (nappended < 0) {
+               res = orc ? 0x10 : 0;
+               vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+               return 0;
+       }
+       res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
+       /*
+        * Set the block-content state part of the result. 0 means resident, so
+        * nothing to do if the page is valid. 2 is for preserved pages
+        * (non-present and non-zero), and 3 for zero pages (non-present and
+        * zero).
+        */
+       if (ptev & _PAGE_INVALID) {
+               res |= 2;
+               if (pgstev & _PGSTE_GPS_ZERO)
+                       res |= 1;
+       }
+       vcpu->run->s.regs.gprs[r1] = res;
+       /*
+        * It is possible that all the normal 511 slots were full, in which case
+        * we will now write in the 512th slot, which is reserved for host use.
+        * In both cases we let the normal essa handling code process all the
+        * slots, including the reserved one, if needed.
+        */
+       if (nappended > 0) {
+               cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK);
+               cbrlo[entries] = gfn << PAGE_SHIFT;
+       }
+
+       if (orc) {
+               /* increment only if we are really flipping the bit to 1 */
+               if (!test_and_set_bit(gfn, ms->pgste_bitmap))
+                       atomic64_inc(&ms->dirty_pages);
+       }
+
+       return nappended;
+}
+
 static int handle_essa(struct kvm_vcpu *vcpu)
 {
        /* entries expected to be 1FF */
        int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
        unsigned long *cbrlo;
        struct gmap *gmap;
-       int i;
+       int i, orc;
 
        VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
        gmap = vcpu->arch.gmap;
@@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
-
-       if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
+       /* Check for invalid operation request code */
+       orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
+       if (orc > ESSA_MAX)
                return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       /* Retry the ESSA instruction */
-       kvm_s390_retry_instr(vcpu);
+       if (likely(!vcpu->kvm->arch.migration_state)) {
+               /*
+                * CMMA is enabled in the KVM settings, but is disabled in
+                * the SIE block and in the mm_context, and we are not doing
+                * a migration. Enable CMMA in the mm_context.
+                * Since we need to take a write lock to write to the context
+                * to avoid races with storage keys handling, we check if the
+                * value really needs to be written to; if the value is
+                * already correct, we do nothing and avoid the lock.
+                */
+               if (vcpu->kvm->mm->context.use_cmma == 0) {
+                       down_write(&vcpu->kvm->mm->mmap_sem);
+                       vcpu->kvm->mm->context.use_cmma = 1;
+                       up_write(&vcpu->kvm->mm->mmap_sem);
+               }
+               /*
+                * If we are here, we are supposed to have CMMA enabled in
+                * the SIE block. Enabling CMMA works on a per-CPU basis,
+                * while the context use_cmma flag is per process.
+                * It's possible that the context flag is enabled and the
+                * SIE flag is not, so we set the flag always; if it was
+                * already set, nothing changes, otherwise we enable it
+                * on this CPU too.
+                */
+               vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+               /* Retry the ESSA instruction */
+               kvm_s390_retry_instr(vcpu);
+       } else {
+               /* Account for the possible extra cbrl entry */
+               i = do_essa(vcpu, orc);
+               if (i < 0)
+                       return i;
+               entries += i;
+       }
        vcpu->arch.sie_block->cbrlo &= PAGE_MASK;       /* reset nceo */
        cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
        down_read(&gmap->mm->mmap_sem);