KVM: MMU: Don't use RCU for lockless shadow walking
authorAvi Kivity <avi@redhat.com>
Mon, 14 May 2012 12:44:06 +0000 (15:44 +0300)
committerMarcelo Tosatti <mtosatti@redhat.com>
Wed, 16 May 2012 19:08:28 +0000 (16:08 -0300)
Using RCU for lockless shadow walking can increase the amount of memory
in use by the system, since RCU grace periods are unpredictable.  We also
have an unconditional write to a shared variable (reader_counter), which
isn't good for scaling.

Replace that with a scheme similar to x86's get_user_pages_fast(): disable
interrupts during lockless shadow walk to force the freer
(kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
processor with interrupts enabled.

We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
kvm_flush_remote_tlbs() from avoiding the IPI.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/mmu.c
include/linux/kvm_host.h

index 69e39bc7e36fe9bef09a6c797cc1f98fff09b247..64c8989263f626779160b3c439551a065da1a6b9 100644 (file)
@@ -240,8 +240,6 @@ struct kvm_mmu_page {
 #endif
 
        int write_flooding_count;
-
-       struct rcu_head rcu;
 };
 
 struct kvm_pio_request {
@@ -540,8 +538,6 @@ struct kvm_arch {
        u64 hv_guest_os_id;
        u64 hv_hypercall;
 
-       atomic_t reader_counter;
-
        #ifdef CONFIG_KVM_MMU_AUDIT
        int audit_point;
        #endif
index 07424cf6043460d3230627caf55249f6f8b8b5e2..72102e0ab7cb3a0ae2302aa10eadd6bdb73d939a 100644 (file)
@@ -551,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
-       rcu_read_lock();
-       atomic_inc(&vcpu->kvm->arch.reader_counter);
-
-       /* Increase the counter before walking shadow page table */
-       smp_mb__after_atomic_inc();
+       /*
+        * Prevent page table teardown by making any free-er wait during
+        * kvm_flush_remote_tlbs() IPI to all active vcpus.
+        */
+       local_irq_disable();
+       vcpu->mode = READING_SHADOW_PAGE_TABLES;
+       /*
+        * Make sure a following spte read is not reordered ahead of the write
+        * to vcpu->mode.
+        */
+       smp_mb();
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 {
-       /* Decrease the counter after walking shadow page table finished */
-       smp_mb__before_atomic_dec();
-       atomic_dec(&vcpu->kvm->arch.reader_counter);
-       rcu_read_unlock();
+       /*
+        * Make sure the write to vcpu->mode is not reordered in front of
+        * reads to sptes.  If it does, kvm_commit_zap_page() can see us
+        * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+        */
+       smp_mb();
+       vcpu->mode = OUTSIDE_GUEST_MODE;
+       local_irq_enable();
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -1989,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
        return ret;
 }
 
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
-       struct kvm_mmu_page *sp;
-
-       list_for_each_entry(sp, invalid_list, link)
-               kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
-       struct kvm_mmu_page *next, *sp;
-
-       sp = container_of(head, struct kvm_mmu_page, rcu);
-       while (sp) {
-               if (!list_empty(&sp->link))
-                       next = list_first_entry(&sp->link,
-                                     struct kvm_mmu_page, link);
-               else
-                       next = NULL;
-               kvm_mmu_free_page(sp);
-               sp = next;
-       }
-}
-
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list)
 {
@@ -2021,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
        if (list_empty(invalid_list))
                return;
 
-       kvm_flush_remote_tlbs(kvm);
-
-       if (atomic_read(&kvm->arch.reader_counter)) {
-               kvm_mmu_isolate_pages(invalid_list);
-               sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
-               list_del_init(invalid_list);
+       /*
+        * wmb: make sure everyone sees our modifications to the page tables
+        * rmb: make sure we see changes to vcpu->mode
+        */
+       smp_mb();
 
-               trace_kvm_mmu_delay_free_pages(sp);
-               call_rcu(&sp->rcu, free_pages_rcu);
-               return;
-       }
+       /*
+        * Wait for all vcpus to exit guest mode and/or lockless shadow
+        * page table walks.
+        */
+       kvm_flush_remote_tlbs(kvm);
 
        do {
                sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -2039,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                kvm_mmu_isolate_page(sp);
                kvm_mmu_free_page(sp);
        } while (!list_empty(invalid_list));
-
 }
 
 /*
index cae342d29d1bd7e08fd9ec3a4e625aa6986f1ccf..c4464356b35b0af21eaafe6cbd1d2d7b4f549814 100644 (file)
@@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 enum {
        OUTSIDE_GUEST_MODE,
        IN_GUEST_MODE,
-       EXITING_GUEST_MODE
+       EXITING_GUEST_MODE,
+       READING_SHADOW_PAGE_TABLES,
 };
 
 /*