From 0a61b222df75a6a69dc34816f7db2f61fee8c935 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 18 Oct 2013 12:03:41 +0200 Subject: [PATCH] KVM: s390/mm: use software dirty bit detection for user dirty tracking Switch the user dirty bit detection used for migration from the hardware provided host change-bit in the pgste to a fault based detection method. This reduced the dependency of the host from the storage key to a point where it becomes possible to enable the RCP bypass for KVM guests. The fault based dirty detection will only indicate changes caused by accesses via the guest address space. The hardware based method can detect all changes, even those caused by I/O or accesses via the kernel page table. The KVM/qemu code needs to take this into account. Signed-off-by: Martin Schwidefsky Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/pgtable.h | 135 +++++++++++++------------------- arch/s390/mm/pgtable.c | 6 +- 2 files changed, 59 insertions(+), 82 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 51b002b5667e..b2c630df0ca5 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -309,7 +309,8 @@ extern unsigned long MODULES_END; #define PGSTE_HC_BIT 0x00200000UL #define PGSTE_GR_BIT 0x00040000UL #define PGSTE_GC_BIT 0x00020000UL -#define PGSTE_IN_BIT 0x00008000UL /* IPTE notify bit */ +#define PGSTE_UC_BIT 0x00008000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x00004000UL /* IPTE notify bit */ #else /* CONFIG_64BIT */ @@ -391,7 +392,8 @@ extern unsigned long MODULES_END; #define PGSTE_HC_BIT 0x0020000000000000UL #define PGSTE_GR_BIT 0x0004000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_IN_BIT 0x0000800000000000UL /* IPTE notify bit */ +#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ #endif /* CONFIG_64BIT */ @@ -720,16 +722,6 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste, address = pte_val(*ptep) & PAGE_MASK; skey = (unsigned long) page_get_storage_key(address); bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - if (!(pgste_val(pgste) & PGSTE_HC_BIT) && (bits & _PAGE_CHANGED)) { - /* Transfer dirty + referenced bit to host bits in pgste */ - pgste_val(pgste) |= bits << 52; - page_set_storage_key(address, skey ^ bits, 0); - } else if (!(pgste_val(pgste) & PGSTE_HR_BIT) && - (bits & _PAGE_REFERENCED)) { - /* Transfer referenced bit to host bit in pgste */ - pgste_val(pgste) |= PGSTE_HR_BIT; - page_reset_referenced(address); - } /* Transfer page changed & referenced bit to guest bits in pgste */ pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ /* Copy page access key and fetch protection bit to pgste */ @@ -740,19 +732,6 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste, } -static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (!mm_use_skey(mm) || pte_val(*ptep) & _PAGE_INVALID) - return pgste; - /* Get referenced bit from storage key */ - if (page_reset_referenced(pte_val(*ptep) & PAGE_MASK)) - pgste_val(pgste) |= PGSTE_HR_BIT | PGSTE_GR_BIT; -#endif - return pgste; -} - static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, struct mm_struct *mm) { @@ -770,23 +749,30 @@ static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, * key C/R to 0. */ nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; page_set_storage_key(address, nkey, 0); #endif } -static inline void pgste_set_pte(pte_t *ptep, pte_t entry) +static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) { - if (!MACHINE_HAS_ESOP && - (pte_val(entry) & _PAGE_PRESENT) && - (pte_val(entry) & _PAGE_WRITE)) { - /* - * Without enhanced suppression-on-protection force - * the dirty bit on for all writable ptes. - */ - pte_val(entry) |= _PAGE_DIRTY; - pte_val(entry) &= ~_PAGE_PROTECT; + if ((pte_val(entry) & _PAGE_PRESENT) && + (pte_val(entry) & _PAGE_WRITE) && + !(pte_val(entry) & _PAGE_INVALID)) { + if (!MACHINE_HAS_ESOP) { + /* + * Without enhanced suppression-on-protection force + * the dirty bit on for all writable ptes. + */ + pte_val(entry) |= _PAGE_DIRTY; + pte_val(entry) &= ~_PAGE_PROTECT; + } + if (!(pte_val(entry) & _PAGE_PROTECT)) + /* This pte allows write access, set user-dirty */ + pgste_val(pgste) |= PGSTE_UC_BIT; } *ptep = entry; + return pgste; } /** @@ -884,7 +870,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pgste = pgste_get_lock(ptep); pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; pgste_set_key(ptep, pgste, entry, mm); - pgste_set_pte(ptep, entry); + pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); } else { if (!(pte_val(entry) & _PAGE_INVALID) && MACHINE_HAS_EDAT1) @@ -1030,45 +1016,6 @@ static inline pte_t pte_mkhuge(pte_t pte) } #endif -/* - * Get (and clear) the user dirty bit for a pte. - */ -static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm, - pte_t *ptep) -{ - pgste_t pgste; - int dirty = 0; - - if (mm_has_pgste(mm)) { - pgste = pgste_get_lock(ptep); - pgste = pgste_update_all(ptep, pgste, mm); - dirty = !!(pgste_val(pgste) & PGSTE_HC_BIT); - pgste_val(pgste) &= ~PGSTE_HC_BIT; - pgste_set_unlock(ptep, pgste); - return dirty; - } - return dirty; -} - -/* - * Get (and clear) the user referenced bit for a pte. - */ -static inline int ptep_test_and_clear_user_young(struct mm_struct *mm, - pte_t *ptep) -{ - pgste_t pgste; - int young = 0; - - if (mm_has_pgste(mm)) { - pgste = pgste_get_lock(ptep); - pgste = pgste_update_young(ptep, pgste, mm); - young = !!(pgste_val(pgste) & PGSTE_HR_BIT); - pgste_val(pgste) &= ~PGSTE_HR_BIT; - pgste_set_unlock(ptep, pgste); - } - return young; -} - static inline void __ptep_ipte(unsigned long address, pte_t *ptep) { unsigned long pto = (unsigned long) ptep; @@ -1131,6 +1078,36 @@ static inline void ptep_flush_lazy(struct mm_struct *mm, atomic_sub(0x10000, &mm->context.attach_count); } +/* + * Get (and clear) the user dirty bit for a pte. + */ +static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + pgste_t pgste; + pte_t pte; + int dirty; + + if (!mm_has_pgste(mm)) + return 0; + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_ipte_notify(mm, ptep, pgste); + __ptep_ipte(addr, ptep); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte_val(pte) |= _PAGE_PROTECT; + else + pte_val(pte) |= _PAGE_INVALID; + *ptep = pte; + } + pgste_set_unlock(ptep, pgste); + return dirty; +} + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -1150,7 +1127,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, pte = pte_mkold(pte); if (mm_has_pgste(vma->vm_mm)) { - pgste_set_pte(ptep, pte); + pgste = pgste_set_pte(ptep, pgste, pte); pgste_set_unlock(ptep, pgste); } else *ptep = pte; @@ -1233,7 +1210,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); - pgste_set_pte(ptep, pte); + pgste = pgste_set_pte(ptep, pgste, pte); pgste_set_unlock(ptep, pgste); } else *ptep = pte; @@ -1314,7 +1291,7 @@ static inline pte_t ptep_set_wrprotect(struct mm_struct *mm, pte = pte_wrprotect(pte); if (mm_has_pgste(mm)) { - pgste_set_pte(ptep, pte); + pgste = pgste_set_pte(ptep, pgste, pte); pgste_set_unlock(ptep, pgste); } else *ptep = pte; @@ -1339,7 +1316,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, ptep_flush_direct(vma->vm_mm, address, ptep); if (mm_has_pgste(vma->vm_mm)) { - pgste_set_pte(ptep, entry); + pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); } else *ptep = entry; diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 02a8607bbeb5..1ddf975352a0 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -832,6 +832,7 @@ void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte) } spin_unlock(&gmap_notifier_lock); } +EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); static inline int page_table_with_pgste(struct page *page) { @@ -864,8 +865,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, atomic_set(&page->_mapcount, 0); table = (unsigned long *) page_to_phys(page); clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT, - PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); return table; } @@ -1005,7 +1005,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, /* changing the guest storage key is considered a change of the page */ if ((pgste_val(new) ^ pgste_val(old)) & (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - pgste_val(new) |= PGSTE_HC_BIT; + pgste_val(new) |= PGSTE_UC_BIT; pgste_set_unlock(ptep, new); pte_unmap_unlock(*ptep, ptl); -- 2.20.1