s390/pageattr: allow kernel page table splitting
authorHeiko Carstens <heiko.carstens@de.ibm.com>
Tue, 17 May 2016 08:50:15 +0000 (10:50 +0200)
committerMartin Schwidefsky <schwidefsky@de.ibm.com>
Mon, 13 Jun 2016 13:58:15 +0000 (15:58 +0200)
set_memory_ro() and set_memory_rw() currently only work on 4k
mappings, which is good enough for module code aka the vmalloc area.

However we stumbled already twice into the need to make this also work
on larger mappings:
- the ro after init patch set
- the crash kernel resize code

Therefore this patch implements automatic kernel page table splitting
if e.g. set_memory_ro() would be called on parts of a 2G mapping.
This works quite the same as the x86 code, but is much simpler.

In order to make this work and to be architecturally compliant we now
always use the csp, cspg or crdte instructions to replace valid page
table entries. This means that set_memory_ro() and set_memory_rw()
will be much more expensive than before. In order to avoid huge
latencies the code contains a couple of cond_resched() calls.

The current code only splits page tables, but does not merge them if
it would be possible.  The reason for this is that currently there is
no real life scenarion where this would really happen. All current use
cases that I know of only change access rights once during the life
time. If that should change we can still implement kernel page table
merging at a later time.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
arch/s390/include/asm/pgtable.h
arch/s390/mm/pageattr.c
arch/s390/mm/vmem.c

index 882d6f4aad252859c74514974967836e92ccc3c9..9133388edd1f88b6f7db88624e00c9e890579c85 100644 (file)
@@ -34,6 +34,8 @@
 extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096)));
 extern void paging_init(void);
 extern void vmem_map_init(void);
+pmd_t *vmem_pmd_alloc(void);
+pte_t *vmem_pte_alloc(void);
 
 /*
  * The S390 doesn't have any external MMU info: the kernel page
@@ -477,6 +479,40 @@ static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
                : "cc");
 }
 
+static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
+{
+       register unsigned long reg2 asm("2") = old;
+       register unsigned long reg3 asm("3") = new;
+       unsigned long address = (unsigned long)ptr | 1;
+
+       asm volatile(
+               "       .insn   rre,0xb98a0000,%0,%3"
+               : "+d" (reg2), "+m" (*ptr)
+               : "d" (reg3), "d" (address)
+               : "cc");
+}
+
+#define CRDTE_DTT_PAGE         0x00UL
+#define CRDTE_DTT_SEGMENT      0x10UL
+#define CRDTE_DTT_REGION3      0x14UL
+#define CRDTE_DTT_REGION2      0x18UL
+#define CRDTE_DTT_REGION1      0x1cUL
+
+static inline void crdte(unsigned long old, unsigned long new,
+                        unsigned long table, unsigned long dtt,
+                        unsigned long address, unsigned long asce)
+{
+       register unsigned long reg2 asm("2") = old;
+       register unsigned long reg3 asm("3") = new;
+       register unsigned long reg4 asm("4") = table | dtt;
+       register unsigned long reg5 asm("5") = address;
+
+       asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
+                    : "+d" (reg2)
+                    : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
+                    : "memory", "cc");
+}
+
 /*
  * pgd/pmd/pte query functions
  */
index e67a8f712e192691a40b2ffcf6c7b36dd1cd6a89..91e5e29c1f5c8608abc1bebd0cffa8553a16755b 100644 (file)
@@ -40,54 +40,235 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 }
 #endif
 
-static pte_t *walk_page_table(unsigned long addr)
+static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
+                   unsigned long dtt)
 {
-       pgd_t *pgdp;
-       pud_t *pudp;
+       unsigned long table, mask;
+
+       mask = 0;
+       if (MACHINE_HAS_EDAT2) {
+               switch (dtt) {
+               case CRDTE_DTT_REGION3:
+                       mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
+                       break;
+               case CRDTE_DTT_SEGMENT:
+                       mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+                       break;
+               case CRDTE_DTT_PAGE:
+                       mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+                       break;
+               }
+               table = (unsigned long)old & mask;
+               crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
+       } else if (MACHINE_HAS_IDTE) {
+               cspg(old, *old, new);
+       } else {
+               csp((unsigned int *)old + 1, *old, new);
+       }
+}
+
+struct cpa {
+       unsigned int set_ro     : 1;
+       unsigned int clear_ro   : 1;
+};
+
+static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
+                         struct cpa cpa)
+{
+       pte_t *ptep, new;
+
+       ptep = pte_offset(pmdp, addr);
+       do {
+               if (pte_none(*ptep))
+                       return -EINVAL;
+               if (cpa.set_ro)
+                       new = pte_wrprotect(*ptep);
+               else if (cpa.clear_ro)
+                       new = pte_mkwrite(pte_mkdirty(*ptep));
+               pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
+               ptep++;
+               addr += PAGE_SIZE;
+               cond_resched();
+       } while (addr < end);
+       return 0;
+}
+
+static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
+{
+       unsigned long pte_addr, prot;
+       pte_t *pt_dir, *ptep;
+       pmd_t new;
+       int i, ro;
+
+       pt_dir = vmem_pte_alloc();
+       if (!pt_dir)
+               return -ENOMEM;
+       pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT;
+       ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT);
+       prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+       ptep = pt_dir;
+       for (i = 0; i < PTRS_PER_PTE; i++) {
+               pte_val(*ptep) = pte_addr | prot;
+               pte_addr += PAGE_SIZE;
+               ptep++;
+       }
+       pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+       pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+       return 0;
+}
+
+static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, struct cpa cpa)
+{
+       pmd_t new;
+
+       if (cpa.set_ro)
+               new = pmd_wrprotect(*pmdp);
+       else if (cpa.clear_ro)
+               new = pmd_mkwrite(pmd_mkdirty(*pmdp));
+       pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+}
+
+static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
+                         struct cpa cpa)
+{
+       unsigned long next;
        pmd_t *pmdp;
-       pte_t *ptep;
+       int rc = 0;
 
-       pgdp = pgd_offset_k(addr);
-       if (pgd_none(*pgdp))
-               return NULL;
-       pudp = pud_offset(pgdp, addr);
-       if (pud_none(*pudp) || pud_large(*pudp))
-               return NULL;
        pmdp = pmd_offset(pudp, addr);
-       if (pmd_none(*pmdp) || pmd_large(*pmdp))
-               return NULL;
-       ptep = pte_offset_kernel(pmdp, addr);
-       if (pte_none(*ptep))
-               return NULL;
-       return ptep;
+       do {
+               if (pmd_none(*pmdp))
+                       return -EINVAL;
+               next = pmd_addr_end(addr, end);
+               if (pmd_large(*pmdp)) {
+                       if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+                               rc = split_pmd_page(pmdp, addr);
+                               if (rc)
+                                       return rc;
+                               continue;
+                       }
+                       modify_pmd_page(pmdp, addr, cpa);
+               } else {
+                       rc = walk_pte_level(pmdp, addr, next, cpa);
+                       if (rc)
+                               return rc;
+               }
+               pmdp++;
+               addr = next;
+               cond_resched();
+       } while (addr < end);
+       return rc;
 }
 
-static void change_page_attr(unsigned long addr, int numpages,
-                            pte_t (*set) (pte_t))
+static int split_pud_page(pud_t *pudp, unsigned long addr)
 {
-       pte_t *ptep;
-       int i;
+       unsigned long pmd_addr, prot;
+       pmd_t *pm_dir, *pmdp;
+       pud_t new;
+       int i, ro;
 
-       for (i = 0; i < numpages; i++) {
-               ptep = walk_page_table(addr);
-               if (WARN_ON_ONCE(!ptep))
-                       break;
-               *ptep = set(*ptep);
-               addr += PAGE_SIZE;
+       pm_dir = vmem_pmd_alloc();
+       if (!pm_dir)
+               return -ENOMEM;
+       pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT;
+       ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT);
+       prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL);
+       pmdp = pm_dir;
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               pmd_val(*pmdp) = pmd_addr | prot;
+               pmd_addr += PMD_SIZE;
+               pmdp++;
        }
-       __tlb_flush_kernel();
+       pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+       pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+       return 0;
+}
+
+static void modify_pud_page(pud_t *pudp, unsigned long addr, struct cpa cpa)
+{
+       pud_t new;
+
+       if (cpa.set_ro)
+               new = pud_wrprotect(*pudp);
+       else if (cpa.clear_ro)
+               new = pud_mkwrite(pud_mkdirty(*pudp));
+       pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+}
+
+static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end,
+                         struct cpa cpa)
+{
+       unsigned long next;
+       pud_t *pudp;
+       int rc = 0;
+
+       pudp = pud_offset(pgd, addr);
+       do {
+               if (pud_none(*pudp))
+                       return -EINVAL;
+               next = pud_addr_end(addr, end);
+               if (pud_large(*pudp)) {
+                       if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+                               rc = split_pud_page(pudp, addr);
+                               if (rc)
+                                       break;
+                               continue;
+                       }
+                       modify_pud_page(pudp, addr, cpa);
+               } else {
+                       rc = walk_pmd_level(pudp, addr, next, cpa);
+               }
+               pudp++;
+               addr = next;
+               cond_resched();
+       } while (addr < end && !rc);
+       return rc;
+}
+
+static DEFINE_MUTEX(cpa_mutex);
+
+static int change_page_attr(unsigned long addr, unsigned long end,
+                           struct cpa cpa)
+{
+       unsigned long next;
+       int rc = -EINVAL;
+       pgd_t *pgdp;
+
+       if (end >= MODULES_END)
+               return -EINVAL;
+       mutex_lock(&cpa_mutex);
+       pgdp = pgd_offset_k(addr);
+       do {
+               if (pgd_none(*pgdp))
+                       break;
+               next = pgd_addr_end(addr, end);
+               rc = walk_pud_level(pgdp, addr, next, cpa);
+               if (rc)
+                       break;
+               cond_resched();
+       } while (pgdp++, addr = next, addr < end && !rc);
+       mutex_unlock(&cpa_mutex);
+       return rc;
 }
 
 int set_memory_ro(unsigned long addr, int numpages)
 {
-       change_page_attr(addr, numpages, pte_wrprotect);
-       return 0;
+       struct cpa cpa = {
+               .set_ro = 1,
+       };
+
+       addr &= PAGE_MASK;
+       return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
 }
 
 int set_memory_rw(unsigned long addr, int numpages)
 {
-       change_page_attr(addr, numpages, pte_mkwrite);
-       return 0;
+       struct cpa cpa = {
+               .clear_ro = 1,
+       };
+
+       addr &= PAGE_MASK;
+       return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
 }
 
 /* not possible */
index 4badd8252e3c73043b1d64e62fc5dc48c37e4ee9..0a7b03496f67848d2994350be1c5c5edb3a93f12 100644 (file)
@@ -47,7 +47,7 @@ static inline pud_t *vmem_pud_alloc(void)
        return pud;
 }
 
-static inline pmd_t *vmem_pmd_alloc(void)
+pmd_t *vmem_pmd_alloc(void)
 {
        pmd_t *pmd = NULL;
 
@@ -58,7 +58,7 @@ static inline pmd_t *vmem_pmd_alloc(void)
        return pmd;
 }
 
-static pte_t __ref *vmem_pte_alloc(void)
+pte_t __ref *vmem_pte_alloc(void)
 {
        pte_t *pte;