[PATCH] shared page table for hugetlb page
authorChen, Kenneth W <kenneth.w.chen@intel.com>
Thu, 7 Dec 2006 04:32:03 +0000 (20:32 -0800)
committerLinus Torvalds <torvalds@woody.osdl.org>
Thu, 7 Dec 2006 16:39:21 +0000 (08:39 -0800)
Following up with the work on shared page table done by Dave McCracken.  This
set of patch target shared page table for hugetlb memory only.

The shared page table is particular useful in the situation of large number of
independent processes sharing large shared memory segments.  In the normal
page case, the amount of memory saved from process' page table is quite
significant.  For hugetlb, the saving on page table memory is not the primary
objective (as hugetlb itself already cuts down page table overhead
significantly), instead, the purpose of using shared page table on hugetlb is
to allow faster TLB refill and smaller cache pollution upon TLB miss.

With PT sharing, pte entries are shared among hundreds of processes, the cache
consumption used by all the page table is smaller and in return, application
gets much higher cache hit ratio.  One other effect is that cache hit ratio
with hardware page walker hitting on pte in cache will be higher and this
helps to reduce tlb miss latency.  These two effects contribute to higher
application performance.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Dave McCracken <dmccr@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
arch/i386/mm/hugetlbpage.c
arch/ia64/mm/hugetlbpage.c
arch/powerpc/mm/hugetlbpage.c
arch/sh/mm/hugetlbpage.c
arch/sh64/mm/hugetlbpage.c
arch/sparc64/mm/hugetlbpage.c
include/linux/hugetlb.h
mm/hugetlb.c

index 1719a8141f81aee9f480087be3e54a68eef4adef..34728e4afe4806ad88982922ebbf9654023e4b64 100644 (file)
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+                               struct vm_area_struct *vma,
+                               unsigned long addr, pgoff_t idx)
+{
+       unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+                               svma->vm_start;
+       unsigned long sbase = saddr & PUD_MASK;
+       unsigned long s_end = sbase + PUD_SIZE;
+
+       /*
+        * match the virtual addresses, permission and the alignment of the
+        * page table page.
+        */
+       if (pmd_index(addr) != pmd_index(saddr) ||
+           vma->vm_flags != svma->vm_flags ||
+           sbase < svma->vm_start || svma->vm_end < s_end)
+               return 0;
+
+       return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+       unsigned long base = addr & PUD_MASK;
+       unsigned long end = base + PUD_SIZE;
+
+       /*
+        * check on proper vm_flags and page table alignment
+        */
+       if (vma->vm_flags & VM_MAYSHARE &&
+           vma->vm_start <= base && end <= vma->vm_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+       struct vm_area_struct *vma = find_vma(mm, addr);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+       struct prio_tree_iter iter;
+       struct vm_area_struct *svma;
+       unsigned long saddr;
+       pte_t *spte = NULL;
+
+       if (!vma_shareable(vma, addr))
+               return;
+
+       spin_lock(&mapping->i_mmap_lock);
+       vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+               if (svma == vma)
+                       continue;
+
+               saddr = page_table_shareable(svma, vma, addr, idx);
+               if (saddr) {
+                       spte = huge_pte_offset(svma->vm_mm, saddr);
+                       if (spte) {
+                               get_page(virt_to_page(spte));
+                               break;
+                       }
+               }
+       }
+
+       if (!spte)
+               goto out;
+
+       spin_lock(&mm->page_table_lock);
+       if (pud_none(*pud))
+               pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+       else
+               put_page(virt_to_page(spte));
+       spin_unlock(&mm->page_table_lock);
+out:
+       spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *         0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       pgd_t *pgd = pgd_offset(mm, *addr);
+       pud_t *pud = pud_offset(pgd, *addr);
+
+       BUG_ON(page_count(virt_to_page(ptep)) == 0);
+       if (page_count(virt_to_page(ptep)) == 1)
+               return 0;
+
+       pud_clear(pud);
+       put_page(virt_to_page(ptep));
+       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       return 1;
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pgd;
@@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 
        pgd = pgd_offset(mm, addr);
        pud = pud_alloc(mm, pgd, addr);
-       if (pud)
+       if (pud) {
+               if (pud_none(*pud))
+                       huge_pmd_share(mm, addr, pud);
                pte = (pte_t *) pmd_alloc(mm, pud, addr);
+       }
        BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
        return pte;
index f3a9585e98a8337a8cbd8b06c6916d5a8b3a8612..0c7e94edc20e9eaf9b30217246bbe5de3bafd191 100644 (file)
@@ -64,6 +64,11 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
 /*
index 506d89768d455ba8625ed480d4122c7d1c8264c9..424a8f57e15534d74d17823c84715a1161fad475 100644 (file)
@@ -146,6 +146,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        return hugepte_offset(hpdp, addr);
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
 {
        pte_t *hugepte = hugepd_page(*hpdp);
index 329059d6b54a79b2bdfb2eacdb2c4477f5936386..cf2c2ee35a376dab1ef632f27fd1be51bf5e38cd 100644 (file)
@@ -63,6 +63,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
 struct page *follow_huge_addr(struct mm_struct *mm,
                              unsigned long address, int write)
 {
index 187cf01750b80fe43ce2f8f32375b360a16a819f..4b455f61114670282914c7b6bf7354974d63864e 100644 (file)
@@ -53,6 +53,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry)
 {
index 53b9b1f528e54ae75d9caeb90e3fa7bd90b67c8a..33fd0b265e707afdee93ec63322e9bff01220882 100644 (file)
@@ -235,6 +235,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry)
 {
index ace64e57e17f4291a813c7a9f61d1779fb560d43..a60995afe3348a7ce3ebd0808252679a17164c98 100644 (file)
@@ -35,6 +35,7 @@ extern int sysctl_hugetlb_shm_group;
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
                              int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
index f7355bf2f285e359d93e41149338f70f267fb144..9244971b67912257442294e553863593e8c35d7a 100644 (file)
@@ -386,6 +386,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (!ptep)
                        continue;
 
+               if (huge_pmd_unshare(mm, &address, ptep))
+                       continue;
+
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (pte_none(pte))
                        continue;
@@ -658,11 +661,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
 
+       spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
        for (; address < end; address += HPAGE_SIZE) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
+               if (huge_pmd_unshare(mm, &address, ptep))
+                       continue;
                if (!pte_none(*ptep)) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -671,6 +677,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
+       spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 
        flush_tlb_range(vma, start, end);
 }