mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups
authorDan Williams <dan.j.williams@intel.com>
Wed, 6 Sep 2017 23:24:13 +0000 (16:24 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 00:27:29 +0000 (17:27 -0700)
devm_memremap_pages() records mapped ranges in pgmap_radix with an entry
per section's worth of memory (128MB).  The key for each of those
entries is a section number.

This leads to false positives when devm_memremap_pages() is passed a
section-unaligned range as lookups in the misalignment fail to return
NULL.  We can close this hole by using the pfn as the key for entries in
the tree.  The number of entries required to describe a remapped range
is reduced by leveraging multi-order entries.

In practice this approach usually yields just one entry in the tree if
the size and starting address are of the same power-of-2 alignment.
Previously we always needed nr_entries = mapping_size / 128MB.

Link: https://lists.01.org/pipermail/linux-nvdimm/2016-August/006666.html
Link: http://lkml.kernel.org/r/150215410565.39310.13767886055248249438.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
kernel/memremap.c
mm/Kconfig

index 9afdc434fb490a3384d847bc50647fa3dd3ab16a..066e73c2fcc9dbbd89c13923e203ed47aaff7d58 100644 (file)
@@ -194,18 +194,41 @@ struct page_map {
        struct vmem_altmap altmap;
 };
 
-static void pgmap_radix_release(struct resource *res)
+static unsigned long order_at(struct resource *res, unsigned long pgoff)
 {
-       resource_size_t key, align_start, align_size, align_end;
+       unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+       unsigned long nr_pages, mask;
 
-       align_start = res->start & ~(SECTION_SIZE - 1);
-       align_size = ALIGN(resource_size(res), SECTION_SIZE);
-       align_end = align_start + align_size - 1;
+       nr_pages = PHYS_PFN(resource_size(res));
+       if (nr_pages == pgoff)
+               return ULONG_MAX;
+
+       /*
+        * What is the largest aligned power-of-2 range available from
+        * this resource pgoff to the end of the resource range,
+        * considering the alignment of the current pgoff?
+        */
+       mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+       if (!mask)
+               return ULONG_MAX;
+
+       return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+       for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+                       pgoff += 1UL << order, order = order_at((res), pgoff))
+
+static void pgmap_radix_release(struct resource *res)
+{
+       unsigned long pgoff, order;
 
        mutex_lock(&pgmap_lock);
-       for (key = res->start; key <= res->end; key += SECTION_SIZE)
-               radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+       foreach_order_pgoff(res, order, pgoff)
+               radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
        mutex_unlock(&pgmap_lock);
+
+       synchronize_rcu();
 }
 
 static unsigned long pfn_first(struct page_map *page_map)
@@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 
        WARN_ON_ONCE(!rcu_read_lock_held());
 
-       page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+       page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
        return page_map ? &page_map->pgmap : NULL;
 }
 
@@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 void *devm_memremap_pages(struct device *dev, struct resource *res,
                struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
-       resource_size_t key, align_start, align_size, align_end;
+       resource_size_t align_start, align_size, align_end;
+       unsigned long pfn, pgoff, order;
        pgprot_t pgprot = PAGE_KERNEL;
        struct dev_pagemap *pgmap;
        struct page_map *page_map;
        int error, nid, is_ram;
-       unsigned long pfn;
 
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        mutex_lock(&pgmap_lock);
        error = 0;
        align_end = align_start + align_size - 1;
-       for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+
+       foreach_order_pgoff(res, order, pgoff) {
                struct dev_pagemap *dup;
 
                rcu_read_lock();
-               dup = find_dev_pagemap(key);
+               dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
                rcu_read_unlock();
                if (dup) {
                        dev_err(dev, "%s: %pr collides with mapping for %s\n",
@@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                        error = -EBUSY;
                        break;
                }
-               error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
-                               page_map);
+               error = __radix_tree_insert(&pgmap_radix,
+                               PHYS_PFN(res->start) + pgoff, order, page_map);
                if (error) {
                        dev_err(dev, "%s: failed: %d\n", __func__, error);
                        break;
index 48b1af447fa749c78c74217f2a09b1b1ade0fd46..0ded10a22639472daf48453d4f6754fc5a626c4b 100644 (file)
@@ -678,6 +678,7 @@ config ZONE_DEVICE
        depends on MEMORY_HOTREMOVE
        depends on SPARSEMEM_VMEMMAP
        depends on ARCH_HAS_ZONE_DEVICE
+       select RADIX_TREE_MULTIORDER
 
        help
          Device memory hotplug support allows for establishing pmem,