x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA
authorTejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 06:00:52 +0000 (15:00 +0900)
committerTejun Heo <tj@kernel.org>
Fri, 14 Aug 2009 06:00:52 +0000 (15:00 +0900)
Embedding percpu first chunk allocator can now handle very sparse unit
mapping.  Use embedding allocator instead of lpage for 64bit NUMA.
This removes extra TLB pressure and the need to do complex and fragile
dancing when changing page attributes.

For 32bit, using very sparse unit mapping isn't a good idea because
the vmalloc space is very constrained.  32bit NUMA machines aren't
exactly the focus of optimization and it isn't very clear whether
lpage performs better than page.  Use page first chunk allocator for
32bit NUMAs.

As this leaves setup_pcpu_*() functions pretty much empty, fold them
into setup_per_cpu_areas().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
arch/x86/Kconfig
arch/x86/kernel/setup_percpu.c

index f7ac2721551209de969c2c86952132910c5ac56a..869d7d301448874ebb65f62a9270cd5a8174d462 100644 (file)
@@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
        def_bool y
 
-config NEED_PER_CPU_LPAGE_FIRST_CHUNK
-       def_bool y
-       depends on NEED_MULTIPLE_NODES
-
 config HAVE_CPUMASK_OF_CPU_MAP
        def_bool X86_64_SMP
 
index 67f6314de9f170492bdd7e3011a8da9a86c49cab..d559af913e1f89c40de010c1a936cdeab15f9751 100644 (file)
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
 #define PERCPU_FIRST_CHUNK_RESERVE     0
 #endif
 
+#ifdef CONFIG_X86_32
 /**
  * pcpu_need_numa - determine percpu allocation needs to consider NUMA
  *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
 #endif
        return false;
 }
+#endif
 
 /**
  * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
        free_bootmem(__pa(ptr), size);
 }
 
-/*
- * Large page remapping allocator
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-static void __init pcpul_map(void *ptr, size_t size, void *addr)
-{
-       pmd_t *pmd, pmd_v;
-
-       pmd = populate_extra_pmd((unsigned long)addr);
-       pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
-       set_pmd(pmd, pmd_v);
-}
-
-static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 {
+#ifdef CONFIG_NEED_MULTIPLE_NODES
        if (early_cpu_to_node(from) == early_cpu_to_node(to))
                return LOCAL_DISTANCE;
        else
                return REMOTE_DISTANCE;
-}
-
-static int __init setup_pcpu_lpage(bool chosen)
-{
-       size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-       size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
-       struct pcpu_alloc_info *ai;
-       int rc;
-
-       /* on non-NUMA, embedding is better */
-       if (!chosen && !pcpu_need_numa())
-               return -EINVAL;
-
-       /* need PSE */
-       if (!cpu_has_pse) {
-               pr_warning("PERCPU: lpage allocator requires PSE\n");
-               return -EINVAL;
-       }
-
-       /* allocate and build unit_map */
-       ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-                                  PMD_SIZE, pcpu_lpage_cpu_distance);
-       if (IS_ERR(ai)) {
-               pr_warning("PERCPU: failed to build unit_map (%ld)\n",
-                          PTR_ERR(ai));
-               return PTR_ERR(ai);
-       }
-
-       /* do the parameters look okay? */
-       if (!chosen) {
-               size_t vm_size = VMALLOC_END - VMALLOC_START;
-               size_t tot_size = 0;
-               int group;
-
-               for (group = 0; group < ai->nr_groups; group++)
-                       tot_size += ai->unit_size * ai->groups[group].nr_units;
-
-               /* don't consume more than 20% of vmalloc area */
-               if (tot_size > vm_size / 5) {
-                       pr_info("PERCPU: too large chunk size %zuMB for "
-                               "large page remap\n", tot_size >> 20);
-                       rc = -EINVAL;
-                       goto out_free;
-               }
-       }
-
-       rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
-out_free:
-       pcpu_free_alloc_info(ai);
-       return rc;
-}
 #else
-static int __init setup_pcpu_lpage(bool chosen)
-{
-       return -EINVAL;
-}
+       return LOCAL_DISTANCE;
 #endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves and embedded into linear physical
- * mapping so that it can use PMD mapping without additional TLB
- * pressure.
- */
-static int __init setup_pcpu_embed(bool chosen)
-{
-       size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-
-       /*
-        * If large page isn't supported, there's no benefit in doing
-        * this.  Also, embedding allocation doesn't play well with
-        * NUMA.
-        */
-       if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
-               return -EINVAL;
-
-       return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-                                     reserve - PERCPU_FIRST_CHUNK_RESERVE,
-                                     PAGE_SIZE, NULL, pcpu_fc_alloc,
-                                     pcpu_fc_free);
 }
 
-/*
- * Page allocator
- *
- * Boring fallback 4k page allocator.  This allocator puts more
- * pressure on PTE TLBs but other than that behaves nicely on both UMA
- * and NUMA.
- */
 static void __init pcpup_populate_pte(unsigned long addr)
 {
        populate_extra_pte(addr);
 }
 
-static int __init setup_pcpu_page(void)
-{
-       return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-                                    pcpu_fc_alloc, pcpu_fc_free,
-                                    pcpup_populate_pte);
-}
-
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void)
                NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
        /*
-        * Allocate percpu area.  If PSE is supported, try to make use
-        * of large page mappings.  Please read comments on top of
-        * each allocator for details.
+        * Allocate percpu area.  Embedding allocator is our favorite;
+        * however, on NUMA configurations, it can result in very
+        * sparse unit mapping and vmalloc area isn't spacious enough
+        * on 32bit.  Use page in that case.
         */
+#ifdef CONFIG_X86_32
+       if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+               pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
        rc = -EINVAL;
-       if (pcpu_chosen_fc != PCPU_FC_AUTO) {
-               if (pcpu_chosen_fc != PCPU_FC_PAGE) {
-                       if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-                               rc = setup_pcpu_lpage(true);
-                       else
-                               rc = setup_pcpu_embed(true);
-
-                       if (rc < 0)
-                               pr_warning("PERCPU: %s allocator failed (%d), "
-                                          "falling back to page size\n",
-                                          pcpu_fc_names[pcpu_chosen_fc], rc);
-               }
-       } else {
-               rc = setup_pcpu_lpage(false);
+       if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+               const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
+               const size_t dyn_size = PERCPU_MODULE_RESERVE +
+                       PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+
+               rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+                                           dyn_size, atom_size,
+                                           pcpu_cpu_distance,
+                                           pcpu_fc_alloc, pcpu_fc_free);
                if (rc < 0)
-                       rc = setup_pcpu_embed(false);
+                       pr_warning("PERCPU: %s allocator failed (%d), "
+                                  "falling back to page size\n",
+                                  pcpu_fc_names[pcpu_chosen_fc], rc);
        }
        if (rc < 0)
-               rc = setup_pcpu_page();
+               rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+                                          pcpu_fc_alloc, pcpu_fc_free,
+                                          pcpup_populate_pte);
        if (rc < 0)
                panic("cannot initialize percpu area (err=%d)", rc);