return -EINVAL;
}
- /*
- * Currently supports only single page. Supporting multiple
- * pages won't be too difficult if it ever becomes necessary.
- */
- pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
- PERCPU_DYNAMIC_RESERVE);
- if (pcpul_size > PMD_SIZE) {
- pr_warning("PERCPU: static data is larger than large page, "
- "can't use large page\n");
- return -EINVAL;
- }
- dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
- /* allocate pointer array and alloc large pages */
- map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
- pcpul_map = alloc_bootmem(map_size);
-
- for_each_possible_cpu(cpu) {
- pcpul_map[cpu].cpu = cpu;
- pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
- PMD_SIZE);
- if (!pcpul_map[cpu].ptr) {
- pr_warning("PERCPU: failed to allocate large page "
- "for cpu%u\n", cpu);
- goto enomem;
- }
-
- /*
- * Only use pcpul_size bytes and give back the rest.
- *
- * Ingo: The 2MB up-rounding bootmem is needed to make
- * sure the partial 2MB page is still fully RAM - it's
- * not well-specified to have a PAT-incompatible area
- * (unmapped RAM, device memory, etc.) in that hole.
- */
- free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
- PMD_SIZE - pcpul_size);
-
- memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
+ /* allocate and build unit_map */
- unit_map_size = num_possible_cpus() * sizeof(int);
++ unit_map_size = nr_cpu_ids * sizeof(int);
+ unit_map = alloc_bootmem_nopanic(unit_map_size);
+ if (!unit_map) {
+ pr_warning("PERCPU: failed to allocate unit_map\n");
+ return -ENOMEM;
}
- /* allocate address and map */
- pcpul_vm.flags = VM_ALLOC;
- pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
- vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
- for_each_possible_cpu(cpu) {
- pmd_t *pmd, pmd_v;
-
- pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
- cpu * PMD_SIZE);
- pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, pmd_v);
+ ret = pcpu_lpage_build_unit_map(static_size,
+ PERCPU_FIRST_CHUNK_RESERVE,
+ &dyn_size, &unit_size, PMD_SIZE,
+ unit_map, pcpu_lpage_cpu_distance);
+ if (ret < 0) {
+ pr_warning("PERCPU: failed to build unit_map\n");
+ goto out_free;
}
+ nr_units = ret;
- /* we're ready, commit */
- pr_info("PERCPU: Remapped at %p with large pages, static data "
- "%zu bytes\n", pcpul_vm.addr, static_size);
-
- ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
- PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
- PMD_SIZE, pcpul_vm.addr, NULL);
-
- /* sort pcpul_map array for pcpu_lpage_remapped() */
- for (i = 0; i < nr_cpu_ids - 1; i++)
- for (j = i + 1; j < nr_cpu_ids; j++)
- if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
- struct pcpul_ent tmp = pcpul_map[i];
- pcpul_map[i] = pcpul_map[j];
- pcpul_map[j] = tmp;
- }
-
- return ret;
-
-enomem:
- for_each_possible_cpu(cpu)
- if (pcpul_map[cpu].ptr)
- free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
- free_bootmem(__pa(pcpul_map), map_size);
- return -ENOMEM;
-}
+ /* do the parameters look okay? */
+ if (!chosen) {
+ size_t vm_size = VMALLOC_END - VMALLOC_START;
+ size_t tot_size = nr_units * unit_size;
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area. This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
- void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
- unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
- int left = 0, right = nr_cpu_ids - 1;
- int pos;
-
- /* pcpul in use at all? */
- if (!pcpul_map)
- return NULL;
-
- /* okay, perform binary search */
- while (left <= right) {
- pos = (left + right) / 2;
-
- if (pcpul_map[pos].ptr < pmd_addr)
- left = pos + 1;
- else if (pcpul_map[pos].ptr > pmd_addr)
- right = pos - 1;
- else {
- /* it shouldn't be in the area for the first chunk */
- WARN_ON(offset < pcpul_size);
-
- return pcpul_vm.addr +
- pcpul_map[pos].cpu * PMD_SIZE + offset;
+ /* don't consume more than 20% of vmalloc area */
+ if (tot_size > vm_size / 5) {
+ pr_info("PERCPU: too large chunk size %zuMB for "
+ "large page remap\n", tot_size >> 20);
+ ret = -EINVAL;
+ goto out_free;
}
}
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
- chunk->page = chunk->page_ar;
- chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
- if (unit_size >= 0) {
- BUG_ON(unit_size < size_sum);
- BUG_ON(unit_size & ~PAGE_MASK);
- BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
- } else
- BUG_ON(base_addr);
- BUG_ON(base_addr && populate_pte_fn);
-
- if (unit_size >= 0)
- pcpu_unit_pages = unit_size >> PAGE_SHIFT;
- else
- pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
- PFN_UP(size_sum));
+ BUG_ON(!base_addr);
+ BUG_ON(unit_size < size_sum);
+ BUG_ON(unit_size & ~PAGE_MASK);
+ BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+
+ /* determine number of units and verify and initialize pcpu_unit_map */
+ if (unit_map) {
+ int first_unit = INT_MAX, last_unit = INT_MIN;
+
+ for_each_possible_cpu(cpu) {
+ int unit = unit_map[cpu];
+
+ BUG_ON(unit < 0);
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ /* the mapping should be one-to-one */
+ BUG_ON(unit_map[tcpu] == unit);
+ }
+
+ if (unit < first_unit) {
+ pcpu_first_unit_cpu = cpu;
+ first_unit = unit;
+ }
+ if (unit > last_unit) {
+ pcpu_last_unit_cpu = cpu;
+ last_unit = unit;
+ }
+ }
+ pcpu_nr_units = last_unit + 1;
+ pcpu_unit_map = unit_map;
+ } else {
+ int *identity_map;
+
+ /* #units == #cpus, identity mapped */
- identity_map = alloc_bootmem(num_possible_cpus() *
++ identity_map = alloc_bootmem(nr_cpu_ids *
+ sizeof(identity_map[0]));
- pcpu_nr_units = num_possible_cpus();
+ for_each_possible_cpu(cpu)
+ identity_map[cpu] = cpu;
+
+ pcpu_first_unit_cpu = 0;
+ pcpu_last_unit_cpu = pcpu_nr_units - 1;
++ pcpu_nr_units = nr_cpu_ids;
+ pcpu_unit_map = identity_map;
+ }
+
+ /* determine basic parameters */
+ pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
+ pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
+ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
unsigned int cpu;
/* determine parameters and allocate */
- pcpue_size = PFN_ALIGN(static_size + reserved_size +
- (dyn_size >= 0 ? dyn_size : 0));
- if (dyn_size != 0)
- dyn_size = pcpue_size - static_size - reserved_size;
-
- if (unit_size >= 0) {
- BUG_ON(unit_size < pcpue_size);
- pcpue_unit_size = unit_size;
- } else
- pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-
- chunk_size = pcpue_unit_size * nr_cpu_ids;
-
- pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
- if (!pcpue_ptr) {
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+
+ unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
- chunk_size = unit_size * num_possible_cpus();
++ chunk_size = unit_size * nr_cpu_ids;
+
+ base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
return -ENOMEM;
}
/* return the leftover and copy */
- for_each_possible_cpu(cpu) {
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+ void *ptr = base + cpu * unit_size;
- free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
- memcpy(ptr, __per_cpu_load, static_size);
+ if (cpu_possible(cpu)) {
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
++ free_bootmem(__pa(ptr + size_sum),
++ unit_size - size_sum);
+ memcpy(ptr, __per_cpu_load, static_size);
+ } else
- free_bootmem(__pa(ptr), pcpue_unit_size);
++ free_bootmem(__pa(ptr), unit_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
- pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+ size_sum >> PAGE_SHIFT, base, static_size);
+
+ return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, base, NULL);
+}
+
+/**
+ * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+ static struct vm_struct vm;
+ int unit_pages;
+ size_t pages_size;
+ struct page **pages;
+ unsigned int cpu;
+ int i, j;
+ ssize_t ret;
+
+ unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
+ PCPU_MIN_UNIT_SIZE));
+
+ /* unaligned allocations can't be freed, round up to page size */
- pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
- sizeof(pages[0]));
++ pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+ pages = alloc_bootmem(pages_size);
+
+ /* allocate pages */
+ j = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < unit_pages; i++) {
+ void *ptr;
+
+ ptr = alloc_fn(cpu, PAGE_SIZE);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate "
+ "4k page for cpu%u\n", cpu);
+ goto enomem;
+ }
+ pages[j++] = virt_to_page(ptr);
+ }
+
+ /* allocate vm area, map the pages and copy static data */
+ vm.flags = VM_ALLOC;
- vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++ vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
+ vm_area_register_early(&vm, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long unit_addr = (unsigned long)vm.addr +
+ (cpu * unit_pages << PAGE_SHIFT);
+
+ for (i = 0; i < unit_pages; i++)
+ populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+
+ /* pte already populated, the following shouldn't fail */
+ ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+ unit_pages);
+ if (ret < 0)
+ panic("failed to map percpu area, err=%zd\n", ret);
+
+ /*
+ * FIXME: Archs with virtual cache should flush local
+ * cache for the linear mapping here - something
+ * equivalent to flush_cache_vmap() on the local cpu.
+ * flush_cache_vmap() can't be used as most supporting
+ * data structures are not set up yet.
+ */
+
+ /* copy static data */
+ memcpy((void *)unit_addr, __per_cpu_load, static_size);
+ }
+
+ /* we're ready, commit */
+ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
+ unit_pages, static_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
+ unit_pages << PAGE_SHIFT, vm.addr, NULL);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_fn(page_address(pages[j]), PAGE_SIZE);
+ ret = -ENOMEM;
+out_free_ar:
+ free_bootmem(__pa(pages), pages_size);
+ return ret;
+}
+
+/*
+ * Large page remapping first chunk setup helper
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page. The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated. -errno on failure.
+ */
+int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+ ssize_t *dyn_sizep, size_t *unit_sizep,
+ size_t lpage_size, int *unit_map,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ int group_cnt_max = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs;
+ unsigned int cpu, tcpu;
+ int group, unit;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of lpage_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, lpage_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ group_cnt_max = max(group_cnt_max, group_cnt[group]);
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group_cnt[group]; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 25%. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ *unit_sizep = alloc_size / best_upa;
- return pcpu_setup_first_chunk(pcpue_get_page, static_size,
- reserved_size, dyn_size,
- pcpue_unit_size, pcpue_ptr, NULL);
+ /* assign units to cpus accordingly */
+ unit = 0;
+ for (group = 0; group_cnt[group]; group++) {
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ unit_map[cpu] = unit++;
+ unit = roundup(unit, best_upa);
+ }
+
+ return unit; /* unit contains aligned number of units */
+}
+
+struct pcpul_ent {
+ void *ptr;
+ void *map_addr;
+};
+
+static size_t pcpul_size;
+static size_t pcpul_lpage_size;
+static int pcpul_nr_lpages;
+static struct pcpul_ent *pcpul_map;
+
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+ unsigned int *cpup)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (unit_map[cpu] == unit) {
+ if (cpup)
+ *cpup = cpu;
+ return true;
+ }
+
+ return false;
+}
+
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+ size_t reserved_size, size_t dyn_size,
+ size_t unit_size, size_t lpage_size,
+ const int *unit_map, int nr_units)
+{
+ int width = 1, v = nr_units;
+ char empty_str[] = "--------";
+ int upl, lpl; /* units per lpage, lpage per line */
+ unsigned int cpu;
+ int lpage, unit;
+
+ while (v /= 10)
+ width++;
+ empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+ upl = max_t(int, lpage_size / unit_size, 1);
+ lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+ printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+ static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+ for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+ if (!(unit % upl)) {
+ if (!(lpage++ % lpl)) {
+ printk("\n");
+ printk("%spcpu-lpage: ", lvl);
+ } else
+ printk("| ");
+ }
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ printk("%0*d ", width, cpu);
+ else
+ printk("%s ", empty_str);
+ }
+ printk("\n");
+}
+
+/**
+ * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes
+ * @unit_size: unit size in bytes
+ * @lpage_size: the size of a large page
+ * @unit_map: cpu -> unit mapping
+ * @nr_units: the number of units
+ * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ * @free_fn: function to free percpu memory, @size <= lpage_size
+ * @map_fn: function to map percpu lpage, always called with lpage_size
+ *
+ * This allocator uses large page to build and map the first chunk.
+ * Unlike other helpers, the caller should always specify @dyn_size
+ * and @unit_size. These parameters along with @unit_map and
+ * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ * This two stage initialization is to allow arch code to evaluate the
+ * parameters before committing to it.
+ *
+ * Large pages are allocated as directed by @unit_map and other
+ * parameters and mapped to vmalloc space. Unused holes are returned
+ * to the page allocator. Note that these holes end up being actively
+ * mapped twice - once to the physical mapping and to the vmalloc area
+ * for the first percpu chunk. Depending on architecture, this might
+ * cause problem when changing page attributes of the returned area.
+ * These double mapped areas can be detected using
+ * pcpu_lpage_remapped().
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+ size_t dyn_size, size_t unit_size,
+ size_t lpage_size, const int *unit_map,
+ int nr_units,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_map_fn_t map_fn)
+{
+ static struct vm_struct vm;
+ size_t chunk_size = unit_size * nr_units;
+ size_t map_size;
+ unsigned int cpu;
+ ssize_t ret;
+ int i, j, unit;
+
+ pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+ unit_size, lpage_size, unit_map, nr_units);
+
+ BUG_ON(chunk_size % lpage_size);
+
+ pcpul_size = static_size + reserved_size + dyn_size;
+ pcpul_lpage_size = lpage_size;
+ pcpul_nr_lpages = chunk_size / lpage_size;
+
+ /* allocate pointer array and alloc large pages */
+ map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
+ pcpul_map = alloc_bootmem(map_size);
+
+ /* allocate all pages */
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ size_t offset = i * lpage_size;
+ int first_unit = offset / unit_size;
+ int last_unit = (offset + lpage_size - 1) / unit_size;
+ void *ptr;
+
+ /* find out which cpu is mapped to this unit */
+ for (unit = first_unit; unit <= last_unit; unit++)
+ if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+ goto found;
+ continue;
+ found:
+ ptr = alloc_fn(cpu, lpage_size);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate large page "
+ "for cpu%u\n", cpu);
+ goto enomem;
+ }
+
+ pcpul_map[i].ptr = ptr;
+ }
+
+ /* return unused holes */
+ for (unit = 0; unit < nr_units; unit++) {
+ size_t start = unit * unit_size;
+ size_t end = start + unit_size;
+ size_t off, next;
+
+ /* don't free used part of occupied unit */
+ if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+ start += pcpul_size;
+
+ /* unit can span more than one page, punch the holes */
+ for (off = start; off < end; off = next) {
+ void *ptr = pcpul_map[off / lpage_size].ptr;
+ next = min(roundup(off + 1, lpage_size), end);
+ if (ptr)
+ free_fn(ptr + off % lpage_size, next - off);
+ }
+ }
+
+ /* allocate address, map and copy */
+ vm.flags = VM_ALLOC;
+ vm.size = chunk_size;
+ vm_area_register_early(&vm, unit_size);
+
+ for (i = 0; i < pcpul_nr_lpages; i++) {
+ if (!pcpul_map[i].ptr)
+ continue;
+ pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+ map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+ }
+
+ for_each_possible_cpu(cpu)
+ memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+ static_size);
+
+ /* we're ready, commit */
+ pr_info("PERCPU: Remapped at %p with large pages, static data "
+ "%zu bytes\n", vm.addr, static_size);
+
+ ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
+ unit_size, vm.addr, unit_map);
+
+ /*
+ * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
+ * lpages are pushed to the end and trimmed.
+ */
+ for (i = 0; i < pcpul_nr_lpages - 1; i++)
+ for (j = i + 1; j < pcpul_nr_lpages; j++) {
+ struct pcpul_ent tmp;
+
+ if (!pcpul_map[j].ptr)
+ continue;
+ if (pcpul_map[i].ptr &&
+ pcpul_map[i].ptr < pcpul_map[j].ptr)
+ continue;
+
+ tmp = pcpul_map[i];
+ pcpul_map[i] = pcpul_map[j];
+ pcpul_map[j] = tmp;
+ }
+
+ while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+ pcpul_nr_lpages--;
+
+ return ret;
+
+enomem:
+ for (i = 0; i < pcpul_nr_lpages; i++)
+ if (pcpul_map[i].ptr)
+ free_fn(pcpul_map[i].ptr, lpage_size);
+ free_bootmem(__pa(pcpul_map), map_size);
+ return -ENOMEM;
+}
+
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area. This is
+ * used by pageattr to detect VM aliases and break up the pcpu large
+ * page mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used large
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+ unsigned long lpage_mask = pcpul_lpage_size - 1;
+ void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+ unsigned long offset = (unsigned long)kaddr & lpage_mask;
+ int left = 0, right = pcpul_nr_lpages - 1;
+ int pos;
+
+ /* pcpul in use at all? */
+ if (!pcpul_map)
+ return NULL;
+
+ /* okay, perform binary search */
+ while (left <= right) {
+ pos = (left + right) / 2;
+
+ if (pcpul_map[pos].ptr < lpage_addr)
+ left = pos + 1;
+ else if (pcpul_map[pos].ptr > lpage_addr)
+ right = pos - 1;
+ else
+ return pcpul_map[pos].map_addr + offset;
+ }
+
+ return NULL;
+}
+#endif
+
+/*
+ * Generic percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup. This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location. As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+ size_t static_size = __per_cpu_end - __per_cpu_start;
+ ssize_t unit_size;
+ unsigned long delta;
+ unsigned int cpu;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE);
+ if (unit_size < 0)
+ panic("Failed to initialized percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + cpu * unit_size;
}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */