[PATCH] sparsemem base: simple NUMA remap space allocator
authorDave Hansen <haveblue@us.ibm.com>
Thu, 23 Jun 2005 07:07:39 +0000 (00:07 -0700)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Thu, 23 Jun 2005 16:45:01 +0000 (09:45 -0700)
Introduce a simple allocator for the NUMA remap space.  This space is very
scarce, used for structures which are best allocated node local.

This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep
the pgdat->node_mem_map initialized in a consistent place for all
architectures.

Issues:
o alloc_remap takes a node_id where we might expect a pgdat which was intended
  to allow us to allocate the pgdat's using this mechanism; which we do not yet
  do.  Could have alloc_remap_node() and alloc_remap_nid() for this purpose.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
arch/i386/Kconfig
arch/i386/mm/discontig.c
include/linux/bootmem.h
mm/page_alloc.c

index dfd904f6883b29820a19c5793240ac0e10103755..35ca3a17ed209c81310ae74f1ae77b90d101493d 100644 (file)
@@ -803,6 +803,11 @@ config NEED_NODE_MEMMAP_SIZE
        depends on DISCONTIGMEM
        default y
 
+config HAVE_ARCH_ALLOC_REMAP
+       bool
+       depends on NUMA
+       default y
+
 config HIGHPTE
        bool "Allocate 3rd-level pagetables from highmem"
        depends on HIGHMEM4G || HIGHMEM64G
index 85d2fcbe10793db952aca8556a0f93d1b9a11444..dcc71f969b01089a0b09969f014588de606ff61c 100644 (file)
@@ -108,6 +108,9 @@ unsigned long node_remap_offset[MAX_NUMNODES];
 void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
+void *node_remap_end_vaddr[MAX_NUMNODES];
+void *node_remap_alloc_vaddr[MAX_NUMNODES];
+
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
@@ -178,6 +181,21 @@ static void __init allocate_pgdat(int nid)
        }
 }
 
+void *alloc_remap(int nid, unsigned long size)
+{
+       void *allocation = node_remap_alloc_vaddr[nid];
+
+       size = ALIGN(size, L1_CACHE_BYTES);
+
+       if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+               return 0;
+
+       node_remap_alloc_vaddr[nid] += size;
+       memset(allocation, 0, size);
+
+       return allocation;
+}
+
 void __init remap_numa_kva(void)
 {
        void *vaddr;
@@ -185,8 +203,6 @@ void __init remap_numa_kva(void)
        int node;
 
        for_each_online_node(node) {
-               if (node == 0)
-                       continue;
                for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
                        vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
                        set_pmd_pfn((ulong) vaddr, 
@@ -202,11 +218,6 @@ static unsigned long calculate_numa_remap_pages(void)
        unsigned long size, reserve_pages = 0;
 
        for_each_online_node(nid) {
-               if (nid == 0)
-                       continue;
-               if (!node_remap_size[nid])
-                       continue;
-
                /*
                 * The acpi/srat node info can show hot-add memroy zones
                 * where memory could be added but not currently present.
@@ -226,8 +237,8 @@ static unsigned long calculate_numa_remap_pages(void)
                printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
                                size, nid);
                node_remap_size[nid] = size;
-               reserve_pages += size;
                node_remap_offset[nid] = reserve_pages;
+               reserve_pages += size;
                printk("Shrinking node %d from %ld pages to %ld pages\n",
                        nid, node_end_pfn[nid], node_end_pfn[nid] - size);
                node_end_pfn[nid] -= size;
@@ -280,12 +291,18 @@ unsigned long __init setup_memory(void)
                        (ulong) pfn_to_kaddr(max_low_pfn));
        for_each_online_node(nid) {
                node_remap_start_vaddr[nid] = pfn_to_kaddr(
-                       (highstart_pfn + reserve_pages) - node_remap_offset[nid]);
+                               highstart_pfn + node_remap_offset[nid]);
+               /* Init the node remap allocator */
+               node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+                       (node_remap_size[nid] * PAGE_SIZE);
+               node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+                       ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
                allocate_pgdat(nid);
                printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
                        (ulong) node_remap_start_vaddr[nid],
-                       (ulong) pfn_to_kaddr(highstart_pfn + reserve_pages
-                           - node_remap_offset[nid] + node_remap_size[nid]));
+                       (ulong) pfn_to_kaddr(highstart_pfn
+                          + node_remap_offset[nid] + node_remap_size[nid]));
        }
        printk("High memory starts at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(highstart_pfn));
@@ -348,23 +365,9 @@ void __init zone_sizes_init(void)
                }
 
                zholes_size = get_zholes_size(nid);
-               /*
-                * We let the lmem_map for node 0 be allocated from the
-                * normal bootmem allocator, but other nodes come from the
-                * remapped KVA area - mbligh
-                */
-               if (!nid)
-                       free_area_init_node(nid, NODE_DATA(nid),
-                                       zones_size, start, zholes_size);
-               else {
-                       unsigned long lmem_map;
-                       lmem_map = (unsigned long)node_remap_start_vaddr[nid];
-                       lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1;
-                       lmem_map &= PAGE_MASK;
-                       NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map;
-                       free_area_init_node(nid, NODE_DATA(nid), zones_size,
-                               start, zholes_size);
-               }
+
+               free_area_init_node(nid, NODE_DATA(nid), zones_size, start,
+                               zholes_size);
        }
        return;
 }
index 0dd8ca1a3d5ac516bdaac0572d23e424a0099ebd..500f451ce0c0012dfe533ddd4f05b2a28db5b78f 100644 (file)
@@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size,
        __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
+extern void *alloc_remap(int nid, unsigned long size);
+#else
+static inline void *alloc_remap(int nid, unsigned long size)
+{
+       return NULL;
+}
+#endif
+
 extern unsigned long __initdata nr_kernel_pages;
 extern unsigned long __initdata nr_all_pages;
 
index 559336de968713921d5265587604c9f1240be8ef..bf1dd88190972bce44043098ca7be8105c53c436 100644 (file)
@@ -1936,6 +1936,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
        unsigned long size;
+       struct page *map;
 
        /* Skip empty nodes */
        if (!pgdat->node_spanned_pages)
@@ -1944,7 +1945,10 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
        /* ia64 gets its own node_mem_map, before this, without bootmem */
        if (!pgdat->node_mem_map) {
                size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-               pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+               map = alloc_remap(pgdat->node_id, size);
+               if (!map)
+                       map = alloc_bootmem_node(pgdat, size);
+               pgdat->node_mem_map = map;
        }
 #ifndef CONFIG_DISCONTIGMEM
        /*