slab: add memory hotplug support

author David Rientjes <rientjes@google.com>

Sun, 28 Mar 2010 02:40:47 +0000 (19:40 -0700)

committer Pekka Enberg <penberg@cs.helsinki.fi>

Wed, 7 Apr 2010 16:28:31 +0000 (19:28 +0300)
author David Rientjes <rientjes@google.com>
Sun, 28 Mar 2010 02:40:47 +0000 (19:40 -0700)
committer Pekka Enberg <penberg@cs.helsinki.fi>
Wed, 7 Apr 2010 16:28:31 +0000 (19:28 +0300)
diff --git a/mm/slab.c b/mm/slab.c

index a9f325b28bed145a4aab6ff8414b074d3b233280..3230cd2c6b3b2051a1dd35b902dc6db2b5e92259 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
  #include       <linux/reciprocal_div.h>
  #include       <linux/debugobjects.h>
  #include       <linux/kmemcheck.h>
+#include       <linux/memory.h>
  
  #include       <asm/cacheflush.h>
  #include       <asm/tlbflush.h>
@@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
  }
  #endif
  
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+       struct kmem_cache *cachep;
+       struct kmem_list3 *l3;
+       const int memsize = sizeof(struct kmem_list3);
+
+       list_for_each_entry(cachep, &cache_chain, next) {
+               /*
+                * Set up the size64 kmemlist for cpu before we can
+                * begin anything. Make sure some other cpu on this
+                * node has not already allocated this
+                */
+               if (!cachep->nodelists[node]) {
+                       l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+                       if (!l3)
+                               return -ENOMEM;
+                       kmem_list3_init(l3);
+                       l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+                           ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+                       /*
+                        * The l3s don't come and go as CPUs come and
+                        * go.  cache_chain_mutex is sufficient
+                        * protection here.
+                        */
+                       cachep->nodelists[node] = l3;
+               }
+
+               spin_lock_irq(&cachep->nodelists[node]->list_lock);
+               cachep->nodelists[node]->free_limit =
+                       (1 + nr_cpus_node(node)) *
+                       cachep->batchcount + cachep->num;
+               spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+       }
+       return 0;
+}
+
  static void __cpuinit cpuup_canceled(long cpu)
  {
         struct kmem_cache *cachep;
@@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu)
         struct kmem_cache *cachep;
         struct kmem_list3 *l3 = NULL;
         int node = cpu_to_node(cpu);
-       const int memsize = sizeof(struct kmem_list3);
+       int err;
  
         /*
          * We need to do this right in the beginning since
@@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu)
          * kmalloc_node allows us to add the slab to the right
          * kmem_list3 and not this cpu's kmem_list3
          */
-
-       list_for_each_entry(cachep, &cache_chain, next) {
-               /*
-                * Set up the size64 kmemlist for cpu before we can
-                * begin anything. Make sure some other cpu on this
-                * node has not already allocated this
-                */
-               if (!cachep->nodelists[node]) {
-                       l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-                       if (!l3)
-                               goto bad;
-                       kmem_list3_init(l3);
-                       l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                           ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
-                       /*
-                        * The l3s don't come and go as CPUs come and
-                        * go.  cache_chain_mutex is sufficient
-                        * protection here.
-                        */
-                       cachep->nodelists[node] = l3;
-               }
-
-               spin_lock_irq(&cachep->nodelists[node]->list_lock);
-               cachep->nodelists[node]->free_limit =
-                       (1 + nr_cpus_node(node)) *
-                       cachep->batchcount + cachep->num;
-               spin_unlock_irq(&cachep->nodelists[node]->list_lock);
-       }
+       err = init_cache_nodelists_node(node);
+       if (err < 0)
+               goto bad;
  
         /*
          * Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
         &cpuup_callback, NULL, 0
  };
  
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+       struct kmem_cache *cachep;
+       int ret = 0;
+
+       list_for_each_entry(cachep, &cache_chain, next) {
+               struct kmem_list3 *l3;
+
+               l3 = cachep->nodelists[node];
+               if (!l3)
+                       continue;
+
+               drain_freelist(cachep, l3, l3->free_objects);
+
+               if (!list_empty(&l3->slabs_full) ||
+                   !list_empty(&l3->slabs_partial)) {
+                       ret = -EBUSY;
+                       break;
+               }
+       }
+       return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+                                       unsigned long action, void *arg)
+{
+       struct memory_notify *mnb = arg;
+       int ret = 0;
+       int nid;
+
+       nid = mnb->status_change_nid;
+       if (nid < 0)
+               goto out;
+
+       switch (action) {
+       case MEM_GOING_ONLINE:
+               mutex_lock(&cache_chain_mutex);
+               ret = init_cache_nodelists_node(nid);
+               mutex_unlock(&cache_chain_mutex);
+               break;
+       case MEM_GOING_OFFLINE:
+               mutex_lock(&cache_chain_mutex);
+               ret = drain_cache_nodelists_node(nid);
+               mutex_unlock(&cache_chain_mutex);
+               break;
+       case MEM_ONLINE:
+       case MEM_OFFLINE:
+       case MEM_CANCEL_ONLINE:
+       case MEM_CANCEL_OFFLINE:
+               break;
+       }
+out:
+       return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
  /*
   * swap the static kmem_list3 with kmalloced memory
   */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
-                       int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+                               int nodeid)
  {
         struct kmem_list3 *ptr;
  
@@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void)
          */
         register_cpu_notifier(&cpucache_notifier);
  
+#ifdef CONFIG_NUMA
+       /*
+        * Register a memory hotplug callback that initializes and frees
+        * nodelists.
+        */
+       hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
         /*
          * The reap timers are started later, with a module init call: That part
          * of the kernel is not yet operational.
author	David Rientjes <rientjes@google.com>
	Sun, 28 Mar 2010 02:40:47 +0000 (19:40 -0700)
committer	Pekka Enberg <penberg@cs.helsinki.fi>
	Wed, 7 Apr 2010 16:28:31 +0000 (19:28 +0300)