[COMMON] mm: introduce High-order Pages Allocator
authorSunyoung Kang <sy0816.kang@samsung.com>
Mon, 4 Jul 2016 06:50:57 +0000 (15:50 +0900)
committerSangwook Ju <sw.ju@samsung.com>
Mon, 14 May 2018 10:45:25 +0000 (19:45 +0900)
The high-order pages allocator (aka. HPA) has been introduced to ensure
successful allocation of high-order pages always with given order and size.
The HPA tries to allocate pages from buddy and then if any pages are in use,
tries to reclaim and migrate pages.

Change-Id: I61a7fe76d4a1c6a1058d2fe731bbfef8d3e6959a
Signed-off-by: Jinsung Yang <jsgood.yang@samsung.com>
Signed-off-by: Sunyoung Kang <sy0816.kang@samsung.com>
Signed-off-by: Cho KyongHo <pullip.cho@samsung.com>
include/linux/gfp.h
mm/Kconfig
mm/Makefile
mm/hpa.c [new file with mode: 0644]
mm/internal.h
mm/page_alloc.c

index b041f94678de5ac8889f22eedc3f5b01ed656f01..16cf067bcbe7afb9fd55a4547a8290ca0299caae 100644 (file)
@@ -585,4 +585,13 @@ extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
 extern void init_cma_reserved_pageblock(struct page *page);
 #endif
 
+#ifdef CONFIG_HPA
+int alloc_pages_highorder(int order, struct page **pages, int nents);
+#else
+static inline int alloc_pages_highorder(int order, struct page **pages, int nents)
+{
+       return 0;
+}
+#endif
+
 #endif /* __LINUX_GFP_H */
index 9c4bdddd80c2123ac522c7d6c08453cca6c8445f..c35c02d0c0b94db3a747bd91b4129bac0074455d 100644 (file)
@@ -639,6 +639,12 @@ config MAX_STACK_SIZE_MB
 
          A sane initial value is 80 MB.
 
+config HPA
+       bool "High-order Pages Allocator"
+       select CMA
+       help
+         Turns on High-order Pages Allocator based on page migration.
+
 # For architectures that support deferred memory initialisation
 config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
        bool
index e7ebd176fb935e382c9da55c39fde038f256c9e5..3c72e1ba63484af3791fe640ca570243f90f7c05 100644 (file)
@@ -94,6 +94,7 @@ obj-$(CONFIG_ZSMALLOC)        += zsmalloc.o
 obj-$(CONFIG_Z3FOLD)   += z3fold.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)      += cma.o
+obj-$(CONFIG_HPA) += hpa.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/hpa.c b/mm/hpa.c
new file mode 100644 (file)
index 0000000..0c1580b
--- /dev/null
+++ b/mm/hpa.c
@@ -0,0 +1,338 @@
+/*
+ * linux/mm/hpa.c
+ *
+ * Copyright (C) 2015 Samsung Electronics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+
+ * Does best efforts to allocate required high-order pages.
+ */
+
+#include <linux/list.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mmzone.h>
+#include <linux/migrate.h>
+#include <linux/memcontrol.h>
+#include <linux/page-isolation.h>
+#include <linux/mm_inline.h>
+#include <linux/swap.h>
+#include <linux/scatterlist.h>
+#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/oom.h>
+
+#include "internal.h"
+
+#define MAX_SCAN_TRY           (2)
+
+static unsigned long start_pfn, end_pfn;
+static unsigned long cached_scan_pfn;
+
+#define HPA_MIN_OOMADJ 100
+static unsigned long hpa_deathpending_timeout;
+
+static int test_task_flag(struct task_struct *p, int flag)
+{
+       struct task_struct *t = p;
+
+       do {
+               task_lock(t);
+               if (test_tsk_thread_flag(t, flag)) {
+                       task_unlock(t);
+                       return 1;
+               }
+               task_unlock(t);
+       } while_each_thread(p, t);
+
+       return 0;
+}
+
+static int hpa_killer(void)
+{
+       struct task_struct *tsk;
+       struct task_struct *selected = NULL;
+       unsigned long rem = 0;
+       int tasksize;
+       int selected_tasksize = 0;
+       short selected_oom_score_adj;
+       int ret = 0;
+
+       rcu_read_lock();
+       for_each_process(tsk) {
+               struct task_struct *p;
+               short oom_score_adj;
+
+               if (tsk->flags & PF_KTHREAD)
+                       continue;
+
+               if (test_task_flag(tsk, TIF_MEMALLOC))
+                       continue;
+
+               p = find_lock_task_mm(tsk);
+               if (!p)
+                       continue;
+
+               if (task_lmk_waiting(p) && p->mm &&
+                   time_before_eq(jiffies, hpa_deathpending_timeout)) {
+                       task_unlock(p);
+                       rcu_read_unlock();
+                       return ret;
+               }
+               oom_score_adj = p->signal->oom_score_adj;
+               tasksize = get_mm_rss(p->mm);
+               task_unlock(p);
+               if (tasksize <= 0 || oom_score_adj <= HPA_MIN_OOMADJ)
+                       continue;
+               if (same_thread_group(p, current))
+                       continue;
+               if (selected) {
+                       if (oom_score_adj < selected_oom_score_adj)
+                               continue;
+                       if (oom_score_adj == selected_oom_score_adj &&
+                           tasksize <= selected_tasksize)
+                               continue;
+               }
+               selected = p;
+               selected_tasksize = tasksize;
+               selected_oom_score_adj = oom_score_adj;
+       }
+
+       if (selected) {
+               pr_info("HPA: Killing '%s' (%d), adj %hd freed %ldkB\n",
+                               selected->comm, selected->pid,
+                               selected_oom_score_adj,
+                               selected_tasksize * (long)(PAGE_SIZE / 1024));
+               hpa_deathpending_timeout = jiffies + HZ;
+               task_set_lmk_waiting(selected);
+               send_sig(SIGKILL, selected, 0);
+               rem += selected_tasksize;
+       } else {
+               pr_info("HPA: no killable task\n");
+               ret = -ESRCH;
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static bool is_movable_chunk(unsigned long start_pfn, unsigned int order)
+{
+       unsigned long pfn = start_pfn;
+       struct page *page = pfn_to_page(start_pfn);
+
+       for (pfn = start_pfn; pfn < start_pfn + (1 << order); pfn++) {
+               page = pfn_to_page(pfn);
+               if (PageBuddy(page)) {
+                       pfn += (1 << page_order(page)) - 1;
+                       continue;
+               }
+               if (PageCompound(page))
+                       return false;
+               if (PageReserved(page))
+                       return false;
+               if (!PageLRU(page))
+                       return false;
+       }
+       return true;
+}
+
+static int alloc_freepages_range(struct zone *zone, unsigned int order,
+                                struct page **pages, int required)
+
+{
+       unsigned int current_order;
+       unsigned int mt;
+       unsigned long wmark;
+       unsigned long flags;
+       struct free_area *area;
+       struct page *page;
+       int i;
+       int count = 0;
+
+       spin_lock_irqsave(&zone->lock, flags);
+
+       for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+               area = &(zone->free_area[current_order]);
+               wmark = min_wmark_pages(zone) + (1 << current_order);
+
+               for (mt = MIGRATE_UNMOVABLE; mt < MIGRATE_PCPTYPES; ++mt) {
+                       while (!list_empty(&area->free_list[mt])) {
+                               if (!zone_watermark_ok(zone, current_order,
+                                                       wmark, 0, 0))
+                                       goto wmark_fail;
+                               /*
+                                * expanding the current free chunk is not
+                                * supported here due to the complex logic of
+                                * expand().
+                                */
+                               if ((required << order) < (1 << current_order))
+                                       break;
+
+                               page = list_entry(area->free_list[mt].next,
+                                                       struct page, lru);
+                               list_del(&page->lru);
+                               __ClearPageBuddy(page);
+                               set_page_private(page, 0);
+                               set_pcppage_migratetype(page, mt);
+                               /*
+                                * skip checking bad page state
+                                * for fast allocation
+                                */
+                               area->nr_free--;
+                               __mod_zone_page_state(zone, NR_FREE_PAGES,
+                                                       -(1 << current_order));
+
+                               required -= 1 << (current_order - order);
+
+                               for (i = 1 << (current_order - order); i > 0; i--) {
+                                       post_alloc_hook(page, order, GFP_KERNEL);
+                                       pages[count++] = page;
+                                       page += 1 << order;
+                               }
+                       }
+               }
+       }
+
+wmark_fail:
+       spin_unlock_irqrestore(&zone->lock, flags);
+
+       return count;
+}
+
+static void prep_highorder_pages(unsigned long start_pfn, int order)
+{
+       int nr_pages = 1 << order;
+       unsigned long pfn;
+
+       for (pfn = start_pfn + 1; pfn < start_pfn + nr_pages; pfn++)
+               set_page_count(pfn_to_page(pfn), 0);
+}
+
+int alloc_pages_highorder(int order, struct page **pages, int nents)
+{
+       struct zone *zone;
+       unsigned int nr_pages = 1 << order;
+       unsigned long total_scanned = 0;
+       unsigned long pfn, tmp;
+       int p = 0;
+       int remained = nents;
+       int ret;
+       int retry_count = 0;
+       int allocated;
+
+retry:
+       for_each_zone(zone) {
+               if (zone->spanned_pages == 0)
+                       continue;
+
+               allocated = alloc_freepages_range(zone, order,
+                                       pages + nents - remained, remained);
+               remained -= allocated;
+
+               if (remained == 0)
+                       return 0;
+       }
+
+       migrate_prep();
+
+       for (pfn = ALIGN(cached_scan_pfn, nr_pages);
+                       (total_scanned < (end_pfn - start_pfn) * MAX_SCAN_TRY)
+                       && (remained > 0);
+                       pfn += nr_pages, total_scanned += nr_pages) {
+               if (pfn + nr_pages > end_pfn) {
+                       pfn = start_pfn;
+                       continue;
+               }
+
+               /* pfn validation check in the range */
+               tmp = pfn;
+               do {
+                       if (!pfn_valid(tmp))
+                               break;
+               } while (++tmp < (pfn + nr_pages));
+
+               if (tmp < (pfn + nr_pages))
+                       continue;
+
+               if (!is_movable_chunk(pfn, order))
+                       continue;
+
+               ret = alloc_contig_range(pfn, pfn + nr_pages,
+                               get_pageblock_migratetype(pfn_to_page(pfn)));
+               if (ret == 0)
+                       prep_highorder_pages(pfn, order);
+               else
+                       continue;
+
+               pages[p++] = pfn_to_page(pfn);
+               remained--;
+       }
+
+       /* save latest scanned pfn */
+       cached_scan_pfn = pfn;
+
+       if (remained) {
+               int i;
+
+               drop_slab();
+               count_vm_event(DROP_SLAB);
+               ret = hpa_killer();
+               if (ret == 0) {
+                       total_scanned = 0;
+                       pr_info("HPA: drop_slab and killer retry %d count\n",
+                               retry_count++);
+                       goto retry;
+               }
+
+               for (i = 0; i < p; i++)
+                       __free_pages(pages[i], order);
+
+               pr_info("%s: remained=%d / %d, not enough memory in order %d\n",
+                                __func__, remained, nents, order);
+
+               ret = -ENOMEM;
+       }
+
+       return ret;
+}
+
+int free_pages_highorder(int order, struct page **pages, int nents)
+{
+       int i;
+
+       for (i = 0; i < nents; i++)
+               __free_pages(pages[i], order);
+
+       return 0;
+}
+
+static int __init init_highorder_pages_allocator(void)
+{
+       struct zone *zone;
+
+       for_each_zone(zone) {
+               if (zone->spanned_pages == 0)
+                       continue;
+               if (zone_idx(zone) == ZONE_MOVABLE) {
+                       start_pfn = zone->zone_start_pfn;
+                       end_pfn = start_pfn + zone->present_pages;
+               }
+       }
+
+       if (!start_pfn) {
+               start_pfn = __phys_to_pfn(memblock_start_of_DRAM());
+               end_pfn = max_pfn;
+       }
+
+       cached_scan_pfn = start_pfn;
+
+       return 0;
+}
+late_initcall(init_highorder_pages_allocator);
index 1df011f624801ffbdf3379a6d17218d9b32280e1..b0302094d4bb4e23e96fbe2fcdb53522cdcaf1d2 100644 (file)
@@ -537,4 +537,23 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 }
 
 void setup_zone_pageset(struct zone *zone);
+
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+       return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+       page->index = migratetype;
+}
+
 #endif /* __MM_INTERNAL_H */
index 1d7693c3542442f4c24cad8059195593e854c43c..a200d3a3e58917eb372622a2c95159faf30c956b 100644 (file)
@@ -130,24 +130,6 @@ unsigned long totalcma_pages __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
-/*
- * A cached value of the page's pageblock's migratetype, used when the page is
- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
- * Also the migratetype set in the page does not necessarily match the pcplist
- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
- * other index - this ensures that it will be put on the correct CMA freelist.
- */
-static inline int get_pcppage_migratetype(struct page *page)
-{
-       return page->index;
-}
-
-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
-{
-       page->index = migratetype;
-}
-
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily