[COMMON] mm: introduce High-order Pages Allocator

author Sunyoung Kang <sy0816.kang@samsung.com>

Mon, 4 Jul 2016 06:50:57 +0000 (15:50 +0900)

committer Sangwook Ju <sw.ju@samsung.com>

Mon, 14 May 2018 10:45:25 +0000 (19:45 +0900)
author Sunyoung Kang <sy0816.kang@samsung.com>
Mon, 4 Jul 2016 06:50:57 +0000 (15:50 +0900)
committer Sangwook Ju <sw.ju@samsung.com>
Mon, 14 May 2018 10:45:25 +0000 (19:45 +0900)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h

index b041f94678de5ac8889f22eedc3f5b01ed656f01..16cf067bcbe7afb9fd55a4547a8290ca0299caae 100644 (file)
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -585,4 +585,13 @@ extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
  extern void init_cma_reserved_pageblock(struct page *page);
  #endif
  
+#ifdef CONFIG_HPA
+int alloc_pages_highorder(int order, struct page **pages, int nents);
+#else
+static inline int alloc_pages_highorder(int order, struct page **pages, int nents)
+{
+       return 0;
+}
+#endif
+
  #endif /* __LINUX_GFP_H */
diff --git a/mm/Kconfig b/mm/Kconfig

index 9c4bdddd80c2123ac522c7d6c08453cca6c8445f..c35c02d0c0b94db3a747bd91b4129bac0074455d 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -639,6 +639,12 @@ config MAX_STACK_SIZE_MB
  
           A sane initial value is 80 MB.
  
+config HPA
+       bool "High-order Pages Allocator"
+       select CMA
+       help
+         Turns on High-order Pages Allocator based on page migration.
+
  # For architectures that support deferred memory initialisation
  config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
         bool
diff --git a/mm/Makefile b/mm/Makefile

index e7ebd176fb935e382c9da55c39fde038f256c9e5..3c72e1ba63484af3791fe640ca570243f90f7c05 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_ZSMALLOC)        += zsmalloc.o
  obj-$(CONFIG_Z3FOLD)   += z3fold.o
  obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
  obj-$(CONFIG_CMA)      += cma.o
+obj-$(CONFIG_HPA) += hpa.o
  obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
  obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
  obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/hpa.c b/mm/hpa.c

new file mode 100644 (file)

index 0000000..0c1580b
--- /dev/null
+++ b/mm/hpa.c
@@ -0,0 +1,338 @@
+/*
+ * linux/mm/hpa.c
+ *
+ * Copyright (C) 2015 Samsung Electronics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+
+ * Does best efforts to allocate required high-order pages.
+ */
+
+#include <linux/list.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mmzone.h>
+#include <linux/migrate.h>
+#include <linux/memcontrol.h>
+#include <linux/page-isolation.h>
+#include <linux/mm_inline.h>
+#include <linux/swap.h>
+#include <linux/scatterlist.h>
+#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/oom.h>
+
+#include "internal.h"
+
+#define MAX_SCAN_TRY           (2)
+
+static unsigned long start_pfn, end_pfn;
+static unsigned long cached_scan_pfn;
+
+#define HPA_MIN_OOMADJ 100
+static unsigned long hpa_deathpending_timeout;
+
+static int test_task_flag(struct task_struct *p, int flag)
+{
+       struct task_struct *t = p;
+
+       do {
+               task_lock(t);
+               if (test_tsk_thread_flag(t, flag)) {
+                       task_unlock(t);
+                       return 1;
+               }
+               task_unlock(t);
+       } while_each_thread(p, t);
+
+       return 0;
+}
+
+static int hpa_killer(void)
+{
+       struct task_struct *tsk;
+       struct task_struct *selected = NULL;
+       unsigned long rem = 0;
+       int tasksize;
+       int selected_tasksize = 0;
+       short selected_oom_score_adj;
+       int ret = 0;
+
+       rcu_read_lock();
+       for_each_process(tsk) {
+               struct task_struct *p;
+               short oom_score_adj;
+
+               if (tsk->flags & PF_KTHREAD)
+                       continue;
+
+               if (test_task_flag(tsk, TIF_MEMALLOC))
+                       continue;
+
+               p = find_lock_task_mm(tsk);
+               if (!p)
+                       continue;
+
+               if (task_lmk_waiting(p) && p->mm &&
+                   time_before_eq(jiffies, hpa_deathpending_timeout)) {
+                       task_unlock(p);
+                       rcu_read_unlock();
+                       return ret;
+               }
+               oom_score_adj = p->signal->oom_score_adj;
+               tasksize = get_mm_rss(p->mm);
+               task_unlock(p);
+               if (tasksize <= 0 || oom_score_adj <= HPA_MIN_OOMADJ)
+                       continue;
+               if (same_thread_group(p, current))
+                       continue;
+               if (selected) {
+                       if (oom_score_adj < selected_oom_score_adj)
+                               continue;
+                       if (oom_score_adj == selected_oom_score_adj &&
+                           tasksize <= selected_tasksize)
+                               continue;
+               }
+               selected = p;
+               selected_tasksize = tasksize;
+               selected_oom_score_adj = oom_score_adj;
+       }
+
+       if (selected) {
+               pr_info("HPA: Killing '%s' (%d), adj %hd freed %ldkB\n",
+                               selected->comm, selected->pid,
+                               selected_oom_score_adj,
+                               selected_tasksize * (long)(PAGE_SIZE / 1024));
+               hpa_deathpending_timeout = jiffies + HZ;
+               task_set_lmk_waiting(selected);
+               send_sig(SIGKILL, selected, 0);
+               rem += selected_tasksize;
+       } else {
+               pr_info("HPA: no killable task\n");
+               ret = -ESRCH;
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static bool is_movable_chunk(unsigned long start_pfn, unsigned int order)
+{
+       unsigned long pfn = start_pfn;
+       struct page *page = pfn_to_page(start_pfn);
+
+       for (pfn = start_pfn; pfn < start_pfn + (1 << order); pfn++) {
+               page = pfn_to_page(pfn);
+               if (PageBuddy(page)) {
+                       pfn += (1 << page_order(page)) - 1;
+                       continue;
+               }
+               if (PageCompound(page))
+                       return false;
+               if (PageReserved(page))
+                       return false;
+               if (!PageLRU(page))
+                       return false;
+       }
+       return true;
+}
+
+static int alloc_freepages_range(struct zone *zone, unsigned int order,
+                                struct page **pages, int required)
+
+{
+       unsigned int current_order;
+       unsigned int mt;
+       unsigned long wmark;
+       unsigned long flags;
+       struct free_area *area;
+       struct page *page;
+       int i;
+       int count = 0;
+
+       spin_lock_irqsave(&zone->lock, flags);
+
+       for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+               area = &(zone->free_area[current_order]);
+               wmark = min_wmark_pages(zone) + (1 << current_order);
+
+               for (mt = MIGRATE_UNMOVABLE; mt < MIGRATE_PCPTYPES; ++mt) {
+                       while (!list_empty(&area->free_list[mt])) {
+                               if (!zone_watermark_ok(zone, current_order,
+                                                       wmark, 0, 0))
+                                       goto wmark_fail;
+                               /*
+                                * expanding the current free chunk is not
+                                * supported here due to the complex logic of
+                                * expand().
+                                */
+                               if ((required << order) < (1 << current_order))
+                                       break;
+
+                               page = list_entry(area->free_list[mt].next,
+                                                       struct page, lru);
+                               list_del(&page->lru);
+                               __ClearPageBuddy(page);
+                               set_page_private(page, 0);
+                               set_pcppage_migratetype(page, mt);
+                               /*
+                                * skip checking bad page state
+                                * for fast allocation
+                                */
+                               area->nr_free--;
+                               __mod_zone_page_state(zone, NR_FREE_PAGES,
+                                                       -(1 << current_order));
+
+                               required -= 1 << (current_order - order);
+
+                               for (i = 1 << (current_order - order); i > 0; i--) {
+                                       post_alloc_hook(page, order, GFP_KERNEL);
+                                       pages[count++] = page;
+                                       page += 1 << order;
+                               }
+                       }
+               }
+       }
+
+wmark_fail:
+       spin_unlock_irqrestore(&zone->lock, flags);
+
+       return count;
+}
+
+static void prep_highorder_pages(unsigned long start_pfn, int order)
+{
+       int nr_pages = 1 << order;
+       unsigned long pfn;
+
+       for (pfn = start_pfn + 1; pfn < start_pfn + nr_pages; pfn++)
+               set_page_count(pfn_to_page(pfn), 0);
+}
+
+int alloc_pages_highorder(int order, struct page **pages, int nents)
+{
+       struct zone *zone;
+       unsigned int nr_pages = 1 << order;
+       unsigned long total_scanned = 0;
+       unsigned long pfn, tmp;
+       int p = 0;
+       int remained = nents;
+       int ret;
+       int retry_count = 0;
+       int allocated;
+
+retry:
+       for_each_zone(zone) {
+               if (zone->spanned_pages == 0)
+                       continue;
+
+               allocated = alloc_freepages_range(zone, order,
+                                       pages + nents - remained, remained);
+               remained -= allocated;
+
+               if (remained == 0)
+                       return 0;
+       }
+
+       migrate_prep();
+
+       for (pfn = ALIGN(cached_scan_pfn, nr_pages);
+                       (total_scanned < (end_pfn - start_pfn) * MAX_SCAN_TRY)
+                       && (remained > 0);
+                       pfn += nr_pages, total_scanned += nr_pages) {
+               if (pfn + nr_pages > end_pfn) {
+                       pfn = start_pfn;
+                       continue;
+               }
+
+               /* pfn validation check in the range */
+               tmp = pfn;
+               do {
+                       if (!pfn_valid(tmp))
+                               break;
+               } while (++tmp < (pfn + nr_pages));
+
+               if (tmp < (pfn + nr_pages))
+                       continue;
+
+               if (!is_movable_chunk(pfn, order))
+                       continue;
+
+               ret = alloc_contig_range(pfn, pfn + nr_pages,
+                               get_pageblock_migratetype(pfn_to_page(pfn)));
+               if (ret == 0)
+                       prep_highorder_pages(pfn, order);
+               else
+                       continue;
+
+               pages[p++] = pfn_to_page(pfn);
+               remained--;
+       }
+
+       /* save latest scanned pfn */
+       cached_scan_pfn = pfn;
+
+       if (remained) {
+               int i;
+
+               drop_slab();
+               count_vm_event(DROP_SLAB);
+               ret = hpa_killer();
+               if (ret == 0) {
+                       total_scanned = 0;
+                       pr_info("HPA: drop_slab and killer retry %d count\n",
+                               retry_count++);
+                       goto retry;
+               }
+
+               for (i = 0; i < p; i++)
+                       __free_pages(pages[i], order);
+
+               pr_info("%s: remained=%d / %d, not enough memory in order %d\n",
+                                __func__, remained, nents, order);
+
+               ret = -ENOMEM;
+       }
+
+       return ret;
+}
+
+int free_pages_highorder(int order, struct page **pages, int nents)
+{
+       int i;
+
+       for (i = 0; i < nents; i++)
+               __free_pages(pages[i], order);
+
+       return 0;
+}
+
+static int __init init_highorder_pages_allocator(void)
+{
+       struct zone *zone;
+
+       for_each_zone(zone) {
+               if (zone->spanned_pages == 0)
+                       continue;
+               if (zone_idx(zone) == ZONE_MOVABLE) {
+                       start_pfn = zone->zone_start_pfn;
+                       end_pfn = start_pfn + zone->present_pages;
+               }
+       }
+
+       if (!start_pfn) {
+               start_pfn = __phys_to_pfn(memblock_start_of_DRAM());
+               end_pfn = max_pfn;
+       }
+
+       cached_scan_pfn = start_pfn;
+
+       return 0;
+}
+late_initcall(init_highorder_pages_allocator);
diff --git a/mm/internal.h b/mm/internal.h

index 1df011f624801ffbdf3379a6d17218d9b32280e1..b0302094d4bb4e23e96fbe2fcdb53522cdcaf1d2 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -537,4 +537,23 @@ static inline bool is_migrate_highatomic_page(struct page *page)
  }
  
  void setup_zone_pageset(struct zone *zone);
+
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+       return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+       page->index = migratetype;
+}
+
  #endif /* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 1d7693c3542442f4c24cad8059195593e854c43c..a200d3a3e58917eb372622a2c95159faf30c956b 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -130,24 +130,6 @@ unsigned long totalcma_pages __read_mostly;
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  
-/*
- * A cached value of the page's pageblock's migratetype, used when the page is
- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
- * Also the migratetype set in the page does not necessarily match the pcplist
- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
- * other index - this ensures that it will be put on the correct CMA freelist.
- */
-static inline int get_pcppage_migratetype(struct page *page)
-{
-       return page->index;
-}
-
-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
-{
-       page->index = migratetype;
-}
-
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
author	Sunyoung Kang <sy0816.kang@samsung.com>
	Mon, 4 Jul 2016 06:50:57 +0000 (15:50 +0900)
committer	Sangwook Ju <sw.ju@samsung.com>
	Mon, 14 May 2018 10:45:25 +0000 (19:45 +0900)
include/linux/gfp.h		patch \| blob \| blame \| history
mm/Kconfig		patch \| blob \| blame \| history
mm/Makefile		patch \| blob \| blame \| history
mm/hpa.c	[new file with mode: 0644]	patch \| blob
mm/internal.h		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history