From 0d3f347e10196157c27158dd6b49aa59ec33d22c Mon Sep 17 00:00:00 2001 From: tao zeng Date: Mon, 15 Oct 2018 15:20:38 +0800 Subject: [PATCH] mm: optimize thread stack usage on arm64 [1/1] PD#SWPL-1219 Problem: On arm64, thread stack is 16KB for each task. If running task number is large, this type of memory may over 40MB. It's a large amount on small memory platform. But most case thread only use less 4KB stack. It's waste of memory and we need optimize it. Solution: 1. Pre-allocate a vmalloc address space for task stack; 2. Only map 1st page for stack and handle page fault in EL1 when stack growth triggered exception; 3. handle stack switch for exception. Verify: p212 Change-Id: I47f511ccfa2868d982bc10a820ed6435b6d52ba9 Signed-off-by: tao zeng --- MAINTAINERS | 2 + arch/arm64/kernel/entry.S | 130 ++++++ arch/arm64/kernel/hw_breakpoint.c | 5 + arch/arm64/kernel/smp.c | 10 + arch/arm64/kernel/stacktrace.c | 13 + arch/arm64/kernel/traps.c | 24 ++ drivers/amlogic/memory_ext/Kconfig | 11 + drivers/amlogic/memory_ext/Makefile | 1 + drivers/amlogic/memory_ext/vmap_stack.c | 536 ++++++++++++++++++++++++ drivers/amlogic/pm/gx_pm.c | 6 + fs/proc/meminfo.c | 6 + include/linux/amlogic/vmap_stack.h | 66 +++ include/linux/sched.h | 4 + include/linux/vmalloc.h | 6 + kernel/fork.c | 27 ++ mm/vmalloc.c | 6 + 16 files changed, 853 insertions(+) create mode 100644 drivers/amlogic/memory_ext/vmap_stack.c create mode 100644 include/linux/amlogic/vmap_stack.h diff --git a/MAINTAINERS b/MAINTAINERS index 925114c88a9b..77edf2f35947 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13520,6 +13520,8 @@ AMLOGIC driver for memory extend M: Tao Zeng F: drivers/amlogic/memory_ext/* F: include/linux/amlogic/ramdump.h +F: include/linux/amlogic/vmap_stack.h +F: drivers/amlogic/memory_ext/vmap_stack.c AMLOGIC driver for memory extend M: Tao Zeng diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index fead7137c8d2..8c075154b8e9 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -189,7 +189,11 @@ alternative_else_nop_endif */ .endm +#ifdef CONFIG_AMLOGIC_VMAP + .macro kernel_exit, el, swap = 0 +#else .macro kernel_exit, el +#endif /* CONFIG_AMLOGIC_VMAP */ .if \el != 0 /* Restore the task's original addr_limit. */ ldr x20, [sp, #S_ORIG_ADDR_LIMIT] @@ -271,6 +275,18 @@ alternative_else_nop_endif ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] +#ifdef CONFIG_AMLOGIC_VMAP + /* restore context sp and per-cpu vmap stack */ + .if \swap == 1 + stp x19, x20, [sp] + mov x20, sp + add x19, x20, #S_FRAME_SIZE + msr DBGWVR3_EL1, x19 + mrs x19, DBGWVR2_EL1 + mov sp, x19 + ldp x19, x20, [x20] + .endif +#endif /* CONFIG_AMLOGIC_VMAP */ add sp, sp, #S_FRAME_SIZE // restore sp .if \el == 0 @@ -313,8 +329,10 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 * Add a dummy stack frame, this non-standard format is fixed up * by unwind_frame() */ +#ifndef CONFIG_AMLOGIC_VMAP /* we need get right stack of el1_preempt */ stp x29, x19, [sp, #-16]! mov x29, sp +#endif /* !CONFIG_AMLOGIC_VMAP */ 9998: .endm @@ -447,6 +465,26 @@ ENDPROC(el1_error_invalid) */ .align 6 el1_sync: +#ifdef CONFIG_AMLOGIC_VMAP + /* + * register using: + * DBGWVR2_EL1: temp register and back up for sp_el1 of exception + * DBGWVR3_EL1: always point to per-cpu vmap stack + * switch sp_el1 to per-cpu vmap stack and using DBGWVR2_EL1 + * to back up sp_el1 under exception + */ + msr DBGWVR2_EL1, x29 + mrs x29, DBGWVR3_EL1 + sub x29, x29, #S_FRAME_SIZE + msr DBGWVR3_EL1, x29 + stp x19, x20, [x29] + mov x19, sp + mrs x20, DBGWVR2_EL1 + msr DBGWVR2_EL1, x19 + mov sp, x29 + mov x29, x20 + ldp x19, x20, [sp] +#endif /* CONFIG_AMLOGIC_VMAP */ kernel_entry 1 mrs x1, esr_el1 // read the syndrome register lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class @@ -474,6 +512,38 @@ el1_da: /* * Data abort handling */ +#ifdef CONFIG_AMLOGIC_VMAP + /* + * first handle vmap page fault, if result is not ok(eg, fault address + * is not in vmap range), then do normal data abort + */ + mrs x0, far_el1 + mov x2, sp + stp x29, x30, [sp, #-16]! /* add a stack frame for backtrace */ + mov x29, sp + bl handle_vmap_fault + ldp x29, x22, [sp], #16 + cmp x0, #0 + b.ne 888888f + kernel_exit 1, 1 /* exit for vmap fault */ +888888: + /* + * Not a vmap fault, copy context saved in per-cpu vmap stack + * to task stack, then switch stack back to task stack under + * exception + */ + mrs x0, DBGWVR2_EL1 + mov x1, sp + mov x2, #S_FRAME_SIZE + bl memcpy + add x17, x0, #S_FRAME_SIZE + str x17, [x0, #S_SP] + mrs x1, esr_el1 /* rebuild parameter for normal handler */ + mrs x18, DBGWVR3_EL1 + add x18, x18, #S_FRAME_SIZE + msr DBGWVR3_EL1, x18 + mov sp, x0 +#endif /* CONFIG_AMLOGIC_VMAP */ mrs x3, far_el1 enable_dbg // re-enable interrupts if they were enabled in the aborted context @@ -503,6 +573,20 @@ el1_undef: mov x0, sp b do_undefinstr el1_dbg: +#ifdef CONFIG_AMLOGIC_VMAP + /* switch back to task stack pointer */ + mrs x0, DBGWVR2_EL1 + mov x1, sp + mov x2, #S_FRAME_SIZE + bl memcpy + add x17, x0, #S_FRAME_SIZE + str x17, [x0, #S_SP] + mrs x1, esr_el1 /* rebuild parameter for normal handler */ + mrs x18, DBGWVR3_EL1 + add x18, x18, #S_FRAME_SIZE + msr DBGWVR3_EL1, x18 + mov sp, x0 +#endif /* CONFIG_AMLOGIC_VMAP */ /* * Debug exception handling */ @@ -527,7 +611,36 @@ ENDPROC(el1_sync) .align 6 el1_irq: +#ifdef CONFIG_AMLOGIC_VMAP + /* switch stack to avoid ELR lost if el1_da + * happen when saving context + */ + msr DBGWVR2_EL1, x29 + mrs x29, DBGWVR3_EL1 + sub x29, x29, #S_FRAME_SIZE + msr DBGWVR3_EL1, x29 + stp x19, x20, [x29] + mov x19, sp + mrs x20, DBGWVR2_EL1 + msr DBGWVR2_EL1, x19 + mov sp, x29 + mov x29, x20 + ldp x19, x20, [sp] +#endif /* CONFIG_AMLOGIC_VMAP */ kernel_entry 1 +#ifdef CONFIG_AMLOGIC_VMAP + /* switch back to task stack pointer */ + mrs x0, DBGWVR2_EL1 + mov x1, sp + mov x2, #S_FRAME_SIZE + mov sp, x0 + bl memcpy + add x17, sp, #S_FRAME_SIZE + str x17, [sp, #S_SP] + mrs x18, DBGWVR3_EL1 + add x18, x18, #S_FRAME_SIZE + msr DBGWVR3_EL1, x18 +#endif /* CONFIG_AMLOGIC_VMAP */ enable_dbg #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off @@ -1035,3 +1148,20 @@ ENTRY(sys_rt_sigreturn_wrapper) mov x0, sp b sys_rt_sigreturn ENDPROC(sys_rt_sigreturn_wrapper) + +#ifdef CONFIG_AMLOGIC_VMAP +ENTRY(__setup_vmap_stack) + ldr x18, =vmap_stack + add x18, x18, x0 + mov x0, x18 + mov x1, #0 + mov x2, #THREAD_SIZE + mov x17, lr + bl memset /* clear stack buffer */ + mov lr, x17 + mov x0, #THREAD_START_SP + add x18, x18, x0 /* set stack top */ + msr DBGWVR3_EL1, x18 + ret +ENDPROC(__setup_vmap_stack) +#endif /* CONFIG_AMLOGIC_VMAP */ diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index fb0082ab40a7..0798abd4d692 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -133,6 +133,11 @@ NOKPROBE_SYMBOL(read_wb_reg); static void write_wb_reg(int reg, int n, u64 val) { +#ifdef CONFIG_AMLOGIC_VMAP + /* avoid write DBGWVR since we use it for special purpose */ + if (reg >= AARCH64_DBG_REG_WVR && reg < AARCH64_DBG_REG_WCR) + return; +#endif switch (reg + n) { GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_BVR, AARCH64_DBG_REG_NAME_BVR, val); GEN_WRITE_WB_REG_CASES(AARCH64_DBG_REG_BCR, AARCH64_DBG_REG_NAME_BCR, val); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 4097031ea407..0185f898dced 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -59,6 +59,10 @@ #include #endif +#ifdef CONFIG_AMLOGIC_VMAP +#include +#endif + #define CREATE_TRACE_POINTS #include @@ -226,6 +230,9 @@ asmlinkage void secondary_start_kernel(void) cpu = task_cpu(current); set_my_cpu_offset(per_cpu_offset(cpu)); +#ifdef CONFIG_AMLOGIC_VMAP + __setup_vmap_stack(my_cpu_offset); +#endif /* * All kernel threads share the same mm context; grab a @@ -446,6 +453,9 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { set_my_cpu_offset(per_cpu_offset(smp_processor_id())); +#ifdef CONFIG_AMLOGIC_VMAP + __setup_vmap_stack(my_cpu_offset); +#endif /* * Initialise the static keys early as they may be enabled by the * cpufeature code. diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 5201bebcec07..28dee26114bb 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -25,6 +25,10 @@ #include #include +#ifdef CONFIG_AMLOGIC_VMAP +#include +#endif + /* * AArch64 PCS assigns the frame pointer to x29. * @@ -117,6 +121,15 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) return -EINVAL; } } +#ifdef CONFIG_AMLOGIC_VMAP + /* + * keep search stack for task + */ + if (on_vmap_stack(frame->sp, raw_smp_processor_id()) && + !on_vmap_stack(frame->fp, raw_smp_processor_id())) { + frame->sp = frame->fp; + } +#endif return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index b2b036beb019..6e2d130f3306 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -97,6 +97,21 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom, set_fs(fs); } +#ifdef CONFIG_AMLOGIC_VMAP +static void dump_backtrace_entry(unsigned long ip, unsigned long fp) +{ + unsigned long fp_size = 0; + + if (fp >= VMALLOC_START) { + fp_size = *((unsigned long *)fp) - fp; + /* fp cross IRQ or vmap stack */ + if (fp_size >= THREAD_SIZE) + fp_size = 0; + } + printk("[%016lx+%4ld][<%p>] %pS\n", + fp, fp_size, (void *) ip, (void *) ip); +} +#else static void dump_backtrace_entry(unsigned long where) { /* @@ -104,6 +119,7 @@ static void dump_backtrace_entry(unsigned long where) */ print_ip_sym(where); } +#endif static void __dump_instr(const char *lvl, struct pt_regs *regs) { @@ -186,7 +202,11 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) /* skip until specified stack frame */ if (!skip) { + #ifdef CONFIG_AMLOGIC_VMAP + dump_backtrace_entry(where, frame.fp); + #else dump_backtrace_entry(where); + #endif } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -196,7 +216,11 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) * at which an exception has taken place, use regs->pc * instead. */ + #ifdef CONFIG_AMLOGIC_VMAP + dump_backtrace_entry(regs->pc, frame.fp); + #else dump_backtrace_entry(regs->pc); + #endif } ret = unwind_frame(tsk, &frame); if (ret < 0) diff --git a/drivers/amlogic/memory_ext/Kconfig b/drivers/amlogic/memory_ext/Kconfig index cef724ab3cf4..5b6f2fc42f7d 100644 --- a/drivers/amlogic/memory_ext/Kconfig +++ b/drivers/amlogic/memory_ext/Kconfig @@ -39,6 +39,17 @@ config AMLOGIC_CMA Amlogic CMA optimization for cma alloc/free problems Including policy change of CMA usage +config AMLOGIC_VMAP + bool "Amlogic kernel stack" + depends on AMLOGIC_MEMORY_EXTEND + depends on 64BIT + default y + help + This config is used to enable amlogic kernel stack + usage optimization with vmalloc. It depends on + AMLOGIC_MEMORY_EXTEND. This config only opened + on 64 bit platform. + config AMLOGIC_SLUB_DEBUG bool "Amlogic debug for trace all slub objects" depends on AMLOGIC_PAGE_TRACE diff --git a/drivers/amlogic/memory_ext/Makefile b/drivers/amlogic/memory_ext/Makefile index f3c121638eb0..8d3c2666ac00 100644 --- a/drivers/amlogic/memory_ext/Makefile +++ b/drivers/amlogic/memory_ext/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_AMLOGIC_PAGE_TRACE) += page_trace.o obj-$(CONFIG_AMLOGIC_CMA) += aml_cma.o obj-$(CONFIG_AMLOGIC_SLUB_DEBUG) += aml_slub_debug.o obj-$(CONFIG_AMLOGIC_RAMDUMP) += ram_dump.o +obj-$(CONFIG_AMLOGIC_VMAP) += vmap_stack.o diff --git a/drivers/amlogic/memory_ext/vmap_stack.c b/drivers/amlogic/memory_ext/vmap_stack.c new file mode 100644 index 000000000000..687a1a610e8d --- /dev/null +++ b/drivers/amlogic/memory_ext/vmap_stack.c @@ -0,0 +1,536 @@ +/* + * drivers/amlogic/memory_ext/vmap_stack.c + * + * Copyright (C) 2017 Amlogic, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG 0 + +#define D(format, args...) \ + { if (DEBUG) \ + pr_info("VMAP:%s "format, __func__, ##args); \ + } + +#define E(format, args...) pr_err("VMAP:%s "format, __func__, ##args) + +static unsigned long stack_shrink_jiffies; +static unsigned char vmap_shrink_enable; +static atomic_t vmap_stack_size; +static struct aml_vmap *avmap; + +DEFINE_PER_CPU(unsigned long [THREAD_SIZE/sizeof(long)], vmap_stack) + __aligned(16); + +void update_vmap_stack(int diff) +{ + atomic_add(diff, &vmap_stack_size); +} +EXPORT_SYMBOL(update_vmap_stack); + +int get_vmap_stack_size(void) +{ + return atomic_read(&vmap_stack_size); +} +EXPORT_SYMBOL(get_vmap_stack_size); + +static int is_vmap_addr(unsigned long addr) +{ + unsigned long start, end; + + start = (unsigned long)avmap->root_vm->addr; + end = (unsigned long)avmap->root_vm->addr + avmap->root_vm->size; + if ((addr >= start) && (addr < end)) + return 1; + else + return 0; +} + +static struct page *get_vmap_cached_page(int *remain) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&avmap->page_lock, flags); + if (unlikely(!avmap->cached_pages)) { + spin_unlock_irqrestore(&avmap->page_lock, flags); + return NULL; + } + page = list_first_entry(&avmap->list, struct page, lru); + list_del(&page->lru); + avmap->cached_pages--; + *remain = avmap->cached_pages; + spin_unlock_irqrestore(&avmap->page_lock, flags); + + return page; +} + +static int vmap_mmu_set(struct page *page, unsigned long addr, int set) +{ + pgd_t *pgd = NULL; + pud_t *pud = NULL; + pmd_t *pmd = NULL; + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + pud = pud_alloc(&init_mm, pgd, addr); + if (!pud) + goto nomem; + + if (pud_none(*pud)) { + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + goto nomem; + } + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte = pte_alloc_kernel(pmd, addr); + if (!pte) + goto nomem; + } + + pte = pte_offset_map(pmd, addr); + if (set) + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + else + pte_clear(&init_mm, addr, pte); + pte_unmap(pte); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + D("add:%lx, pgd:%p %llx, pmd:%p %llx, pte:%p %llx\n", + addr, pgd, pgd_val(*pgd), pmd, pmd_val(*pmd), + pte, pte_val(*pte)); + return 0; +nomem: + E("allocation page talbe failed, G:%p, U:%p, M:%p, T:%p", + pgd, pud, pmd, pte); + return -ENOMEM; +} + +static int stack_floor_page(unsigned long addr) +{ + /* + * stack address must align to THREAD_SIZE + */ + return ((addr & (THREAD_SIZE - 1)) < PAGE_SIZE); +} + +static int check_addr_up_flow(unsigned long addr) +{ + /* + * It's the first page of 4 contigours virtual address + * rage(aligned to THREAD_SIZE) but next page of this + * addr is not mapped + */ + if (stack_floor_page(addr) && + !vmalloc_to_page((const void *)(addr + PAGE_SIZE))) + return 1; + return 0; +} + +#if DEBUG +static void dump_backtrace_entry(unsigned long ip, unsigned long fp) +{ + unsigned long fp_size = 0; + + if (fp >= VMALLOC_START) { + fp_size = *((unsigned long *)fp) - fp; + /* fp cross IRQ or vmap stack */ + if (fp_size >= THREAD_SIZE) + fp_size = 0; + } + pr_info("[%016lx+%4ld][<%p>] %pS\n", + fp, fp_size, (void *) ip, (void *) ip); +} + +static void show_fault_stack(unsigned long addr, struct pt_regs *regs) +{ + struct stackframe frame; + + frame.fp = regs->regs[29]; + frame.sp = addr; + frame.pc = (unsigned long)regs->regs[30]; + + pr_info("Call trace:\n"); + pr_info("[%016lx+%4ld][<%p>] %pS\n", + addr, frame.fp - addr, (void *)regs->pc, (void *) regs->pc); + while (1) { + int ret; + + dump_backtrace_entry(frame.pc, frame.fp); + ret = unwind_frame(current, &frame); + if (ret < 0) + break; + } +} +#endif + +/* + * IRQ should *NEVER* been opened in this handler + */ +int handle_vmap_fault(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ + struct page *page; + int cache = 0; + + if (!is_vmap_addr(addr)) + return -EINVAL; + + D("addr:%lx, esr:%x, task:%5d %s\n", + addr, esr, current->pid, current->comm); + D("pc:%pf, %llx, lr:%pf, %llx, sp:%llx, %lx\n", + (void *)regs->pc, regs->pc, + (void *)regs->regs[30], regs->regs[30], regs->sp, + current_stack_pointer); + + if (check_addr_up_flow(addr)) { + E("address %lx out of range\n", addr); + E("PC is:%llx, %pf, LR is:%llx %pf\n", + regs->pc, (void *)regs->pc, + regs->regs[30], (void *)regs->regs[30]); + E("task:%d %s, stack:%p, %lx\n", + current->pid, current->comm, current->stack, + current_stack_pointer); + dump_stack(); + return -ERANGE; + } + + /* + * allocate a new page for vmap + */ + page = get_vmap_cached_page(&cache); + WARN_ON(!page); + vmap_mmu_set(page, addr, 1); + update_vmap_stack(1); + if ((THREAD_SIZE_ORDER > 1) && stack_floor_page(addr)) { + E("task:%d %s, stack near overflow, addr:%lx\n", + current->pid, current->comm, addr); + dump_stack(); + } + + /* cache is not enough */ + if (cache <= (VMAP_CACHE_PAGE / 2)) + mod_delayed_work(system_highpri_wq, &avmap->mwork, 0); + + D("map page:%5lx for addr:%lx\n", page_to_pfn(page), addr); +#if DEBUG + show_fault_stack(addr, regs); +#endif + + return 0; +} +EXPORT_SYMBOL(handle_vmap_fault); + +static unsigned long vmap_shrink_count(struct shrinker *s, + struct shrink_control *sc) +{ + return global_page_state(NR_KERNEL_STACK_KB); +} + +static int shrink_vm_stack(unsigned long low, unsigned long high) +{ + int pages = 0; + struct page *page; + + for (; low < (high & PAGE_MASK); low += PAGE_SIZE) { + page = vmalloc_to_page((const void *)low); + vmap_mmu_set(page, low, 0); + update_vmap_stack(-1); + __free_page(page); + pages++; + } + return pages; +} + +static unsigned long get_task_stack_floor(unsigned long sp) +{ + unsigned long end; + + end = sp & (THREAD_SIZE - 1); + while (sp > end) { + if (!vmalloc_to_page((const void *)sp)) + break; + sp -= PAGE_SIZE; + } + return PAGE_ALIGN(sp); +} + +static unsigned long vmap_shrink_scan(struct shrinker *s, + struct shrink_control *sc) +{ + struct task_struct *tsk; + unsigned long thread_sp; + unsigned long stack_floor; + unsigned long rem = 0; + + if (!vmap_shrink_enable) + return 0; + + /* + * sleep for a while if shrink too ofen + */ + if (jiffies - stack_shrink_jiffies <= STACK_SHRINK_SLEEP) + return 0; + + rcu_read_lock(); + for_each_process(tsk) { + thread_sp = thread_saved_sp(tsk); + stack_floor = get_task_stack_floor(thread_sp); + /* + * Make sure selected task is sleeping + */ + D("r:%3ld, sp:[%lx-%lx], s:%5ld, tsk:%lx %d %s\n", + rem, thread_sp, stack_floor, + thread_sp - stack_floor, + tsk->state, tsk->pid, tsk->comm); + task_lock(tsk); + if (tsk->state == TASK_RUNNING) { + task_unlock(tsk); + continue; + } + if (thread_sp - stack_floor >= STACK_SHRINK_THRESHOLD) + rem += shrink_vm_stack(stack_floor, thread_sp); + task_unlock(tsk); + } + rcu_read_unlock(); + stack_shrink_jiffies = jiffies; + + return rem; +} + +static struct shrinker vmap_shrinker = { + .scan_objects = vmap_shrink_scan, + .count_objects = vmap_shrink_count, + .seeks = DEFAULT_SEEKS * 16 +}; + +/* FOR debug */ +static unsigned long vmap_debug_jiff; + +void aml_account_task_stack(struct task_struct *tsk, int account) +{ + unsigned long stack = (unsigned long)task_stack_page(tsk); + struct page *first_page; + + stack += STACK_TOP_PAGE_OFF; + first_page = vmalloc_to_page((void *)stack); + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); + + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + if (time_after(jiffies, vmap_debug_jiff + HZ * 5)) { + vmap_debug_jiff = jiffies; + D("KERNEL_STACK:%ld KB, vmap stack:%d KB, cached:%d KB\n", + global_page_state(NR_KERNEL_STACK_KB), + get_vmap_stack_size() << (PAGE_SHIFT - 10), + avmap->cached_pages << (PAGE_SHIFT - 10)); + } +} + +void *aml_stack_alloc(int node, struct task_struct *tsk) +{ + unsigned long bitmap_no, raw_start; + struct page *page; + unsigned long addr, map_addr, flags; + + spin_lock_irqsave(&avmap->vmap_lock, flags); + raw_start = avmap->start_bit; + bitmap_no = find_next_zero_bit(avmap->bitmap, MAX_TASKS, + avmap->start_bit); + avmap->start_bit = bitmap_no + 1; /* next idle address space */ + if (bitmap_no >= MAX_TASKS) { + spin_unlock_irqrestore(&avmap->vmap_lock, flags); + E("BITMAP FULL!!!\n"); + return NULL; + } + bitmap_set(avmap->bitmap, bitmap_no, 1); + spin_unlock_irqrestore(&avmap->vmap_lock, flags); + + page = alloc_page(THREADINFO_GFP | __GFP_ZERO); + if (!page) { + spin_lock_irqsave(&avmap->vmap_lock, flags); + bitmap_clear(avmap->bitmap, bitmap_no, 1); + spin_unlock_irqrestore(&avmap->vmap_lock, flags); + E("alloction page failed\n"); + return NULL; + } + /* + * map first page only + */ + addr = (unsigned long)avmap->root_vm->addr + THREAD_SIZE * bitmap_no; + map_addr = addr + STACK_TOP_PAGE_OFF; + vmap_mmu_set(page, map_addr, 1); + update_vmap_stack(1); + D("bit idx:%5ld, start:%5ld, addr:%lx, page:%lx\n", + bitmap_no, raw_start, addr, page_to_pfn(page)); + + return (void *)addr; +} + +void aml_stack_free(struct task_struct *tsk) +{ + unsigned long stack = (unsigned long)tsk->stack; + unsigned long addr, bitmap_no; + struct page *page; + unsigned long flags; + + addr = stack + STACK_TOP_PAGE_OFF; + for (; addr >= stack; addr -= PAGE_SIZE) { + page = vmalloc_to_page((const void *)addr); + if (!page) + break; + vmap_mmu_set(page, addr, 0); + /* supplement for stack page cache first */ + spin_lock_irqsave(&avmap->page_lock, flags); + if (avmap->cached_pages < VMAP_CACHE_PAGE) { + list_add_tail(&page->lru, &avmap->list); + avmap->cached_pages++; + spin_unlock_irqrestore(&avmap->page_lock, flags); + clear_highpage(page); /* clear for next use */ + } else { + spin_unlock_irqrestore(&avmap->page_lock, flags); + __free_page(page); + } + update_vmap_stack(-1); + } + bitmap_no = (stack - (unsigned long)avmap->root_vm->addr) / THREAD_SIZE; + spin_lock_irqsave(&avmap->vmap_lock, flags); + bitmap_clear(avmap->bitmap, bitmap_no, 1); + if (bitmap_no < avmap->start_bit) + avmap->start_bit = bitmap_no; + spin_unlock_irqrestore(&avmap->vmap_lock, flags); +} + +static void page_cache_maintain_work(struct work_struct *work) +{ + struct page *page; + struct list_head head; + int i, cnt; + unsigned long flags; + + spin_lock_irqsave(&avmap->page_lock, flags); + cnt = avmap->cached_pages; + spin_unlock_irqrestore(&avmap->page_lock, flags); + if (cnt >= VMAP_CACHE_PAGE) { + D("cache full cnt:%d\n", cnt); + schedule_delayed_work(&avmap->mwork, CACHE_MAINTAIN_DELAY); + return; + } + + INIT_LIST_HEAD(&head); + for (i = 0; i < VMAP_CACHE_PAGE - cnt; i++) { + page = alloc_page(GFP_KERNEL | __GFP_HIGH); + if (!page) { + E("get page failed, allocated:%d, cnt:%d\n", i, cnt); + break; + } + list_add(&page->lru, &head); + } + spin_lock_irqsave(&avmap->page_lock, flags); + list_splice(&head, &avmap->list); + avmap->cached_pages += i; + spin_unlock_irqrestore(&avmap->page_lock, flags); + D("add %d pages, cnt:%d\n", i, cnt); + schedule_delayed_work(&avmap->mwork, CACHE_MAINTAIN_DELAY); +} + +int __init start_thread_work(void) +{ + schedule_delayed_work(&avmap->mwork, CACHE_MAINTAIN_DELAY); + return 0; +} +arch_initcall(start_thread_work); + +void __init thread_stack_cache_init(void) +{ + int i; + unsigned long addr; + struct page *page; + + page = alloc_pages(GFP_KERNEL, VMAP_CACHE_PAGE_ORDER); + if (!page) + return; + + avmap = kzalloc(sizeof(struct aml_vmap), GFP_KERNEL); + if (!avmap) { + __free_pages(page, VMAP_CACHE_PAGE_ORDER); + return; + } + + avmap->bitmap = kzalloc(MAX_TASKS / 8, GFP_KERNEL); + if (!avmap->bitmap) { + __free_pages(page, VMAP_CACHE_PAGE_ORDER); + kfree(avmap); + return; + } + pr_info("%s, vmap:%p, bitmap:%p, cache page:%lx\n", + __func__, avmap, avmap->bitmap, page_to_pfn(page)); + avmap->root_vm = __get_vm_area_node(VM_STACK_AREA_SIZE, + VM_STACK_AREA_SIZE, + 0, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + if (!avmap->root_vm) { + __free_pages(page, VMAP_CACHE_PAGE_ORDER); + kfree(avmap->bitmap); + kfree(avmap); + return; + } + pr_info("%s, allocation vm area:%p, addr:%p, size:%lx\n", __func__, + avmap->root_vm, avmap->root_vm->addr, + avmap->root_vm->size); + + INIT_LIST_HEAD(&avmap->list); + spin_lock_init(&avmap->page_lock); + spin_lock_init(&avmap->vmap_lock); + + for (i = 0; i < VMAP_CACHE_PAGE; i++) { + list_add(&page->lru, &avmap->list); + page++; + } + avmap->cached_pages = VMAP_CACHE_PAGE; + INIT_DELAYED_WORK(&avmap->mwork, page_cache_maintain_work); + + for_each_possible_cpu(i) { + addr = (unsigned long)per_cpu_ptr(vmap_stack, i); + pr_info("cpu %d, vmap_stack:[%lx-%lx]\n", + i, addr, addr + THREAD_START_SP); + addr = (unsigned long)per_cpu_ptr(irq_stack, i); + pr_info("cpu %d, irq_stack: [%lx-%lx]\n", + i, addr, addr + THREAD_START_SP); + } + register_shrinker(&vmap_shrinker); +} diff --git a/drivers/amlogic/pm/gx_pm.c b/drivers/amlogic/pm/gx_pm.c index 8228001d15b3..dbf91515e34e 100644 --- a/drivers/amlogic/pm/gx_pm.c +++ b/drivers/amlogic/pm/gx_pm.c @@ -42,6 +42,9 @@ #include #include <../kernel/power/power.h> #include +#ifdef CONFIG_AMLOGIC_VMAP +#include +#endif typedef unsigned long (psci_fn)(unsigned long, unsigned long, unsigned long, unsigned long); @@ -82,6 +85,9 @@ static void meson_gx_suspend(void) /* cpu_suspend(0, meson_system_suspend); */ pr_info("... wake up\n"); +#ifdef CONFIG_AMLOGIC_VMAP + __setup_vmap_stack(my_cpu_offset); +#endif } static int meson_pm_prepare(void) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8a428498d6b2..58bd1745012b 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -15,6 +15,9 @@ #ifdef CONFIG_CMA #include #endif +#ifdef CONFIG_AMLOGIC_VMAP +#include +#endif #include #include #include "internal.h" @@ -153,6 +156,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "CmaFree: ", global_page_state(NR_FREE_CMA_PAGES)); #endif +#ifdef CONFIG_AMLOGIC_VMAP + show_val_kb(m, "VmapStack: ", get_vmap_stack_size()); +#endif hugetlb_report_meminfo(m); diff --git a/include/linux/amlogic/vmap_stack.h b/include/linux/amlogic/vmap_stack.h new file mode 100644 index 000000000000..8f7a36f5110e --- /dev/null +++ b/include/linux/amlogic/vmap_stack.h @@ -0,0 +1,66 @@ +/* + * include/linux/amlogic/vmap_stack.h + * + * Copyright (C) 2017 Amlogic, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef __VMAP_STACK_H__ +#define __VMAP_STACK_H__ + +#define STACK_SHRINK_THRESHOLD (PAGE_SIZE + 1024) +#define STACK_SHRINK_SLEEP (HZ) +#define VM_STACK_AREA_SIZE SZ_512M + +#define STACK_TOP_PAGE_OFF (THREAD_SIZE - PAGE_SIZE) + +#define MAX_TASKS (VM_STACK_AREA_SIZE / THREAD_SIZE) + +#define VMAP_PAGE_FLAG (__GFP_ZERO | __GFP_HIGH |\ + __GFP_ATOMIC | __GFP_REPEAT) + +#define VMAP_CACHE_PAGE_ORDER 6 +#define VMAP_CACHE_PAGE (1 << VMAP_CACHE_PAGE_ORDER) +#define CACHE_MAINTAIN_DELAY (HZ) + +struct aml_vmap { + unsigned int start_bit; + int cached_pages; + struct vm_struct *root_vm; + unsigned long *bitmap; + struct list_head list; + spinlock_t vmap_lock; + spinlock_t page_lock; + struct delayed_work mwork; +}; + +extern int handle_vmap_fault(unsigned long addr, + unsigned int esr, struct pt_regs *regs); + +extern DEFINE_PER_CPU(unsigned long [THREAD_SIZE/sizeof(long)], vmap_stack); +static inline bool on_vmap_stack(unsigned long sp, int cpu) +{ + /* variable names the same as kernel/stacktrace.c */ + unsigned long low = (unsigned long)per_cpu(vmap_stack, cpu); + unsigned long high = low + THREAD_START_SP; + + return (low <= sp && sp <= high); +} + +extern void __setup_vmap_stack(unsigned long off); +extern void update_vmap_stack(int diff); +extern int get_vmap_stack_size(void); +extern void aml_stack_free(struct task_struct *tsk); +extern void *aml_stack_alloc(int node, struct task_struct *tsk); +extern void aml_account_task_stack(struct task_struct *tsk, int account); +#endif /* __VMAP_STACK_H__ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3e3f559f9a39..9ab2bf1de373 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3377,8 +3377,12 @@ static inline void *try_get_task_stack(struct task_struct *tsk) static inline void put_task_stack(struct task_struct *tsk) {} #endif +#ifdef CONFIG_AMLOGIC_VMAP +#define task_stack_end_corrupted(task) (false) +#else #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) +#endif static inline int object_is_on_stack(void *obj) { diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3d9d786a943c..2f4fc62a7bd6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -192,4 +192,10 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) int register_vmap_purge_notifier(struct notifier_block *nb); int unregister_vmap_purge_notifier(struct notifier_block *nb); +#ifdef CONFIG_AMLOGIC_VMAP +extern struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, + gfp_t gfp_mask, const void *caller); +#endif #endif /* _LINUX_VMALLOC_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 24ce22c41f21..232244644042 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,6 +86,10 @@ #include #include +#ifdef CONFIG_AMLOGIC_VMAP +#include +#endif + #include #define CREATE_TRACE_POINTS @@ -206,15 +210,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) tsk->stack_vm_area = find_vm_area(stack); return stack; #else +#ifdef CONFIG_AMLOGIC_VMAP + return aml_stack_alloc(node, tsk); +#else /* CONFIG_AMLOGIC_VMAP */ struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif /* CONFIG_AMLOGIC_VMAP */ #endif } static inline void free_thread_stack(struct task_struct *tsk) { +#ifdef CONFIG_AMLOGIC_VMAP + aml_stack_free(tsk); +#else /* CONFIG_AMLOGIC_VMAP */ kaiser_unmap_thread_stack(tsk->stack); #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { @@ -238,6 +249,7 @@ static inline void free_thread_stack(struct task_struct *tsk) #endif __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); +#endif /* CONFIG_AMLOGIC_VMAP */ } # else static struct kmem_cache *thread_stack_cache; @@ -282,6 +294,9 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { +#ifdef CONFIG_AMLOGIC_VMAP + aml_account_task_stack(tsk, account); +#else void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); @@ -314,6 +329,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account) memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } +#endif /* CONFIG_AMLOGIC_VMAP*/ } static void release_task_stack(struct task_struct *tsk) @@ -465,12 +481,23 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +#ifdef CONFIG_AMLOGIC_VMAP +static bool first_magic __read_mostly; +#endif + void set_task_stack_end_magic(struct task_struct *tsk) { unsigned long *stackend; stackend = end_of_stack(tsk); +#ifdef CONFIG_AMLOGIC_VMAP + if (unlikely(!first_magic)) { + *stackend = STACK_END_MAGIC; /* for overflow detection */ + first_magic = 1; + } +#else *stackend = STACK_END_MAGIC; /* for overflow detection */ +#endif } static struct task_struct *dup_task_struct(struct task_struct *orig, int node) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 195de42bea1f..1323e1af39fd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1361,9 +1361,15 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) vm->flags &= ~VM_UNINITIALIZED; } +#ifdef CONFIG_AMLOGIC_VMAP +struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, const void *caller) +#else static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) +#endif { struct vmap_area *va; struct vm_struct *area; -- 2.20.1