From: Paul Mackerras Date: Mon, 10 Oct 2005 11:58:35 +0000 (+1000) Subject: powerpc: Merge arch/ppc64/mm to arch/powerpc/mm X-Git-Url: https://git.stricted.de/?a=commitdiff_plain;h=ab1f9dac6eea25ee59e4c8e1cf0b7476afbbfe07;p=GitHub%2FLineageOS%2Fandroid_kernel_motorola_exynos9610.git powerpc: Merge arch/ppc64/mm to arch/powerpc/mm This moves the remaining files in arch/ppc64/mm to arch/powerpc/mm, and arranges that we use them when compiling with ARCH=ppc64. Signed-off-by: Paul Mackerras --- diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 35497deeb4b2..612bc4ec72b1 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,8 +5,14 @@ obj-y := fault.o mem.o lmb.o obj-$(CONFIG_PPC32) += init_32.o pgtable_32.o mmu_context_32.o \ tlb_32.o -obj-$(CONFIG_PPC64) += init_64.o pgtable_64.o mmu_context_64.o +hash-$(CONFIG_PPC_MULTIPLATFORM) := hash_native_64.o +obj-$(CONFIG_PPC64) += init_64.o pgtable_64.o mmu_context_64.o \ + hash_utils_64.o hash_low_64.o tlb_64.o \ + slb_low.o slb.o stab.o mmap.o imalloc.o \ + $(hash-y) obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o obj-$(CONFIG_40x) += 4xx_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o +obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S new file mode 100644 index 000000000000..d6ed9102eeea --- /dev/null +++ b/arch/powerpc/mm/hash_low_64.S @@ -0,0 +1,288 @@ +/* + * ppc64 MMU hashtable management routines + * + * (c) Copyright IBM Corp. 2003 + * + * Maintained by: Benjamin Herrenschmidt + * + * + * This file is covered by the GNU Public Licence v2 as + * described in the kernel's COPYING file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + .text + +/* + * Stackframe: + * + * +-> Back chain (SP + 256) + * | General register save area (SP + 112) + * | Parameter save area (SP + 48) + * | TOC save area (SP + 40) + * | link editor doubleword (SP + 32) + * | compiler doubleword (SP + 24) + * | LR save area (SP + 16) + * | CR save area (SP + 8) + * SP ---> +-- Back chain (SP + 0) + */ +#define STACKFRAMESIZE 256 + +/* Save parameters offsets */ +#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8) + +/* Save non-volatile offsets */ +#define STK_REG(i) (112 + ((i)-14)*8) + +/* + * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, int local) + * + * Adds a page to the hash table. This is the non-LPAR version for now + */ + +_GLOBAL(__hash_page) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- bail_ok + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY,HASHPTE and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 + rldicl r3,r3,0,36 + or r29,r3,r29 + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ + xor r28,r5,r0 + + /* Convert linux PTE bits into HW equivalents */ + andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + andi. r0,r31,_PAGE_HASHPTE + bne htab_modify_pte + +htab_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HASHPTE + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 + ori r30,r30,_PAGE_HASHPTE + + /* page number in r5 */ + rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r6,0 /* no vflags */ +_GLOBAL(htab_call_hpte_insert1) + bl . /* Will be patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + + /* page number in r5 */ + rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r6,HPTE_V_SECONDARY@l /* secondary slot */ +_GLOBAL(htab_call_hpte_insert2) + bl . /* Will be patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(htab_call_hpte_remove) + bl . /* Will be patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + +bail_ok: + li r3,0 + b bail + +htab_pte_insert_ok: + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +htab_write_out_pte: + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3, 0 +bail: + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_SECONDARY + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,0 /* large is 0 */ + ld r7,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(htab_call_hpte_updatepp) + bl . /* Will be patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b htab_write_out_pte + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b bail + +htab_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARM(r6)(r1) + std r31,0(r6) + li r3,-1 + b bail + + diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c new file mode 100644 index 000000000000..174d14576c28 --- /dev/null +++ b/arch/powerpc/mm/hash_native_64.c @@ -0,0 +1,446 @@ +/* + * native hashtable management. + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define HPTE_LOCK_BIT 3 + +static DEFINE_SPINLOCK(native_tlbie_lock); + +static inline void native_lock_hpte(hpte_t *hptep) +{ + unsigned long *word = &hptep->v; + + while (1) { + if (!test_and_set_bit(HPTE_LOCK_BIT, word)) + break; + while(test_bit(HPTE_LOCK_BIT, word)) + cpu_relax(); + } +} + +static inline void native_unlock_hpte(hpte_t *hptep) +{ + unsigned long *word = &hptep->v; + + asm volatile("lwsync":::"memory"); + clear_bit(HPTE_LOCK_BIT, word); +} + +long native_hpte_insert(unsigned long hpte_group, unsigned long va, + unsigned long prpn, unsigned long vflags, + unsigned long rflags) +{ + hpte_t *hptep = htab_address + hpte_group; + unsigned long hpte_v, hpte_r; + int i; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (! (hptep->v & HPTE_V_VALID)) { + /* retry with lock held */ + native_lock_hpte(hptep); + if (! (hptep->v & HPTE_V_VALID)) + break; + native_unlock_hpte(hptep); + } + + hptep++; + } + + if (i == HPTES_PER_GROUP) + return -1; + + hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; + if (vflags & HPTE_V_LARGE) + va &= ~(1UL << HPTE_V_AVPN_SHIFT); + hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; + + hptep->r = hpte_r; + /* Guarantee the second dword is visible before the valid bit */ + __asm__ __volatile__ ("eieio" : : : "memory"); + /* + * Now set the first dword including the valid bit + * NOTE: this also unlocks the hpte + */ + hptep->v = hpte_v; + + __asm__ __volatile__ ("ptesync" : : : "memory"); + + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); +} + +static long native_hpte_remove(unsigned long hpte_group) +{ + hpte_t *hptep; + int i; + int slot_offset; + unsigned long hpte_v; + + /* pick a random entry to start at */ + slot_offset = mftb() & 0x7; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group + slot_offset; + hpte_v = hptep->v; + + if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { + /* retry with lock held */ + native_lock_hpte(hptep); + hpte_v = hptep->v; + if ((hpte_v & HPTE_V_VALID) + && !(hpte_v & HPTE_V_BOLTED)) + break; + native_unlock_hpte(hptep); + } + + slot_offset++; + slot_offset &= 0x7; + } + + if (i == HPTES_PER_GROUP) + return -1; + + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + return i; +} + +static inline void set_pp_bit(unsigned long pp, hpte_t *addr) +{ + unsigned long old; + unsigned long *p = &addr->r; + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + rldimi %0,%2,0,61\n\ + stdcx. %0,0,%3\n\ + bne 1b" + : "=&r" (old), "=m" (*p) + : "r" (pp), "r" (p), "m" (*p) + : "cc"); +} + +/* + * Only works on small pages. Yes its ugly to have to check each slot in + * the group but we only use this during bootup. + */ +static long native_hpte_find(unsigned long vpn) +{ + hpte_t *hptep; + unsigned long hash; + unsigned long i, j; + long slot; + unsigned long hpte_v; + + hash = hpt_hash(vpn, 0); + + for (j = 0; j < 2; j++) { + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + slot; + hpte_v = hptep->v; + + if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11)) + && (hpte_v & HPTE_V_VALID) + && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) { + /* HPTE matches */ + if (j) + slot = -slot; + return slot; + } + ++slot; + } + hash = ~hash; + } + + return -1; +} + +static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, + unsigned long va, int large, int local) +{ + hpte_t *hptep = htab_address + slot; + unsigned long hpte_v; + unsigned long avpn = va >> 23; + int ret = 0; + + if (large) + avpn &= ~1; + + native_lock_hpte(hptep); + + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) + || !(hpte_v & HPTE_V_VALID)) { + native_unlock_hpte(hptep); + ret = -1; + } else { + set_pp_bit(newpp, hptep); + native_unlock_hpte(hptep); + } + + /* Ensure it is out of the tlb too */ + if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { + tlbiel(va); + } else { + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock(&native_tlbie_lock); + tlbie(va, large); + if (lock_tlbie) + spin_unlock(&native_tlbie_lock); + } + + return ret; +} + +/* + * Update the page protection bits. Intended to be used to create + * guard pages for kernel data structures on pages which are bolted + * in the HPT. Assumes pages being operated on will not be stolen. + * Does not work on large pages. + * + * No need to lock here because we should be the only user. + */ +static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) +{ + unsigned long vsid, va, vpn, flags = 0; + long slot; + hpte_t *hptep; + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + vsid = get_kernel_vsid(ea); + va = (vsid << 28) | (ea & 0x0fffffff); + vpn = va >> PAGE_SHIFT; + + slot = native_hpte_find(vpn); + if (slot == -1) + panic("could not find page to bolt\n"); + hptep = htab_address + slot; + + set_pp_bit(newpp, hptep); + + /* Ensure it is out of the tlb too */ + if (lock_tlbie) + spin_lock_irqsave(&native_tlbie_lock, flags); + tlbie(va, 0); + if (lock_tlbie) + spin_unlock_irqrestore(&native_tlbie_lock, flags); +} + +static void native_hpte_invalidate(unsigned long slot, unsigned long va, + int large, int local) +{ + hpte_t *hptep = htab_address + slot; + unsigned long hpte_v; + unsigned long avpn = va >> 23; + unsigned long flags; + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + if (large) + avpn &= ~1; + + local_irq_save(flags); + native_lock_hpte(hptep); + + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) + || !(hpte_v & HPTE_V_VALID)) { + native_unlock_hpte(hptep); + } else { + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + } + + /* Invalidate the tlb */ + if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { + tlbiel(va); + } else { + if (lock_tlbie) + spin_lock(&native_tlbie_lock); + tlbie(va, large); + if (lock_tlbie) + spin_unlock(&native_tlbie_lock); + } + local_irq_restore(flags); +} + +/* + * clear all mappings on kexec. All cpus are in real mode (or they will + * be when they isi), and we are the only one left. We rely on our kernel + * mapping being 0xC0's and the hardware ignoring those two real bits. + * + * TODO: add batching support when enabled. remember, no dynamic memory here, + * athough there is the control page available... + */ +static void native_hpte_clear(void) +{ + unsigned long slot, slots, flags; + hpte_t *hptep = htab_address; + unsigned long hpte_v; + unsigned long pteg_count; + + pteg_count = htab_hash_mask + 1; + + local_irq_save(flags); + + /* we take the tlbie lock and hold it. Some hardware will + * deadlock if we try to tlbie from two processors at once. + */ + spin_lock(&native_tlbie_lock); + + slots = pteg_count * HPTES_PER_GROUP; + + for (slot = 0; slot < slots; slot++, hptep++) { + /* + * we could lock the pte here, but we are the only cpu + * running, right? and for crash dump, we probably + * don't want to wait for a maybe bad cpu. + */ + hpte_v = hptep->v; + + if (hpte_v & HPTE_V_VALID) { + hptep->v = 0; + tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE); + } + } + + spin_unlock(&native_tlbie_lock); + local_irq_restore(flags); +} + +static void native_flush_hash_range(unsigned long number, int local) +{ + unsigned long va, vpn, hash, secondary, slot, flags, avpn; + int i, j; + hpte_t *hptep; + unsigned long hpte_v; + struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + unsigned long large = batch->large; + + local_irq_save(flags); + + j = 0; + for (i = 0; i < number; i++) { + va = batch->vaddr[j]; + if (large) + vpn = va >> HPAGE_SHIFT; + else + vpn = va >> PAGE_SHIFT; + hash = hpt_hash(vpn, large); + secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15; + if (secondary) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12; + + hptep = htab_address + slot; + + avpn = va >> 23; + if (large) + avpn &= ~0x1UL; + + native_lock_hpte(hptep); + + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) + || !(hpte_v & HPTE_V_VALID)) { + native_unlock_hpte(hptep); + } else { + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + } + + j++; + } + + if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { + asm volatile("ptesync":::"memory"); + + for (i = 0; i < j; i++) + __tlbiel(batch->vaddr[i]); + + asm volatile("ptesync":::"memory"); + } else { + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + + for (i = 0; i < j; i++) + __tlbie(batch->vaddr[i], large); + + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + spin_unlock(&native_tlbie_lock); + } + + local_irq_restore(flags); +} + +#ifdef CONFIG_PPC_PSERIES +/* Disable TLB batching on nighthawk */ +static inline int tlb_batching_enabled(void) +{ + struct device_node *root = of_find_node_by_path("/"); + int enabled = 1; + + if (root) { + const char *model = get_property(root, "model", NULL); + if (model && !strcmp(model, "IBM,9076-N81")) + enabled = 0; + of_node_put(root); + } + + return enabled; +} +#else +static inline int tlb_batching_enabled(void) +{ + return 1; +} +#endif + +void hpte_init_native(void) +{ + ppc_md.hpte_invalidate = native_hpte_invalidate; + ppc_md.hpte_updatepp = native_hpte_updatepp; + ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp; + ppc_md.hpte_insert = native_hpte_insert; + ppc_md.hpte_remove = native_hpte_remove; + ppc_md.hpte_clear_all = native_hpte_clear; + if (tlb_batching_enabled()) + ppc_md.flush_hash_range = native_flush_hash_range; + htab_finish_init(); +} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c new file mode 100644 index 000000000000..35dd93eeaf4b --- /dev/null +++ b/arch/powerpc/mm/hash_utils_64.c @@ -0,0 +1,438 @@ +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard , IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +#ifdef CONFIG_U3_DART +extern unsigned long dart_tablebase; +#endif /* CONFIG_U3_DART */ + +hpte_t *htab_address; +unsigned long htab_hash_mask; + +unsigned long _SDR1; + +#define KB (1024) +#define MB (1024*KB) + +static inline void loop_forever(void) +{ + volatile unsigned long x = 1; + for(;x;x|=1) + ; +} + +static inline void create_pte_mapping(unsigned long start, unsigned long end, + unsigned long mode, int large) +{ + unsigned long addr; + unsigned int step; + unsigned long tmp_mode; + unsigned long vflags; + + if (large) { + step = 16*MB; + vflags = HPTE_V_BOLTED | HPTE_V_LARGE; + } else { + step = 4*KB; + vflags = HPTE_V_BOLTED; + } + + for (addr = start; addr < end; addr += step) { + unsigned long vpn, hash, hpteg; + unsigned long vsid = get_kernel_vsid(addr); + unsigned long va = (vsid << 28) | (addr & 0xfffffff); + int ret = -1; + + if (large) + vpn = va >> HPAGE_SHIFT; + else + vpn = va >> PAGE_SHIFT; + + + tmp_mode = mode; + + /* Make non-kernel text non-executable */ + if (!in_kernel_text(addr)) + tmp_mode = mode | HW_NO_EXEC; + + hash = hpt_hash(vpn, large); + + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + +#ifdef CONFIG_PPC_ISERIES + if (systemcfg->platform & PLATFORM_ISERIES_LPAR) + ret = iSeries_hpte_bolt_or_insert(hpteg, va, + virt_to_abs(addr) >> PAGE_SHIFT, + vflags, tmp_mode); + else +#endif +#ifdef CONFIG_PPC_PSERIES + if (systemcfg->platform & PLATFORM_LPAR) + ret = pSeries_lpar_hpte_insert(hpteg, va, + virt_to_abs(addr) >> PAGE_SHIFT, + vflags, tmp_mode); + else +#endif +#ifdef CONFIG_PPC_MULTIPLATFORM + ret = native_hpte_insert(hpteg, va, + virt_to_abs(addr) >> PAGE_SHIFT, + vflags, tmp_mode); +#endif + + if (ret == -1) { + ppc64_terminate_msg(0x20, "create_pte_mapping"); + loop_forever(); + } + } +} + +void __init htab_initialize(void) +{ + unsigned long table, htab_size_bytes; + unsigned long pteg_count; + unsigned long mode_rw; + int i, use_largepages = 0; + unsigned long base = 0, size = 0; + extern unsigned long tce_alloc_start, tce_alloc_end; + + DBG(" -> htab_initialize()\n"); + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = 1UL << ppc64_pft_size; + pteg_count = htab_size_bytes >> 7; + + /* For debug, make the HTAB 1/8 as big as it normally would be. */ + ifppcdebug(PPCDBG_HTABSIZE) { + pteg_count >>= 3; + htab_size_bytes = pteg_count << 7; + } + + htab_hash_mask = pteg_count - 1; + + if (systemcfg->platform & PLATFORM_LPAR) { + /* Using a hypervisor which owns the htab */ + htab_address = NULL; + _SDR1 = 0; + } else { + /* Find storage for the HPT. Must be contiguous in + * the absolute address space. + */ + table = lmb_alloc(htab_size_bytes, htab_size_bytes); + + DBG("Hash table allocated at %lx, size: %lx\n", table, + htab_size_bytes); + + if ( !table ) { + ppc64_terminate_msg(0x20, "hpt space"); + loop_forever(); + } + htab_address = abs_to_virt(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(pteg_count) - 11; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + } + + mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX; + + /* On U3 based machines, we need to reserve the DART area and + * _NOT_ map it to avoid cache paradoxes as it's remapped non + * cacheable later on + */ + if (cpu_has_feature(CPU_FTR_16M_PAGE)) + use_largepages = 1; + + /* create bolted the linear mapping in the hash table */ + for (i=0; i < lmb.memory.cnt; i++) { + base = lmb.memory.region[i].base + KERNELBASE; + size = lmb.memory.region[i].size; + + DBG("creating mapping for region: %lx : %lx\n", base, size); + +#ifdef CONFIG_U3_DART + /* Do not map the DART space. Fortunately, it will be aligned + * in such a way that it will not cross two lmb regions and will + * fit within a single 16Mb page. + * The DART space is assumed to be a full 16Mb region even if we + * only use 2Mb of that space. We will use more of it later for + * AGP GART. We have to use a full 16Mb large page. + */ + DBG("DART base: %lx\n", dart_tablebase); + + if (dart_tablebase != 0 && dart_tablebase >= base + && dart_tablebase < (base + size)) { + if (base != dart_tablebase) + create_pte_mapping(base, dart_tablebase, mode_rw, + use_largepages); + if ((base + size) > (dart_tablebase + 16*MB)) + create_pte_mapping(dart_tablebase + 16*MB, base + size, + mode_rw, use_largepages); + continue; + } +#endif /* CONFIG_U3_DART */ + create_pte_mapping(base, base + size, mode_rw, use_largepages); + } + + /* + * If we have a memory_limit and we've allocated TCEs then we need to + * explicitly map the TCE area at the top of RAM. We also cope with the + * case that the TCEs start below memory_limit. + * tce_alloc_start/end are 16MB aligned so the mapping should work + * for either 4K or 16MB pages. + */ + if (tce_alloc_start) { + tce_alloc_start += KERNELBASE; + tce_alloc_end += KERNELBASE; + + if (base + size >= tce_alloc_start) + tce_alloc_start = base + size + 1; + + create_pte_mapping(tce_alloc_start, tce_alloc_end, + mode_rw, use_largepages); + } + + DBG(" <- htab_initialize()\n"); +} +#undef KB +#undef MB + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct page *page; + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + __flush_dcache_icache(page_address(page)); + set_bit(PG_arch_1, &page->flags); + } else + pp |= HW_NO_EXEC; + } + return pp; +} + +/* Result code is: + * 0 - handled + * 1 - normal page fault + * -1 - critical hash insertion error + */ +int hash_page(unsigned long ea, unsigned long access, unsigned long trap) +{ + void *pgdir; + unsigned long vsid; + struct mm_struct *mm; + pte_t *ptep; + int ret; + int user_region = 0; + int local = 0; + cpumask_t tmp; + + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) + return 1; + + switch (REGION_ID(ea)) { + case USER_REGION_ID: + user_region = 1; + mm = current->mm; + if (! mm) + return 1; + + vsid = get_vsid(mm->context.id, ea); + break; + case VMALLOC_REGION_ID: + mm = &init_mm; + vsid = get_kernel_vsid(ea); + break; +#if 0 + case KERNEL_REGION_ID: + /* + * Should never get here - entire 0xC0... region is bolted. + * Send the problem up to do_page_fault + */ +#endif + default: + /* Not a valid range + * Send the problem up to do_page_fault + */ + return 1; + break; + } + + pgdir = mm->pgd; + + if (pgdir == NULL) + return 1; + + tmp = cpumask_of_cpu(smp_processor_id()); + if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) + local = 1; + + /* Is this a huge page ? */ + if (unlikely(in_hugepage_area(mm->context, ea))) + ret = hash_huge_page(mm, access, ea, vsid, local); + else { + ptep = find_linux_pte(pgdir, ea); + if (ptep == NULL) + return 1; + ret = __hash_page(ea, access, vsid, ptep, trap, local); + } + + return ret; +} + +void flush_hash_page(unsigned long va, pte_t pte, int local) +{ + unsigned long vpn, hash, secondary, slot; + unsigned long huge = pte_huge(pte); + + if (huge) + vpn = va >> HPAGE_SHIFT; + else + vpn = va >> PAGE_SHIFT; + hash = hpt_hash(vpn, huge); + secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; + if (secondary) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12; + + ppc_md.hpte_invalidate(slot, va, huge, local); +} + +void flush_hash_range(unsigned long number, int local) +{ + if (ppc_md.flush_hash_range) { + ppc_md.flush_hash_range(number, local); + } else { + int i; + struct ppc64_tlb_batch *batch = + &__get_cpu_var(ppc64_tlb_batch); + + for (i = 0; i < number; i++) + flush_hash_page(batch->vaddr[i], batch->pte[i], local); + } +} + +static inline void make_bl(unsigned int *insn_addr, void *func) +{ + unsigned long funcp = *((unsigned long *)func); + int offset = funcp - (unsigned long)insn_addr; + + *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); + flush_icache_range((unsigned long)insn_addr, 4+ + (unsigned long)insn_addr); +} + +/* + * low_hash_fault is called when we the low level hash code failed + * to instert a PTE due to an hypervisor error + */ +void low_hash_fault(struct pt_regs *regs, unsigned long address) +{ + if (user_mode(regs)) { + siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, current); + return; + } + bad_page_fault(regs, address, SIGBUS); +} + +void __init htab_finish_init(void) +{ + extern unsigned int *htab_call_hpte_insert1; + extern unsigned int *htab_call_hpte_insert2; + extern unsigned int *htab_call_hpte_remove; + extern unsigned int *htab_call_hpte_updatepp; + + make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); + make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); + make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); + make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c new file mode 100644 index 000000000000..0ea0994ed974 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage.c @@ -0,0 +1,745 @@ +/* + * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) +#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) + +/* Modelled after find_linux_pte() */ +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt; + + BUG_ON(! in_hugepage_area(mm->context, addr)); + + addr &= HPAGE_MASK; + + pg = pgd_offset(mm, addr); + if (!pgd_none(*pg)) { + pu = pud_offset(pg, addr); + if (!pud_none(*pu)) { + pm = pmd_offset(pu, addr); + pt = (pte_t *)pm; + BUG_ON(!pmd_none(*pm) + && !(pte_present(*pt) && pte_huge(*pt))); + return pt; + } + } + + return NULL; +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt; + + BUG_ON(! in_hugepage_area(mm->context, addr)); + + addr &= HPAGE_MASK; + + pg = pgd_offset(mm, addr); + pu = pud_alloc(mm, pg, addr); + + if (pu) { + pm = pmd_alloc(mm, pu, addr); + if (pm) { + pt = (pte_t *)pm; + BUG_ON(!pmd_none(*pm) + && !(pte_present(*pt) && pte_huge(*pt))); + return pt; + } + } + + return NULL; +} + +#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE) + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + int i; + + if (pte_present(*ptep)) { + pte_clear(mm, addr, ptep); + flush_tlb_pending(); + } + + for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) { + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + ptep++; + } +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + unsigned long old = pte_update(ptep, ~0UL); + int i; + + if (old & _PAGE_HASHPTE) + hpte_update(mm, addr, old, 0); + + for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) + ptep[i] = __pte(0); + + return __pte(old); +} + +/* + * This function checks for proper alignment of input addr and len parameters. + */ +int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + if (! (within_hugepage_low_range(addr, len) + || within_hugepage_high_range(addr, len)) ) + return -EINVAL; + return 0; +} + +static void flush_low_segments(void *parm) +{ + u16 areas = (unsigned long) parm; + unsigned long i; + + asm volatile("isync" : : : "memory"); + + BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS); + + for (i = 0; i < NUM_LOW_AREAS; i++) { + if (! (areas & (1U << i))) + continue; + asm volatile("slbie %0" + : : "r" ((i << SID_SHIFT) | SLBIE_C)); + } + + asm volatile("isync" : : : "memory"); +} + +static void flush_high_segments(void *parm) +{ + u16 areas = (unsigned long) parm; + unsigned long i, j; + + asm volatile("isync" : : : "memory"); + + BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS); + + for (i = 0; i < NUM_HIGH_AREAS; i++) { + if (! (areas & (1U << i))) + continue; + for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) + asm volatile("slbie %0" + :: "r" (((i << HTLB_AREA_SHIFT) + + (j << SID_SHIFT)) | SLBIE_C)); + } + + asm volatile("isync" : : : "memory"); +} + +static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) +{ + unsigned long start = area << SID_SHIFT; + unsigned long end = (area+1) << SID_SHIFT; + struct vm_area_struct *vma; + + BUG_ON(area >= NUM_LOW_AREAS); + + /* Check no VMAs are in the region */ + vma = find_vma(mm, start); + if (vma && (vma->vm_start < end)) + return -EBUSY; + + return 0; +} + +static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) +{ + unsigned long start = area << HTLB_AREA_SHIFT; + unsigned long end = (area+1) << HTLB_AREA_SHIFT; + struct vm_area_struct *vma; + + BUG_ON(area >= NUM_HIGH_AREAS); + + /* Check no VMAs are in the region */ + vma = find_vma(mm, start); + if (vma && (vma->vm_start < end)) + return -EBUSY; + + return 0; +} + +static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) +{ + unsigned long i; + + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); + BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); + + newareas &= ~(mm->context.low_htlb_areas); + if (! newareas) + return 0; /* The segments we want are already open */ + + for (i = 0; i < NUM_LOW_AREAS; i++) + if ((1 << i) & newareas) + if (prepare_low_area_for_htlb(mm, i) != 0) + return -EBUSY; + + mm->context.low_htlb_areas |= newareas; + + /* update the paca copy of the context struct */ + get_paca()->context = mm->context; + + /* the context change must make it to memory before the flush, + * so that further SLB misses do the right thing. */ + mb(); + on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1); + + return 0; +} + +static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) +{ + unsigned long i; + + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); + BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) + != NUM_HIGH_AREAS); + + newareas &= ~(mm->context.high_htlb_areas); + if (! newareas) + return 0; /* The areas we want are already open */ + + for (i = 0; i < NUM_HIGH_AREAS; i++) + if ((1 << i) & newareas) + if (prepare_high_area_for_htlb(mm, i) != 0) + return -EBUSY; + + mm->context.high_htlb_areas |= newareas; + + /* update the paca copy of the context struct */ + get_paca()->context = mm->context; + + /* the context change must make it to memory before the flush, + * so that further SLB misses do the right thing. */ + mb(); + on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1); + + return 0; +} + +int prepare_hugepage_range(unsigned long addr, unsigned long len) +{ + int err; + + if ( (addr+len) < addr ) + return -EINVAL; + + if ((addr + len) < 0x100000000UL) + err = open_low_hpage_areas(current->mm, + LOW_ESID_MASK(addr, len)); + else + err = open_high_hpage_areas(current->mm, + HTLB_AREA_MASK(addr, len)); + if (err) { + printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" + " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", + addr, len, + LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); + return err; + } + + return 0; +} + +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + pte_t *ptep; + struct page *page; + + if (! in_hugepage_area(mm->context, address)) + return ERR_PTR(-EINVAL); + + ptep = huge_pte_offset(mm, address); + page = pte_page(*ptep); + if (page) + page += (address % HPAGE_SIZE) / PAGE_SIZE; + + return page; +} + +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + BUG(); + return NULL; +} + +/* Because we have an exclusive hugepage region which lies within the + * normal user address space, we have to take special measures to make + * non-huge mmap()s evade the hugepage reserved regions. */ +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (((TASK_SIZE - len) >= addr) + && (!vma || (addr+len) <= vma->vm_start) + && !is_hugepage_only_range(mm, addr,len)) + return addr; + } + if (len > mm->cached_hole_size) { + start_addr = addr = mm->free_area_cache; + } else { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + } + +full_search: + vma = find_vma(mm, addr); + while (TASK_SIZE - len >= addr) { + BUG_ON(vma && (addr >= vma->vm_end)); + + if (touches_hugepage_low_range(mm, addr, len)) { + addr = ALIGN(addr+1, 1<vm_start) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + addr = vma->vm_end; + vma = vma->vm_next; + } + + /* Make sure we didn't miss any holes */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + goto full_search; + } + return -ENOMEM; +} + +/* + * This mmap-allocator allocates new areas top-down from below the + * stack's low limit (the base): + * + * Because we have an exclusive hugepage region which lies within the + * normal user address space, we have to take special measures to make + * non-huge mmap()s evade the hugepage reserved regions. + */ +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma, *prev_vma; + struct mm_struct *mm = current->mm; + unsigned long base = mm->mmap_base, addr = addr0; + unsigned long largest_hole = mm->cached_hole_size; + int first_time = 1; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + /* dont allow allocations above current base */ + if (mm->free_area_cache > base) + mm->free_area_cache = base; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start) + && !is_hugepage_only_range(mm, addr,len)) + return addr; + } + + if (len <= largest_hole) { + largest_hole = 0; + mm->free_area_cache = base; + } +try_again: + /* make sure it can fit in the remaining address space */ + if (mm->free_area_cache < len) + goto fail; + + /* either no address requested or cant fit in requested address hole */ + addr = (mm->free_area_cache - len) & PAGE_MASK; + do { +hugepage_recheck: + if (touches_hugepage_low_range(mm, addr, len)) { + addr = (addr & ((~0) << SID_SHIFT)) - len; + goto hugepage_recheck; + } else if (touches_hugepage_high_range(mm, addr, len)) { + addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; + goto hugepage_recheck; + } + + /* + * Lookup failure means no vma is above this address, + * i.e. return with success: + */ + if (!(vma = find_vma_prev(mm, addr, &prev_vma))) + return addr; + + /* + * new region fits between prev_vma->vm_end and + * vma->vm_start, use it: + */ + if (addr+len <= vma->vm_start && + (!prev_vma || (addr >= prev_vma->vm_end))) { + /* remember the address as a hint for next time */ + mm->cached_hole_size = largest_hole; + return (mm->free_area_cache = addr); + } else { + /* pull free_area_cache down to the first hole */ + if (mm->free_area_cache == vma->vm_end) { + mm->free_area_cache = vma->vm_start; + mm->cached_hole_size = largest_hole; + } + } + + /* remember the largest hole we saw so far */ + if (addr + largest_hole < vma->vm_start) + largest_hole = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len <= vma->vm_start); + +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (first_time) { + mm->free_area_cache = base; + largest_hole = 0; + first_time = 0; + goto try_again; + } + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = base; + mm->cached_hole_size = ~0UL; + + return addr; +} + +static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) +{ + unsigned long addr = 0; + struct vm_area_struct *vma; + + vma = find_vma(current->mm, addr); + while (addr + len <= 0x100000000UL) { + BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ + + if (! __within_hugepage_low_range(addr, len, segmask)) { + addr = ALIGN(addr+1, 1<mm, addr); + continue; + } + + if (!vma || (addr + len) <= vma->vm_start) + return addr; + addr = ALIGN(vma->vm_end, HPAGE_SIZE); + /* Depending on segmask this might not be a confirmed + * hugepage region, so the ALIGN could have skipped + * some VMAs */ + vma = find_vma(current->mm, addr); + } + + return -ENOMEM; +} + +static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) +{ + unsigned long addr = 0x100000000UL; + struct vm_area_struct *vma; + + vma = find_vma(current->mm, addr); + while (addr + len <= TASK_SIZE_USER64) { + BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ + + if (! __within_hugepage_high_range(addr, len, areamask)) { + addr = ALIGN(addr+1, 1UL<mm, addr); + continue; + } + + if (!vma || (addr + len) <= vma->vm_start) + return addr; + addr = ALIGN(vma->vm_end, HPAGE_SIZE); + /* Depending on segmask this might not be a confirmed + * hugepage region, so the ALIGN could have skipped + * some VMAs */ + vma = find_vma(current->mm, addr); + } + + return -ENOMEM; +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + int lastshift; + u16 areamask, curareas; + + if (len & ~HPAGE_MASK) + return -EINVAL; + + if (!cpu_has_feature(CPU_FTR_16M_PAGE)) + return -EINVAL; + + if (test_thread_flag(TIF_32BIT)) { + curareas = current->mm->context.low_htlb_areas; + + /* First see if we can do the mapping in the existing + * low areas */ + addr = htlb_get_low_area(len, curareas); + if (addr != -ENOMEM) + return addr; + + lastshift = 0; + for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); + ! lastshift; areamask >>=1) { + if (areamask & 1) + lastshift = 1; + + addr = htlb_get_low_area(len, curareas | areamask); + if ((addr != -ENOMEM) + && open_low_hpage_areas(current->mm, areamask) == 0) + return addr; + } + } else { + curareas = current->mm->context.high_htlb_areas; + + /* First see if we can do the mapping in the existing + * high areas */ + addr = htlb_get_high_area(len, curareas); + if (addr != -ENOMEM) + return addr; + + lastshift = 0; + for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); + ! lastshift; areamask >>=1) { + if (areamask & 1) + lastshift = 1; + + addr = htlb_get_high_area(len, curareas | areamask); + if ((addr != -ENOMEM) + && open_high_hpage_areas(current->mm, areamask) == 0) + return addr; + } + } + printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" + " enough areas\n"); + return -ENOMEM; +} + +int hash_huge_page(struct mm_struct *mm, unsigned long access, + unsigned long ea, unsigned long vsid, int local) +{ + pte_t *ptep; + unsigned long va, vpn; + pte_t old_pte, new_pte; + unsigned long rflags, prpn; + long slot; + int err = 1; + + spin_lock(&mm->page_table_lock); + + ptep = huge_pte_offset(mm, ea); + + /* Search the Linux page table for a match with va */ + va = (vsid << 28) | (ea & 0x0fffffff); + vpn = va >> HPAGE_SHIFT; + + /* + * If no pte found or not present, send the problem up to + * do_page_fault + */ + if (unlikely(!ptep || pte_none(*ptep))) + goto out; + +/* BUG_ON(pte_bad(*ptep)); */ + + /* + * Check the user's access rights to the page. If access should be + * prevented then send the problem up to do_page_fault. + */ + if (unlikely(access & ~pte_val(*ptep))) + goto out; + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + old_pte = *ptep; + new_pte = old_pte; + + rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long hash, slot; + + hash = hpt_hash(vpn, 1); + if (pte_val(old_pte) & _PAGE_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; + + if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1) + pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, 1); + unsigned long hpte_group; + + prpn = pte_pfn(old_pte); + +repeat: + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + /* Update the linux pte with the HPTE slot */ + pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; + pte_val(new_pte) |= _PAGE_HASHPTE; + + /* Add in WIMG bits */ + /* XXX We should store these in the pte */ + rflags |= _PAGE_COHERENT; + + slot = ppc_md.hpte_insert(hpte_group, va, prpn, + HPTE_V_LARGE, rflags); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + pte_val(new_pte) |= _PAGE_SECONDARY; + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, va, prpn, + HPTE_V_LARGE | + HPTE_V_SECONDARY, + rflags); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP)&~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + + if (unlikely(slot == -2)) + panic("hash_huge_page: pte_insert failed\n"); + + pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; + + /* + * No need to use ldarx/stdcx here because all who + * might be updating the pte will hold the + * page_table_lock + */ + *ptep = new_pte; + } + + err = 0; + + out: + spin_unlock(&mm->page_table_lock); + + return err; +} diff --git a/arch/powerpc/mm/imalloc.c b/arch/powerpc/mm/imalloc.c new file mode 100644 index 000000000000..c65b87b92756 --- /dev/null +++ b/arch/powerpc/mm/imalloc.c @@ -0,0 +1,317 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +static DECLARE_MUTEX(imlist_sem); +struct vm_struct * imlist = NULL; + +static int get_free_im_addr(unsigned long size, unsigned long *im_addr) +{ + unsigned long addr; + struct vm_struct **p, *tmp; + + addr = ioremap_bot; + for (p = &imlist; (tmp = *p) ; p = &tmp->next) { + if (size + addr < (unsigned long) tmp->addr) + break; + if ((unsigned long)tmp->addr >= ioremap_bot) + addr = tmp->size + (unsigned long) tmp->addr; + if (addr >= IMALLOC_END-size) + return 1; + } + *im_addr = addr; + + return 0; +} + +/* Return whether the region described by v_addr and size is a subset + * of the region described by parent + */ +static inline int im_region_is_subset(unsigned long v_addr, unsigned long size, + struct vm_struct *parent) +{ + return (int) (v_addr >= (unsigned long) parent->addr && + v_addr < (unsigned long) parent->addr + parent->size && + size < parent->size); +} + +/* Return whether the region described by v_addr and size is a superset + * of the region described by child + */ +static int im_region_is_superset(unsigned long v_addr, unsigned long size, + struct vm_struct *child) +{ + struct vm_struct parent; + + parent.addr = (void *) v_addr; + parent.size = size; + + return im_region_is_subset((unsigned long) child->addr, child->size, + &parent); +} + +/* Return whether the region described by v_addr and size overlaps + * the region described by vm. Overlapping regions meet the + * following conditions: + * 1) The regions share some part of the address space + * 2) The regions aren't identical + * 3) Neither region is a subset of the other + */ +static int im_region_overlaps(unsigned long v_addr, unsigned long size, + struct vm_struct *vm) +{ + if (im_region_is_superset(v_addr, size, vm)) + return 0; + + return (v_addr + size > (unsigned long) vm->addr + vm->size && + v_addr < (unsigned long) vm->addr + vm->size) || + (v_addr < (unsigned long) vm->addr && + v_addr + size > (unsigned long) vm->addr); +} + +/* Determine imalloc status of region described by v_addr and size. + * Can return one of the following: + * IM_REGION_UNUSED - Entire region is unallocated in imalloc space. + * IM_REGION_SUBSET - Region is a subset of a region that is already + * allocated in imalloc space. + * vm will be assigned to a ptr to the parent region. + * IM_REGION_EXISTS - Exact region already allocated in imalloc space. + * vm will be assigned to a ptr to the existing imlist + * member. + * IM_REGION_OVERLAPS - Region overlaps an allocated region in imalloc space. + * IM_REGION_SUPERSET - Region is a superset of a region that is already + * allocated in imalloc space. + */ +static int im_region_status(unsigned long v_addr, unsigned long size, + struct vm_struct **vm) +{ + struct vm_struct *tmp; + + for (tmp = imlist; tmp; tmp = tmp->next) + if (v_addr < (unsigned long) tmp->addr + tmp->size) + break; + + if (tmp) { + if (im_region_overlaps(v_addr, size, tmp)) + return IM_REGION_OVERLAP; + + *vm = tmp; + if (im_region_is_subset(v_addr, size, tmp)) { + /* Return with tmp pointing to superset */ + return IM_REGION_SUBSET; + } + if (im_region_is_superset(v_addr, size, tmp)) { + /* Return with tmp pointing to first subset */ + return IM_REGION_SUPERSET; + } + else if (v_addr == (unsigned long) tmp->addr && + size == tmp->size) { + /* Return with tmp pointing to exact region */ + return IM_REGION_EXISTS; + } + } + + *vm = NULL; + return IM_REGION_UNUSED; +} + +static struct vm_struct * split_im_region(unsigned long v_addr, + unsigned long size, struct vm_struct *parent) +{ + struct vm_struct *vm1 = NULL; + struct vm_struct *vm2 = NULL; + struct vm_struct *new_vm = NULL; + + vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL); + if (vm1 == NULL) { + printk(KERN_ERR "%s() out of memory\n", __FUNCTION__); + return NULL; + } + + if (v_addr == (unsigned long) parent->addr) { + /* Use existing parent vm_struct to represent child, allocate + * new one for the remainder of parent range + */ + vm1->size = parent->size - size; + vm1->addr = (void *) (v_addr + size); + vm1->next = parent->next; + + parent->size = size; + parent->next = vm1; + new_vm = parent; + } else if (v_addr + size == (unsigned long) parent->addr + + parent->size) { + /* Allocate new vm_struct to represent child, use existing + * parent one for remainder of parent range + */ + vm1->size = size; + vm1->addr = (void *) v_addr; + vm1->next = parent->next; + new_vm = vm1; + + parent->size -= size; + parent->next = vm1; + } else { + /* Allocate two new vm_structs for the new child and + * uppermost remainder, and use existing parent one for the + * lower remainder of parent range + */ + vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL); + if (vm2 == NULL) { + printk(KERN_ERR "%s() out of memory\n", __FUNCTION__); + kfree(vm1); + return NULL; + } + + vm1->size = size; + vm1->addr = (void *) v_addr; + vm1->next = vm2; + new_vm = vm1; + + vm2->size = ((unsigned long) parent->addr + parent->size) - + (v_addr + size); + vm2->addr = (void *) v_addr + size; + vm2->next = parent->next; + + parent->size = v_addr - (unsigned long) parent->addr; + parent->next = vm1; + } + + return new_vm; +} + +static struct vm_struct * __add_new_im_area(unsigned long req_addr, + unsigned long size) +{ + struct vm_struct **p, *tmp, *area; + + for (p = &imlist; (tmp = *p) ; p = &tmp->next) { + if (req_addr + size <= (unsigned long)tmp->addr) + break; + } + + area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + area->flags = 0; + area->addr = (void *)req_addr; + area->size = size; + area->next = *p; + *p = area; + + return area; +} + +static struct vm_struct * __im_get_area(unsigned long req_addr, + unsigned long size, + int criteria) +{ + struct vm_struct *tmp; + int status; + + status = im_region_status(req_addr, size, &tmp); + if ((criteria & status) == 0) { + return NULL; + } + + switch (status) { + case IM_REGION_UNUSED: + tmp = __add_new_im_area(req_addr, size); + break; + case IM_REGION_SUBSET: + tmp = split_im_region(req_addr, size, tmp); + break; + case IM_REGION_EXISTS: + /* Return requested region */ + break; + case IM_REGION_SUPERSET: + /* Return first existing subset of requested region */ + break; + default: + printk(KERN_ERR "%s() unexpected imalloc region status\n", + __FUNCTION__); + tmp = NULL; + } + + return tmp; +} + +struct vm_struct * im_get_free_area(unsigned long size) +{ + struct vm_struct *area; + unsigned long addr; + + down(&imlist_sem); + if (get_free_im_addr(size, &addr)) { + printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n", + __FUNCTION__, size); + area = NULL; + goto next_im_done; + } + + area = __im_get_area(addr, size, IM_REGION_UNUSED); + if (area == NULL) { + printk(KERN_ERR + "%s() cannot obtain area for addr 0x%lx size 0x%lx\n", + __FUNCTION__, addr, size); + } +next_im_done: + up(&imlist_sem); + return area; +} + +struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, + int criteria) +{ + struct vm_struct *area; + + down(&imlist_sem); + area = __im_get_area(v_addr, size, criteria); + up(&imlist_sem); + return area; +} + +void im_free(void * addr) +{ + struct vm_struct **p, *tmp; + + if (!addr) + return; + if ((unsigned long) addr & ~PAGE_MASK) { + printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr); + return; + } + down(&imlist_sem); + for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + + /* XXX: do we need the lock? */ + spin_lock(&init_mm.page_table_lock); + unmap_vm_area(tmp); + spin_unlock(&init_mm.page_table_lock); + + kfree(tmp); + up(&imlist_sem); + return; + } + } + up(&imlist_sem); + printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__, + addr); +} diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index c0ce6a7af3c7..b0fc822ec29f 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -73,18 +73,8 @@ #warning TASK_SIZE is smaller than it needs to be. #endif -int mem_init_done; -unsigned long ioremap_bot = IMALLOC_BASE; -static unsigned long phbs_io_bot = PHBS_IO_BASE; - -extern pgd_t swapper_pg_dir[]; -extern struct task_struct *current_set[NR_CPUS]; - unsigned long klimit = (unsigned long)_end; -unsigned long _SDR1=0; -unsigned long _ASR=0; - /* max amount of RAM to use */ unsigned long __max_memory; @@ -193,19 +183,6 @@ static int __init setup_kcore(void) } module_init(setup_kcore); -void __iomem * reserve_phb_iospace(unsigned long size) -{ - void __iomem *virt_addr; - - if (phbs_io_bot >= IMALLOC_BASE) - panic("reserve_phb_iospace(): phb io space overflow\n"); - - virt_addr = (void __iomem *) phbs_io_bot; - phbs_io_bot += size; - - return virt_addr; -} - static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) { memset(addr, 0, kmem_cache_size(cache)); @@ -244,16 +221,3 @@ void pgtable_cache_init(void) name); } } - -pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, - unsigned long size, pgprot_t vma_prot) -{ - if (ppc_md.phys_mem_access_prot) - return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot); - - if (!page_is_ram(addr >> PAGE_SHIFT)) - vma_prot = __pgprot(pgprot_val(vma_prot) - | _PAGE_GUARDED | _PAGE_NO_CACHE); - return vma_prot; -} -EXPORT_SYMBOL(phys_mem_access_prot); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0650de74d0b3..55b5860ed3c9 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -47,6 +47,9 @@ #include #include #include +#ifdef CONFIG_PPC64 +#include +#endif #include "mmu_decl.h" @@ -334,7 +337,7 @@ void flush_dcache_icache_page(struct page *page) void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); __flush_dcache_icache(start); kunmap_atomic(start, KM_PPC_SYNC_ICACHE); -#elif defined(CONFIG_8xx) +#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) /* On 8xx there is no need to kmap since highmem is not supported */ __flush_dcache_icache(page_address(page)); #else @@ -463,18 +466,18 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, if (pgdir == NULL) return; - ptep = find_linux_pte(pgdir, ea); + ptep = find_linux_pte(pgdir, address); if (!ptep) return; - vsid = get_vsid(vma->vm_mm->context.id, ea); + vsid = get_vsid(vma->vm_mm->context.id, address); local_irq_save(flags); tmp = cpumask_of_cpu(smp_processor_id()); if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) local = 1; - __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep, + __hash_page(address, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep, 0x300, local); local_irq_restore(flags); #endif diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c new file mode 100644 index 000000000000..fe65f522aff3 --- /dev/null +++ b/arch/powerpc/mm/mmap.c @@ -0,0 +1,86 @@ +/* + * linux/arch/ppc64/mm/mmap.c + * + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Started by Ingo Molnar + */ + +#include +#include + +/* + * Top of mmap area (just below the process stack). + * + * Leave an at least ~128 MB hole. + */ +#define MIN_GAP (128*1024*1024) +#define MAX_GAP (TASK_SIZE/6*5) + +static inline unsigned long mmap_base(void) +{ + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return TASK_SIZE - (gap & PAGE_MASK); +} + +static inline int mmap_is_legacy(void) +{ + /* + * Force standard allocation for 64 bit programs. + */ + if (!test_thread_flag(TIF_32BIT)) + return 1; + + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + /* + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +} diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 06fe8af3af55..a4d7a327c0e5 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -22,11 +22,11 @@ #include #include +#ifdef CONFIG_PPC32 extern void mapin_ram(void); extern int map_page(unsigned long va, phys_addr_t pa, int flags); extern void setbat(int index, unsigned long virt, unsigned long phys, unsigned int size, int flags); -extern void reserve_phys_mem(unsigned long start, unsigned long size); extern void settlbcam(int index, unsigned long virt, phys_addr_t phys, unsigned int size, int flags, unsigned int pid); extern void invalidate_tlbcam_entry(int index); @@ -36,16 +36,16 @@ extern unsigned long ioremap_base; extern unsigned long ioremap_bot; extern unsigned int rtas_data, rtas_size; -extern unsigned long __max_low_memory; -extern unsigned long __initial_memory_limit; -extern unsigned long total_memory; -extern unsigned long total_lowmem; -extern int mem_init_done; - extern PTE *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; extern unsigned int num_tlbcam_entries; +#endif + +extern unsigned long __max_low_memory; +extern unsigned long __initial_memory_limit; +extern unsigned long total_memory; +extern unsigned long total_lowmem; /* ...and now those things that may be slightly different between processor * architectures. -- Dan @@ -66,8 +66,8 @@ extern void MMU_init_hw(void); extern unsigned long mmu_mapin_ram(void); extern void adjust_total_lowmem(void); -#else -/* anything except 4xx or 8xx */ +#elif defined(CONFIG_PPC32) +/* anything 32-bit except 4xx or 8xx */ extern void MMU_init_hw(void); extern unsigned long mmu_mapin_ram(void); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c new file mode 100644 index 000000000000..cb864b8f2750 --- /dev/null +++ b/arch/powerpc/mm/numa.c @@ -0,0 +1,779 @@ +/* + * pSeries NUMA support + * + * Copyright (C) 2002 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int numa_enabled = 1; + +static int numa_debug; +#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } + +#ifdef DEBUG_NUMA +#define ARRAY_INITIALISER -1 +#else +#define ARRAY_INITIALISER 0 +#endif + +int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = + ARRAY_INITIALISER}; +char *numa_memory_lookup_table; +cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; +int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; + +struct pglist_data *node_data[MAX_NUMNODES]; +bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; +static int min_common_depth; + +/* + * We need somewhere to store start/span for each node until we have + * allocated the real node_data structures. + */ +static struct { + unsigned long node_start_pfn; + unsigned long node_end_pfn; + unsigned long node_present_pages; +} init_node_data[MAX_NUMNODES] __initdata; + +EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(numa_cpu_lookup_table); +EXPORT_SYMBOL(numa_memory_lookup_table); +EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(nr_cpus_in_node); + +static inline void map_cpu_to_node(int cpu, int node) +{ + numa_cpu_lookup_table[cpu] = node; + if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { + cpu_set(cpu, numa_cpumask_lookup_table[node]); + nr_cpus_in_node[node]++; + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static void unmap_cpu_from_node(unsigned long cpu) +{ + int node = numa_cpu_lookup_table[cpu]; + + dbg("removing cpu %lu from node %d\n", cpu, node); + + if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { + cpu_clear(cpu, numa_cpumask_lookup_table[node]); + nr_cpus_in_node[node]--; + } else { + printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", + cpu, node); + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static struct device_node * __devinit find_cpu_node(unsigned int cpu) +{ + unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); + struct device_node *cpu_node = NULL; + unsigned int *interrupt_server, *reg; + int len; + + while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { + /* Try interrupt server first */ + interrupt_server = (unsigned int *)get_property(cpu_node, + "ibm,ppc-interrupt-server#s", &len); + + len = len / sizeof(u32); + + if (interrupt_server && (len > 0)) { + while (len--) { + if (interrupt_server[len] == hw_cpuid) + return cpu_node; + } + } else { + reg = (unsigned int *)get_property(cpu_node, + "reg", &len); + if (reg && (len > 0) && (reg[0] == hw_cpuid)) + return cpu_node; + } + } + + return NULL; +} + +/* must hold reference to node during call */ +static int *of_get_associativity(struct device_node *dev) +{ + return (unsigned int *)get_property(dev, "ibm,associativity", NULL); +} + +static int of_node_numa_domain(struct device_node *device) +{ + int numa_domain; + unsigned int *tmp; + + if (min_common_depth == -1) + return 0; + + tmp = of_get_associativity(device); + if (tmp && (tmp[0] >= min_common_depth)) { + numa_domain = tmp[min_common_depth]; + } else { + dbg("WARNING: no NUMA information for %s\n", + device->full_name); + numa_domain = 0; + } + return numa_domain; +} + +/* + * In theory, the "ibm,associativity" property may contain multiple + * associativity lists because a resource may be multiply connected + * into the machine. This resource then has different associativity + * characteristics relative to its multiple connections. We ignore + * this for now. We also assume that all cpu and memory sets have + * their distances represented at a common level. This won't be + * true for heirarchical NUMA. + * + * In any case the ibm,associativity-reference-points should give + * the correct depth for a normal NUMA system. + * + * - Dave Hansen + */ +static int __init find_min_common_depth(void) +{ + int depth; + unsigned int *ref_points; + struct device_node *rtas_root; + unsigned int len; + + rtas_root = of_find_node_by_path("/rtas"); + + if (!rtas_root) + return -1; + + /* + * this property is 2 32-bit integers, each representing a level of + * depth in the associativity nodes. The first is for an SMP + * configuration (should be all 0's) and the second is for a normal + * NUMA configuration. + */ + ref_points = (unsigned int *)get_property(rtas_root, + "ibm,associativity-reference-points", &len); + + if ((len >= 1) && ref_points) { + depth = ref_points[1]; + } else { + dbg("WARNING: could not find NUMA " + "associativity reference point\n"); + depth = -1; + } + of_node_put(rtas_root); + + return depth; +} + +static int __init get_mem_addr_cells(void) +{ + struct device_node *memory = NULL; + int rc; + + memory = of_find_node_by_type(memory, "memory"); + if (!memory) + return 0; /* it won't matter */ + + rc = prom_n_addr_cells(memory); + return rc; +} + +static int __init get_mem_size_cells(void) +{ + struct device_node *memory = NULL; + int rc; + + memory = of_find_node_by_type(memory, "memory"); + if (!memory) + return 0; /* it won't matter */ + rc = prom_n_size_cells(memory); + return rc; +} + +static unsigned long read_n_cells(int n, unsigned int **buf) +{ + unsigned long result = 0; + + while (n--) { + result = (result << 32) | **buf; + (*buf)++; + } + return result; +} + +/* + * Figure out to which domain a cpu belongs and stick it there. + * Return the id of the domain used. + */ +static int numa_setup_cpu(unsigned long lcpu) +{ + int numa_domain = 0; + struct device_node *cpu = find_cpu_node(lcpu); + + if (!cpu) { + WARN_ON(1); + goto out; + } + + numa_domain = of_node_numa_domain(cpu); + + if (numa_domain >= num_online_nodes()) { + /* + * POWER4 LPAR uses 0xffff as invalid node, + * dont warn in this case. + */ + if (numa_domain != 0xffff) + printk(KERN_ERR "WARNING: cpu %ld " + "maps to invalid NUMA node %d\n", + lcpu, numa_domain); + numa_domain = 0; + } +out: + node_set_online(numa_domain); + + map_cpu_to_node(lcpu, numa_domain); + + of_node_put(cpu); + + return numa_domain; +} + +static int cpu_numa_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned long lcpu = (unsigned long)hcpu; + int ret = NOTIFY_DONE; + + switch (action) { + case CPU_UP_PREPARE: + if (min_common_depth == -1 || !numa_enabled) + map_cpu_to_node(lcpu, 0); + else + numa_setup_cpu(lcpu); + ret = NOTIFY_OK; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_UP_CANCELED: + unmap_cpu_from_node(lcpu); + break; + ret = NOTIFY_OK; +#endif + } + return ret; +} + +/* + * Check and possibly modify a memory region to enforce the memory limit. + * + * Returns the size the region should have to enforce the memory limit. + * This will either be the original value of size, a truncated value, + * or zero. If the returned value of size is 0 the region should be + * discarded as it lies wholy above the memory limit. + */ +static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) +{ + /* + * We use lmb_end_of_DRAM() in here instead of memory_limit because + * we've already adjusted it for the limit and it takes care of + * having memory holes below the limit. + */ + extern unsigned long memory_limit; + + if (! memory_limit) + return size; + + if (start + size <= lmb_end_of_DRAM()) + return size; + + if (start >= lmb_end_of_DRAM()) + return 0; + + return lmb_end_of_DRAM() - start; +} + +static int __init parse_numa_properties(void) +{ + struct device_node *cpu = NULL; + struct device_node *memory = NULL; + int addr_cells, size_cells; + int max_domain = 0; + long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; + unsigned long i; + + if (numa_enabled == 0) { + printk(KERN_WARNING "NUMA disabled by user\n"); + return -1; + } + + numa_memory_lookup_table = + (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); + memset(numa_memory_lookup_table, 0, entries * sizeof(char)); + + for (i = 0; i < entries ; i++) + numa_memory_lookup_table[i] = ARRAY_INITIALISER; + + min_common_depth = find_min_common_depth(); + + dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + if (min_common_depth < 0) + return min_common_depth; + + max_domain = numa_setup_cpu(boot_cpuid); + + /* + * Even though we connect cpus to numa domains later in SMP init, + * we need to know the maximum node id now. This is because each + * node id must have NODE_DATA etc backing it. + * As a result of hotplug we could still have cpus appear later on + * with larger node ids. In that case we force the cpu into node 0. + */ + for_each_cpu(i) { + int numa_domain; + + cpu = find_cpu_node(i); + + if (cpu) { + numa_domain = of_node_numa_domain(cpu); + of_node_put(cpu); + + if (numa_domain < MAX_NUMNODES && + max_domain < numa_domain) + max_domain = numa_domain; + } + } + + addr_cells = get_mem_addr_cells(); + size_cells = get_mem_size_cells(); + memory = NULL; + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long start; + unsigned long size; + int numa_domain; + int ranges; + unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + ranges = memory->n_addrs; +new_range: + /* these are order-sensitive, and modify the buffer pointer */ + start = read_n_cells(addr_cells, &memcell_buf); + size = read_n_cells(size_cells, &memcell_buf); + + start = _ALIGN_DOWN(start, MEMORY_INCREMENT); + size = _ALIGN_UP(size, MEMORY_INCREMENT); + + numa_domain = of_node_numa_domain(memory); + + if (numa_domain >= MAX_NUMNODES) { + if (numa_domain != 0xffff) + printk(KERN_ERR "WARNING: memory at %lx maps " + "to invalid NUMA node %d\n", start, + numa_domain); + numa_domain = 0; + } + + if (max_domain < numa_domain) + max_domain = numa_domain; + + if (! (size = numa_enforce_memory_limit(start, size))) { + if (--ranges) + goto new_range; + else + continue; + } + + /* + * Initialize new node struct, or add to an existing one. + */ + if (init_node_data[numa_domain].node_end_pfn) { + if ((start / PAGE_SIZE) < + init_node_data[numa_domain].node_start_pfn) + init_node_data[numa_domain].node_start_pfn = + start / PAGE_SIZE; + if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > + init_node_data[numa_domain].node_end_pfn) + init_node_data[numa_domain].node_end_pfn = + (start / PAGE_SIZE) + + (size / PAGE_SIZE); + + init_node_data[numa_domain].node_present_pages += + size / PAGE_SIZE; + } else { + node_set_online(numa_domain); + + init_node_data[numa_domain].node_start_pfn = + start / PAGE_SIZE; + init_node_data[numa_domain].node_end_pfn = + init_node_data[numa_domain].node_start_pfn + + size / PAGE_SIZE; + init_node_data[numa_domain].node_present_pages = + size / PAGE_SIZE; + } + + for (i = start ; i < (start+size); i += MEMORY_INCREMENT) + numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = + numa_domain; + + if (--ranges) + goto new_range; + } + + for (i = 0; i <= max_domain; i++) + node_set_online(i); + + return 0; +} + +static void __init setup_nonnuma(void) +{ + unsigned long top_of_ram = lmb_end_of_DRAM(); + unsigned long total_ram = lmb_phys_mem_size(); + unsigned long i; + + printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", + top_of_ram, total_ram); + printk(KERN_INFO "Memory hole size: %ldMB\n", + (top_of_ram - total_ram) >> 20); + + if (!numa_memory_lookup_table) { + long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; + numa_memory_lookup_table = + (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); + memset(numa_memory_lookup_table, 0, entries * sizeof(char)); + for (i = 0; i < entries ; i++) + numa_memory_lookup_table[i] = ARRAY_INITIALISER; + } + + map_cpu_to_node(boot_cpuid, 0); + + node_set_online(0); + + init_node_data[0].node_start_pfn = 0; + init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; + init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; + + for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) + numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; +} + +static void __init dump_numa_topology(void) +{ + unsigned int node; + unsigned int count; + + if (min_common_depth == -1 || !numa_enabled) + return; + + for_each_online_node(node) { + unsigned long i; + + printk(KERN_INFO "Node %d Memory:", node); + + count = 0; + + for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { + if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { + if (count == 0) + printk(" 0x%lx", i); + ++count; + } else { + if (count > 0) + printk("-0x%lx", i); + count = 0; + } + } + + if (count > 0) + printk("-0x%lx", i); + printk("\n"); + } + return; +} + +/* + * Allocate some memory, satisfying the lmb or bootmem allocator where + * required. nid is the preferred node and end is the physical address of + * the highest address in the node. + * + * Returns the physical address of the memory. + */ +static unsigned long careful_allocation(int nid, unsigned long size, + unsigned long align, unsigned long end) +{ + unsigned long ret = lmb_alloc_base(size, align, end); + + /* retry over all memory */ + if (!ret) + ret = lmb_alloc_base(size, align, lmb_end_of_DRAM()); + + if (!ret) + panic("numa.c: cannot allocate %lu bytes on node %d", + size, nid); + + /* + * If the memory came from a previously allocated node, we must + * retry with the bootmem allocator. + */ + if (pa_to_nid(ret) < nid) { + nid = pa_to_nid(ret); + ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), + size, align, 0); + + if (!ret) + panic("numa.c: cannot allocate %lu bytes on node %d", + size, nid); + + ret = virt_to_abs(ret); + + dbg("alloc_bootmem %lx %lx\n", ret, size); + } + + return ret; +} + +void __init do_init_bootmem(void) +{ + int nid; + int addr_cells, size_cells; + struct device_node *memory = NULL; + static struct notifier_block ppc64_numa_nb = { + .notifier_call = cpu_numa_callback, + .priority = 1 /* Must run before sched domains notifier. */ + }; + + min_low_pfn = 0; + max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; + max_pfn = max_low_pfn; + + if (parse_numa_properties()) + setup_nonnuma(); + else + dump_numa_topology(); + + register_cpu_notifier(&ppc64_numa_nb); + + for_each_online_node(nid) { + unsigned long start_paddr, end_paddr; + int i; + unsigned long bootmem_paddr; + unsigned long bootmap_pages; + + start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; + end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; + + /* Allocate the node structure node local if possible */ + NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, + sizeof(struct pglist_data), + SMP_CACHE_BYTES, end_paddr); + NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + + dbg("node %d\n", nid); + dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); + + NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; + NODE_DATA(nid)->node_start_pfn = + init_node_data[nid].node_start_pfn; + NODE_DATA(nid)->node_spanned_pages = + end_paddr - start_paddr; + + if (NODE_DATA(nid)->node_spanned_pages == 0) + continue; + + dbg("start_paddr = %lx\n", start_paddr); + dbg("end_paddr = %lx\n", end_paddr); + + bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); + + bootmem_paddr = careful_allocation(nid, + bootmap_pages << PAGE_SHIFT, + PAGE_SIZE, end_paddr); + memset(abs_to_virt(bootmem_paddr), 0, + bootmap_pages << PAGE_SHIFT); + dbg("bootmap_paddr = %lx\n", bootmem_paddr); + + init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, + start_paddr >> PAGE_SHIFT, + end_paddr >> PAGE_SHIFT); + + /* + * We need to do another scan of all memory sections to + * associate memory with the correct node. + */ + addr_cells = get_mem_addr_cells(); + size_cells = get_mem_size_cells(); + memory = NULL; + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long mem_start, mem_size; + int numa_domain, ranges; + unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + ranges = memory->n_addrs; /* ranges in cell */ +new_range: + mem_start = read_n_cells(addr_cells, &memcell_buf); + mem_size = read_n_cells(size_cells, &memcell_buf); + if (numa_enabled) { + numa_domain = of_node_numa_domain(memory); + if (numa_domain >= MAX_NUMNODES) + numa_domain = 0; + } else + numa_domain = 0; + + if (numa_domain != nid) + continue; + + mem_size = numa_enforce_memory_limit(mem_start, mem_size); + if (mem_size) { + dbg("free_bootmem %lx %lx\n", mem_start, mem_size); + free_bootmem_node(NODE_DATA(nid), mem_start, mem_size); + } + + if (--ranges) /* process all ranges in cell */ + goto new_range; + } + + /* + * Mark reserved regions on this node + */ + for (i = 0; i < lmb.reserved.cnt; i++) { + unsigned long physbase = lmb.reserved.region[i].base; + unsigned long size = lmb.reserved.region[i].size; + + if (pa_to_nid(physbase) != nid && + pa_to_nid(physbase+size-1) != nid) + continue; + + if (physbase < end_paddr && + (physbase+size) > start_paddr) { + /* overlaps */ + if (physbase < start_paddr) { + size -= start_paddr - physbase; + physbase = start_paddr; + } + + if (size > end_paddr - physbase) + size = end_paddr - physbase; + + dbg("reserve_bootmem %lx %lx\n", physbase, + size); + reserve_bootmem_node(NODE_DATA(nid), physbase, + size); + } + } + /* + * This loop may look famaliar, but we have to do it again + * after marking our reserved memory to mark memory present + * for sparsemem. + */ + addr_cells = get_mem_addr_cells(); + size_cells = get_mem_size_cells(); + memory = NULL; + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long mem_start, mem_size; + int numa_domain, ranges; + unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + ranges = memory->n_addrs; /* ranges in cell */ +new_range2: + mem_start = read_n_cells(addr_cells, &memcell_buf); + mem_size = read_n_cells(size_cells, &memcell_buf); + if (numa_enabled) { + numa_domain = of_node_numa_domain(memory); + if (numa_domain >= MAX_NUMNODES) + numa_domain = 0; + } else + numa_domain = 0; + + if (numa_domain != nid) + continue; + + mem_size = numa_enforce_memory_limit(mem_start, mem_size); + memory_present(numa_domain, mem_start >> PAGE_SHIFT, + (mem_start + mem_size) >> PAGE_SHIFT); + + if (--ranges) /* process all ranges in cell */ + goto new_range2; + } + + } +} + +void __init paging_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES]; + unsigned long zholes_size[MAX_NR_ZONES]; + int nid; + + memset(zones_size, 0, sizeof(zones_size)); + memset(zholes_size, 0, sizeof(zholes_size)); + + for_each_online_node(nid) { + unsigned long start_pfn; + unsigned long end_pfn; + + start_pfn = init_node_data[nid].node_start_pfn; + end_pfn = init_node_data[nid].node_end_pfn; + + zones_size[ZONE_DMA] = end_pfn - start_pfn; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - + init_node_data[nid].node_present_pages; + + dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, + zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); + + free_area_init_node(nid, NODE_DATA(nid), zones_size, + start_pfn, zholes_size); + } +} + +static int __init early_numa(char *p) +{ + if (!p) + return 0; + + if (strstr(p, "off")) + numa_enabled = 0; + + if (strstr(p, "debug")) + numa_debug = 1; + + return 0; +} +early_param("numa", early_numa); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 724f97e5dee5..484d24f9208b 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -67,30 +67,9 @@ #include #include -#if PGTABLE_RANGE > USER_VSID_RANGE -#warning Limited user VSID range means pagetable space is wasted -#endif - -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) -#warning TASK_SIZE is smaller than it needs to be. -#endif - -int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; static unsigned long phbs_io_bot = PHBS_IO_BASE; -extern pgd_t swapper_pg_dir[]; -extern struct task_struct *current_set[NR_CPUS]; - -unsigned long klimit = (unsigned long)_end; - -/* max amount of RAM to use */ -unsigned long __max_memory; - -/* info on what we think the IO hole is */ -unsigned long io_hole_start; -unsigned long io_hole_size; - #ifdef CONFIG_PPC_ISERIES void __iomem *ioremap(unsigned long addr, unsigned long size) @@ -355,3 +334,16 @@ int iounmap_explicit(volatile void __iomem *start, unsigned long size) EXPORT_SYMBOL(ioremap); EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(iounmap); + +void __iomem * reserve_phb_iospace(unsigned long size) +{ + void __iomem *virt_addr; + + if (phbs_io_bot >= IMALLOC_BASE) + panic("reserve_phb_iospace(): phb io space overflow\n"); + + virt_addr = (void __iomem *) phbs_io_bot; + phbs_io_bot += size; + + return virt_addr; +} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c new file mode 100644 index 000000000000..0473953f6a37 --- /dev/null +++ b/arch/powerpc/mm/slb.c @@ -0,0 +1,158 @@ +/* + * PowerPC64 SLB support. + * + * Copyright (C) 2004 David Gibson , IBM + * Based on earlier code writteh by: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard , IBM + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +extern void slb_allocate(unsigned long ea); + +static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot) +{ + return (ea & ESID_MASK) | SLB_ESID_V | slot; +} + +static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags) +{ + return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags; +} + +static inline void create_slbe(unsigned long ea, unsigned long flags, + unsigned long entry) +{ + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data(ea, flags)), + "r" (mk_esid_data(ea, entry)) + : "memory" ); +} + +static void slb_flush_and_rebolt(void) +{ + /* If you change this make sure you change SLB_NUM_BOLTED + * appropriately too. */ + unsigned long ksp_flags = SLB_VSID_KERNEL; + unsigned long ksp_esid_data; + + WARN_ON(!irqs_disabled()); + + if (cpu_has_feature(CPU_FTR_16M_PAGE)) + ksp_flags |= SLB_VSID_L; + + ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); + if ((ksp_esid_data & ESID_MASK) == KERNELBASE) + ksp_esid_data &= ~SLB_ESID_V; + + /* We need to do this all in asm, so we're sure we don't touch + * the stack between the slbia and rebolting it. */ + asm volatile("isync\n" + "slbia\n" + /* Slot 1 - first VMALLOC segment */ + "slbmte %0,%1\n" + /* Slot 2 - kernel stack */ + "slbmte %2,%3\n" + "isync" + :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)), + "r"(mk_esid_data(VMALLOCBASE, 1)), + "r"(mk_vsid_data(ksp_esid_data, ksp_flags)), + "r"(ksp_esid_data) + : "memory"); +} + +/* Flush all user entries from the segment table of the current processor. */ +void switch_slb(struct task_struct *tsk, struct mm_struct *mm) +{ + unsigned long offset = get_paca()->slb_cache_ptr; + unsigned long esid_data = 0; + unsigned long pc = KSTK_EIP(tsk); + unsigned long stack = KSTK_ESP(tsk); + unsigned long unmapped_base; + + if (offset <= SLB_CACHE_ENTRIES) { + int i; + asm volatile("isync" : : : "memory"); + for (i = 0; i < offset; i++) { + esid_data = ((unsigned long)get_paca()->slb_cache[i] + << SID_SHIFT) | SLBIE_C; + asm volatile("slbie %0" : : "r" (esid_data)); + } + asm volatile("isync" : : : "memory"); + } else { + slb_flush_and_rebolt(); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (offset == 1 || offset > SLB_CACHE_ENTRIES) + asm volatile("slbie %0" : : "r" (esid_data)); + + get_paca()->slb_cache_ptr = 0; + get_paca()->context = mm->context; + + /* + * preload some userspace segments into the SLB. + */ + if (test_tsk_thread_flag(tsk, TIF_32BIT)) + unmapped_base = TASK_UNMAPPED_BASE_USER32; + else + unmapped_base = TASK_UNMAPPED_BASE_USER64; + + if (pc >= KERNELBASE) + return; + slb_allocate(pc); + + if (GET_ESID(pc) == GET_ESID(stack)) + return; + + if (stack >= KERNELBASE) + return; + slb_allocate(stack); + + if ((GET_ESID(pc) == GET_ESID(unmapped_base)) + || (GET_ESID(stack) == GET_ESID(unmapped_base))) + return; + + if (unmapped_base >= KERNELBASE) + return; + slb_allocate(unmapped_base); +} + +void slb_initialize(void) +{ + /* On iSeries the bolted entries have already been set up by + * the hypervisor from the lparMap data in head.S */ +#ifndef CONFIG_PPC_ISERIES + unsigned long flags = SLB_VSID_KERNEL; + + /* Invalidate the entire SLB (even slot 0) & all the ERATS */ + if (cpu_has_feature(CPU_FTR_16M_PAGE)) + flags |= SLB_VSID_L; + + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + asm volatile("isync; slbia; isync":::"memory"); + create_slbe(KERNELBASE, flags, 0); + create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1); + /* We don't bolt the stack for the time being - we're in boot, + * so the stack is in the bolted segment. By the time it goes + * elsewhere, we'll call _switch() which will bolt in the new + * one. */ + asm volatile("isync":::"memory"); +#endif + + get_paca()->stab_rr = SLB_NUM_BOLTED; +} diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S new file mode 100644 index 000000000000..a3a03da503bc --- /dev/null +++ b/arch/powerpc/mm/slb_low.S @@ -0,0 +1,151 @@ +/* + * arch/ppc64/mm/slb_low.S + * + * Low-level SLB routines + * + * Copyright (C) 2004 David Gibson , IBM + * + * Based on earlier C version: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* void slb_allocate(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * No other registers are examined or changed. + */ +_GLOBAL(slb_allocate) + /* + * First find a slot, round robin. Previously we tried to find + * a free slot first but that took too long. Unfortunately we + * dont have any LRU information to help us choose a slot. + */ +#ifdef CONFIG_PPC_ISERIES + /* + * On iSeries, the "bolted" stack segment can be cast out on + * shared processor switch so we need to check for a miss on + * it and restore it to the right slot. + */ + ld r9,PACAKSAVE(r13) + clrrdi r9,r9,28 + clrrdi r11,r3,28 + li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ + cmpld r9,r11 + beq 3f +#endif /* CONFIG_PPC_ISERIES */ + + ld r10,PACASTABRR(r13) + addi r10,r10,1 + /* use a cpu feature mask if we ever change our slb size */ + cmpldi r10,SLB_NUM_ENTRIES + + blt+ 4f + li r10,SLB_NUM_BOLTED + +4: + std r10,PACASTABRR(r13) +3: + /* r3 = faulting address, r10 = entry */ + + srdi r9,r3,60 /* get region */ + srdi r3,r3,28 /* get esid */ + cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ + + rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */ + oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */ + + /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */ + + blt cr7,0f /* user or kernel? */ + + /* kernel address: proto-VSID = ESID */ + /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but + * this code will generate the protoVSID 0xfffffffff for the + * top segment. That's ok, the scramble below will translate + * it to VSID 0, which is reserved as a bad VSID - one which + * will never have any pages in it. */ + li r11,SLB_VSID_KERNEL +BEGIN_FTR_SECTION + bne cr7,9f + li r11,(SLB_VSID_KERNEL|SLB_VSID_L) +END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) + b 9f + +0: /* user address: proto-VSID = context<<15 | ESID */ + srdi. r9,r3,USER_ESID_BITS + bne- 8f /* invalid ea bits set */ + +#ifdef CONFIG_HUGETLB_PAGE +BEGIN_FTR_SECTION + lhz r9,PACAHIGHHTLBAREAS(r13) + srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT) + srd r9,r9,r11 + lhz r11,PACALOWHTLBAREAS(r13) + srd r11,r11,r3 + or r9,r9,r11 +END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) +#endif /* CONFIG_HUGETLB_PAGE */ + + li r11,SLB_VSID_USER + +#ifdef CONFIG_HUGETLB_PAGE +BEGIN_FTR_SECTION + rldimi r11,r9,8,55 /* shift masked bit into SLB_VSID_L */ +END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) +#endif /* CONFIG_HUGETLB_PAGE */ + + ld r9,PACACONTEXTID(r13) + rldimi r3,r9,USER_ESID_BITS,0 + +9: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */ + ASM_VSID_SCRAMBLE(r3,r9) + + rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */ + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + */ + slbmte r11,r10 + + bgelr cr7 /* we're done for kernel addresses */ + + /* Update the slb cache */ + lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ + cmpldi r3,SLB_CACHE_ENTRIES + bge 1f + + /* still room in the slb cache */ + sldi r11,r3,1 /* r11 = offset * sizeof(u16) */ + rldicl r10,r10,36,28 /* get low 16 bits of the ESID */ + add r11,r11,r13 /* r11 = (u16 *)paca + offset */ + sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ + addi r3,r3,1 /* offset++ */ + b 2f +1: /* offset >= SLB_CACHE_ENTRIES */ + li r3,SLB_CACHE_ENTRIES+1 +2: + sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + blr + +8: /* invalid EA */ + li r3,0 /* BAD_VSID */ + li r11,SLB_VSID_USER /* flags don't much matter */ + b 9b diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c new file mode 100644 index 000000000000..1b83f002bf27 --- /dev/null +++ b/arch/powerpc/mm/stab.c @@ -0,0 +1,279 @@ +/* + * PowerPC64 Segment Translation Support. + * + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * + * Copyright (C) 2002 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct stab_entry { + unsigned long esid_data; + unsigned long vsid_data; +}; + +/* Both the segment table and SLB code uses the following cache */ +#define NR_STAB_CACHE_ENTRIES 8 +DEFINE_PER_CPU(long, stab_cache_ptr); +DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); + +/* + * Create a segment table entry for the given esid/vsid pair. + */ +static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid) +{ + unsigned long esid_data, vsid_data; + unsigned long entry, group, old_esid, castout_entry, i; + unsigned int global_entry; + struct stab_entry *ste, *castout_ste; + unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE; + + vsid_data = vsid << STE_VSID_SHIFT; + esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; + if (! kernel_segment) + esid_data |= STE_ESID_KS; + + /* Search the primary group first. */ + global_entry = (esid & 0x1f) << 3; + ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); + + /* Find an empty entry, if one exists. */ + for (group = 0; group < 2; group++) { + for (entry = 0; entry < 8; entry++, ste++) { + if (!(ste->esid_data & STE_ESID_V)) { + ste->vsid_data = vsid_data; + asm volatile("eieio":::"memory"); + ste->esid_data = esid_data; + return (global_entry | entry); + } + } + /* Now search the secondary group. */ + global_entry = ((~esid) & 0x1f) << 3; + ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); + } + + /* + * Could not find empty entry, pick one with a round robin selection. + * Search all entries in the two groups. + */ + castout_entry = get_paca()->stab_rr; + for (i = 0; i < 16; i++) { + if (castout_entry < 8) { + global_entry = (esid & 0x1f) << 3; + ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); + castout_ste = ste + castout_entry; + } else { + global_entry = ((~esid) & 0x1f) << 3; + ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); + castout_ste = ste + (castout_entry - 8); + } + + /* Dont cast out the first kernel segment */ + if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE) + break; + + castout_entry = (castout_entry + 1) & 0xf; + } + + get_paca()->stab_rr = (castout_entry + 1) & 0xf; + + /* Modify the old entry to the new value. */ + + /* Force previous translations to complete. DRENG */ + asm volatile("isync" : : : "memory"); + + old_esid = castout_ste->esid_data >> SID_SHIFT; + castout_ste->esid_data = 0; /* Invalidate old entry */ + + asm volatile("sync" : : : "memory"); /* Order update */ + + castout_ste->vsid_data = vsid_data; + asm volatile("eieio" : : : "memory"); /* Order update */ + castout_ste->esid_data = esid_data; + + asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT)); + /* Ensure completion of slbie */ + asm volatile("sync" : : : "memory"); + + return (global_entry | (castout_entry & 0x7)); +} + +/* + * Allocate a segment table entry for the given ea and mm + */ +static int __ste_allocate(unsigned long ea, struct mm_struct *mm) +{ + unsigned long vsid; + unsigned char stab_entry; + unsigned long offset; + + /* Kernel or user address? */ + if (ea >= KERNELBASE) { + vsid = get_kernel_vsid(ea); + } else { + if ((ea >= TASK_SIZE_USER64) || (! mm)) + return 1; + + vsid = get_vsid(mm->context.id, ea); + } + + stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); + + if (ea < KERNELBASE) { + offset = __get_cpu_var(stab_cache_ptr); + if (offset < NR_STAB_CACHE_ENTRIES) + __get_cpu_var(stab_cache[offset++]) = stab_entry; + else + offset = NR_STAB_CACHE_ENTRIES+1; + __get_cpu_var(stab_cache_ptr) = offset; + + /* Order update */ + asm volatile("sync":::"memory"); + } + + return 0; +} + +int ste_allocate(unsigned long ea) +{ + return __ste_allocate(ea, current->mm); +} + +/* + * Do the segment table work for a context switch: flush all user + * entries from the table, then preload some probably useful entries + * for the new task + */ +void switch_stab(struct task_struct *tsk, struct mm_struct *mm) +{ + struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; + struct stab_entry *ste; + unsigned long offset = __get_cpu_var(stab_cache_ptr); + unsigned long pc = KSTK_EIP(tsk); + unsigned long stack = KSTK_ESP(tsk); + unsigned long unmapped_base; + + /* Force previous translations to complete. DRENG */ + asm volatile("isync" : : : "memory"); + + if (offset <= NR_STAB_CACHE_ENTRIES) { + int i; + + for (i = 0; i < offset; i++) { + ste = stab + __get_cpu_var(stab_cache[i]); + ste->esid_data = 0; /* invalidate entry */ + } + } else { + unsigned long entry; + + /* Invalidate all entries. */ + ste = stab; + + /* Never flush the first entry. */ + ste += 1; + for (entry = 1; + entry < (PAGE_SIZE / sizeof(struct stab_entry)); + entry++, ste++) { + unsigned long ea; + ea = ste->esid_data & ESID_MASK; + if (ea < KERNELBASE) { + ste->esid_data = 0; + } + } + } + + asm volatile("sync; slbia; sync":::"memory"); + + __get_cpu_var(stab_cache_ptr) = 0; + + /* Now preload some entries for the new task */ + if (test_tsk_thread_flag(tsk, TIF_32BIT)) + unmapped_base = TASK_UNMAPPED_BASE_USER32; + else + unmapped_base = TASK_UNMAPPED_BASE_USER64; + + __ste_allocate(pc, mm); + + if (GET_ESID(pc) == GET_ESID(stack)) + return; + + __ste_allocate(stack, mm); + + if ((GET_ESID(pc) == GET_ESID(unmapped_base)) + || (GET_ESID(stack) == GET_ESID(unmapped_base))) + return; + + __ste_allocate(unmapped_base, mm); + + /* Order update */ + asm volatile("sync" : : : "memory"); +} + +extern void slb_initialize(void); + +/* + * Allocate segment tables for secondary CPUs. These must all go in + * the first (bolted) segment, so that do_stab_bolted won't get a + * recursive segment miss on the segment table itself. + */ +void stabs_alloc(void) +{ + int cpu; + + if (cpu_has_feature(CPU_FTR_SLB)) + return; + + for_each_cpu(cpu) { + unsigned long newstab; + + if (cpu == 0) + continue; /* stab for CPU 0 is statically allocated */ + + newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1< + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); + +/* This is declared as we are using the more or less generic + * include/asm-ppc64/tlb.h file -- tgall + */ +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +unsigned long pte_freelist_forced_free; + +struct pte_freelist_batch +{ + struct rcu_head rcu; + unsigned int index; + pgtable_free_t tables[0]; +}; + +DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +unsigned long pte_freelist_forced_free; + +#define PTE_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ + / sizeof(pgtable_free_t)) + +#ifdef CONFIG_SMP +static void pte_free_smp_sync(void *arg) +{ + /* Do nothing, just ensure we sync with all CPUs */ +} +#endif + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +static void pgtable_free_now(pgtable_free_t pgf) +{ + pte_freelist_forced_free++; + + smp_call_function(pte_free_smp_sync, NULL, 0, 1); + + pgtable_free(pgf); +} + +static void pte_free_rcu_callback(struct rcu_head *head) +{ + struct pte_freelist_batch *batch = + container_of(head, struct pte_freelist_batch, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + pgtable_free(batch->tables[i]); + + free_page((unsigned long)batch); +} + +static void pte_free_submit(struct pte_freelist_batch *batch) +{ + INIT_RCU_HEAD(&batch->rcu); + call_rcu(&batch->rcu, pte_free_rcu_callback); +} + +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +{ + /* This is safe as we are holding page_table_lock */ + cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { + pgtable_free(pgf); + return; + } + + if (*batchp == NULL) { + *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); + if (*batchp == NULL) { + pgtable_free_now(pgf); + return; + } + (*batchp)->index = 0; + } + (*batchp)->tables[(*batchp)->index++] = pgf; + if ((*batchp)->index == PTE_FREELIST_SIZE) { + pte_free_submit(*batchp); + *batchp = NULL; + } +} + +/* + * Update the MMU hash table to correspond with a change to + * a Linux PTE. If wrprot is true, it is permissible to + * change the existing HPTE to read-only rather than removing it + * (if we remove it we should clear the _PTE_HPTEFLAGS bits). + */ +void hpte_update(struct mm_struct *mm, unsigned long addr, + unsigned long pte, int wrprot) +{ + struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + unsigned long vsid; + int i; + + i = batch->index; + + /* + * This can happen when we are in the middle of a TLB batch and + * we encounter memory pressure (eg copy_page_range when it tries + * to allocate a new pte). If we have to reclaim memory and end + * up scanning and resetting referenced bits then our batch context + * will change mid stream. + */ + if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) { + flush_tlb_pending(); + i = 0; + } + if (i == 0) { + batch->mm = mm; + batch->large = pte_huge(pte); + } + if (addr < KERNELBASE) { + vsid = get_vsid(mm->context.id, addr); + WARN_ON(vsid == 0); + } else + vsid = get_kernel_vsid(addr); + batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff); + batch->pte[i] = __pte(pte); + batch->index = ++i; + if (i >= PPC64_TLB_BATCH_NR) + flush_tlb_pending(); +} + +void __flush_tlb_pending(struct ppc64_tlb_batch *batch) +{ + int i; + int cpu; + cpumask_t tmp; + int local = 0; + + BUG_ON(in_interrupt()); + + cpu = get_cpu(); + i = batch->index; + tmp = cpumask_of_cpu(cpu); + if (cpus_equal(batch->mm->cpu_vm_mask, tmp)) + local = 1; + + if (i == 1) + flush_hash_page(batch->vaddr[0], batch->pte[0], local); + else + flush_hash_range(i, local); + batch->index = 0; + put_cpu(); +} + +void pte_free_finish(void) +{ + /* This is safe as we are holding page_table_lock */ + struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + + if (*batchp == NULL) + return; + pte_free_submit(*batchp); + *batchp = NULL; +} diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile index fa889204d6ae..4a9928ef3032 100644 --- a/arch/ppc64/Makefile +++ b/arch/ppc64/Makefile @@ -83,7 +83,7 @@ head-y := arch/ppc64/kernel/head.o libs-y += arch/ppc64/lib/ core-y += arch/ppc64/kernel/ arch/powerpc/kernel/ -core-y += arch/ppc64/mm/ +core-y += arch/powerpc/mm/ core-y += arch/powerpc/platforms/ core-$(CONFIG_XMON) += arch/ppc64/xmon/ drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile deleted file mode 100644 index 3695d00d347f..000000000000 --- a/arch/ppc64/mm/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# -# Makefile for the linux ppc-specific parts of the memory manager. -# - -EXTRA_CFLAGS += -mno-minimal-toc - -obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \ - slb_low.o slb.o stab.o mmap.o -obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c deleted file mode 100644 index be3f25cf3e9f..000000000000 --- a/arch/ppc64/mm/fault.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * arch/ppc/mm/fault.c - * - * PowerPC version - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Derived from "arch/i386/mm/fault.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * Modified by Cort Dougan and Paul Mackerras. - * - * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Check whether the instruction at regs->nip is a store using - * an update addressing form which will update r1. - */ -static int store_updates_sp(struct pt_regs *regs) -{ - unsigned int inst; - - if (get_user(inst, (unsigned int __user *)regs->nip)) - return 0; - /* check for 1 in the rA field */ - if (((inst >> 16) & 0x1f) != 1) - return 0; - /* check major opcode */ - switch (inst >> 26) { - case 37: /* stwu */ - case 39: /* stbu */ - case 45: /* sthu */ - case 53: /* stfsu */ - case 55: /* stfdu */ - return 1; - case 62: /* std or stdu */ - return (inst & 3) == 1; - case 31: - /* check minor opcode */ - switch ((inst >> 1) & 0x3ff) { - case 181: /* stdux */ - case 183: /* stwux */ - case 247: /* stbux */ - case 439: /* sthux */ - case 695: /* stfsux */ - case 759: /* stfdux */ - return 1; - } - } - return 0; -} - -static void do_dabr(struct pt_regs *regs, unsigned long error_code) -{ - siginfo_t info; - - if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, - 11, SIGSEGV) == NOTIFY_STOP) - return; - - if (debugger_dabr_match(regs)) - return; - - /* Clear the DABR */ - set_dabr(0); - - /* Deliver the signal to userspace */ - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)regs->nip; - force_sig_info(SIGTRAP, &info, current); -} - -/* - * The error_code parameter is - * - DSISR for a non-SLB data access fault, - * - SRR1 & 0x08000000 for a non-SLB instruction access fault - * - 0 any SLB fault. - * The return value is 0 if the fault was handled, or the signal - * number if this is a kernel fault that can't be handled here. - */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - siginfo_t info; - unsigned long code = SEGV_MAPERR; - unsigned long is_write = error_code & DSISR_ISSTORE; - unsigned long trap = TRAP(regs); - unsigned long is_exec = trap == 0x400; - - BUG_ON((trap == 0x380) || (trap == 0x480)); - - if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code, - 11, SIGSEGV) == NOTIFY_STOP) - return 0; - - if (trap == 0x300) { - if (debugger_fault_handler(regs)) - return 0; - } - - /* On a kernel SLB miss we can only check for a valid exception entry */ - if (!user_mode(regs) && (address >= TASK_SIZE)) - return SIGSEGV; - - if (error_code & DSISR_DABRMATCH) { - do_dabr(regs, error_code); - return 0; - } - - if (in_atomic() || mm == NULL) { - if (!user_mode(regs)) - return SIGSEGV; - /* in_atomic() in user mode is really bad, - as is current->mm == NULL. */ - printk(KERN_EMERG "Page fault in user mode with" - "in_atomic() = %d mm = %p\n", in_atomic(), mm); - printk(KERN_EMERG "NIP = %lx MSR = %lx\n", - regs->nip, regs->msr); - die("Weird page fault", regs, SIGSEGV); - } - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occuring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->nip)) - goto bad_area_nosemaphore; - - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - - if (vma->vm_start <= address) { - goto good_area; - } - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - - /* - * N.B. The POWER/Open ABI allows programs to access up to - * 288 bytes below the stack pointer. - * The kernel signal delivery code writes up to about 1.5kB - * below the stack pointer (r1) before decrementing it. - * The exec code can write slightly over 640kB to the stack - * before setting the user r1. Thus we allow the stack to - * expand to 1MB without further checks. - */ - if (address + 0x100000 < vma->vm_end) { - /* get user regs even if this fault is in kernel mode */ - struct pt_regs *uregs = current->thread.regs; - if (uregs == NULL) - goto bad_area; - - /* - * A user-mode access to an address a long way below - * the stack pointer is only valid if the instruction - * is one which would update the stack pointer to the - * address accessed if the instruction completed, - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb - * (or the byte, halfword, float or double forms). - * - * If we don't check this then any write to the area - * between the last mapped region and the stack will - * expand the stack rather than segfaulting. - */ - if (address + 2048 < uregs->gpr[1] - && (!user_mode(regs) || !store_updates_sp(regs))) - goto bad_area; - } - - if (expand_stack(vma, address)) - goto bad_area; - -good_area: - code = SEGV_ACCERR; - - if (is_exec) { - /* protection fault */ - if (error_code & DSISR_PROTFAULT) - goto bad_area; - if (!(vma->vm_flags & VM_EXEC)) - goto bad_area; - /* a write */ - } else if (is_write) { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - /* a read */ - } else { - if (!(vma->vm_flags & VM_READ)) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - switch (handle_mm_fault(mm, vma, address, is_write)) { - - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - case VM_FAULT_OOM: - goto out_of_memory; - default: - BUG(); - } - - up_read(&mm->mmap_sem); - return 0; - -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses cause a SIGSEGV */ - if (user_mode(regs)) { - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = code; - info.si_addr = (void __user *) address; - force_sig_info(SIGSEGV, &info, current); - return 0; - } - - if (trap == 0x400 && (error_code & DSISR_PROTFAULT) - && printk_ratelimit()) - printk(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, current->uid); - - return SIGSEGV; - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (current->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_exit(SIGKILL); - return SIGKILL; - -do_sigbus: - up_read(&mm->mmap_sem); - if (user_mode(regs)) { - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, current); - return 0; - } - return SIGBUS; -} - -/* - * bad_page_fault is called when we have a bad access from the kernel. - * It is called from do_page_fault above and from some of the procedures - * in traps.c. - */ -void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) -{ - const struct exception_table_entry *entry; - - /* Are we prepared to handle this fault? */ - if ((entry = search_exception_tables(regs->nip)) != NULL) { - regs->nip = entry->fixup; - return; - } - - /* kernel has accessed a bad area */ - die("Kernel access of bad area", regs, sig); -} diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S deleted file mode 100644 index ee5a5d36bfa8..000000000000 --- a/arch/ppc64/mm/hash_low.S +++ /dev/null @@ -1,288 +0,0 @@ -/* - * ppc64 MMU hashtable management routines - * - * (c) Copyright IBM Corp. 2003 - * - * Maintained by: Benjamin Herrenschmidt - * - * - * This file is covered by the GNU Public Licence v2 as - * described in the kernel's COPYING file. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - - .text - -/* - * Stackframe: - * - * +-> Back chain (SP + 256) - * | General register save area (SP + 112) - * | Parameter save area (SP + 48) - * | TOC save area (SP + 40) - * | link editor doubleword (SP + 32) - * | compiler doubleword (SP + 24) - * | LR save area (SP + 16) - * | CR save area (SP + 8) - * SP ---> +-- Back chain (SP + 0) - */ -#define STACKFRAMESIZE 256 - -/* Save parameters offsets */ -#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8) - -/* Save non-volatile offsets */ -#define STK_REG(i) (112 + ((i)-14)*8) - -/* - * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid, - * pte_t *ptep, unsigned long trap, int local) - * - * Adds a page to the hash table. This is the non-LPAR version for now - */ - -_GLOBAL(__hash_page) - mflr r0 - std r0,16(r1) - stdu r1,-STACKFRAMESIZE(r1) - /* Save all params that we need after a function call */ - std r6,STK_PARM(r6)(r1) - std r8,STK_PARM(r8)(r1) - - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT - - /* Save non-volatile registers. - * r31 will hold "old PTE" - * r30 is "new PTE" - * r29 is "va" - * r28 is a hash value - * r27 is hashtab mask (maybe dynamic patched instead ?) - */ - std r27,STK_REG(r27)(r1) - std r28,STK_REG(r28)(r1) - std r29,STK_REG(r29)(r1) - std r30,STK_REG(r30)(r1) - std r31,STK_REG(r31)(r1) - - /* Step 1: - * - * Check permissions, atomically mark the linux PTE busy - * and hashed. - */ -1: - ldarx r31,0,r6 - /* Check access rights (access & ~(pte_val(*ptep))) */ - andc. r0,r4,r31 - bne- htab_wrong_access - /* Check if PTE is busy */ - andi. r0,r31,_PAGE_BUSY - /* If so, just bail out and refault if needed. Someone else - * is changing this PTE anyway and might hash it. - */ - bne- bail_ok - /* Prepare new PTE value (turn access RW into DIRTY, then - * add BUSY,HASHPTE and ACCESSED) - */ - rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ - or r30,r30,r31 - ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE - /* Write the linux PTE atomically (setting busy) */ - stdcx. r30,0,r6 - bne- 1b - isync - - /* Step 2: - * - * Insert/Update the HPTE in the hash table. At this point, - * r4 (access) is re-useable, we use it for the new HPTE flags - */ - - /* Calc va and put it in r29 */ - rldicr r29,r5,28,63-28 - rldicl r3,r3,0,36 - or r29,r3,r29 - - /* Calculate hash value for primary slot and store it in r28 */ - rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ - rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ - xor r28,r5,r0 - - /* Convert linux PTE bits into HW equivalents */ - andi. r3,r30,0x1fe /* Get basic set of flags */ - xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */ - rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ - rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ - and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */ - andc r0,r30,r0 /* r0 = pte & ~r0 */ - rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ - - /* We eventually do the icache sync here (maybe inline that - * code rather than call a C function...) - */ -BEGIN_FTR_SECTION - mr r4,r30 - mr r5,r7 - bl .hash_page_do_lazy_icache -END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) - - /* At this point, r3 contains new PP bits, save them in - * place of "access" in the param area (sic) - */ - std r3,STK_PARM(r4)(r1) - - /* Get htab_hash_mask */ - ld r4,htab_hash_mask@got(2) - ld r27,0(r4) /* htab_hash_mask -> r27 */ - - /* Check if we may already be in the hashtable, in this case, we - * go to out-of-line code to try to modify the HPTE - */ - andi. r0,r31,_PAGE_HASHPTE - bne htab_modify_pte - -htab_insert_pte: - /* Clear hpte bits in new pte (we also clear BUSY btw) and - * add _PAGE_HASHPTE - */ - lis r0,_PAGE_HPTEFLAGS@h - ori r0,r0,_PAGE_HPTEFLAGS@l - andc r30,r30,r0 - ori r30,r30,_PAGE_HASHPTE - - /* page number in r5 */ - rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT - - /* Calculate primary group hash */ - and r0,r28,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ - li r6,0 /* no vflags */ -_GLOBAL(htab_call_hpte_insert1) - bl . /* Will be patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge htab_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- htab_pte_insert_failure - - /* Now try secondary slot */ - - /* page number in r5 */ - rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT - - /* Calculate secondary group hash */ - andc r0,r27,r28 - rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ - - /* Call ppc_md.hpte_insert */ - ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ - mr r4,r29 /* Retreive va */ - li r6,HPTE_V_SECONDARY@l /* secondary slot */ -_GLOBAL(htab_call_hpte_insert2) - bl . /* Will be patched by htab_finish_init() */ - cmpdi 0,r3,0 - bge+ htab_pte_insert_ok /* Insertion successful */ - cmpdi 0,r3,-2 /* Critical failure */ - beq- htab_pte_insert_failure - - /* Both are full, we need to evict something */ - mftb r0 - /* Pick a random group based on TB */ - andi. r0,r0,1 - mr r5,r28 - bne 2f - not r5,r5 -2: and r0,r5,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - /* Call ppc_md.hpte_remove */ -_GLOBAL(htab_call_hpte_remove) - bl . /* Will be patched by htab_finish_init() */ - - /* Try all again */ - b htab_insert_pte - -bail_ok: - li r3,0 - b bail - -htab_pte_insert_ok: - /* Insert slot number & secondary bit in PTE */ - rldimi r30,r3,12,63-15 - - /* Write out the PTE with a normal write - * (maybe add eieio may be good still ?) - */ -htab_write_out_pte: - ld r6,STK_PARM(r6)(r1) - std r30,0(r6) - li r3, 0 -bail: - ld r27,STK_REG(r27)(r1) - ld r28,STK_REG(r28)(r1) - ld r29,STK_REG(r29)(r1) - ld r30,STK_REG(r30)(r1) - ld r31,STK_REG(r31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) - mtlr r0 - blr - -htab_modify_pte: - /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ - mr r4,r3 - rlwinm r3,r31,32-12,29,31 - - /* Secondary group ? if yes, get a inverted hash value */ - mr r5,r28 - andi. r0,r31,_PAGE_SECONDARY - beq 1f - not r5,r5 -1: - /* Calculate proper slot value for ppc_md.hpte_updatepp */ - and r0,r5,r27 - rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ - add r3,r0,r3 /* add slot idx */ - - /* Call ppc_md.hpte_updatepp */ - mr r5,r29 /* va */ - li r6,0 /* large is 0 */ - ld r7,STK_PARM(r8)(r1) /* get "local" param */ -_GLOBAL(htab_call_hpte_updatepp) - bl . /* Will be patched by htab_finish_init() */ - - /* if we failed because typically the HPTE wasn't really here - * we try an insertion. - */ - cmpdi 0,r3,-1 - beq- htab_insert_pte - - /* Clear the BUSY bit and Write out the PTE */ - li r0,_PAGE_BUSY - andc r30,r30,r0 - b htab_write_out_pte - -htab_wrong_access: - /* Bail out clearing reservation */ - stdcx. r31,0,r6 - li r3,1 - b bail - -htab_pte_insert_failure: - /* Bail out restoring old PTE */ - ld r6,STK_PARM(r6)(r1) - std r31,0(r6) - li r3,-1 - b bail - - diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c deleted file mode 100644 index 174d14576c28..000000000000 --- a/arch/ppc64/mm/hash_native.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * native hashtable management. - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#define HPTE_LOCK_BIT 3 - -static DEFINE_SPINLOCK(native_tlbie_lock); - -static inline void native_lock_hpte(hpte_t *hptep) -{ - unsigned long *word = &hptep->v; - - while (1) { - if (!test_and_set_bit(HPTE_LOCK_BIT, word)) - break; - while(test_bit(HPTE_LOCK_BIT, word)) - cpu_relax(); - } -} - -static inline void native_unlock_hpte(hpte_t *hptep) -{ - unsigned long *word = &hptep->v; - - asm volatile("lwsync":::"memory"); - clear_bit(HPTE_LOCK_BIT, word); -} - -long native_hpte_insert(unsigned long hpte_group, unsigned long va, - unsigned long prpn, unsigned long vflags, - unsigned long rflags) -{ - hpte_t *hptep = htab_address + hpte_group; - unsigned long hpte_v, hpte_r; - int i; - - for (i = 0; i < HPTES_PER_GROUP; i++) { - if (! (hptep->v & HPTE_V_VALID)) { - /* retry with lock held */ - native_lock_hpte(hptep); - if (! (hptep->v & HPTE_V_VALID)) - break; - native_unlock_hpte(hptep); - } - - hptep++; - } - - if (i == HPTES_PER_GROUP) - return -1; - - hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; - if (vflags & HPTE_V_LARGE) - va &= ~(1UL << HPTE_V_AVPN_SHIFT); - hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; - - hptep->r = hpte_r; - /* Guarantee the second dword is visible before the valid bit */ - __asm__ __volatile__ ("eieio" : : : "memory"); - /* - * Now set the first dword including the valid bit - * NOTE: this also unlocks the hpte - */ - hptep->v = hpte_v; - - __asm__ __volatile__ ("ptesync" : : : "memory"); - - return i | (!!(vflags & HPTE_V_SECONDARY) << 3); -} - -static long native_hpte_remove(unsigned long hpte_group) -{ - hpte_t *hptep; - int i; - int slot_offset; - unsigned long hpte_v; - - /* pick a random entry to start at */ - slot_offset = mftb() & 0x7; - - for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + hpte_group + slot_offset; - hpte_v = hptep->v; - - if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { - /* retry with lock held */ - native_lock_hpte(hptep); - hpte_v = hptep->v; - if ((hpte_v & HPTE_V_VALID) - && !(hpte_v & HPTE_V_BOLTED)) - break; - native_unlock_hpte(hptep); - } - - slot_offset++; - slot_offset &= 0x7; - } - - if (i == HPTES_PER_GROUP) - return -1; - - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - - return i; -} - -static inline void set_pp_bit(unsigned long pp, hpte_t *addr) -{ - unsigned long old; - unsigned long *p = &addr->r; - - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - rldimi %0,%2,0,61\n\ - stdcx. %0,0,%3\n\ - bne 1b" - : "=&r" (old), "=m" (*p) - : "r" (pp), "r" (p), "m" (*p) - : "cc"); -} - -/* - * Only works on small pages. Yes its ugly to have to check each slot in - * the group but we only use this during bootup. - */ -static long native_hpte_find(unsigned long vpn) -{ - hpte_t *hptep; - unsigned long hash; - unsigned long i, j; - long slot; - unsigned long hpte_v; - - hash = hpt_hash(vpn, 0); - - for (j = 0; j < 2; j++) { - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + slot; - hpte_v = hptep->v; - - if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11)) - && (hpte_v & HPTE_V_VALID) - && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) { - /* HPTE matches */ - if (j) - slot = -slot; - return slot; - } - ++slot; - } - hash = ~hash; - } - - return -1; -} - -static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int large, int local) -{ - hpte_t *hptep = htab_address + slot; - unsigned long hpte_v; - unsigned long avpn = va >> 23; - int ret = 0; - - if (large) - avpn &= ~1; - - native_lock_hpte(hptep); - - hpte_v = hptep->v; - - /* Even if we miss, we need to invalidate the TLB */ - if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) - || !(hpte_v & HPTE_V_VALID)) { - native_unlock_hpte(hptep); - ret = -1; - } else { - set_pp_bit(newpp, hptep); - native_unlock_hpte(hptep); - } - - /* Ensure it is out of the tlb too */ - if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { - tlbiel(va); - } else { - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); - - if (lock_tlbie) - spin_lock(&native_tlbie_lock); - tlbie(va, large); - if (lock_tlbie) - spin_unlock(&native_tlbie_lock); - } - - return ret; -} - -/* - * Update the page protection bits. Intended to be used to create - * guard pages for kernel data structures on pages which are bolted - * in the HPT. Assumes pages being operated on will not be stolen. - * Does not work on large pages. - * - * No need to lock here because we should be the only user. - */ -static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) -{ - unsigned long vsid, va, vpn, flags = 0; - long slot; - hpte_t *hptep; - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); - - vsid = get_kernel_vsid(ea); - va = (vsid << 28) | (ea & 0x0fffffff); - vpn = va >> PAGE_SHIFT; - - slot = native_hpte_find(vpn); - if (slot == -1) - panic("could not find page to bolt\n"); - hptep = htab_address + slot; - - set_pp_bit(newpp, hptep); - - /* Ensure it is out of the tlb too */ - if (lock_tlbie) - spin_lock_irqsave(&native_tlbie_lock, flags); - tlbie(va, 0); - if (lock_tlbie) - spin_unlock_irqrestore(&native_tlbie_lock, flags); -} - -static void native_hpte_invalidate(unsigned long slot, unsigned long va, - int large, int local) -{ - hpte_t *hptep = htab_address + slot; - unsigned long hpte_v; - unsigned long avpn = va >> 23; - unsigned long flags; - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); - - if (large) - avpn &= ~1; - - local_irq_save(flags); - native_lock_hpte(hptep); - - hpte_v = hptep->v; - - /* Even if we miss, we need to invalidate the TLB */ - if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) - || !(hpte_v & HPTE_V_VALID)) { - native_unlock_hpte(hptep); - } else { - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - } - - /* Invalidate the tlb */ - if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { - tlbiel(va); - } else { - if (lock_tlbie) - spin_lock(&native_tlbie_lock); - tlbie(va, large); - if (lock_tlbie) - spin_unlock(&native_tlbie_lock); - } - local_irq_restore(flags); -} - -/* - * clear all mappings on kexec. All cpus are in real mode (or they will - * be when they isi), and we are the only one left. We rely on our kernel - * mapping being 0xC0's and the hardware ignoring those two real bits. - * - * TODO: add batching support when enabled. remember, no dynamic memory here, - * athough there is the control page available... - */ -static void native_hpte_clear(void) -{ - unsigned long slot, slots, flags; - hpte_t *hptep = htab_address; - unsigned long hpte_v; - unsigned long pteg_count; - - pteg_count = htab_hash_mask + 1; - - local_irq_save(flags); - - /* we take the tlbie lock and hold it. Some hardware will - * deadlock if we try to tlbie from two processors at once. - */ - spin_lock(&native_tlbie_lock); - - slots = pteg_count * HPTES_PER_GROUP; - - for (slot = 0; slot < slots; slot++, hptep++) { - /* - * we could lock the pte here, but we are the only cpu - * running, right? and for crash dump, we probably - * don't want to wait for a maybe bad cpu. - */ - hpte_v = hptep->v; - - if (hpte_v & HPTE_V_VALID) { - hptep->v = 0; - tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE); - } - } - - spin_unlock(&native_tlbie_lock); - local_irq_restore(flags); -} - -static void native_flush_hash_range(unsigned long number, int local) -{ - unsigned long va, vpn, hash, secondary, slot, flags, avpn; - int i, j; - hpte_t *hptep; - unsigned long hpte_v; - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); - unsigned long large = batch->large; - - local_irq_save(flags); - - j = 0; - for (i = 0; i < number; i++) { - va = batch->vaddr[j]; - if (large) - vpn = va >> HPAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - hash = hpt_hash(vpn, large); - secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15; - if (secondary) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12; - - hptep = htab_address + slot; - - avpn = va >> 23; - if (large) - avpn &= ~0x1UL; - - native_lock_hpte(hptep); - - hpte_v = hptep->v; - - /* Even if we miss, we need to invalidate the TLB */ - if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) - || !(hpte_v & HPTE_V_VALID)) { - native_unlock_hpte(hptep); - } else { - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - } - - j++; - } - - if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { - asm volatile("ptesync":::"memory"); - - for (i = 0; i < j; i++) - __tlbiel(batch->vaddr[i]); - - asm volatile("ptesync":::"memory"); - } else { - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); - - if (lock_tlbie) - spin_lock(&native_tlbie_lock); - - asm volatile("ptesync":::"memory"); - - for (i = 0; i < j; i++) - __tlbie(batch->vaddr[i], large); - - asm volatile("eieio; tlbsync; ptesync":::"memory"); - - if (lock_tlbie) - spin_unlock(&native_tlbie_lock); - } - - local_irq_restore(flags); -} - -#ifdef CONFIG_PPC_PSERIES -/* Disable TLB batching on nighthawk */ -static inline int tlb_batching_enabled(void) -{ - struct device_node *root = of_find_node_by_path("/"); - int enabled = 1; - - if (root) { - const char *model = get_property(root, "model", NULL); - if (model && !strcmp(model, "IBM,9076-N81")) - enabled = 0; - of_node_put(root); - } - - return enabled; -} -#else -static inline int tlb_batching_enabled(void) -{ - return 1; -} -#endif - -void hpte_init_native(void) -{ - ppc_md.hpte_invalidate = native_hpte_invalidate; - ppc_md.hpte_updatepp = native_hpte_updatepp; - ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp; - ppc_md.hpte_insert = native_hpte_insert; - ppc_md.hpte_remove = native_hpte_remove; - ppc_md.hpte_clear_all = native_hpte_clear; - if (tlb_batching_enabled()) - ppc_md.flush_hash_range = native_flush_hash_range; - htab_finish_init(); -} diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c deleted file mode 100644 index 83507438d6a0..000000000000 --- a/arch/ppc64/mm/hash_utils.c +++ /dev/null @@ -1,438 +0,0 @@ -/* - * PowerPC64 port by Mike Corrigan and Dave Engebretsen - * {mikejc|engebret}@us.ibm.com - * - * Copyright (c) 2000 Mike Corrigan - * - * SMP scalability work: - * Copyright (C) 2001 Anton Blanchard , IBM - * - * Module name: htab.c - * - * Description: - * PowerPC Hashed Page Table functions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#undef DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -/* - * Note: pte --> Linux PTE - * HPTE --> PowerPC Hashed Page Table Entry - * - * Execution context: - * htab_initialize is called with the MMU off (of course), but - * the kernel has been copied down to zero so it can directly - * reference global data. At this point it is very difficult - * to print debug info. - * - */ - -#ifdef CONFIG_U3_DART -extern unsigned long dart_tablebase; -#endif /* CONFIG_U3_DART */ - -hpte_t *htab_address; -unsigned long htab_hash_mask; - -extern unsigned long _SDR1; - -#define KB (1024) -#define MB (1024*KB) - -static inline void loop_forever(void) -{ - volatile unsigned long x = 1; - for(;x;x|=1) - ; -} - -static inline void create_pte_mapping(unsigned long start, unsigned long end, - unsigned long mode, int large) -{ - unsigned long addr; - unsigned int step; - unsigned long tmp_mode; - unsigned long vflags; - - if (large) { - step = 16*MB; - vflags = HPTE_V_BOLTED | HPTE_V_LARGE; - } else { - step = 4*KB; - vflags = HPTE_V_BOLTED; - } - - for (addr = start; addr < end; addr += step) { - unsigned long vpn, hash, hpteg; - unsigned long vsid = get_kernel_vsid(addr); - unsigned long va = (vsid << 28) | (addr & 0xfffffff); - int ret = -1; - - if (large) - vpn = va >> HPAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - - - tmp_mode = mode; - - /* Make non-kernel text non-executable */ - if (!in_kernel_text(addr)) - tmp_mode = mode | HW_NO_EXEC; - - hash = hpt_hash(vpn, large); - - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - -#ifdef CONFIG_PPC_ISERIES - if (systemcfg->platform & PLATFORM_ISERIES_LPAR) - ret = iSeries_hpte_bolt_or_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, - vflags, tmp_mode); - else -#endif -#ifdef CONFIG_PPC_PSERIES - if (systemcfg->platform & PLATFORM_LPAR) - ret = pSeries_lpar_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, - vflags, tmp_mode); - else -#endif -#ifdef CONFIG_PPC_MULTIPLATFORM - ret = native_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, - vflags, tmp_mode); -#endif - - if (ret == -1) { - ppc64_terminate_msg(0x20, "create_pte_mapping"); - loop_forever(); - } - } -} - -void __init htab_initialize(void) -{ - unsigned long table, htab_size_bytes; - unsigned long pteg_count; - unsigned long mode_rw; - int i, use_largepages = 0; - unsigned long base = 0, size = 0; - extern unsigned long tce_alloc_start, tce_alloc_end; - - DBG(" -> htab_initialize()\n"); - - /* - * Calculate the required size of the htab. We want the number of - * PTEGs to equal one half the number of real pages. - */ - htab_size_bytes = 1UL << ppc64_pft_size; - pteg_count = htab_size_bytes >> 7; - - /* For debug, make the HTAB 1/8 as big as it normally would be. */ - ifppcdebug(PPCDBG_HTABSIZE) { - pteg_count >>= 3; - htab_size_bytes = pteg_count << 7; - } - - htab_hash_mask = pteg_count - 1; - - if (systemcfg->platform & PLATFORM_LPAR) { - /* Using a hypervisor which owns the htab */ - htab_address = NULL; - _SDR1 = 0; - } else { - /* Find storage for the HPT. Must be contiguous in - * the absolute address space. - */ - table = lmb_alloc(htab_size_bytes, htab_size_bytes); - - DBG("Hash table allocated at %lx, size: %lx\n", table, - htab_size_bytes); - - if ( !table ) { - ppc64_terminate_msg(0x20, "hpt space"); - loop_forever(); - } - htab_address = abs_to_virt(table); - - /* htab absolute addr + encoded htabsize */ - _SDR1 = table + __ilog2(pteg_count) - 11; - - /* Initialize the HPT with no entries */ - memset((void *)table, 0, htab_size_bytes); - } - - mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX; - - /* On U3 based machines, we need to reserve the DART area and - * _NOT_ map it to avoid cache paradoxes as it's remapped non - * cacheable later on - */ - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - use_largepages = 1; - - /* create bolted the linear mapping in the hash table */ - for (i=0; i < lmb.memory.cnt; i++) { - base = lmb.memory.region[i].base + KERNELBASE; - size = lmb.memory.region[i].size; - - DBG("creating mapping for region: %lx : %lx\n", base, size); - -#ifdef CONFIG_U3_DART - /* Do not map the DART space. Fortunately, it will be aligned - * in such a way that it will not cross two lmb regions and will - * fit within a single 16Mb page. - * The DART space is assumed to be a full 16Mb region even if we - * only use 2Mb of that space. We will use more of it later for - * AGP GART. We have to use a full 16Mb large page. - */ - DBG("DART base: %lx\n", dart_tablebase); - - if (dart_tablebase != 0 && dart_tablebase >= base - && dart_tablebase < (base + size)) { - if (base != dart_tablebase) - create_pte_mapping(base, dart_tablebase, mode_rw, - use_largepages); - if ((base + size) > (dart_tablebase + 16*MB)) - create_pte_mapping(dart_tablebase + 16*MB, base + size, - mode_rw, use_largepages); - continue; - } -#endif /* CONFIG_U3_DART */ - create_pte_mapping(base, base + size, mode_rw, use_largepages); - } - - /* - * If we have a memory_limit and we've allocated TCEs then we need to - * explicitly map the TCE area at the top of RAM. We also cope with the - * case that the TCEs start below memory_limit. - * tce_alloc_start/end are 16MB aligned so the mapping should work - * for either 4K or 16MB pages. - */ - if (tce_alloc_start) { - tce_alloc_start += KERNELBASE; - tce_alloc_end += KERNELBASE; - - if (base + size >= tce_alloc_start) - tce_alloc_start = base + size + 1; - - create_pte_mapping(tce_alloc_start, tce_alloc_end, - mode_rw, use_largepages); - } - - DBG(" <- htab_initialize()\n"); -} -#undef KB -#undef MB - -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) -{ - struct page *page; - - if (!pfn_valid(pte_pfn(pte))) - return pp; - - page = pte_page(pte); - - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - __flush_dcache_icache(page_address(page)); - set_bit(PG_arch_1, &page->flags); - } else - pp |= HW_NO_EXEC; - } - return pp; -} - -/* Result code is: - * 0 - handled - * 1 - normal page fault - * -1 - critical hash insertion error - */ -int hash_page(unsigned long ea, unsigned long access, unsigned long trap) -{ - void *pgdir; - unsigned long vsid; - struct mm_struct *mm; - pte_t *ptep; - int ret; - int user_region = 0; - int local = 0; - cpumask_t tmp; - - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) - return 1; - - switch (REGION_ID(ea)) { - case USER_REGION_ID: - user_region = 1; - mm = current->mm; - if (! mm) - return 1; - - vsid = get_vsid(mm->context.id, ea); - break; - case VMALLOC_REGION_ID: - mm = &init_mm; - vsid = get_kernel_vsid(ea); - break; -#if 0 - case KERNEL_REGION_ID: - /* - * Should never get here - entire 0xC0... region is bolted. - * Send the problem up to do_page_fault - */ -#endif - default: - /* Not a valid range - * Send the problem up to do_page_fault - */ - return 1; - break; - } - - pgdir = mm->pgd; - - if (pgdir == NULL) - return 1; - - tmp = cpumask_of_cpu(smp_processor_id()); - if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) - local = 1; - - /* Is this a huge page ? */ - if (unlikely(in_hugepage_area(mm->context, ea))) - ret = hash_huge_page(mm, access, ea, vsid, local); - else { - ptep = find_linux_pte(pgdir, ea); - if (ptep == NULL) - return 1; - ret = __hash_page(ea, access, vsid, ptep, trap, local); - } - - return ret; -} - -void flush_hash_page(unsigned long va, pte_t pte, int local) -{ - unsigned long vpn, hash, secondary, slot; - unsigned long huge = pte_huge(pte); - - if (huge) - vpn = va >> HPAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - hash = hpt_hash(vpn, huge); - secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; - if (secondary) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12; - - ppc_md.hpte_invalidate(slot, va, huge, local); -} - -void flush_hash_range(unsigned long number, int local) -{ - if (ppc_md.flush_hash_range) { - ppc_md.flush_hash_range(number, local); - } else { - int i; - struct ppc64_tlb_batch *batch = - &__get_cpu_var(ppc64_tlb_batch); - - for (i = 0; i < number; i++) - flush_hash_page(batch->vaddr[i], batch->pte[i], local); - } -} - -static inline void make_bl(unsigned int *insn_addr, void *func) -{ - unsigned long funcp = *((unsigned long *)func); - int offset = funcp - (unsigned long)insn_addr; - - *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); - flush_icache_range((unsigned long)insn_addr, 4+ - (unsigned long)insn_addr); -} - -/* - * low_hash_fault is called when we the low level hash code failed - * to instert a PTE due to an hypervisor error - */ -void low_hash_fault(struct pt_regs *regs, unsigned long address) -{ - if (user_mode(regs)) { - siginfo_t info; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, current); - return; - } - bad_page_fault(regs, address, SIGBUS); -} - -void __init htab_finish_init(void) -{ - extern unsigned int *htab_call_hpte_insert1; - extern unsigned int *htab_call_hpte_insert2; - extern unsigned int *htab_call_hpte_remove; - extern unsigned int *htab_call_hpte_updatepp; - - make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); - make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); - make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); - make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); -} diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c deleted file mode 100644 index 0ea0994ed974..000000000000 --- a/arch/ppc64/mm/hugetlbpage.c +++ /dev/null @@ -1,745 +0,0 @@ -/* - * PPC64 (POWER4) Huge TLB Page Support for Kernel. - * - * Copyright (C) 2003 David Gibson, IBM Corporation. - * - * Based on the IA-32 version: - * Copyright (C) 2002, Rohit Seth - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) -#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) - -/* Modelled after find_linux_pte() */ -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *pt; - - BUG_ON(! in_hugepage_area(mm->context, addr)); - - addr &= HPAGE_MASK; - - pg = pgd_offset(mm, addr); - if (!pgd_none(*pg)) { - pu = pud_offset(pg, addr); - if (!pud_none(*pu)) { - pm = pmd_offset(pu, addr); - pt = (pte_t *)pm; - BUG_ON(!pmd_none(*pm) - && !(pte_present(*pt) && pte_huge(*pt))); - return pt; - } - } - - return NULL; -} - -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *pt; - - BUG_ON(! in_hugepage_area(mm->context, addr)); - - addr &= HPAGE_MASK; - - pg = pgd_offset(mm, addr); - pu = pud_alloc(mm, pg, addr); - - if (pu) { - pm = pmd_alloc(mm, pu, addr); - if (pm) { - pt = (pte_t *)pm; - BUG_ON(!pmd_none(*pm) - && !(pte_present(*pt) && pte_huge(*pt))); - return pt; - } - } - - return NULL; -} - -#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE) - -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - int i; - - if (pte_present(*ptep)) { - pte_clear(mm, addr, ptep); - flush_tlb_pending(); - } - - for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) { - *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - ptep++; - } -} - -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - unsigned long old = pte_update(ptep, ~0UL); - int i; - - if (old & _PAGE_HASHPTE) - hpte_update(mm, addr, old, 0); - - for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) - ptep[i] = __pte(0); - - return __pte(old); -} - -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - if (! (within_hugepage_low_range(addr, len) - || within_hugepage_high_range(addr, len)) ) - return -EINVAL; - return 0; -} - -static void flush_low_segments(void *parm) -{ - u16 areas = (unsigned long) parm; - unsigned long i; - - asm volatile("isync" : : : "memory"); - - BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS); - - for (i = 0; i < NUM_LOW_AREAS; i++) { - if (! (areas & (1U << i))) - continue; - asm volatile("slbie %0" - : : "r" ((i << SID_SHIFT) | SLBIE_C)); - } - - asm volatile("isync" : : : "memory"); -} - -static void flush_high_segments(void *parm) -{ - u16 areas = (unsigned long) parm; - unsigned long i, j; - - asm volatile("isync" : : : "memory"); - - BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS); - - for (i = 0; i < NUM_HIGH_AREAS; i++) { - if (! (areas & (1U << i))) - continue; - for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) - asm volatile("slbie %0" - :: "r" (((i << HTLB_AREA_SHIFT) - + (j << SID_SHIFT)) | SLBIE_C)); - } - - asm volatile("isync" : : : "memory"); -} - -static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) -{ - unsigned long start = area << SID_SHIFT; - unsigned long end = (area+1) << SID_SHIFT; - struct vm_area_struct *vma; - - BUG_ON(area >= NUM_LOW_AREAS); - - /* Check no VMAs are in the region */ - vma = find_vma(mm, start); - if (vma && (vma->vm_start < end)) - return -EBUSY; - - return 0; -} - -static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) -{ - unsigned long start = area << HTLB_AREA_SHIFT; - unsigned long end = (area+1) << HTLB_AREA_SHIFT; - struct vm_area_struct *vma; - - BUG_ON(area >= NUM_HIGH_AREAS); - - /* Check no VMAs are in the region */ - vma = find_vma(mm, start); - if (vma && (vma->vm_start < end)) - return -EBUSY; - - return 0; -} - -static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) -{ - unsigned long i; - - BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); - BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); - - newareas &= ~(mm->context.low_htlb_areas); - if (! newareas) - return 0; /* The segments we want are already open */ - - for (i = 0; i < NUM_LOW_AREAS; i++) - if ((1 << i) & newareas) - if (prepare_low_area_for_htlb(mm, i) != 0) - return -EBUSY; - - mm->context.low_htlb_areas |= newareas; - - /* update the paca copy of the context struct */ - get_paca()->context = mm->context; - - /* the context change must make it to memory before the flush, - * so that further SLB misses do the right thing. */ - mb(); - on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1); - - return 0; -} - -static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) -{ - unsigned long i; - - BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); - BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) - != NUM_HIGH_AREAS); - - newareas &= ~(mm->context.high_htlb_areas); - if (! newareas) - return 0; /* The areas we want are already open */ - - for (i = 0; i < NUM_HIGH_AREAS; i++) - if ((1 << i) & newareas) - if (prepare_high_area_for_htlb(mm, i) != 0) - return -EBUSY; - - mm->context.high_htlb_areas |= newareas; - - /* update the paca copy of the context struct */ - get_paca()->context = mm->context; - - /* the context change must make it to memory before the flush, - * so that further SLB misses do the right thing. */ - mb(); - on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1); - - return 0; -} - -int prepare_hugepage_range(unsigned long addr, unsigned long len) -{ - int err; - - if ( (addr+len) < addr ) - return -EINVAL; - - if ((addr + len) < 0x100000000UL) - err = open_low_hpage_areas(current->mm, - LOW_ESID_MASK(addr, len)); - else - err = open_high_hpage_areas(current->mm, - HTLB_AREA_MASK(addr, len)); - if (err) { - printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" - " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", - addr, len, - LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); - return err; - } - - return 0; -} - -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - pte_t *ptep; - struct page *page; - - if (! in_hugepage_area(mm->context, address)) - return ERR_PTR(-EINVAL); - - ptep = huge_pte_offset(mm, address); - page = pte_page(*ptep); - if (page) - page += (address % HPAGE_SIZE) / PAGE_SIZE; - - return page; -} - -int pmd_huge(pmd_t pmd) -{ - return 0; -} - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - BUG(); - return NULL; -} - -/* Because we have an exclusive hugepage region which lies within the - * normal user address space, we have to take special measures to make - * non-huge mmap()s evade the hugepage reserved regions. */ -unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - - if (len > TASK_SIZE) - return -ENOMEM; - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (((TASK_SIZE - len) >= addr) - && (!vma || (addr+len) <= vma->vm_start) - && !is_hugepage_only_range(mm, addr,len)) - return addr; - } - if (len > mm->cached_hole_size) { - start_addr = addr = mm->free_area_cache; - } else { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - vma = find_vma(mm, addr); - while (TASK_SIZE - len >= addr) { - BUG_ON(vma && (addr >= vma->vm_end)); - - if (touches_hugepage_low_range(mm, addr, len)) { - addr = ALIGN(addr+1, 1<vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - vma = vma->vm_next; - } - - /* Make sure we didn't miss any holes */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; -} - -/* - * This mmap-allocator allocates new areas top-down from below the - * stack's low limit (the base): - * - * Because we have an exclusive hugepage region which lies within the - * normal user address space, we have to take special measures to make - * non-huge mmap()s evade the hugepage reserved regions. - */ -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) -{ - struct vm_area_struct *vma, *prev_vma; - struct mm_struct *mm = current->mm; - unsigned long base = mm->mmap_base, addr = addr0; - unsigned long largest_hole = mm->cached_hole_size; - int first_time = 1; - - /* requested length too big for entire address space */ - if (len > TASK_SIZE) - return -ENOMEM; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - - /* requesting a specific address */ - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start) - && !is_hugepage_only_range(mm, addr,len)) - return addr; - } - - if (len <= largest_hole) { - largest_hole = 0; - mm->free_area_cache = base; - } -try_again: - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; - - /* either no address requested or cant fit in requested address hole */ - addr = (mm->free_area_cache - len) & PAGE_MASK; - do { -hugepage_recheck: - if (touches_hugepage_low_range(mm, addr, len)) { - addr = (addr & ((~0) << SID_SHIFT)) - len; - goto hugepage_recheck; - } else if (touches_hugepage_high_range(mm, addr, len)) { - addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; - goto hugepage_recheck; - } - - /* - * Lookup failure means no vma is above this address, - * i.e. return with success: - */ - if (!(vma = find_vma_prev(mm, addr, &prev_vma))) - return addr; - - /* - * new region fits between prev_vma->vm_end and - * vma->vm_start, use it: - */ - if (addr+len <= vma->vm_start && - (!prev_vma || (addr >= prev_vma->vm_end))) { - /* remember the address as a hint for next time */ - mm->cached_hole_size = largest_hole; - return (mm->free_area_cache = addr); - } else { - /* pull free_area_cache down to the first hole */ - if (mm->free_area_cache == vma->vm_end) { - mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; - } - } - - /* remember the largest hole we saw so far */ - if (addr + largest_hole < vma->vm_start) - largest_hole = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len <= vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (first_time) { - mm->free_area_cache = base; - largest_hole = 0; - first_time = 0; - goto try_again; - } - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; - - return addr; -} - -static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) -{ - unsigned long addr = 0; - struct vm_area_struct *vma; - - vma = find_vma(current->mm, addr); - while (addr + len <= 0x100000000UL) { - BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ - - if (! __within_hugepage_low_range(addr, len, segmask)) { - addr = ALIGN(addr+1, 1<mm, addr); - continue; - } - - if (!vma || (addr + len) <= vma->vm_start) - return addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); - /* Depending on segmask this might not be a confirmed - * hugepage region, so the ALIGN could have skipped - * some VMAs */ - vma = find_vma(current->mm, addr); - } - - return -ENOMEM; -} - -static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) -{ - unsigned long addr = 0x100000000UL; - struct vm_area_struct *vma; - - vma = find_vma(current->mm, addr); - while (addr + len <= TASK_SIZE_USER64) { - BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ - - if (! __within_hugepage_high_range(addr, len, areamask)) { - addr = ALIGN(addr+1, 1UL<mm, addr); - continue; - } - - if (!vma || (addr + len) <= vma->vm_start) - return addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); - /* Depending on segmask this might not be a confirmed - * hugepage region, so the ALIGN could have skipped - * some VMAs */ - vma = find_vma(current->mm, addr); - } - - return -ENOMEM; -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - int lastshift; - u16 areamask, curareas; - - if (len & ~HPAGE_MASK) - return -EINVAL; - - if (!cpu_has_feature(CPU_FTR_16M_PAGE)) - return -EINVAL; - - if (test_thread_flag(TIF_32BIT)) { - curareas = current->mm->context.low_htlb_areas; - - /* First see if we can do the mapping in the existing - * low areas */ - addr = htlb_get_low_area(len, curareas); - if (addr != -ENOMEM) - return addr; - - lastshift = 0; - for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); - ! lastshift; areamask >>=1) { - if (areamask & 1) - lastshift = 1; - - addr = htlb_get_low_area(len, curareas | areamask); - if ((addr != -ENOMEM) - && open_low_hpage_areas(current->mm, areamask) == 0) - return addr; - } - } else { - curareas = current->mm->context.high_htlb_areas; - - /* First see if we can do the mapping in the existing - * high areas */ - addr = htlb_get_high_area(len, curareas); - if (addr != -ENOMEM) - return addr; - - lastshift = 0; - for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); - ! lastshift; areamask >>=1) { - if (areamask & 1) - lastshift = 1; - - addr = htlb_get_high_area(len, curareas | areamask); - if ((addr != -ENOMEM) - && open_high_hpage_areas(current->mm, areamask) == 0) - return addr; - } - } - printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" - " enough areas\n"); - return -ENOMEM; -} - -int hash_huge_page(struct mm_struct *mm, unsigned long access, - unsigned long ea, unsigned long vsid, int local) -{ - pte_t *ptep; - unsigned long va, vpn; - pte_t old_pte, new_pte; - unsigned long rflags, prpn; - long slot; - int err = 1; - - spin_lock(&mm->page_table_lock); - - ptep = huge_pte_offset(mm, ea); - - /* Search the Linux page table for a match with va */ - va = (vsid << 28) | (ea & 0x0fffffff); - vpn = va >> HPAGE_SHIFT; - - /* - * If no pte found or not present, send the problem up to - * do_page_fault - */ - if (unlikely(!ptep || pte_none(*ptep))) - goto out; - -/* BUG_ON(pte_bad(*ptep)); */ - - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; - /* - * At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. - */ - - - old_pte = *ptep; - new_pte = old_pte; - - rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); - /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ - rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long hash, slot; - - hash = hpt_hash(vpn, 1); - if (pte_val(old_pte) & _PAGE_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; - - if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1) - pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, 1); - unsigned long hpte_group; - - prpn = pte_pfn(old_pte); - -repeat: - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - - /* Update the linux pte with the HPTE slot */ - pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; - pte_val(new_pte) |= _PAGE_HASHPTE; - - /* Add in WIMG bits */ - /* XXX We should store these in the pte */ - rflags |= _PAGE_COHERENT; - - slot = ppc_md.hpte_insert(hpte_group, va, prpn, - HPTE_V_LARGE, rflags); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - pte_val(new_pte) |= _PAGE_SECONDARY; - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, prpn, - HPTE_V_LARGE | - HPTE_V_SECONDARY, - rflags); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP)&~0x7UL; - - ppc_md.hpte_remove(hpte_group); - goto repeat; - } - } - - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); - - pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; - - /* - * No need to use ldarx/stdcx here because all who - * might be updating the pte will hold the - * page_table_lock - */ - *ptep = new_pte; - } - - err = 0; - - out: - spin_unlock(&mm->page_table_lock); - - return err; -} diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c deleted file mode 100644 index c65b87b92756..000000000000 --- a/arch/ppc64/mm/imalloc.c +++ /dev/null @@ -1,317 +0,0 @@ -/* - * c 2001 PPC 64 Team, IBM Corp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -static DECLARE_MUTEX(imlist_sem); -struct vm_struct * imlist = NULL; - -static int get_free_im_addr(unsigned long size, unsigned long *im_addr) -{ - unsigned long addr; - struct vm_struct **p, *tmp; - - addr = ioremap_bot; - for (p = &imlist; (tmp = *p) ; p = &tmp->next) { - if (size + addr < (unsigned long) tmp->addr) - break; - if ((unsigned long)tmp->addr >= ioremap_bot) - addr = tmp->size + (unsigned long) tmp->addr; - if (addr >= IMALLOC_END-size) - return 1; - } - *im_addr = addr; - - return 0; -} - -/* Return whether the region described by v_addr and size is a subset - * of the region described by parent - */ -static inline int im_region_is_subset(unsigned long v_addr, unsigned long size, - struct vm_struct *parent) -{ - return (int) (v_addr >= (unsigned long) parent->addr && - v_addr < (unsigned long) parent->addr + parent->size && - size < parent->size); -} - -/* Return whether the region described by v_addr and size is a superset - * of the region described by child - */ -static int im_region_is_superset(unsigned long v_addr, unsigned long size, - struct vm_struct *child) -{ - struct vm_struct parent; - - parent.addr = (void *) v_addr; - parent.size = size; - - return im_region_is_subset((unsigned long) child->addr, child->size, - &parent); -} - -/* Return whether the region described by v_addr and size overlaps - * the region described by vm. Overlapping regions meet the - * following conditions: - * 1) The regions share some part of the address space - * 2) The regions aren't identical - * 3) Neither region is a subset of the other - */ -static int im_region_overlaps(unsigned long v_addr, unsigned long size, - struct vm_struct *vm) -{ - if (im_region_is_superset(v_addr, size, vm)) - return 0; - - return (v_addr + size > (unsigned long) vm->addr + vm->size && - v_addr < (unsigned long) vm->addr + vm->size) || - (v_addr < (unsigned long) vm->addr && - v_addr + size > (unsigned long) vm->addr); -} - -/* Determine imalloc status of region described by v_addr and size. - * Can return one of the following: - * IM_REGION_UNUSED - Entire region is unallocated in imalloc space. - * IM_REGION_SUBSET - Region is a subset of a region that is already - * allocated in imalloc space. - * vm will be assigned to a ptr to the parent region. - * IM_REGION_EXISTS - Exact region already allocated in imalloc space. - * vm will be assigned to a ptr to the existing imlist - * member. - * IM_REGION_OVERLAPS - Region overlaps an allocated region in imalloc space. - * IM_REGION_SUPERSET - Region is a superset of a region that is already - * allocated in imalloc space. - */ -static int im_region_status(unsigned long v_addr, unsigned long size, - struct vm_struct **vm) -{ - struct vm_struct *tmp; - - for (tmp = imlist; tmp; tmp = tmp->next) - if (v_addr < (unsigned long) tmp->addr + tmp->size) - break; - - if (tmp) { - if (im_region_overlaps(v_addr, size, tmp)) - return IM_REGION_OVERLAP; - - *vm = tmp; - if (im_region_is_subset(v_addr, size, tmp)) { - /* Return with tmp pointing to superset */ - return IM_REGION_SUBSET; - } - if (im_region_is_superset(v_addr, size, tmp)) { - /* Return with tmp pointing to first subset */ - return IM_REGION_SUPERSET; - } - else if (v_addr == (unsigned long) tmp->addr && - size == tmp->size) { - /* Return with tmp pointing to exact region */ - return IM_REGION_EXISTS; - } - } - - *vm = NULL; - return IM_REGION_UNUSED; -} - -static struct vm_struct * split_im_region(unsigned long v_addr, - unsigned long size, struct vm_struct *parent) -{ - struct vm_struct *vm1 = NULL; - struct vm_struct *vm2 = NULL; - struct vm_struct *new_vm = NULL; - - vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL); - if (vm1 == NULL) { - printk(KERN_ERR "%s() out of memory\n", __FUNCTION__); - return NULL; - } - - if (v_addr == (unsigned long) parent->addr) { - /* Use existing parent vm_struct to represent child, allocate - * new one for the remainder of parent range - */ - vm1->size = parent->size - size; - vm1->addr = (void *) (v_addr + size); - vm1->next = parent->next; - - parent->size = size; - parent->next = vm1; - new_vm = parent; - } else if (v_addr + size == (unsigned long) parent->addr + - parent->size) { - /* Allocate new vm_struct to represent child, use existing - * parent one for remainder of parent range - */ - vm1->size = size; - vm1->addr = (void *) v_addr; - vm1->next = parent->next; - new_vm = vm1; - - parent->size -= size; - parent->next = vm1; - } else { - /* Allocate two new vm_structs for the new child and - * uppermost remainder, and use existing parent one for the - * lower remainder of parent range - */ - vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL); - if (vm2 == NULL) { - printk(KERN_ERR "%s() out of memory\n", __FUNCTION__); - kfree(vm1); - return NULL; - } - - vm1->size = size; - vm1->addr = (void *) v_addr; - vm1->next = vm2; - new_vm = vm1; - - vm2->size = ((unsigned long) parent->addr + parent->size) - - (v_addr + size); - vm2->addr = (void *) v_addr + size; - vm2->next = parent->next; - - parent->size = v_addr - (unsigned long) parent->addr; - parent->next = vm1; - } - - return new_vm; -} - -static struct vm_struct * __add_new_im_area(unsigned long req_addr, - unsigned long size) -{ - struct vm_struct **p, *tmp, *area; - - for (p = &imlist; (tmp = *p) ; p = &tmp->next) { - if (req_addr + size <= (unsigned long)tmp->addr) - break; - } - - area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); - if (!area) - return NULL; - area->flags = 0; - area->addr = (void *)req_addr; - area->size = size; - area->next = *p; - *p = area; - - return area; -} - -static struct vm_struct * __im_get_area(unsigned long req_addr, - unsigned long size, - int criteria) -{ - struct vm_struct *tmp; - int status; - - status = im_region_status(req_addr, size, &tmp); - if ((criteria & status) == 0) { - return NULL; - } - - switch (status) { - case IM_REGION_UNUSED: - tmp = __add_new_im_area(req_addr, size); - break; - case IM_REGION_SUBSET: - tmp = split_im_region(req_addr, size, tmp); - break; - case IM_REGION_EXISTS: - /* Return requested region */ - break; - case IM_REGION_SUPERSET: - /* Return first existing subset of requested region */ - break; - default: - printk(KERN_ERR "%s() unexpected imalloc region status\n", - __FUNCTION__); - tmp = NULL; - } - - return tmp; -} - -struct vm_struct * im_get_free_area(unsigned long size) -{ - struct vm_struct *area; - unsigned long addr; - - down(&imlist_sem); - if (get_free_im_addr(size, &addr)) { - printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n", - __FUNCTION__, size); - area = NULL; - goto next_im_done; - } - - area = __im_get_area(addr, size, IM_REGION_UNUSED); - if (area == NULL) { - printk(KERN_ERR - "%s() cannot obtain area for addr 0x%lx size 0x%lx\n", - __FUNCTION__, addr, size); - } -next_im_done: - up(&imlist_sem); - return area; -} - -struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size, - int criteria) -{ - struct vm_struct *area; - - down(&imlist_sem); - area = __im_get_area(v_addr, size, criteria); - up(&imlist_sem); - return area; -} - -void im_free(void * addr) -{ - struct vm_struct **p, *tmp; - - if (!addr) - return; - if ((unsigned long) addr & ~PAGE_MASK) { - printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__, addr); - return; - } - down(&imlist_sem); - for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { - if (tmp->addr == addr) { - *p = tmp->next; - - /* XXX: do we need the lock? */ - spin_lock(&init_mm.page_table_lock); - unmap_vm_area(tmp); - spin_unlock(&init_mm.page_table_lock); - - kfree(tmp); - up(&imlist_sem); - return; - } - } - up(&imlist_sem); - printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__, - addr); -} diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c deleted file mode 100644 index c2157c9c3acb..000000000000 --- a/arch/ppc64/mm/init.c +++ /dev/null @@ -1,870 +0,0 @@ -/* - * PowerPC version - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) - * and Cort Dougan (PReP) (cort@cs.nmt.edu) - * Copyright (C) 1996 Paul Mackerras - * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). - * - * Derived from "arch/i386/mm/init.c" - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * - * Dave Engebretsen - * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if PGTABLE_RANGE > USER_VSID_RANGE -#warning Limited user VSID range means pagetable space is wasted -#endif - -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) -#warning TASK_SIZE is smaller than it needs to be. -#endif - -int mem_init_done; -unsigned long ioremap_bot = IMALLOC_BASE; -static unsigned long phbs_io_bot = PHBS_IO_BASE; - -extern pgd_t swapper_pg_dir[]; -extern struct task_struct *current_set[NR_CPUS]; - -unsigned long klimit = (unsigned long)_end; - -unsigned long _SDR1=0; -unsigned long _ASR=0; - -/* max amount of RAM to use */ -unsigned long __max_memory; - -/* info on what we think the IO hole is */ -unsigned long io_hole_start; -unsigned long io_hole_size; - -void show_mem(void) -{ - unsigned long total = 0, reserved = 0; - unsigned long shared = 0, cached = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - - printk("Mem-info:\n"); - show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; i++) { - page = pgdat_page_nr(pgdat, i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk("%ld pages of RAM\n", total); - printk("%ld reserved pages\n", reserved); - printk("%ld pages shared\n", shared); - printk("%ld pages swap cached\n", cached); -} - -#ifdef CONFIG_PPC_ISERIES - -void __iomem *ioremap(unsigned long addr, unsigned long size) -{ - return (void __iomem *)addr; -} - -extern void __iomem *__ioremap(unsigned long addr, unsigned long size, - unsigned long flags) -{ - return (void __iomem *)addr; -} - -void iounmap(volatile void __iomem *addr) -{ - return; -} - -#else - -/* - * map_io_page currently only called by __ioremap - * map_io_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it - */ -static int map_io_page(unsigned long ea, unsigned long pa, int flags) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - unsigned long vsid; - - if (mem_init_done) { - spin_lock(&init_mm.page_table_lock); - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(&init_mm, pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); - spin_unlock(&init_mm.page_table_lock); - } else { - unsigned long va, vpn, hash, hpteg; - - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - */ - vsid = get_kernel_vsid(ea); - va = (vsid << 28) | (ea & 0xFFFFFFF); - vpn = va >> PAGE_SHIFT; - - hash = hpt_hash(vpn, 0); - - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - - /* Panic if a pte grpup is full */ - if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, - HPTE_V_BOLTED, - _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX) - == -1) { - panic("map_io_page: could not insert mapping"); - } - } - return 0; -} - - -static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa, - unsigned long ea, unsigned long size, - unsigned long flags) -{ - unsigned long i; - - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - - for (i = 0; i < size; i += PAGE_SIZE) - if (map_io_page(ea+i, pa+i, flags)) - return NULL; - - return (void __iomem *) (ea + (addr & ~PAGE_MASK)); -} - - -void __iomem * -ioremap(unsigned long addr, unsigned long size) -{ - return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED); -} - -void __iomem * __ioremap(unsigned long addr, unsigned long size, - unsigned long flags) -{ - unsigned long pa, ea; - void __iomem *ret; - - /* - * Choose an address to map it to. - * Once the imalloc system is running, we use it. - * Before that, we map using addresses going - * up from ioremap_bot. imalloc will use - * the addresses from ioremap_bot through - * IMALLOC_END - * - */ - pa = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - pa; - - if (size == 0) - return NULL; - - if (mem_init_done) { - struct vm_struct *area; - area = im_get_free_area(size); - if (area == NULL) - return NULL; - ea = (unsigned long)(area->addr); - ret = __ioremap_com(addr, pa, ea, size, flags); - if (!ret) - im_free(area->addr); - } else { - ea = ioremap_bot; - ret = __ioremap_com(addr, pa, ea, size, flags); - if (ret) - ioremap_bot += size; - } - return ret; -} - -#define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK)) - -int __ioremap_explicit(unsigned long pa, unsigned long ea, - unsigned long size, unsigned long flags) -{ - struct vm_struct *area; - void __iomem *ret; - - /* For now, require page-aligned values for pa, ea, and size */ - if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) || - !IS_PAGE_ALIGNED(size)) { - printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__); - return 1; - } - - if (!mem_init_done) { - /* Two things to consider in this case: - * 1) No records will be kept (imalloc, etc) that the region - * has been remapped - * 2) It won't be easy to iounmap() the region later (because - * of 1) - */ - ; - } else { - area = im_get_area(ea, size, - IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS); - if (area == NULL) { - /* Expected when PHB-dlpar is in play */ - return 1; - } - if (ea != (unsigned long) area->addr) { - printk(KERN_ERR "unexpected addr return from " - "im_get_area\n"); - return 1; - } - } - - ret = __ioremap_com(pa, pa, ea, size, flags); - if (ret == NULL) { - printk(KERN_ERR "ioremap_explicit() allocation failure !\n"); - return 1; - } - if (ret != (void *) ea) { - printk(KERN_ERR "__ioremap_com() returned unexpected addr\n"); - return 1; - } - - return 0; -} - -/* - * Unmap an IO region and remove it from imalloc'd list. - * Access to IO memory should be serialized by driver. - * This code is modeled after vmalloc code - unmap_vm_area() - * - * XXX what about calls before mem_init_done (ie python_countermeasures()) - */ -void iounmap(volatile void __iomem *token) -{ - void *addr; - - if (!mem_init_done) - return; - - addr = (void *) ((unsigned long __force) token & PAGE_MASK); - - im_free(addr); -} - -static int iounmap_subset_regions(unsigned long addr, unsigned long size) -{ - struct vm_struct *area; - - /* Check whether subsets of this region exist */ - area = im_get_area(addr, size, IM_REGION_SUPERSET); - if (area == NULL) - return 1; - - while (area) { - iounmap((void __iomem *) area->addr); - area = im_get_area(addr, size, - IM_REGION_SUPERSET); - } - - return 0; -} - -int iounmap_explicit(volatile void __iomem *start, unsigned long size) -{ - struct vm_struct *area; - unsigned long addr; - int rc; - - addr = (unsigned long __force) start & PAGE_MASK; - - /* Verify that the region either exists or is a subset of an existing - * region. In the latter case, split the parent region to create - * the exact region - */ - area = im_get_area(addr, size, - IM_REGION_EXISTS | IM_REGION_SUBSET); - if (area == NULL) { - /* Determine whether subset regions exist. If so, unmap */ - rc = iounmap_subset_regions(addr, size); - if (rc) { - printk(KERN_ERR - "%s() cannot unmap nonexistent range 0x%lx\n", - __FUNCTION__, addr); - return 1; - } - } else { - iounmap((void __iomem *) area->addr); - } - /* - * FIXME! This can't be right: - iounmap(area->addr); - * Maybe it should be "iounmap(area);" - */ - return 0; -} - -#endif - -EXPORT_SYMBOL(ioremap); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(iounmap); - -void free_initmem(void) -{ - unsigned long addr; - - addr = (unsigned long)__init_begin; - for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { - memset((void *)addr, 0xcc, PAGE_SIZE); - ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); - free_page(addr); - totalram_pages++; - } - printk ("Freeing unused kernel memory: %luk freed\n", - ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < end) - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); - free_page(start); - totalram_pages++; - } -} -#endif - -static DEFINE_SPINLOCK(mmu_context_lock); -static DEFINE_IDR(mmu_context_idr); - -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - int index; - int err; - -again: - if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&mmu_context_lock); - err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); - spin_unlock(&mmu_context_lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > MAX_CONTEXT) { - idr_remove(&mmu_context_idr, index); - return -ENOMEM; - } - - mm->context.id = index; - - return 0; -} - -void destroy_context(struct mm_struct *mm) -{ - spin_lock(&mmu_context_lock); - idr_remove(&mmu_context_idr, mm->context.id); - spin_unlock(&mmu_context_lock); - - mm->context.id = NO_CONTEXT; -} - -/* - * Do very early mm setup. - */ -void __init mm_init_ppc64(void) -{ -#ifndef CONFIG_PPC_ISERIES - unsigned long i; -#endif - - ppc64_boot_msg(0x100, "MM Init"); - - /* This is the story of the IO hole... please, keep seated, - * unfortunately, we are out of oxygen masks at the moment. - * So we need some rough way to tell where your big IO hole - * is. On pmac, it's between 2G and 4G, on POWER3, it's around - * that area as well, on POWER4 we don't have one, etc... - * We need that as a "hint" when sizing the TCE table on POWER3 - * So far, the simplest way that seem work well enough for us it - * to just assume that the first discontinuity in our physical - * RAM layout is the IO hole. That may not be correct in the future - * (and isn't on iSeries but then we don't care ;) - */ - -#ifndef CONFIG_PPC_ISERIES - for (i = 1; i < lmb.memory.cnt; i++) { - unsigned long base, prevbase, prevsize; - - prevbase = lmb.memory.region[i-1].base; - prevsize = lmb.memory.region[i-1].size; - base = lmb.memory.region[i].base; - if (base > (prevbase + prevsize)) { - io_hole_start = prevbase + prevsize; - io_hole_size = base - (prevbase + prevsize); - break; - } - } -#endif /* CONFIG_PPC_ISERIES */ - if (io_hole_start) - printk("IO Hole assumed to be %lx -> %lx\n", - io_hole_start, io_hole_start + io_hole_size - 1); - - ppc64_boot_msg(0x100, "MM Init Done"); -} - -/* - * This is called by /dev/mem to know if a given address has to - * be mapped non-cacheable or not - */ -int page_is_ram(unsigned long pfn) -{ - int i; - unsigned long paddr = (pfn << PAGE_SHIFT); - - for (i=0; i < lmb.memory.cnt; i++) { - unsigned long base; - - base = lmb.memory.region[i].base; - - if ((paddr >= base) && - (paddr < (base + lmb.memory.region[i].size))) { - return 1; - } - } - - return 0; -} -EXPORT_SYMBOL(page_is_ram); - -/* - * Initialize the bootmem system and give it all the memory we - * have available. - */ -#ifndef CONFIG_NEED_MULTIPLE_NODES -void __init do_init_bootmem(void) -{ - unsigned long i; - unsigned long start, bootmap_pages; - unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT; - int boot_mapsize; - - /* - * Find an area to use for the bootmem bitmap. Calculate the size of - * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. - * Add 1 additional page in case the address isn't page-aligned. - */ - bootmap_pages = bootmem_bootmap_pages(total_pages); - - start = lmb_alloc(bootmap_pages<> PAGE_SHIFT, total_pages); - - max_pfn = max_low_pfn; - - /* Add all physical memory to the bootmem map, mark each area - * present. - */ - for (i=0; i < lmb.memory.cnt; i++) - free_bootmem(lmb.memory.region[i].base, - lmb_size_bytes(&lmb.memory, i)); - - /* reserve the sections we're already using */ - for (i=0; i < lmb.reserved.cnt; i++) - reserve_bootmem(lmb.reserved.region[i].base, - lmb_size_bytes(&lmb.reserved, i)); - - for (i=0; i < lmb.memory.cnt; i++) - memory_present(0, lmb_start_pfn(&lmb.memory, i), - lmb_end_pfn(&lmb.memory, i)); -} - -/* - * paging_init() sets up the page tables - in fact we've already done this. - */ -void __init paging_init(void) -{ - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zholes_size[MAX_NR_ZONES]; - unsigned long total_ram = lmb_phys_mem_size(); - unsigned long top_of_ram = lmb_end_of_DRAM(); - - printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_INFO "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); - /* - * All pages are DMA-able so we put them all in the DMA zone. - */ - memset(zones_size, 0, sizeof(zones_size)); - memset(zholes_size, 0, sizeof(zholes_size)); - - zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; - zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT; - - free_area_init_node(0, NODE_DATA(0), zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); -} -#endif /* ! CONFIG_NEED_MULTIPLE_NODES */ - -static struct kcore_list kcore_vmem; - -static int __init setup_kcore(void) -{ - int i; - - for (i=0; i < lmb.memory.cnt; i++) { - unsigned long base, size; - struct kcore_list *kcore_mem; - - base = lmb.memory.region[i].base; - size = lmb.memory.region[i].size; - - /* GFP_ATOMIC to avoid might_sleep warnings during boot */ - kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); - if (!kcore_mem) - panic("mem_init: kmalloc failed\n"); - - kclist_add(kcore_mem, __va(base), size); - } - - kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); - - return 0; -} -module_init(setup_kcore); - -void __init mem_init(void) -{ -#ifdef CONFIG_NEED_MULTIPLE_NODES - int nid; -#endif - pg_data_t *pgdat; - unsigned long i; - struct page *page; - unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; - - num_physpages = max_low_pfn; /* RAM is assumed contiguous */ - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - -#ifdef CONFIG_NEED_MULTIPLE_NODES - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages != 0) { - printk("freeing bootmem node %x\n", nid); - totalram_pages += - free_all_bootmem_node(NODE_DATA(nid)); - } - } -#else - max_mapnr = num_physpages; - totalram_pages += free_all_bootmem(); -#endif - - for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; i++) { - page = pgdat_page_nr(pgdat, i); - if (PageReserved(page)) - reservedpages++; - } - } - - codesize = (unsigned long)&_etext - (unsigned long)&_stext; - initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; - datasize = (unsigned long)&_edata - (unsigned long)&__init_end; - bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; - - printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " - "%luk reserved, %luk data, %luk bss, %luk init)\n", - (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - bsssize >> 10, - initsize >> 10); - - mem_init_done = 1; - - /* Initialize the vDSO */ - vdso_init(); -} - -/* - * This is called when a page has been modified by the kernel. - * It just marks the page as not i-cache clean. We do the i-cache - * flush later when the page is given to a user process, if necessary. - */ -void flush_dcache_page(struct page *page) -{ - if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - return; - /* avoid an atomic op if possible */ - if (test_bit(PG_arch_1, &page->flags)) - clear_bit(PG_arch_1, &page->flags); -} -EXPORT_SYMBOL(flush_dcache_page); - -void clear_user_page(void *page, unsigned long vaddr, struct page *pg) -{ - clear_page(page); - - if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - return; - /* - * We shouldnt have to do this, but some versions of glibc - * require it (ld.so assumes zero filled pages are icache clean) - * - Anton - */ - - /* avoid an atomic op if possible */ - if (test_bit(PG_arch_1, &pg->flags)) - clear_bit(PG_arch_1, &pg->flags); -} -EXPORT_SYMBOL(clear_user_page); - -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg) -{ - copy_page(vto, vfrom); - - /* - * We should be able to use the following optimisation, however - * there are two problems. - * Firstly a bug in some versions of binutils meant PLT sections - * were not marked executable. - * Secondly the first word in the GOT section is blrl, used - * to establish the GOT address. Until recently the GOT was - * not marked executable. - * - Anton - */ -#if 0 - if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) - return; -#endif - - if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - return; - - /* avoid an atomic op if possible */ - if (test_bit(PG_arch_1, &pg->flags)) - clear_bit(PG_arch_1, &pg->flags); -} - -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, - unsigned long addr, int len) -{ - unsigned long maddr; - - maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK); - flush_icache_range(maddr, maddr + len); -} -EXPORT_SYMBOL(flush_icache_user_range); - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux PTE. - * - * This must always be called with the mm->page_table_lock held - */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea, - pte_t pte) -{ - unsigned long vsid; - void *pgdir; - pte_t *ptep; - int local = 0; - cpumask_t tmp; - unsigned long flags; - - /* handle i-cache coherency */ - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && - !cpu_has_feature(CPU_FTR_NOEXECUTE)) { - unsigned long pfn = pte_pfn(pte); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - if (!PageReserved(page) - && !test_bit(PG_arch_1, &page->flags)) { - __flush_dcache_icache(page_address(page)); - set_bit(PG_arch_1, &page->flags); - } - } - } - - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ - if (!pte_young(pte)) - return; - - pgdir = vma->vm_mm->pgd; - if (pgdir == NULL) - return; - - ptep = find_linux_pte(pgdir, ea); - if (!ptep) - return; - - vsid = get_vsid(vma->vm_mm->context.id, ea); - - local_irq_save(flags); - tmp = cpumask_of_cpu(smp_processor_id()); - if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) - local = 1; - - __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep, - 0x300, local); - local_irq_restore(flags); -} - -void __iomem * reserve_phb_iospace(unsigned long size) -{ - void __iomem *virt_addr; - - if (phbs_io_bot >= IMALLOC_BASE) - panic("reserve_phb_iospace(): phb io space overflow\n"); - - virt_addr = (void __iomem *) phbs_io_bot; - phbs_io_bot += size; - - return virt_addr; -} - -static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) -{ - memset(addr, 0, kmem_cache_size(cache)); -} - -static const int pgtable_cache_size[2] = { - PTE_TABLE_SIZE, PMD_TABLE_SIZE -}; -static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { - "pgd_pte_cache", "pud_pmd_cache", -}; - -kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; - -void pgtable_cache_init(void) -{ - int i; - - BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); - BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); - BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); - BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); - - for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { - int size = pgtable_cache_size[i]; - const char *name = pgtable_cache_name[i]; - - pgtable_cache[i] = kmem_cache_create(name, - size, size, - SLAB_HWCACHE_ALIGN - | SLAB_MUST_HWCACHE_ALIGN, - zero_ctor, - NULL); - if (! pgtable_cache[i]) - panic("pgtable_cache_init(): could not create %s!\n", - name); - } -} - -pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, - unsigned long size, pgprot_t vma_prot) -{ - if (ppc_md.phys_mem_access_prot) - return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot); - - if (!page_is_ram(addr >> PAGE_SHIFT)) - vma_prot = __pgprot(pgprot_val(vma_prot) - | _PAGE_GUARDED | _PAGE_NO_CACHE); - return vma_prot; -} -EXPORT_SYMBOL(phys_mem_access_prot); diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c deleted file mode 100644 index fe65f522aff3..000000000000 --- a/arch/ppc64/mm/mmap.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * linux/arch/ppc64/mm/mmap.c - * - * flexible mmap layout support - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar - */ - -#include -#include - -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline unsigned long mmap_base(void) -{ - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return TASK_SIZE - (gap & PAGE_MASK); -} - -static inline int mmap_is_legacy(void) -{ - /* - * Force standard allocation for 64 bit programs. - */ - if (!test_thread_flag(TIF_32BIT)) - return 1; - - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm) -{ - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (mmap_is_legacy()) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { - mm->mmap_base = mmap_base(); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -} diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c deleted file mode 100644 index cb864b8f2750..000000000000 --- a/arch/ppc64/mm/numa.c +++ /dev/null @@ -1,779 +0,0 @@ -/* - * pSeries NUMA support - * - * Copyright (C) 2002 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int numa_enabled = 1; - -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - -#ifdef DEBUG_NUMA -#define ARRAY_INITIALISER -1 -#else -#define ARRAY_INITIALISER 0 -#endif - -int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = - ARRAY_INITIALISER}; -char *numa_memory_lookup_table; -cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; -int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; - -struct pglist_data *node_data[MAX_NUMNODES]; -bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; -static int min_common_depth; - -/* - * We need somewhere to store start/span for each node until we have - * allocated the real node_data structures. - */ -static struct { - unsigned long node_start_pfn; - unsigned long node_end_pfn; - unsigned long node_present_pages; -} init_node_data[MAX_NUMNODES] __initdata; - -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(numa_cpu_lookup_table); -EXPORT_SYMBOL(numa_memory_lookup_table); -EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(nr_cpus_in_node); - -static inline void map_cpu_to_node(int cpu, int node) -{ - numa_cpu_lookup_table[cpu] = node; - if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { - cpu_set(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]++; - } -} - -#ifdef CONFIG_HOTPLUG_CPU -static void unmap_cpu_from_node(unsigned long cpu) -{ - int node = numa_cpu_lookup_table[cpu]; - - dbg("removing cpu %lu from node %d\n", cpu, node); - - if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { - cpu_clear(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]--; - } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); - } -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static struct device_node * __devinit find_cpu_node(unsigned int cpu) -{ - unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); - struct device_node *cpu_node = NULL; - unsigned int *interrupt_server, *reg; - int len; - - while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { - /* Try interrupt server first */ - interrupt_server = (unsigned int *)get_property(cpu_node, - "ibm,ppc-interrupt-server#s", &len); - - len = len / sizeof(u32); - - if (interrupt_server && (len > 0)) { - while (len--) { - if (interrupt_server[len] == hw_cpuid) - return cpu_node; - } - } else { - reg = (unsigned int *)get_property(cpu_node, - "reg", &len); - if (reg && (len > 0) && (reg[0] == hw_cpuid)) - return cpu_node; - } - } - - return NULL; -} - -/* must hold reference to node during call */ -static int *of_get_associativity(struct device_node *dev) -{ - return (unsigned int *)get_property(dev, "ibm,associativity", NULL); -} - -static int of_node_numa_domain(struct device_node *device) -{ - int numa_domain; - unsigned int *tmp; - - if (min_common_depth == -1) - return 0; - - tmp = of_get_associativity(device); - if (tmp && (tmp[0] >= min_common_depth)) { - numa_domain = tmp[min_common_depth]; - } else { - dbg("WARNING: no NUMA information for %s\n", - device->full_name); - numa_domain = 0; - } - return numa_domain; -} - -/* - * In theory, the "ibm,associativity" property may contain multiple - * associativity lists because a resource may be multiply connected - * into the machine. This resource then has different associativity - * characteristics relative to its multiple connections. We ignore - * this for now. We also assume that all cpu and memory sets have - * their distances represented at a common level. This won't be - * true for heirarchical NUMA. - * - * In any case the ibm,associativity-reference-points should give - * the correct depth for a normal NUMA system. - * - * - Dave Hansen - */ -static int __init find_min_common_depth(void) -{ - int depth; - unsigned int *ref_points; - struct device_node *rtas_root; - unsigned int len; - - rtas_root = of_find_node_by_path("/rtas"); - - if (!rtas_root) - return -1; - - /* - * this property is 2 32-bit integers, each representing a level of - * depth in the associativity nodes. The first is for an SMP - * configuration (should be all 0's) and the second is for a normal - * NUMA configuration. - */ - ref_points = (unsigned int *)get_property(rtas_root, - "ibm,associativity-reference-points", &len); - - if ((len >= 1) && ref_points) { - depth = ref_points[1]; - } else { - dbg("WARNING: could not find NUMA " - "associativity reference point\n"); - depth = -1; - } - of_node_put(rtas_root); - - return depth; -} - -static int __init get_mem_addr_cells(void) -{ - struct device_node *memory = NULL; - int rc; - - memory = of_find_node_by_type(memory, "memory"); - if (!memory) - return 0; /* it won't matter */ - - rc = prom_n_addr_cells(memory); - return rc; -} - -static int __init get_mem_size_cells(void) -{ - struct device_node *memory = NULL; - int rc; - - memory = of_find_node_by_type(memory, "memory"); - if (!memory) - return 0; /* it won't matter */ - rc = prom_n_size_cells(memory); - return rc; -} - -static unsigned long read_n_cells(int n, unsigned int **buf) -{ - unsigned long result = 0; - - while (n--) { - result = (result << 32) | **buf; - (*buf)++; - } - return result; -} - -/* - * Figure out to which domain a cpu belongs and stick it there. - * Return the id of the domain used. - */ -static int numa_setup_cpu(unsigned long lcpu) -{ - int numa_domain = 0; - struct device_node *cpu = find_cpu_node(lcpu); - - if (!cpu) { - WARN_ON(1); - goto out; - } - - numa_domain = of_node_numa_domain(cpu); - - if (numa_domain >= num_online_nodes()) { - /* - * POWER4 LPAR uses 0xffff as invalid node, - * dont warn in this case. - */ - if (numa_domain != 0xffff) - printk(KERN_ERR "WARNING: cpu %ld " - "maps to invalid NUMA node %d\n", - lcpu, numa_domain); - numa_domain = 0; - } -out: - node_set_online(numa_domain); - - map_cpu_to_node(lcpu, numa_domain); - - of_node_put(cpu); - - return numa_domain; -} - -static int cpu_numa_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - unsigned long lcpu = (unsigned long)hcpu; - int ret = NOTIFY_DONE; - - switch (action) { - case CPU_UP_PREPARE: - if (min_common_depth == -1 || !numa_enabled) - map_cpu_to_node(lcpu, 0); - else - numa_setup_cpu(lcpu); - ret = NOTIFY_OK; - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_UP_CANCELED: - unmap_cpu_from_node(lcpu); - break; - ret = NOTIFY_OK; -#endif - } - return ret; -} - -/* - * Check and possibly modify a memory region to enforce the memory limit. - * - * Returns the size the region should have to enforce the memory limit. - * This will either be the original value of size, a truncated value, - * or zero. If the returned value of size is 0 the region should be - * discarded as it lies wholy above the memory limit. - */ -static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) -{ - /* - * We use lmb_end_of_DRAM() in here instead of memory_limit because - * we've already adjusted it for the limit and it takes care of - * having memory holes below the limit. - */ - extern unsigned long memory_limit; - - if (! memory_limit) - return size; - - if (start + size <= lmb_end_of_DRAM()) - return size; - - if (start >= lmb_end_of_DRAM()) - return 0; - - return lmb_end_of_DRAM() - start; -} - -static int __init parse_numa_properties(void) -{ - struct device_node *cpu = NULL; - struct device_node *memory = NULL; - int addr_cells, size_cells; - int max_domain = 0; - long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; - unsigned long i; - - if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); - return -1; - } - - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - - min_common_depth = find_min_common_depth(); - - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); - if (min_common_depth < 0) - return min_common_depth; - - max_domain = numa_setup_cpu(boot_cpuid); - - /* - * Even though we connect cpus to numa domains later in SMP init, - * we need to know the maximum node id now. This is because each - * node id must have NODE_DATA etc backing it. - * As a result of hotplug we could still have cpus appear later on - * with larger node ids. In that case we force the cpu into node 0. - */ - for_each_cpu(i) { - int numa_domain; - - cpu = find_cpu_node(i); - - if (cpu) { - numa_domain = of_node_numa_domain(cpu); - of_node_put(cpu); - - if (numa_domain < MAX_NUMNODES && - max_domain < numa_domain) - max_domain = numa_domain; - } - } - - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long start; - unsigned long size; - int numa_domain; - int ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; - - ranges = memory->n_addrs; -new_range: - /* these are order-sensitive, and modify the buffer pointer */ - start = read_n_cells(addr_cells, &memcell_buf); - size = read_n_cells(size_cells, &memcell_buf); - - start = _ALIGN_DOWN(start, MEMORY_INCREMENT); - size = _ALIGN_UP(size, MEMORY_INCREMENT); - - numa_domain = of_node_numa_domain(memory); - - if (numa_domain >= MAX_NUMNODES) { - if (numa_domain != 0xffff) - printk(KERN_ERR "WARNING: memory at %lx maps " - "to invalid NUMA node %d\n", start, - numa_domain); - numa_domain = 0; - } - - if (max_domain < numa_domain) - max_domain = numa_domain; - - if (! (size = numa_enforce_memory_limit(start, size))) { - if (--ranges) - goto new_range; - else - continue; - } - - /* - * Initialize new node struct, or add to an existing one. - */ - if (init_node_data[numa_domain].node_end_pfn) { - if ((start / PAGE_SIZE) < - init_node_data[numa_domain].node_start_pfn) - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > - init_node_data[numa_domain].node_end_pfn) - init_node_data[numa_domain].node_end_pfn = - (start / PAGE_SIZE) + - (size / PAGE_SIZE); - - init_node_data[numa_domain].node_present_pages += - size / PAGE_SIZE; - } else { - node_set_online(numa_domain); - - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - init_node_data[numa_domain].node_end_pfn = - init_node_data[numa_domain].node_start_pfn + - size / PAGE_SIZE; - init_node_data[numa_domain].node_present_pages = - size / PAGE_SIZE; - } - - for (i = start ; i < (start+size); i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = - numa_domain; - - if (--ranges) - goto new_range; - } - - for (i = 0; i <= max_domain; i++) - node_set_online(i); - - return 0; -} - -static void __init setup_nonnuma(void) -{ - unsigned long top_of_ram = lmb_end_of_DRAM(); - unsigned long total_ram = lmb_phys_mem_size(); - unsigned long i; - - printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_INFO "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); - - if (!numa_memory_lookup_table) { - long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - } - - map_cpu_to_node(boot_cpuid, 0); - - node_set_online(0); - - init_node_data[0].node_start_pfn = 0; - init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; - init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; - - for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; -} - -static void __init dump_numa_topology(void) -{ - unsigned int node; - unsigned int count; - - if (min_common_depth == -1 || !numa_enabled) - return; - - for_each_online_node(node) { - unsigned long i; - - printk(KERN_INFO "Node %d Memory:", node); - - count = 0; - - for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { - if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { - if (count == 0) - printk(" 0x%lx", i); - ++count; - } else { - if (count > 0) - printk("-0x%lx", i); - count = 0; - } - } - - if (count > 0) - printk("-0x%lx", i); - printk("\n"); - } - return; -} - -/* - * Allocate some memory, satisfying the lmb or bootmem allocator where - * required. nid is the preferred node and end is the physical address of - * the highest address in the node. - * - * Returns the physical address of the memory. - */ -static unsigned long careful_allocation(int nid, unsigned long size, - unsigned long align, unsigned long end) -{ - unsigned long ret = lmb_alloc_base(size, align, end); - - /* retry over all memory */ - if (!ret) - ret = lmb_alloc_base(size, align, lmb_end_of_DRAM()); - - if (!ret) - panic("numa.c: cannot allocate %lu bytes on node %d", - size, nid); - - /* - * If the memory came from a previously allocated node, we must - * retry with the bootmem allocator. - */ - if (pa_to_nid(ret) < nid) { - nid = pa_to_nid(ret); - ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), - size, align, 0); - - if (!ret) - panic("numa.c: cannot allocate %lu bytes on node %d", - size, nid); - - ret = virt_to_abs(ret); - - dbg("alloc_bootmem %lx %lx\n", ret, size); - } - - return ret; -} - -void __init do_init_bootmem(void) -{ - int nid; - int addr_cells, size_cells; - struct device_node *memory = NULL; - static struct notifier_block ppc64_numa_nb = { - .notifier_call = cpu_numa_callback, - .priority = 1 /* Must run before sched domains notifier. */ - }; - - min_low_pfn = 0; - max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; - - if (parse_numa_properties()) - setup_nonnuma(); - else - dump_numa_topology(); - - register_cpu_notifier(&ppc64_numa_nb); - - for_each_online_node(nid) { - unsigned long start_paddr, end_paddr; - int i; - unsigned long bootmem_paddr; - unsigned long bootmap_pages; - - start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; - end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; - - /* Allocate the node structure node local if possible */ - NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, - sizeof(struct pglist_data), - SMP_CACHE_BYTES, end_paddr); - NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - - dbg("node %d\n", nid); - dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); - - NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; - NODE_DATA(nid)->node_start_pfn = - init_node_data[nid].node_start_pfn; - NODE_DATA(nid)->node_spanned_pages = - end_paddr - start_paddr; - - if (NODE_DATA(nid)->node_spanned_pages == 0) - continue; - - dbg("start_paddr = %lx\n", start_paddr); - dbg("end_paddr = %lx\n", end_paddr); - - bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); - - bootmem_paddr = careful_allocation(nid, - bootmap_pages << PAGE_SHIFT, - PAGE_SIZE, end_paddr); - memset(abs_to_virt(bootmem_paddr), 0, - bootmap_pages << PAGE_SHIFT); - dbg("bootmap_paddr = %lx\n", bootmem_paddr); - - init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, - start_paddr >> PAGE_SHIFT, - end_paddr >> PAGE_SHIFT); - - /* - * We need to do another scan of all memory sections to - * associate memory with the correct node. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; - - ranges = memory->n_addrs; /* ranges in cell */ -new_range: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; - - if (numa_domain != nid) - continue; - - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - if (mem_size) { - dbg("free_bootmem %lx %lx\n", mem_start, mem_size); - free_bootmem_node(NODE_DATA(nid), mem_start, mem_size); - } - - if (--ranges) /* process all ranges in cell */ - goto new_range; - } - - /* - * Mark reserved regions on this node - */ - for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].base; - unsigned long size = lmb.reserved.region[i].size; - - if (pa_to_nid(physbase) != nid && - pa_to_nid(physbase+size-1) != nid) - continue; - - if (physbase < end_paddr && - (physbase+size) > start_paddr) { - /* overlaps */ - if (physbase < start_paddr) { - size -= start_paddr - physbase; - physbase = start_paddr; - } - - if (size > end_paddr - physbase) - size = end_paddr - physbase; - - dbg("reserve_bootmem %lx %lx\n", physbase, - size); - reserve_bootmem_node(NODE_DATA(nid), physbase, - size); - } - } - /* - * This loop may look famaliar, but we have to do it again - * after marking our reserved memory to mark memory present - * for sparsemem. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; - - ranges = memory->n_addrs; /* ranges in cell */ -new_range2: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; - - if (numa_domain != nid) - continue; - - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - memory_present(numa_domain, mem_start >> PAGE_SHIFT, - (mem_start + mem_size) >> PAGE_SHIFT); - - if (--ranges) /* process all ranges in cell */ - goto new_range2; - } - - } -} - -void __init paging_init(void) -{ - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long zholes_size[MAX_NR_ZONES]; - int nid; - - memset(zones_size, 0, sizeof(zones_size)); - memset(zholes_size, 0, sizeof(zholes_size)); - - for_each_online_node(nid) { - unsigned long start_pfn; - unsigned long end_pfn; - - start_pfn = init_node_data[nid].node_start_pfn; - end_pfn = init_node_data[nid].node_end_pfn; - - zones_size[ZONE_DMA] = end_pfn - start_pfn; - zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - - init_node_data[nid].node_present_pages; - - dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, - zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); - - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start_pfn, zholes_size); - } -} - -static int __init early_numa(char *p) -{ - if (!p) - return 0; - - if (strstr(p, "off")) - numa_enabled = 0; - - if (strstr(p, "debug")) - numa_debug = 1; - - return 0; -} -early_param("numa", early_numa); diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c deleted file mode 100644 index 0473953f6a37..000000000000 --- a/arch/ppc64/mm/slb.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * PowerPC64 SLB support. - * - * Copyright (C) 2004 David Gibson , IBM - * Based on earlier code writteh by: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard , IBM - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include - -extern void slb_allocate(unsigned long ea); - -static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot) -{ - return (ea & ESID_MASK) | SLB_ESID_V | slot; -} - -static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags) -{ - return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags; -} - -static inline void create_slbe(unsigned long ea, unsigned long flags, - unsigned long entry) -{ - asm volatile("slbmte %0,%1" : - : "r" (mk_vsid_data(ea, flags)), - "r" (mk_esid_data(ea, entry)) - : "memory" ); -} - -static void slb_flush_and_rebolt(void) -{ - /* If you change this make sure you change SLB_NUM_BOLTED - * appropriately too. */ - unsigned long ksp_flags = SLB_VSID_KERNEL; - unsigned long ksp_esid_data; - - WARN_ON(!irqs_disabled()); - - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - ksp_flags |= SLB_VSID_L; - - ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); - if ((ksp_esid_data & ESID_MASK) == KERNELBASE) - ksp_esid_data &= ~SLB_ESID_V; - - /* We need to do this all in asm, so we're sure we don't touch - * the stack between the slbia and rebolting it. */ - asm volatile("isync\n" - "slbia\n" - /* Slot 1 - first VMALLOC segment */ - "slbmte %0,%1\n" - /* Slot 2 - kernel stack */ - "slbmte %2,%3\n" - "isync" - :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)), - "r"(mk_esid_data(VMALLOCBASE, 1)), - "r"(mk_vsid_data(ksp_esid_data, ksp_flags)), - "r"(ksp_esid_data) - : "memory"); -} - -/* Flush all user entries from the segment table of the current processor. */ -void switch_slb(struct task_struct *tsk, struct mm_struct *mm) -{ - unsigned long offset = get_paca()->slb_cache_ptr; - unsigned long esid_data = 0; - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; - - if (offset <= SLB_CACHE_ENTRIES) { - int i; - asm volatile("isync" : : : "memory"); - for (i = 0; i < offset; i++) { - esid_data = ((unsigned long)get_paca()->slb_cache[i] - << SID_SHIFT) | SLBIE_C; - asm volatile("slbie %0" : : "r" (esid_data)); - } - asm volatile("isync" : : : "memory"); - } else { - slb_flush_and_rebolt(); - } - - /* Workaround POWER5 < DD2.1 issue */ - if (offset == 1 || offset > SLB_CACHE_ENTRIES) - asm volatile("slbie %0" : : "r" (esid_data)); - - get_paca()->slb_cache_ptr = 0; - get_paca()->context = mm->context; - - /* - * preload some userspace segments into the SLB. - */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; - - if (pc >= KERNELBASE) - return; - slb_allocate(pc); - - if (GET_ESID(pc) == GET_ESID(stack)) - return; - - if (stack >= KERNELBASE) - return; - slb_allocate(stack); - - if ((GET_ESID(pc) == GET_ESID(unmapped_base)) - || (GET_ESID(stack) == GET_ESID(unmapped_base))) - return; - - if (unmapped_base >= KERNELBASE) - return; - slb_allocate(unmapped_base); -} - -void slb_initialize(void) -{ - /* On iSeries the bolted entries have already been set up by - * the hypervisor from the lparMap data in head.S */ -#ifndef CONFIG_PPC_ISERIES - unsigned long flags = SLB_VSID_KERNEL; - - /* Invalidate the entire SLB (even slot 0) & all the ERATS */ - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - flags |= SLB_VSID_L; - - asm volatile("isync":::"memory"); - asm volatile("slbmte %0,%0"::"r" (0) : "memory"); - asm volatile("isync; slbia; isync":::"memory"); - create_slbe(KERNELBASE, flags, 0); - create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1); - /* We don't bolt the stack for the time being - we're in boot, - * so the stack is in the bolted segment. By the time it goes - * elsewhere, we'll call _switch() which will bolt in the new - * one. */ - asm volatile("isync":::"memory"); -#endif - - get_paca()->stab_rr = SLB_NUM_BOLTED; -} diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S deleted file mode 100644 index a3a03da503bc..000000000000 --- a/arch/ppc64/mm/slb_low.S +++ /dev/null @@ -1,151 +0,0 @@ -/* - * arch/ppc64/mm/slb_low.S - * - * Low-level SLB routines - * - * Copyright (C) 2004 David Gibson , IBM - * - * Based on earlier C version: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* void slb_allocate(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - */ -_GLOBAL(slb_allocate) - /* - * First find a slot, round robin. Previously we tried to find - * a free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ -#ifdef CONFIG_PPC_ISERIES - /* - * On iSeries, the "bolted" stack segment can be cast out on - * shared processor switch so we need to check for a miss on - * it and restore it to the right slot. - */ - ld r9,PACAKSAVE(r13) - clrrdi r9,r9,28 - clrrdi r11,r3,28 - li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ - cmpld r9,r11 - beq 3f -#endif /* CONFIG_PPC_ISERIES */ - - ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* use a cpu feature mask if we ever change our slb size */ - cmpldi r10,SLB_NUM_ENTRIES - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) -3: - /* r3 = faulting address, r10 = entry */ - - srdi r9,r3,60 /* get region */ - srdi r3,r3,28 /* get esid */ - cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ - - rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */ - oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */ - - /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */ - - blt cr7,0f /* user or kernel? */ - - /* kernel address: proto-VSID = ESID */ - /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but - * this code will generate the protoVSID 0xfffffffff for the - * top segment. That's ok, the scramble below will translate - * it to VSID 0, which is reserved as a bad VSID - one which - * will never have any pages in it. */ - li r11,SLB_VSID_KERNEL -BEGIN_FTR_SECTION - bne cr7,9f - li r11,(SLB_VSID_KERNEL|SLB_VSID_L) -END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) - b 9f - -0: /* user address: proto-VSID = context<<15 | ESID */ - srdi. r9,r3,USER_ESID_BITS - bne- 8f /* invalid ea bits set */ - -#ifdef CONFIG_HUGETLB_PAGE -BEGIN_FTR_SECTION - lhz r9,PACAHIGHHTLBAREAS(r13) - srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT) - srd r9,r9,r11 - lhz r11,PACALOWHTLBAREAS(r13) - srd r11,r11,r3 - or r9,r9,r11 -END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) -#endif /* CONFIG_HUGETLB_PAGE */ - - li r11,SLB_VSID_USER - -#ifdef CONFIG_HUGETLB_PAGE -BEGIN_FTR_SECTION - rldimi r11,r9,8,55 /* shift masked bit into SLB_VSID_L */ -END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) -#endif /* CONFIG_HUGETLB_PAGE */ - - ld r9,PACACONTEXTID(r13) - rldimi r3,r9,USER_ESID_BITS,0 - -9: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */ - ASM_VSID_SCRAMBLE(r3,r9) - - rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */ - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - */ - slbmte r11,r10 - - bgelr cr7 /* we're done for kernel addresses */ - - /* Update the slb cache */ - lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r3,SLB_CACHE_ENTRIES - bge 1f - - /* still room in the slb cache */ - sldi r11,r3,1 /* r11 = offset * sizeof(u16) */ - rldicl r10,r10,36,28 /* get low 16 bits of the ESID */ - add r11,r11,r13 /* r11 = (u16 *)paca + offset */ - sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r3,r3,1 /* offset++ */ - b 2f -1: /* offset >= SLB_CACHE_ENTRIES */ - li r3,SLB_CACHE_ENTRIES+1 -2: - sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ - blr - -8: /* invalid EA */ - li r3,0 /* BAD_VSID */ - li r11,SLB_VSID_USER /* flags don't much matter */ - b 9b diff --git a/arch/ppc64/mm/stab.c b/arch/ppc64/mm/stab.c deleted file mode 100644 index 1b83f002bf27..000000000000 --- a/arch/ppc64/mm/stab.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * PowerPC64 Segment Translation Support. - * - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * - * Copyright (C) 2002 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct stab_entry { - unsigned long esid_data; - unsigned long vsid_data; -}; - -/* Both the segment table and SLB code uses the following cache */ -#define NR_STAB_CACHE_ENTRIES 8 -DEFINE_PER_CPU(long, stab_cache_ptr); -DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); - -/* - * Create a segment table entry for the given esid/vsid pair. - */ -static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid) -{ - unsigned long esid_data, vsid_data; - unsigned long entry, group, old_esid, castout_entry, i; - unsigned int global_entry; - struct stab_entry *ste, *castout_ste; - unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE; - - vsid_data = vsid << STE_VSID_SHIFT; - esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; - if (! kernel_segment) - esid_data |= STE_ESID_KS; - - /* Search the primary group first. */ - global_entry = (esid & 0x1f) << 3; - ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); - - /* Find an empty entry, if one exists. */ - for (group = 0; group < 2; group++) { - for (entry = 0; entry < 8; entry++, ste++) { - if (!(ste->esid_data & STE_ESID_V)) { - ste->vsid_data = vsid_data; - asm volatile("eieio":::"memory"); - ste->esid_data = esid_data; - return (global_entry | entry); - } - } - /* Now search the secondary group. */ - global_entry = ((~esid) & 0x1f) << 3; - ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); - } - - /* - * Could not find empty entry, pick one with a round robin selection. - * Search all entries in the two groups. - */ - castout_entry = get_paca()->stab_rr; - for (i = 0; i < 16; i++) { - if (castout_entry < 8) { - global_entry = (esid & 0x1f) << 3; - ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); - castout_ste = ste + castout_entry; - } else { - global_entry = ((~esid) & 0x1f) << 3; - ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); - castout_ste = ste + (castout_entry - 8); - } - - /* Dont cast out the first kernel segment */ - if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE) - break; - - castout_entry = (castout_entry + 1) & 0xf; - } - - get_paca()->stab_rr = (castout_entry + 1) & 0xf; - - /* Modify the old entry to the new value. */ - - /* Force previous translations to complete. DRENG */ - asm volatile("isync" : : : "memory"); - - old_esid = castout_ste->esid_data >> SID_SHIFT; - castout_ste->esid_data = 0; /* Invalidate old entry */ - - asm volatile("sync" : : : "memory"); /* Order update */ - - castout_ste->vsid_data = vsid_data; - asm volatile("eieio" : : : "memory"); /* Order update */ - castout_ste->esid_data = esid_data; - - asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT)); - /* Ensure completion of slbie */ - asm volatile("sync" : : : "memory"); - - return (global_entry | (castout_entry & 0x7)); -} - -/* - * Allocate a segment table entry for the given ea and mm - */ -static int __ste_allocate(unsigned long ea, struct mm_struct *mm) -{ - unsigned long vsid; - unsigned char stab_entry; - unsigned long offset; - - /* Kernel or user address? */ - if (ea >= KERNELBASE) { - vsid = get_kernel_vsid(ea); - } else { - if ((ea >= TASK_SIZE_USER64) || (! mm)) - return 1; - - vsid = get_vsid(mm->context.id, ea); - } - - stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); - - if (ea < KERNELBASE) { - offset = __get_cpu_var(stab_cache_ptr); - if (offset < NR_STAB_CACHE_ENTRIES) - __get_cpu_var(stab_cache[offset++]) = stab_entry; - else - offset = NR_STAB_CACHE_ENTRIES+1; - __get_cpu_var(stab_cache_ptr) = offset; - - /* Order update */ - asm volatile("sync":::"memory"); - } - - return 0; -} - -int ste_allocate(unsigned long ea) -{ - return __ste_allocate(ea, current->mm); -} - -/* - * Do the segment table work for a context switch: flush all user - * entries from the table, then preload some probably useful entries - * for the new task - */ -void switch_stab(struct task_struct *tsk, struct mm_struct *mm) -{ - struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; - struct stab_entry *ste; - unsigned long offset = __get_cpu_var(stab_cache_ptr); - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; - - /* Force previous translations to complete. DRENG */ - asm volatile("isync" : : : "memory"); - - if (offset <= NR_STAB_CACHE_ENTRIES) { - int i; - - for (i = 0; i < offset; i++) { - ste = stab + __get_cpu_var(stab_cache[i]); - ste->esid_data = 0; /* invalidate entry */ - } - } else { - unsigned long entry; - - /* Invalidate all entries. */ - ste = stab; - - /* Never flush the first entry. */ - ste += 1; - for (entry = 1; - entry < (PAGE_SIZE / sizeof(struct stab_entry)); - entry++, ste++) { - unsigned long ea; - ea = ste->esid_data & ESID_MASK; - if (ea < KERNELBASE) { - ste->esid_data = 0; - } - } - } - - asm volatile("sync; slbia; sync":::"memory"); - - __get_cpu_var(stab_cache_ptr) = 0; - - /* Now preload some entries for the new task */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; - - __ste_allocate(pc, mm); - - if (GET_ESID(pc) == GET_ESID(stack)) - return; - - __ste_allocate(stack, mm); - - if ((GET_ESID(pc) == GET_ESID(unmapped_base)) - || (GET_ESID(stack) == GET_ESID(unmapped_base))) - return; - - __ste_allocate(unmapped_base, mm); - - /* Order update */ - asm volatile("sync" : : : "memory"); -} - -extern void slb_initialize(void); - -/* - * Allocate segment tables for secondary CPUs. These must all go in - * the first (bolted) segment, so that do_stab_bolted won't get a - * recursive segment miss on the segment table itself. - */ -void stabs_alloc(void) -{ - int cpu; - - if (cpu_has_feature(CPU_FTR_SLB)) - return; - - for_each_cpu(cpu) { - unsigned long newstab; - - if (cpu == 0) - continue; /* stab for CPU 0 is statically allocated */ - - newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1< - * Rework for PPC64 port. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); - -/* This is declared as we are using the more or less generic - * include/asm-ppc64/tlb.h file -- tgall - */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); -unsigned long pte_freelist_forced_free; - -struct pte_freelist_batch -{ - struct rcu_head rcu; - unsigned int index; - pgtable_free_t tables[0]; -}; - -DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); -unsigned long pte_freelist_forced_free; - -#define PTE_FREELIST_SIZE \ - ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ - / sizeof(pgtable_free_t)) - -#ifdef CONFIG_SMP -static void pte_free_smp_sync(void *arg) -{ - /* Do nothing, just ensure we sync with all CPUs */ -} -#endif - -/* This is only called when we are critically out of memory - * (and fail to get a page in pte_free_tlb). - */ -static void pgtable_free_now(pgtable_free_t pgf) -{ - pte_freelist_forced_free++; - - smp_call_function(pte_free_smp_sync, NULL, 0, 1); - - pgtable_free(pgf); -} - -static void pte_free_rcu_callback(struct rcu_head *head) -{ - struct pte_freelist_batch *batch = - container_of(head, struct pte_freelist_batch, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - pgtable_free(batch->tables[i]); - - free_page((unsigned long)batch); -} - -static void pte_free_submit(struct pte_freelist_batch *batch) -{ - INIT_RCU_HEAD(&batch->rcu); - call_rcu(&batch->rcu, pte_free_rcu_callback); -} - -void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) -{ - /* This is safe as we are holding page_table_lock */ - cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); - - if (atomic_read(&tlb->mm->mm_users) < 2 || - cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { - pgtable_free(pgf); - return; - } - - if (*batchp == NULL) { - *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); - if (*batchp == NULL) { - pgtable_free_now(pgf); - return; - } - (*batchp)->index = 0; - } - (*batchp)->tables[(*batchp)->index++] = pgf; - if ((*batchp)->index == PTE_FREELIST_SIZE) { - pte_free_submit(*batchp); - *batchp = NULL; - } -} - -/* - * Update the MMU hash table to correspond with a change to - * a Linux PTE. If wrprot is true, it is permissible to - * change the existing HPTE to read-only rather than removing it - * (if we remove it we should clear the _PTE_HPTEFLAGS bits). - */ -void hpte_update(struct mm_struct *mm, unsigned long addr, - unsigned long pte, int wrprot) -{ - struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); - unsigned long vsid; - int i; - - i = batch->index; - - /* - * This can happen when we are in the middle of a TLB batch and - * we encounter memory pressure (eg copy_page_range when it tries - * to allocate a new pte). If we have to reclaim memory and end - * up scanning and resetting referenced bits then our batch context - * will change mid stream. - */ - if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) { - flush_tlb_pending(); - i = 0; - } - if (i == 0) { - batch->mm = mm; - batch->large = pte_huge(pte); - } - if (addr < KERNELBASE) { - vsid = get_vsid(mm->context.id, addr); - WARN_ON(vsid == 0); - } else - vsid = get_kernel_vsid(addr); - batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff); - batch->pte[i] = __pte(pte); - batch->index = ++i; - if (i >= PPC64_TLB_BATCH_NR) - flush_tlb_pending(); -} - -void __flush_tlb_pending(struct ppc64_tlb_batch *batch) -{ - int i; - int cpu; - cpumask_t tmp; - int local = 0; - - BUG_ON(in_interrupt()); - - cpu = get_cpu(); - i = batch->index; - tmp = cpumask_of_cpu(cpu); - if (cpus_equal(batch->mm->cpu_vm_mask, tmp)) - local = 1; - - if (i == 1) - flush_hash_page(batch->vaddr[0], batch->pte[0], local); - else - flush_hash_range(i, local); - batch->index = 0; - put_cpu(); -} - -void pte_free_finish(void) -{ - /* This is safe as we are holding page_table_lock */ - struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); - - if (*batchp == NULL) - return; - pte_free_submit(*batchp); - *batchp = NULL; -}