powerpc: Merge arch/ppc64/mm to arch/powerpc/mm
authorPaul Mackerras <paulus@samba.org>
Mon, 10 Oct 2005 11:58:35 +0000 (21:58 +1000)
committerPaul Mackerras <paulus@samba.org>
Mon, 10 Oct 2005 11:58:35 +0000 (21:58 +1000)
This moves the remaining files in arch/ppc64/mm to arch/powerpc/mm,
and arranges that we use them when compiling with ARCH=ppc64.

Signed-off-by: Paul Mackerras <paulus@samba.org>
31 files changed:
arch/powerpc/mm/Makefile
arch/powerpc/mm/hash_low_64.S [new file with mode: 0644]
arch/powerpc/mm/hash_native_64.c [new file with mode: 0644]
arch/powerpc/mm/hash_utils_64.c [new file with mode: 0644]
arch/powerpc/mm/hugetlbpage.c [new file with mode: 0644]
arch/powerpc/mm/imalloc.c [new file with mode: 0644]
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/mmap.c [new file with mode: 0644]
arch/powerpc/mm/mmu_decl.h
arch/powerpc/mm/numa.c [new file with mode: 0644]
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/slb.c [new file with mode: 0644]
arch/powerpc/mm/slb_low.S [new file with mode: 0644]
arch/powerpc/mm/stab.c [new file with mode: 0644]
arch/powerpc/mm/tlb_64.c [new file with mode: 0644]
arch/ppc64/Makefile
arch/ppc64/mm/Makefile [deleted file]
arch/ppc64/mm/fault.c [deleted file]
arch/ppc64/mm/hash_low.S [deleted file]
arch/ppc64/mm/hash_native.c [deleted file]
arch/ppc64/mm/hash_utils.c [deleted file]
arch/ppc64/mm/hugetlbpage.c [deleted file]
arch/ppc64/mm/imalloc.c [deleted file]
arch/ppc64/mm/init.c [deleted file]
arch/ppc64/mm/mmap.c [deleted file]
arch/ppc64/mm/numa.c [deleted file]
arch/ppc64/mm/slb.c [deleted file]
arch/ppc64/mm/slb_low.S [deleted file]
arch/ppc64/mm/stab.c [deleted file]
arch/ppc64/mm/tlb.c [deleted file]

index 35497deeb4b21522e0bfcd0e015b8dca2242f3a4..612bc4ec72b1e1ec23e094ca7fd3517fe783d6a2 100644 (file)
@@ -5,8 +5,14 @@
 obj-y                          := fault.o mem.o lmb.o
 obj-$(CONFIG_PPC32)            += init_32.o pgtable_32.o mmu_context_32.o \
                                   tlb_32.o
-obj-$(CONFIG_PPC64)            += init_64.o pgtable_64.o mmu_context_64.o
+hash-$(CONFIG_PPC_MULTIPLATFORM) := hash_native_64.o
+obj-$(CONFIG_PPC64)            += init_64.o pgtable_64.o mmu_context_64.o \
+                                  hash_utils_64.o hash_low_64.o tlb_64.o \
+                                  slb_low.o slb.o stab.o mmap.o imalloc.o \
+                                  $(hash-y)
 obj-$(CONFIG_PPC_STD_MMU_32)   += ppc_mmu_32.o hash_low_32.o
 obj-$(CONFIG_40x)              += 4xx_mmu.o
 obj-$(CONFIG_44x)              += 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)                += fsl_booke_mmu.o
+obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
+obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
new file mode 100644 (file)
index 0000000..d6ed910
--- /dev/null
@@ -0,0 +1,288 @@
+/*
+ * ppc64 MMU hashtable management routines
+ *
+ * (c) Copyright IBM Corp. 2003
+ *
+ * Maintained by: Benjamin Herrenschmidt
+ *                <benh@kernel.crashing.org>
+ *
+ * This file is covered by the GNU Public Licence v2 as
+ * described in the kernel's COPYING file.
+ */
+
+#include <asm/reg.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+
+       .text
+
+/*
+ * Stackframe:
+ *             
+ *         +-> Back chain                      (SP + 256)
+ *         |   General register save area      (SP + 112)
+ *         |   Parameter save area             (SP + 48)
+ *         |   TOC save area                   (SP + 40)
+ *         |   link editor doubleword          (SP + 32)
+ *         |   compiler doubleword             (SP + 24)
+ *         |   LR save area                    (SP + 16)
+ *         |   CR save area                    (SP + 8)
+ * SP ---> +-- Back chain                      (SP + 0)
+ */
+#define STACKFRAMESIZE 256
+
+/* Save parameters offsets */
+#define STK_PARM(i)    (STACKFRAMESIZE + 48 + ((i)-3)*8)
+
+/* Save non-volatile offsets */
+#define STK_REG(i)     (112 + ((i)-14)*8)
+
+/*
+ * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
+ *             pte_t *ptep, unsigned long trap, int local)
+ *
+ * Adds a page to the hash table. This is the non-LPAR version for now
+ */
+
+_GLOBAL(__hash_page)
+       mflr    r0
+       std     r0,16(r1)
+       stdu    r1,-STACKFRAMESIZE(r1)
+       /* Save all params that we need after a function call */
+       std     r6,STK_PARM(r6)(r1)
+       std     r8,STK_PARM(r8)(r1)
+       
+       /* Add _PAGE_PRESENT to access */
+       ori     r4,r4,_PAGE_PRESENT
+
+       /* Save non-volatile registers.
+        * r31 will hold "old PTE"
+        * r30 is "new PTE"
+        * r29 is "va"
+        * r28 is a hash value
+        * r27 is hashtab mask (maybe dynamic patched instead ?)
+        */
+       std     r27,STK_REG(r27)(r1)
+       std     r28,STK_REG(r28)(r1)
+       std     r29,STK_REG(r29)(r1)
+       std     r30,STK_REG(r30)(r1)
+       std     r31,STK_REG(r31)(r1)
+       
+       /* Step 1:
+        *
+        * Check permissions, atomically mark the linux PTE busy
+        * and hashed.
+        */ 
+1:
+       ldarx   r31,0,r6
+       /* Check access rights (access & ~(pte_val(*ptep))) */
+       andc.   r0,r4,r31
+       bne-    htab_wrong_access
+       /* Check if PTE is busy */
+       andi.   r0,r31,_PAGE_BUSY
+       /* If so, just bail out and refault if needed. Someone else
+        * is changing this PTE anyway and might hash it.
+        */
+       bne-    bail_ok
+       /* Prepare new PTE value (turn access RW into DIRTY, then
+        * add BUSY,HASHPTE and ACCESSED)
+        */
+       rlwinm  r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
+       or      r30,r30,r31
+       ori     r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+       /* Write the linux PTE atomically (setting busy) */
+       stdcx.  r30,0,r6
+       bne-    1b
+       isync
+
+       /* Step 2:
+        *
+        * Insert/Update the HPTE in the hash table. At this point,
+        * r4 (access) is re-useable, we use it for the new HPTE flags
+        */
+
+       /* Calc va and put it in r29 */
+       rldicr  r29,r5,28,63-28
+       rldicl  r3,r3,0,36
+       or      r29,r3,r29
+
+       /* Calculate hash value for primary slot and store it in r28 */
+       rldicl  r5,r5,0,25              /* vsid & 0x0000007fffffffff */
+       rldicl  r0,r3,64-12,48          /* (ea >> 12) & 0xffff */
+       xor     r28,r5,r0
+
+       /* Convert linux PTE bits into HW equivalents */
+       andi.   r3,r30,0x1fe            /* Get basic set of flags */
+       xori    r3,r3,HW_NO_EXEC        /* _PAGE_EXEC -> NOEXEC */
+       rlwinm  r0,r30,32-9+1,30,30     /* _PAGE_RW -> _PAGE_USER (r0) */
+       rlwinm  r4,r30,32-7+1,30,30     /* _PAGE_DIRTY -> _PAGE_USER (r4) */
+       and     r0,r0,r4                /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
+       andc    r0,r30,r0               /* r0 = pte & ~r0 */
+       rlwimi  r3,r0,32-1,31,31        /* Insert result into PP lsb */
+
+       /* We eventually do the icache sync here (maybe inline that
+        * code rather than call a C function...) 
+        */
+BEGIN_FTR_SECTION
+       mr      r4,r30
+       mr      r5,r7
+       bl      .hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+       /* At this point, r3 contains new PP bits, save them in
+        * place of "access" in the param area (sic)
+        */
+       std     r3,STK_PARM(r4)(r1)
+
+       /* Get htab_hash_mask */
+       ld      r4,htab_hash_mask@got(2)
+       ld      r27,0(r4)       /* htab_hash_mask -> r27 */
+
+       /* Check if we may already be in the hashtable, in this case, we
+        * go to out-of-line code to try to modify the HPTE
+        */
+       andi.   r0,r31,_PAGE_HASHPTE
+       bne     htab_modify_pte
+
+htab_insert_pte:
+       /* Clear hpte bits in new pte (we also clear BUSY btw) and
+        * add _PAGE_HASHPTE
+        */
+       lis     r0,_PAGE_HPTEFLAGS@h
+       ori     r0,r0,_PAGE_HPTEFLAGS@l
+       andc    r30,r30,r0
+       ori     r30,r30,_PAGE_HASHPTE
+
+       /* page number in r5 */
+       rldicl  r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+       /* Calculate primary group hash */
+       and     r0,r28,r27
+       rldicr  r3,r0,3,63-3    /* r0 = (hash & mask) << 3 */
+
+       /* Call ppc_md.hpte_insert */
+       ld      r7,STK_PARM(r4)(r1)     /* Retreive new pp bits */
+       mr      r4,r29                  /* Retreive va */
+       li      r6,0                    /* no vflags */
+_GLOBAL(htab_call_hpte_insert1)
+       bl      .                       /* Will be patched by htab_finish_init() */
+       cmpdi   0,r3,0
+       bge     htab_pte_insert_ok      /* Insertion successful */
+       cmpdi   0,r3,-2                 /* Critical failure */
+       beq-    htab_pte_insert_failure
+
+       /* Now try secondary slot */
+       
+       /* page number in r5 */
+       rldicl  r5,r31,64-PTE_SHIFT,PTE_SHIFT
+
+       /* Calculate secondary group hash */
+       andc    r0,r27,r28
+       rldicr  r3,r0,3,63-3    /* r0 = (~hash & mask) << 3 */
+       
+       /* Call ppc_md.hpte_insert */
+       ld      r7,STK_PARM(r4)(r1)     /* Retreive new pp bits */
+       mr      r4,r29                  /* Retreive va */
+       li      r6,HPTE_V_SECONDARY@l   /* secondary slot */
+_GLOBAL(htab_call_hpte_insert2)
+       bl      .                       /* Will be patched by htab_finish_init() */
+       cmpdi   0,r3,0
+       bge+    htab_pte_insert_ok      /* Insertion successful */
+       cmpdi   0,r3,-2                 /* Critical failure */
+       beq-    htab_pte_insert_failure
+
+       /* Both are full, we need to evict something */
+       mftb    r0
+       /* Pick a random group based on TB */
+       andi.   r0,r0,1
+       mr      r5,r28
+       bne     2f
+       not     r5,r5
+2:     and     r0,r5,r27
+       rldicr  r3,r0,3,63-3    /* r0 = (hash & mask) << 3 */   
+       /* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+       bl      .                       /* Will be patched by htab_finish_init() */
+
+       /* Try all again */
+       b       htab_insert_pte 
+
+bail_ok:
+       li      r3,0
+       b       bail
+
+htab_pte_insert_ok:
+       /* Insert slot number & secondary bit in PTE */
+       rldimi  r30,r3,12,63-15
+               
+       /* Write out the PTE with a normal write
+        * (maybe add eieio may be good still ?)
+        */
+htab_write_out_pte:
+       ld      r6,STK_PARM(r6)(r1)
+       std     r30,0(r6)
+       li      r3, 0
+bail:
+       ld      r27,STK_REG(r27)(r1)
+       ld      r28,STK_REG(r28)(r1)
+       ld      r29,STK_REG(r29)(r1)
+       ld      r30,STK_REG(r30)(r1)
+       ld      r31,STK_REG(r31)(r1)
+       addi    r1,r1,STACKFRAMESIZE
+       ld      r0,16(r1)
+       mtlr    r0
+       blr
+
+htab_modify_pte:
+       /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+       mr      r4,r3
+       rlwinm  r3,r31,32-12,29,31
+
+       /* Secondary group ? if yes, get a inverted hash value */
+       mr      r5,r28
+       andi.   r0,r31,_PAGE_SECONDARY
+       beq     1f
+       not     r5,r5
+1:
+       /* Calculate proper slot value for ppc_md.hpte_updatepp */
+       and     r0,r5,r27
+       rldicr  r0,r0,3,63-3    /* r0 = (hash & mask) << 3 */
+       add     r3,r0,r3        /* add slot idx */
+
+       /* Call ppc_md.hpte_updatepp */
+       mr      r5,r29                  /* va */
+       li      r6,0                    /* large is 0 */
+       ld      r7,STK_PARM(r8)(r1)     /* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+       bl      .                       /* Will be patched by htab_finish_init() */
+
+       /* if we failed because typically the HPTE wasn't really here
+        * we try an insertion. 
+        */
+       cmpdi   0,r3,-1
+       beq-    htab_insert_pte
+
+       /* Clear the BUSY bit and Write out the PTE */
+       li      r0,_PAGE_BUSY
+       andc    r30,r30,r0
+       b       htab_write_out_pte
+
+htab_wrong_access:
+       /* Bail out clearing reservation */
+       stdcx.  r31,0,r6
+       li      r3,1
+       b       bail
+
+htab_pte_insert_failure:
+       /* Bail out restoring old PTE */
+       ld      r6,STK_PARM(r6)(r1)
+       std     r31,0(r6)
+       li      r3,-1
+       b       bail
+
+
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
new file mode 100644 (file)
index 0000000..174d145
--- /dev/null
@@ -0,0 +1,446 @@
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/abs_addr.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+
+#define HPTE_LOCK_BIT 3
+
+static DEFINE_SPINLOCK(native_tlbie_lock);
+
+static inline void native_lock_hpte(hpte_t *hptep)
+{
+       unsigned long *word = &hptep->v;
+
+       while (1) {
+               if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+                       break;
+               while(test_bit(HPTE_LOCK_BIT, word))
+                       cpu_relax();
+       }
+}
+
+static inline void native_unlock_hpte(hpte_t *hptep)
+{
+       unsigned long *word = &hptep->v;
+
+       asm volatile("lwsync":::"memory");
+       clear_bit(HPTE_LOCK_BIT, word);
+}
+
+long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+                       unsigned long prpn, unsigned long vflags,
+                       unsigned long rflags)
+{
+       hpte_t *hptep = htab_address + hpte_group;
+       unsigned long hpte_v, hpte_r;
+       int i;
+
+       for (i = 0; i < HPTES_PER_GROUP; i++) {
+               if (! (hptep->v & HPTE_V_VALID)) {
+                       /* retry with lock held */
+                       native_lock_hpte(hptep);
+                       if (! (hptep->v & HPTE_V_VALID))
+                               break;
+                       native_unlock_hpte(hptep);
+               }
+
+               hptep++;
+       }
+
+       if (i == HPTES_PER_GROUP)
+               return -1;
+
+       hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
+       if (vflags & HPTE_V_LARGE)
+               va &= ~(1UL << HPTE_V_AVPN_SHIFT);
+       hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
+
+       hptep->r = hpte_r;
+       /* Guarantee the second dword is visible before the valid bit */
+       __asm__ __volatile__ ("eieio" : : : "memory");
+       /*
+        * Now set the first dword including the valid bit
+        * NOTE: this also unlocks the hpte
+        */
+       hptep->v = hpte_v;
+
+       __asm__ __volatile__ ("ptesync" : : : "memory");
+
+       return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+       hpte_t *hptep;
+       int i;
+       int slot_offset;
+       unsigned long hpte_v;
+
+       /* pick a random entry to start at */
+       slot_offset = mftb() & 0x7;
+
+       for (i = 0; i < HPTES_PER_GROUP; i++) {
+               hptep = htab_address + hpte_group + slot_offset;
+               hpte_v = hptep->v;
+
+               if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
+                       /* retry with lock held */
+                       native_lock_hpte(hptep);
+                       hpte_v = hptep->v;
+                       if ((hpte_v & HPTE_V_VALID)
+                           && !(hpte_v & HPTE_V_BOLTED))
+                               break;
+                       native_unlock_hpte(hptep);
+               }
+
+               slot_offset++;
+               slot_offset &= 0x7;
+       }
+
+       if (i == HPTES_PER_GROUP)
+               return -1;
+
+       /* Invalidate the hpte. NOTE: this also unlocks it */
+       hptep->v = 0;
+
+       return i;
+}
+
+static inline void set_pp_bit(unsigned long pp, hpte_t *addr)
+{
+       unsigned long old;
+       unsigned long *p = &addr->r;
+
+       __asm__ __volatile__(
+       "1:     ldarx   %0,0,%3\n\
+               rldimi  %0,%2,0,61\n\
+               stdcx.  %0,0,%3\n\
+               bne     1b"
+       : "=&r" (old), "=m" (*p)
+       : "r" (pp), "r" (p), "m" (*p)
+       : "cc");
+}
+
+/*
+ * Only works on small pages. Yes its ugly to have to check each slot in
+ * the group but we only use this during bootup.
+ */
+static long native_hpte_find(unsigned long vpn)
+{
+       hpte_t *hptep;
+       unsigned long hash;
+       unsigned long i, j;
+       long slot;
+       unsigned long hpte_v;
+
+       hash = hpt_hash(vpn, 0);
+
+       for (j = 0; j < 2; j++) {
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               for (i = 0; i < HPTES_PER_GROUP; i++) {
+                       hptep = htab_address + slot;
+                       hpte_v = hptep->v;
+
+                       if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
+                           && (hpte_v & HPTE_V_VALID)
+                           && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) {
+                               /* HPTE matches */
+                               if (j)
+                                       slot = -slot;
+                               return slot;
+                       }
+                       ++slot;
+               }
+               hash = ~hash;
+       }
+
+       return -1;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+                                unsigned long va, int large, int local)
+{
+       hpte_t *hptep = htab_address + slot;
+       unsigned long hpte_v;
+       unsigned long avpn = va >> 23;
+       int ret = 0;
+
+       if (large)
+               avpn &= ~1;
+
+       native_lock_hpte(hptep);
+
+       hpte_v = hptep->v;
+
+       /* Even if we miss, we need to invalidate the TLB */
+       if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
+           || !(hpte_v & HPTE_V_VALID)) {
+               native_unlock_hpte(hptep);
+               ret = -1;
+       } else {
+               set_pp_bit(newpp, hptep);
+               native_unlock_hpte(hptep);
+       }
+
+       /* Ensure it is out of the tlb too */
+       if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+               tlbiel(va);
+       } else {
+               int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+               if (lock_tlbie)
+                       spin_lock(&native_tlbie_lock);
+               tlbie(va, large);
+               if (lock_tlbie)
+                       spin_unlock(&native_tlbie_lock);
+       }
+
+       return ret;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ * Does not work on large pages.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
+{
+       unsigned long vsid, va, vpn, flags = 0;
+       long slot;
+       hpte_t *hptep;
+       int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+       vsid = get_kernel_vsid(ea);
+       va = (vsid << 28) | (ea & 0x0fffffff);
+       vpn = va >> PAGE_SHIFT;
+
+       slot = native_hpte_find(vpn);
+       if (slot == -1)
+               panic("could not find page to bolt\n");
+       hptep = htab_address + slot;
+
+       set_pp_bit(newpp, hptep);
+
+       /* Ensure it is out of the tlb too */
+       if (lock_tlbie)
+               spin_lock_irqsave(&native_tlbie_lock, flags);
+       tlbie(va, 0);
+       if (lock_tlbie)
+               spin_unlock_irqrestore(&native_tlbie_lock, flags);
+}
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long va,
+                                   int large, int local)
+{
+       hpte_t *hptep = htab_address + slot;
+       unsigned long hpte_v;
+       unsigned long avpn = va >> 23;
+       unsigned long flags;
+       int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+       if (large)
+               avpn &= ~1;
+
+       local_irq_save(flags);
+       native_lock_hpte(hptep);
+
+       hpte_v = hptep->v;
+
+       /* Even if we miss, we need to invalidate the TLB */
+       if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
+           || !(hpte_v & HPTE_V_VALID)) {
+               native_unlock_hpte(hptep);
+       } else {
+               /* Invalidate the hpte. NOTE: this also unlocks it */
+               hptep->v = 0;
+       }
+
+       /* Invalidate the tlb */
+       if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+               tlbiel(va);
+       } else {
+               if (lock_tlbie)
+                       spin_lock(&native_tlbie_lock);
+               tlbie(va, large);
+               if (lock_tlbie)
+                       spin_unlock(&native_tlbie_lock);
+       }
+       local_irq_restore(flags);
+}
+
+/*
+ * clear all mappings on kexec.  All cpus are in real mode (or they will
+ * be when they isi), and we are the only one left.  We rely on our kernel
+ * mapping being 0xC0's and the hardware ignoring those two real bits.
+ *
+ * TODO: add batching support when enabled.  remember, no dynamic memory here,
+ * athough there is the control page available...
+ */
+static void native_hpte_clear(void)
+{
+       unsigned long slot, slots, flags;
+       hpte_t *hptep = htab_address;
+       unsigned long hpte_v;
+       unsigned long pteg_count;
+
+       pteg_count = htab_hash_mask + 1;
+
+       local_irq_save(flags);
+
+       /* we take the tlbie lock and hold it.  Some hardware will
+        * deadlock if we try to tlbie from two processors at once.
+        */
+       spin_lock(&native_tlbie_lock);
+
+       slots = pteg_count * HPTES_PER_GROUP;
+
+       for (slot = 0; slot < slots; slot++, hptep++) {
+               /*
+                * we could lock the pte here, but we are the only cpu
+                * running,  right?  and for crash dump, we probably
+                * don't want to wait for a maybe bad cpu.
+                */
+               hpte_v = hptep->v;
+
+               if (hpte_v & HPTE_V_VALID) {
+                       hptep->v = 0;
+                       tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE);
+               }
+       }
+
+       spin_unlock(&native_tlbie_lock);
+       local_irq_restore(flags);
+}
+
+static void native_flush_hash_range(unsigned long number, int local)
+{
+       unsigned long va, vpn, hash, secondary, slot, flags, avpn;
+       int i, j;
+       hpte_t *hptep;
+       unsigned long hpte_v;
+       struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+       unsigned long large = batch->large;
+
+       local_irq_save(flags);
+
+       j = 0;
+       for (i = 0; i < number; i++) {
+               va = batch->vaddr[j];
+               if (large)
+                       vpn = va >> HPAGE_SHIFT;
+               else
+                       vpn = va >> PAGE_SHIFT;
+               hash = hpt_hash(vpn, large);
+               secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
+               if (secondary)
+                       hash = ~hash;
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
+
+               hptep = htab_address + slot;
+
+               avpn = va >> 23;
+               if (large)
+                       avpn &= ~0x1UL;
+
+               native_lock_hpte(hptep);
+
+               hpte_v = hptep->v;
+
+               /* Even if we miss, we need to invalidate the TLB */
+               if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
+                   || !(hpte_v & HPTE_V_VALID)) {
+                       native_unlock_hpte(hptep);
+               } else {
+                       /* Invalidate the hpte. NOTE: this also unlocks it */
+                       hptep->v = 0;
+               }
+
+               j++;
+       }
+
+       if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
+               asm volatile("ptesync":::"memory");
+
+               for (i = 0; i < j; i++)
+                       __tlbiel(batch->vaddr[i]);
+
+               asm volatile("ptesync":::"memory");
+       } else {
+               int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+               if (lock_tlbie)
+                       spin_lock(&native_tlbie_lock);
+
+               asm volatile("ptesync":::"memory");
+
+               for (i = 0; i < j; i++)
+                       __tlbie(batch->vaddr[i], large);
+
+               asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+               if (lock_tlbie)
+                       spin_unlock(&native_tlbie_lock);
+       }
+
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_PSERIES
+/* Disable TLB batching on nighthawk */
+static inline int tlb_batching_enabled(void)
+{
+       struct device_node *root = of_find_node_by_path("/");
+       int enabled = 1;
+
+       if (root) {
+               const char *model = get_property(root, "model", NULL);
+               if (model && !strcmp(model, "IBM,9076-N81"))
+                       enabled = 0;
+               of_node_put(root);
+       }
+
+       return enabled;
+}
+#else
+static inline int tlb_batching_enabled(void)
+{
+       return 1;
+}
+#endif
+
+void hpte_init_native(void)
+{
+       ppc_md.hpte_invalidate  = native_hpte_invalidate;
+       ppc_md.hpte_updatepp    = native_hpte_updatepp;
+       ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
+       ppc_md.hpte_insert      = native_hpte_insert;
+       ppc_md.hpte_remove      = native_hpte_remove;
+       ppc_md.hpte_clear_all   = native_hpte_clear;
+       if (tlb_batching_enabled())
+               ppc_md.flush_hash_range = native_flush_hash_range;
+       htab_finish_init();
+}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
new file mode 100644 (file)
index 0000000..35dd93e
--- /dev/null
@@ -0,0 +1,438 @@
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ *   {mikejc|engebret}@us.ibm.com
+ *
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ *    Module name: htab.c
+ *
+ *    Description:
+ *      PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+
+#include <asm/ppcdebug.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/abs_addr.h>
+#include <asm/sections.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * Note:  pte   --> Linux PTE
+ *        HPTE  --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ *   htab_initialize is called with the MMU off (of course), but
+ *   the kernel has been copied down to zero so it can directly
+ *   reference global data.  At this point it is very difficult
+ *   to print debug info.
+ *
+ */
+
+#ifdef CONFIG_U3_DART
+extern unsigned long dart_tablebase;
+#endif /* CONFIG_U3_DART */
+
+hpte_t *htab_address;
+unsigned long htab_hash_mask;
+
+unsigned long _SDR1;
+
+#define KB (1024)
+#define MB (1024*KB)
+
+static inline void loop_forever(void)
+{
+       volatile unsigned long x = 1;
+       for(;x;x|=1)
+               ;
+}
+
+static inline void create_pte_mapping(unsigned long start, unsigned long end,
+                                     unsigned long mode, int large)
+{
+       unsigned long addr;
+       unsigned int step;
+       unsigned long tmp_mode;
+       unsigned long vflags;
+
+       if (large) {
+               step = 16*MB;
+               vflags = HPTE_V_BOLTED | HPTE_V_LARGE;
+       } else {
+               step = 4*KB;
+               vflags = HPTE_V_BOLTED;
+       }
+
+       for (addr = start; addr < end; addr += step) {
+               unsigned long vpn, hash, hpteg;
+               unsigned long vsid = get_kernel_vsid(addr);
+               unsigned long va = (vsid << 28) | (addr & 0xfffffff);
+               int ret = -1;
+
+               if (large)
+                       vpn = va >> HPAGE_SHIFT;
+               else
+                       vpn = va >> PAGE_SHIFT;
+
+
+               tmp_mode = mode;
+               
+               /* Make non-kernel text non-executable */
+               if (!in_kernel_text(addr))
+                       tmp_mode = mode | HW_NO_EXEC;
+
+               hash = hpt_hash(vpn, large);
+
+               hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+#ifdef CONFIG_PPC_ISERIES
+               if (systemcfg->platform & PLATFORM_ISERIES_LPAR)
+                       ret = iSeries_hpte_bolt_or_insert(hpteg, va,
+                               virt_to_abs(addr) >> PAGE_SHIFT,
+                               vflags, tmp_mode);
+               else
+#endif
+#ifdef CONFIG_PPC_PSERIES
+               if (systemcfg->platform & PLATFORM_LPAR)
+                       ret = pSeries_lpar_hpte_insert(hpteg, va,
+                               virt_to_abs(addr) >> PAGE_SHIFT,
+                               vflags, tmp_mode);
+               else
+#endif
+#ifdef CONFIG_PPC_MULTIPLATFORM
+                       ret = native_hpte_insert(hpteg, va,
+                               virt_to_abs(addr) >> PAGE_SHIFT,
+                               vflags, tmp_mode);
+#endif
+
+               if (ret == -1) {
+                       ppc64_terminate_msg(0x20, "create_pte_mapping");
+                       loop_forever();
+               }
+       }
+}
+
+void __init htab_initialize(void)
+{
+       unsigned long table, htab_size_bytes;
+       unsigned long pteg_count;
+       unsigned long mode_rw;
+       int i, use_largepages = 0;
+       unsigned long base = 0, size = 0;
+       extern unsigned long tce_alloc_start, tce_alloc_end;
+
+       DBG(" -> htab_initialize()\n");
+
+       /*
+        * Calculate the required size of the htab.  We want the number of
+        * PTEGs to equal one half the number of real pages.
+        */ 
+       htab_size_bytes = 1UL << ppc64_pft_size;
+       pteg_count = htab_size_bytes >> 7;
+
+       /* For debug, make the HTAB 1/8 as big as it normally would be. */
+       ifppcdebug(PPCDBG_HTABSIZE) {
+               pteg_count >>= 3;
+               htab_size_bytes = pteg_count << 7;
+       }
+
+       htab_hash_mask = pteg_count - 1;
+
+       if (systemcfg->platform & PLATFORM_LPAR) {
+               /* Using a hypervisor which owns the htab */
+               htab_address = NULL;
+               _SDR1 = 0; 
+       } else {
+               /* Find storage for the HPT.  Must be contiguous in
+                * the absolute address space.
+                */
+               table = lmb_alloc(htab_size_bytes, htab_size_bytes);
+
+               DBG("Hash table allocated at %lx, size: %lx\n", table,
+                   htab_size_bytes);
+
+               if ( !table ) {
+                       ppc64_terminate_msg(0x20, "hpt space");
+                       loop_forever();
+               }
+               htab_address = abs_to_virt(table);
+
+               /* htab absolute addr + encoded htabsize */
+               _SDR1 = table + __ilog2(pteg_count) - 11;
+
+               /* Initialize the HPT with no entries */
+               memset((void *)table, 0, htab_size_bytes);
+       }
+
+       mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+
+       /* On U3 based machines, we need to reserve the DART area and
+        * _NOT_ map it to avoid cache paradoxes as it's remapped non
+        * cacheable later on
+        */
+       if (cpu_has_feature(CPU_FTR_16M_PAGE))
+               use_largepages = 1;
+
+       /* create bolted the linear mapping in the hash table */
+       for (i=0; i < lmb.memory.cnt; i++) {
+               base = lmb.memory.region[i].base + KERNELBASE;
+               size = lmb.memory.region[i].size;
+
+               DBG("creating mapping for region: %lx : %lx\n", base, size);
+
+#ifdef CONFIG_U3_DART
+               /* Do not map the DART space. Fortunately, it will be aligned
+                * in such a way that it will not cross two lmb regions and will
+                * fit within a single 16Mb page.
+                * The DART space is assumed to be a full 16Mb region even if we
+                * only use 2Mb of that space. We will use more of it later for
+                * AGP GART. We have to use a full 16Mb large page.
+                */
+               DBG("DART base: %lx\n", dart_tablebase);
+
+               if (dart_tablebase != 0 && dart_tablebase >= base
+                   && dart_tablebase < (base + size)) {
+                       if (base != dart_tablebase)
+                               create_pte_mapping(base, dart_tablebase, mode_rw,
+                                                  use_largepages);
+                       if ((base + size) > (dart_tablebase + 16*MB))
+                               create_pte_mapping(dart_tablebase + 16*MB, base + size,
+                                                  mode_rw, use_largepages);
+                       continue;
+               }
+#endif /* CONFIG_U3_DART */
+               create_pte_mapping(base, base + size, mode_rw, use_largepages);
+       }
+
+       /*
+        * If we have a memory_limit and we've allocated TCEs then we need to
+        * explicitly map the TCE area at the top of RAM. We also cope with the
+        * case that the TCEs start below memory_limit.
+        * tce_alloc_start/end are 16MB aligned so the mapping should work
+        * for either 4K or 16MB pages.
+        */
+       if (tce_alloc_start) {
+               tce_alloc_start += KERNELBASE;
+               tce_alloc_end += KERNELBASE;
+
+               if (base + size >= tce_alloc_start)
+                       tce_alloc_start = base + size + 1;
+
+               create_pte_mapping(tce_alloc_start, tce_alloc_end,
+                       mode_rw, use_largepages);
+       }
+
+       DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+       struct page *page;
+
+       if (!pfn_valid(pte_pfn(pte)))
+               return pp;
+
+       page = pte_page(pte);
+
+       /* page is dirty */
+       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+               if (trap == 0x400) {
+                       __flush_dcache_icache(page_address(page));
+                       set_bit(PG_arch_1, &page->flags);
+               } else
+                       pp |= HW_NO_EXEC;
+       }
+       return pp;
+}
+
+/* Result code is:
+ *  0 - handled
+ *  1 - normal page fault
+ * -1 - critical hash insertion error
+ */
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
+{
+       void *pgdir;
+       unsigned long vsid;
+       struct mm_struct *mm;
+       pte_t *ptep;
+       int ret;
+       int user_region = 0;
+       int local = 0;
+       cpumask_t tmp;
+
+       if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
+               return 1;
+
+       switch (REGION_ID(ea)) {
+       case USER_REGION_ID:
+               user_region = 1;
+               mm = current->mm;
+               if (! mm)
+                       return 1;
+
+               vsid = get_vsid(mm->context.id, ea);
+               break;
+       case VMALLOC_REGION_ID:
+               mm = &init_mm;
+               vsid = get_kernel_vsid(ea);
+               break;
+#if 0
+       case KERNEL_REGION_ID:
+               /*
+                * Should never get here - entire 0xC0... region is bolted.
+                * Send the problem up to do_page_fault 
+                */
+#endif
+       default:
+               /* Not a valid range
+                * Send the problem up to do_page_fault 
+                */
+               return 1;
+               break;
+       }
+
+       pgdir = mm->pgd;
+
+       if (pgdir == NULL)
+               return 1;
+
+       tmp = cpumask_of_cpu(smp_processor_id());
+       if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+               local = 1;
+
+       /* Is this a huge page ? */
+       if (unlikely(in_hugepage_area(mm->context, ea)))
+               ret = hash_huge_page(mm, access, ea, vsid, local);
+       else {
+               ptep = find_linux_pte(pgdir, ea);
+               if (ptep == NULL)
+                       return 1;
+               ret = __hash_page(ea, access, vsid, ptep, trap, local);
+       }
+
+       return ret;
+}
+
+void flush_hash_page(unsigned long va, pte_t pte, int local)
+{
+       unsigned long vpn, hash, secondary, slot;
+       unsigned long huge = pte_huge(pte);
+
+       if (huge)
+               vpn = va >> HPAGE_SHIFT;
+       else
+               vpn = va >> PAGE_SHIFT;
+       hash = hpt_hash(vpn, huge);
+       secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
+       if (secondary)
+               hash = ~hash;
+       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+       slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
+
+       ppc_md.hpte_invalidate(slot, va, huge, local);
+}
+
+void flush_hash_range(unsigned long number, int local)
+{
+       if (ppc_md.flush_hash_range) {
+               ppc_md.flush_hash_range(number, local);
+       } else {
+               int i;
+               struct ppc64_tlb_batch *batch =
+                       &__get_cpu_var(ppc64_tlb_batch);
+
+               for (i = 0; i < number; i++)
+                       flush_hash_page(batch->vaddr[i], batch->pte[i], local);
+       }
+}
+
+static inline void make_bl(unsigned int *insn_addr, void *func)
+{
+       unsigned long funcp = *((unsigned long *)func);
+       int offset = funcp - (unsigned long)insn_addr;
+
+       *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
+       flush_icache_range((unsigned long)insn_addr, 4+
+                          (unsigned long)insn_addr);
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address)
+{
+       if (user_mode(regs)) {
+               siginfo_t info;
+
+               info.si_signo = SIGBUS;
+               info.si_errno = 0;
+               info.si_code = BUS_ADRERR;
+               info.si_addr = (void __user *)address;
+               force_sig_info(SIGBUS, &info, current);
+               return;
+       }
+       bad_page_fault(regs, address, SIGBUS);
+}
+
+void __init htab_finish_init(void)
+{
+       extern unsigned int *htab_call_hpte_insert1;
+       extern unsigned int *htab_call_hpte_insert2;
+       extern unsigned int *htab_call_hpte_remove;
+       extern unsigned int *htab_call_hpte_updatepp;
+
+       make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
+       make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
+       make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
+       make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
new file mode 100644 (file)
index 0000000..0ea0994
--- /dev/null
@@ -0,0 +1,745 @@
+/*
+ * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/tlb.h>
+
+#include <linux/sysctl.h>
+
+#define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
+#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       pte_t *pt;
+
+       BUG_ON(! in_hugepage_area(mm->context, addr));
+
+       addr &= HPAGE_MASK;
+
+       pg = pgd_offset(mm, addr);
+       if (!pgd_none(*pg)) {
+               pu = pud_offset(pg, addr);
+               if (!pud_none(*pu)) {
+                       pm = pmd_offset(pu, addr);
+                       pt = (pte_t *)pm;
+                       BUG_ON(!pmd_none(*pm)
+                              && !(pte_present(*pt) && pte_huge(*pt)));
+                       return pt;
+               }
+       }
+
+       return NULL;
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       pte_t *pt;
+
+       BUG_ON(! in_hugepage_area(mm->context, addr));
+
+       addr &= HPAGE_MASK;
+
+       pg = pgd_offset(mm, addr);
+       pu = pud_alloc(mm, pg, addr);
+
+       if (pu) {
+               pm = pmd_alloc(mm, pu, addr);
+               if (pm) {
+                       pt = (pte_t *)pm;
+                       BUG_ON(!pmd_none(*pm)
+                              && !(pte_present(*pt) && pte_huge(*pt)));
+                       return pt;
+               }
+       }
+
+       return NULL;
+}
+
+#define HUGEPTE_BATCH_SIZE     (HPAGE_SIZE / PMD_SIZE)
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pte)
+{
+       int i;
+
+       if (pte_present(*ptep)) {
+               pte_clear(mm, addr, ptep);
+               flush_tlb_pending();
+       }
+
+       for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+               *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+               ptep++;
+       }
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                             pte_t *ptep)
+{
+       unsigned long old = pte_update(ptep, ~0UL);
+       int i;
+
+       if (old & _PAGE_HASHPTE)
+               hpte_update(mm, addr, old, 0);
+
+       for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
+               ptep[i] = __pte(0);
+
+       return __pte(old);
+}
+
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+       if (len & ~HPAGE_MASK)
+               return -EINVAL;
+       if (addr & ~HPAGE_MASK)
+               return -EINVAL;
+       if (! (within_hugepage_low_range(addr, len)
+              || within_hugepage_high_range(addr, len)) )
+               return -EINVAL;
+       return 0;
+}
+
+static void flush_low_segments(void *parm)
+{
+       u16 areas = (unsigned long) parm;
+       unsigned long i;
+
+       asm volatile("isync" : : : "memory");
+
+       BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+
+       for (i = 0; i < NUM_LOW_AREAS; i++) {
+               if (! (areas & (1U << i)))
+                       continue;
+               asm volatile("slbie %0"
+                            : : "r" ((i << SID_SHIFT) | SLBIE_C));
+       }
+
+       asm volatile("isync" : : : "memory");
+}
+
+static void flush_high_segments(void *parm)
+{
+       u16 areas = (unsigned long) parm;
+       unsigned long i, j;
+
+       asm volatile("isync" : : : "memory");
+
+       BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+
+       for (i = 0; i < NUM_HIGH_AREAS; i++) {
+               if (! (areas & (1U << i)))
+                       continue;
+               for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+                       asm volatile("slbie %0"
+                                    :: "r" (((i << HTLB_AREA_SHIFT)
+                                            + (j << SID_SHIFT)) | SLBIE_C));
+       }
+
+       asm volatile("isync" : : : "memory");
+}
+
+static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+       unsigned long start = area << SID_SHIFT;
+       unsigned long end = (area+1) << SID_SHIFT;
+       struct vm_area_struct *vma;
+
+       BUG_ON(area >= NUM_LOW_AREAS);
+
+       /* Check no VMAs are in the region */
+       vma = find_vma(mm, start);
+       if (vma && (vma->vm_start < end))
+               return -EBUSY;
+
+       return 0;
+}
+
+static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+       unsigned long start = area << HTLB_AREA_SHIFT;
+       unsigned long end = (area+1) << HTLB_AREA_SHIFT;
+       struct vm_area_struct *vma;
+
+       BUG_ON(area >= NUM_HIGH_AREAS);
+
+       /* Check no VMAs are in the region */
+       vma = find_vma(mm, start);
+       if (vma && (vma->vm_start < end))
+               return -EBUSY;
+
+       return 0;
+}
+
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+       unsigned long i;
+
+       BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
+       BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
+
+       newareas &= ~(mm->context.low_htlb_areas);
+       if (! newareas)
+               return 0; /* The segments we want are already open */
+
+       for (i = 0; i < NUM_LOW_AREAS; i++)
+               if ((1 << i) & newareas)
+                       if (prepare_low_area_for_htlb(mm, i) != 0)
+                               return -EBUSY;
+
+       mm->context.low_htlb_areas |= newareas;
+
+       /* update the paca copy of the context struct */
+       get_paca()->context = mm->context;
+
+       /* the context change must make it to memory before the flush,
+        * so that further SLB misses do the right thing. */
+       mb();
+       on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       return 0;
+}
+
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+       unsigned long i;
+
+       BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
+       BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
+                    != NUM_HIGH_AREAS);
+
+       newareas &= ~(mm->context.high_htlb_areas);
+       if (! newareas)
+               return 0; /* The areas we want are already open */
+
+       for (i = 0; i < NUM_HIGH_AREAS; i++)
+               if ((1 << i) & newareas)
+                       if (prepare_high_area_for_htlb(mm, i) != 0)
+                               return -EBUSY;
+
+       mm->context.high_htlb_areas |= newareas;
+
+       /* update the paca copy of the context struct */
+       get_paca()->context = mm->context;
+
+       /* the context change must make it to memory before the flush,
+        * so that further SLB misses do the right thing. */
+       mb();
+       on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
+
+       return 0;
+}
+
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
+{
+       int err;
+
+       if ( (addr+len) < addr )
+               return -EINVAL;
+
+       if ((addr + len) < 0x100000000UL)
+               err = open_low_hpage_areas(current->mm,
+                                         LOW_ESID_MASK(addr, len));
+       else
+               err = open_high_hpage_areas(current->mm,
+                                           HTLB_AREA_MASK(addr, len));
+       if (err) {
+               printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+                      " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
+                      addr, len,
+                      LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
+               return err;
+       }
+
+       return 0;
+}
+
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+       pte_t *ptep;
+       struct page *page;
+
+       if (! in_hugepage_area(mm->context, address))
+               return ERR_PTR(-EINVAL);
+
+       ptep = huge_pte_offset(mm, address);
+       page = pte_page(*ptep);
+       if (page)
+               page += (address % HPAGE_SIZE) / PAGE_SIZE;
+
+       return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+       return 0;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+               pmd_t *pmd, int write)
+{
+       BUG();
+       return NULL;
+}
+
+/* Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved regions. */
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+                                    unsigned long len, unsigned long pgoff,
+                                    unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long start_addr;
+
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(mm, addr);
+               if (((TASK_SIZE - len) >= addr)
+                   && (!vma || (addr+len) <= vma->vm_start)
+                   && !is_hugepage_only_range(mm, addr,len))
+                       return addr;
+       }
+       if (len > mm->cached_hole_size) {
+               start_addr = addr = mm->free_area_cache;
+       } else {
+               start_addr = addr = TASK_UNMAPPED_BASE;
+               mm->cached_hole_size = 0;
+       }
+
+full_search:
+       vma = find_vma(mm, addr);
+       while (TASK_SIZE - len >= addr) {
+               BUG_ON(vma && (addr >= vma->vm_end));
+
+               if (touches_hugepage_low_range(mm, addr, len)) {
+                       addr = ALIGN(addr+1, 1<<SID_SHIFT);
+                       vma = find_vma(mm, addr);
+                       continue;
+               }
+               if (touches_hugepage_high_range(mm, addr, len)) {
+                       addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+                       vma = find_vma(mm, addr);
+                       continue;
+               }
+               if (!vma || addr + len <= vma->vm_start) {
+                       /*
+                        * Remember the place where we stopped the search:
+                        */
+                       mm->free_area_cache = addr + len;
+                       return addr;
+               }
+               if (addr + mm->cached_hole_size < vma->vm_start)
+                       mm->cached_hole_size = vma->vm_start - addr;
+               addr = vma->vm_end;
+               vma = vma->vm_next;
+       }
+
+       /* Make sure we didn't miss any holes */
+       if (start_addr != TASK_UNMAPPED_BASE) {
+               start_addr = addr = TASK_UNMAPPED_BASE;
+               mm->cached_hole_size = 0;
+               goto full_search;
+       }
+       return -ENOMEM;
+}
+
+/*
+ * This mmap-allocator allocates new areas top-down from below the
+ * stack's low limit (the base):
+ *
+ * Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved regions.
+ */
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+                         const unsigned long len, const unsigned long pgoff,
+                         const unsigned long flags)
+{
+       struct vm_area_struct *vma, *prev_vma;
+       struct mm_struct *mm = current->mm;
+       unsigned long base = mm->mmap_base, addr = addr0;
+       unsigned long largest_hole = mm->cached_hole_size;
+       int first_time = 1;
+
+       /* requested length too big for entire address space */
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
+       /* dont allow allocations above current base */
+       if (mm->free_area_cache > base)
+               mm->free_area_cache = base;
+
+       /* requesting a specific address */
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(mm, addr);
+               if (TASK_SIZE - len >= addr &&
+                               (!vma || addr + len <= vma->vm_start)
+                               && !is_hugepage_only_range(mm, addr,len))
+                       return addr;
+       }
+
+       if (len <= largest_hole) {
+               largest_hole = 0;
+               mm->free_area_cache = base;
+       }
+try_again:
+       /* make sure it can fit in the remaining address space */
+       if (mm->free_area_cache < len)
+               goto fail;
+
+       /* either no address requested or cant fit in requested address hole */
+       addr = (mm->free_area_cache - len) & PAGE_MASK;
+       do {
+hugepage_recheck:
+               if (touches_hugepage_low_range(mm, addr, len)) {
+                       addr = (addr & ((~0) << SID_SHIFT)) - len;
+                       goto hugepage_recheck;
+               } else if (touches_hugepage_high_range(mm, addr, len)) {
+                       addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+                       goto hugepage_recheck;
+               }
+
+               /*
+                * Lookup failure means no vma is above this address,
+                * i.e. return with success:
+                */
+               if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+                       return addr;
+
+               /*
+                * new region fits between prev_vma->vm_end and
+                * vma->vm_start, use it:
+                */
+               if (addr+len <= vma->vm_start &&
+                         (!prev_vma || (addr >= prev_vma->vm_end))) {
+                       /* remember the address as a hint for next time */
+                       mm->cached_hole_size = largest_hole;
+                       return (mm->free_area_cache = addr);
+               } else {
+                       /* pull free_area_cache down to the first hole */
+                       if (mm->free_area_cache == vma->vm_end) {
+                               mm->free_area_cache = vma->vm_start;
+                               mm->cached_hole_size = largest_hole;
+                       }
+               }
+
+               /* remember the largest hole we saw so far */
+               if (addr + largest_hole < vma->vm_start)
+                       largest_hole = vma->vm_start - addr;
+
+               /* try just below the current vma->vm_start */
+               addr = vma->vm_start-len;
+       } while (len <= vma->vm_start);
+
+fail:
+       /*
+        * if hint left us with no space for the requested
+        * mapping then try again:
+        */
+       if (first_time) {
+               mm->free_area_cache = base;
+               largest_hole = 0;
+               first_time = 0;
+               goto try_again;
+       }
+       /*
+        * A failed mmap() very likely causes application failure,
+        * so fall back to the bottom-up function here. This scenario
+        * can happen with large stack limits and large mmap()
+        * allocations.
+        */
+       mm->free_area_cache = TASK_UNMAPPED_BASE;
+       mm->cached_hole_size = ~0UL;
+       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+       /*
+        * Restore the topdown base:
+        */
+       mm->free_area_cache = base;
+       mm->cached_hole_size = ~0UL;
+
+       return addr;
+}
+
+static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
+{
+       unsigned long addr = 0;
+       struct vm_area_struct *vma;
+
+       vma = find_vma(current->mm, addr);
+       while (addr + len <= 0x100000000UL) {
+               BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+
+               if (! __within_hugepage_low_range(addr, len, segmask)) {
+                       addr = ALIGN(addr+1, 1<<SID_SHIFT);
+                       vma = find_vma(current->mm, addr);
+                       continue;
+               }
+
+               if (!vma || (addr + len) <= vma->vm_start)
+                       return addr;
+               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+               /* Depending on segmask this might not be a confirmed
+                * hugepage region, so the ALIGN could have skipped
+                * some VMAs */
+               vma = find_vma(current->mm, addr);
+       }
+
+       return -ENOMEM;
+}
+
+static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
+{
+       unsigned long addr = 0x100000000UL;
+       struct vm_area_struct *vma;
+
+       vma = find_vma(current->mm, addr);
+       while (addr + len <= TASK_SIZE_USER64) {
+               BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+
+               if (! __within_hugepage_high_range(addr, len, areamask)) {
+                       addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+                       vma = find_vma(current->mm, addr);
+                       continue;
+               }
+
+               if (!vma || (addr + len) <= vma->vm_start)
+                       return addr;
+               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+               /* Depending on segmask this might not be a confirmed
+                * hugepage region, so the ALIGN could have skipped
+                * some VMAs */
+               vma = find_vma(current->mm, addr);
+       }
+
+       return -ENOMEM;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+                                       unsigned long len, unsigned long pgoff,
+                                       unsigned long flags)
+{
+       int lastshift;
+       u16 areamask, curareas;
+
+       if (len & ~HPAGE_MASK)
+               return -EINVAL;
+
+       if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+               return -EINVAL;
+
+       if (test_thread_flag(TIF_32BIT)) {
+               curareas = current->mm->context.low_htlb_areas;
+
+               /* First see if we can do the mapping in the existing
+                * low areas */
+               addr = htlb_get_low_area(len, curareas);
+               if (addr != -ENOMEM)
+                       return addr;
+
+               lastshift = 0;
+               for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
+                    ! lastshift; areamask >>=1) {
+                       if (areamask & 1)
+                               lastshift = 1;
+
+                       addr = htlb_get_low_area(len, curareas | areamask);
+                       if ((addr != -ENOMEM)
+                           && open_low_hpage_areas(current->mm, areamask) == 0)
+                               return addr;
+               }
+       } else {
+               curareas = current->mm->context.high_htlb_areas;
+
+               /* First see if we can do the mapping in the existing
+                * high areas */
+               addr = htlb_get_high_area(len, curareas);
+               if (addr != -ENOMEM)
+                       return addr;
+
+               lastshift = 0;
+               for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+                    ! lastshift; areamask >>=1) {
+                       if (areamask & 1)
+                               lastshift = 1;
+
+                       addr = htlb_get_high_area(len, curareas | areamask);
+                       if ((addr != -ENOMEM)
+                           && open_high_hpage_areas(current->mm, areamask) == 0)
+                               return addr;
+               }
+       }
+       printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+              " enough areas\n");
+       return -ENOMEM;
+}
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+                  unsigned long ea, unsigned long vsid, int local)
+{
+       pte_t *ptep;
+       unsigned long va, vpn;
+       pte_t old_pte, new_pte;
+       unsigned long rflags, prpn;
+       long slot;
+       int err = 1;
+
+       spin_lock(&mm->page_table_lock);
+
+       ptep = huge_pte_offset(mm, ea);
+
+       /* Search the Linux page table for a match with va */
+       va = (vsid << 28) | (ea & 0x0fffffff);
+       vpn = va >> HPAGE_SHIFT;
+
+       /*
+        * If no pte found or not present, send the problem up to
+        * do_page_fault
+        */
+       if (unlikely(!ptep || pte_none(*ptep)))
+               goto out;
+
+/*     BUG_ON(pte_bad(*ptep)); */
+
+       /* 
+        * Check the user's access rights to the page.  If access should be
+        * prevented then send the problem up to do_page_fault.
+        */
+       if (unlikely(access & ~pte_val(*ptep)))
+               goto out;
+       /*
+        * At this point, we have a pte (old_pte) which can be used to build
+        * or update an HPTE. There are 2 cases:
+        *
+        * 1. There is a valid (present) pte with no associated HPTE (this is 
+        *      the most common case)
+        * 2. There is a valid (present) pte with an associated HPTE. The
+        *      current values of the pp bits in the HPTE prevent access
+        *      because we are doing software DIRTY bit management and the
+        *      page is currently not DIRTY. 
+        */
+
+
+       old_pte = *ptep;
+       new_pte = old_pte;
+
+       rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+       /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+       rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+
+       /* Check if pte already has an hpte (case 2) */
+       if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+               /* There MIGHT be an HPTE for this pte */
+               unsigned long hash, slot;
+
+               hash = hpt_hash(vpn, 1);
+               if (pte_val(old_pte) & _PAGE_SECONDARY)
+                       hash = ~hash;
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+
+               if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
+                       pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
+       }
+
+       if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
+               unsigned long hash = hpt_hash(vpn, 1);
+               unsigned long hpte_group;
+
+               prpn = pte_pfn(old_pte);
+
+repeat:
+               hpte_group = ((hash & htab_hash_mask) *
+                             HPTES_PER_GROUP) & ~0x7UL;
+
+               /* Update the linux pte with the HPTE slot */
+               pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
+               pte_val(new_pte) |= _PAGE_HASHPTE;
+
+               /* Add in WIMG bits */
+               /* XXX We should store these in the pte */
+               rflags |= _PAGE_COHERENT;
+
+               slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+                                         HPTE_V_LARGE, rflags);
+
+               /* Primary is full, try the secondary */
+               if (unlikely(slot == -1)) {
+                       pte_val(new_pte) |= _PAGE_SECONDARY;
+                       hpte_group = ((~hash & htab_hash_mask) *
+                                     HPTES_PER_GROUP) & ~0x7UL; 
+                       slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+                                                 HPTE_V_LARGE |
+                                                 HPTE_V_SECONDARY,
+                                                 rflags);
+                       if (slot == -1) {
+                               if (mftb() & 0x1)
+                                       hpte_group = ((hash & htab_hash_mask) *
+                                                     HPTES_PER_GROUP)&~0x7UL;
+
+                               ppc_md.hpte_remove(hpte_group);
+                               goto repeat;
+                        }
+               }
+
+               if (unlikely(slot == -2))
+                       panic("hash_huge_page: pte_insert failed\n");
+
+               pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
+
+               /* 
+                * No need to use ldarx/stdcx here because all who
+                * might be updating the pte will hold the
+                * page_table_lock
+                */
+               *ptep = new_pte;
+       }
+
+       err = 0;
+
+ out:
+       spin_unlock(&mm->page_table_lock);
+
+       return err;
+}
diff --git a/arch/powerpc/mm/imalloc.c b/arch/powerpc/mm/imalloc.c
new file mode 100644 (file)
index 0000000..c65b87b
--- /dev/null
@@ -0,0 +1,317 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ * 
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/semaphore.h>
+#include <asm/imalloc.h>
+#include <asm/cacheflush.h>
+
+static DECLARE_MUTEX(imlist_sem);
+struct vm_struct * imlist = NULL;
+
+static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
+{
+       unsigned long addr;
+       struct vm_struct **p, *tmp;
+
+       addr = ioremap_bot;
+       for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
+               if (size + addr < (unsigned long) tmp->addr)
+                       break;
+               if ((unsigned long)tmp->addr >= ioremap_bot)
+                       addr = tmp->size + (unsigned long) tmp->addr;
+               if (addr >= IMALLOC_END-size)
+                       return 1;
+       }
+       *im_addr = addr;
+
+       return 0;
+}
+
+/* Return whether the region described by v_addr and size is a subset
+ * of the region described by parent
+ */
+static inline int im_region_is_subset(unsigned long v_addr, unsigned long size,
+                       struct vm_struct *parent)
+{
+       return (int) (v_addr >= (unsigned long) parent->addr &&
+                     v_addr < (unsigned long) parent->addr + parent->size &&
+                     size < parent->size);
+}
+
+/* Return whether the region described by v_addr and size is a superset
+ * of the region described by child
+ */
+static int im_region_is_superset(unsigned long v_addr, unsigned long size,
+               struct vm_struct *child)
+{
+       struct vm_struct parent;
+
+       parent.addr = (void *) v_addr;
+       parent.size = size;
+
+       return im_region_is_subset((unsigned long) child->addr, child->size,
+                       &parent);
+}
+
+/* Return whether the region described by v_addr and size overlaps
+ * the region described by vm.  Overlapping regions meet the
+ * following conditions:
+ * 1) The regions share some part of the address space
+ * 2) The regions aren't identical
+ * 3) Neither region is a subset of the other
+ */
+static int im_region_overlaps(unsigned long v_addr, unsigned long size,
+                    struct vm_struct *vm)
+{
+       if (im_region_is_superset(v_addr, size, vm))
+               return 0;
+
+       return (v_addr + size > (unsigned long) vm->addr + vm->size &&
+               v_addr < (unsigned long) vm->addr + vm->size) ||
+              (v_addr < (unsigned long) vm->addr &&
+               v_addr + size > (unsigned long) vm->addr);
+}
+
+/* Determine imalloc status of region described by v_addr and size.
+ * Can return one of the following:
+ * IM_REGION_UNUSED   -  Entire region is unallocated in imalloc space.
+ * IM_REGION_SUBSET -    Region is a subset of a region that is already
+ *                      allocated in imalloc space.
+ *                      vm will be assigned to a ptr to the parent region.
+ * IM_REGION_EXISTS -    Exact region already allocated in imalloc space.
+ *                       vm will be assigned to a ptr to the existing imlist
+ *                       member.
+ * IM_REGION_OVERLAPS -  Region overlaps an allocated region in imalloc space.
+ * IM_REGION_SUPERSET -  Region is a superset of a region that is already
+ *                       allocated in imalloc space.
+ */
+static int im_region_status(unsigned long v_addr, unsigned long size,
+                   struct vm_struct **vm)
+{
+       struct vm_struct *tmp;
+
+       for (tmp = imlist; tmp; tmp = tmp->next)
+               if (v_addr < (unsigned long) tmp->addr + tmp->size)
+                       break;
+
+       if (tmp) {
+               if (im_region_overlaps(v_addr, size, tmp))
+                       return IM_REGION_OVERLAP;
+
+               *vm = tmp;
+               if (im_region_is_subset(v_addr, size, tmp)) {
+                       /* Return with tmp pointing to superset */
+                       return IM_REGION_SUBSET;
+               }
+               if (im_region_is_superset(v_addr, size, tmp)) {
+                       /* Return with tmp pointing to first subset */
+                       return IM_REGION_SUPERSET;
+               }
+               else if (v_addr == (unsigned long) tmp->addr &&
+                        size == tmp->size) {
+                       /* Return with tmp pointing to exact region */
+                       return IM_REGION_EXISTS;
+               }
+       }
+
+       *vm = NULL;
+       return IM_REGION_UNUSED;
+}
+
+static struct vm_struct * split_im_region(unsigned long v_addr, 
+               unsigned long size, struct vm_struct *parent)
+{
+       struct vm_struct *vm1 = NULL;
+       struct vm_struct *vm2 = NULL;
+       struct vm_struct *new_vm = NULL;
+       
+       vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL);
+       if (vm1 == NULL) {
+               printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
+               return NULL;
+       }
+
+       if (v_addr == (unsigned long) parent->addr) {
+               /* Use existing parent vm_struct to represent child, allocate
+                * new one for the remainder of parent range
+                */
+               vm1->size = parent->size - size;
+               vm1->addr = (void *) (v_addr + size);
+               vm1->next = parent->next;
+
+               parent->size = size;
+               parent->next = vm1;
+               new_vm = parent;
+       } else if (v_addr + size == (unsigned long) parent->addr + 
+                       parent->size) {
+               /* Allocate new vm_struct to represent child, use existing
+                * parent one for remainder of parent range
+                */
+               vm1->size = size;
+               vm1->addr = (void *) v_addr;
+               vm1->next = parent->next;
+               new_vm = vm1;
+
+               parent->size -= size;
+               parent->next = vm1;
+       } else {
+               /* Allocate two new vm_structs for the new child and 
+                * uppermost remainder, and use existing parent one for the
+                * lower remainder of parent range
+                */
+               vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL);
+               if (vm2 == NULL) {
+                       printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
+                       kfree(vm1);
+                       return NULL;
+               }
+
+               vm1->size = size;
+               vm1->addr = (void *) v_addr;
+               vm1->next = vm2;
+               new_vm = vm1;
+
+               vm2->size = ((unsigned long) parent->addr + parent->size) - 
+                               (v_addr + size);
+               vm2->addr = (void *) v_addr + size;
+               vm2->next = parent->next;
+
+               parent->size = v_addr - (unsigned long) parent->addr;
+               parent->next = vm1;
+       }
+
+       return new_vm;
+}
+
+static struct vm_struct * __add_new_im_area(unsigned long req_addr, 
+                                           unsigned long size)
+{
+       struct vm_struct **p, *tmp, *area;
+               
+       for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
+               if (req_addr + size <= (unsigned long)tmp->addr)
+                       break;
+       }
+       
+       area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
+       if (!area)
+               return NULL;
+       area->flags = 0;
+       area->addr = (void *)req_addr;
+       area->size = size;
+       area->next = *p;
+       *p = area;
+
+       return area;
+}
+
+static struct vm_struct * __im_get_area(unsigned long req_addr, 
+                                       unsigned long size,
+                                       int criteria)
+{
+       struct vm_struct *tmp;
+       int status;
+
+       status = im_region_status(req_addr, size, &tmp);
+       if ((criteria & status) == 0) {
+               return NULL;
+       }
+       
+       switch (status) {
+       case IM_REGION_UNUSED:
+               tmp = __add_new_im_area(req_addr, size);
+               break;
+       case IM_REGION_SUBSET:
+               tmp = split_im_region(req_addr, size, tmp);
+               break;
+       case IM_REGION_EXISTS:
+               /* Return requested region */
+               break;
+       case IM_REGION_SUPERSET:
+               /* Return first existing subset of requested region */
+               break;
+       default:
+               printk(KERN_ERR "%s() unexpected imalloc region status\n",
+                               __FUNCTION__);
+               tmp = NULL;
+       }
+
+       return tmp;
+}
+
+struct vm_struct * im_get_free_area(unsigned long size)
+{
+       struct vm_struct *area;
+       unsigned long addr;
+       
+       down(&imlist_sem);
+       if (get_free_im_addr(size, &addr)) {
+               printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
+                               __FUNCTION__, size);
+               area = NULL;
+               goto next_im_done;
+       }
+
+       area = __im_get_area(addr, size, IM_REGION_UNUSED);
+       if (area == NULL) {
+               printk(KERN_ERR 
+                      "%s() cannot obtain area for addr 0x%lx size 0x%lx\n",
+                       __FUNCTION__, addr, size);
+       }
+next_im_done:
+       up(&imlist_sem);
+       return area;
+}
+
+struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
+               int criteria)
+{
+       struct vm_struct *area;
+
+       down(&imlist_sem);
+       area = __im_get_area(v_addr, size, criteria);
+       up(&imlist_sem);
+       return area;
+}
+
+void im_free(void * addr)
+{
+       struct vm_struct **p, *tmp;
+  
+       if (!addr)
+               return;
+       if ((unsigned long) addr & ~PAGE_MASK) {
+               printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__,                        addr);
+               return;
+       }
+       down(&imlist_sem);
+       for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
+               if (tmp->addr == addr) {
+                       *p = tmp->next;
+
+                       /* XXX: do we need the lock? */
+                       spin_lock(&init_mm.page_table_lock);
+                       unmap_vm_area(tmp);
+                       spin_unlock(&init_mm.page_table_lock);
+
+                       kfree(tmp);
+                       up(&imlist_sem);
+                       return;
+               }
+       }
+       up(&imlist_sem);
+       printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
+                       addr);
+}
index c0ce6a7af3c74371875f991ffbfb2c0d7382381e..b0fc822ec29f5b660825315a260317bd1a4e9114 100644 (file)
 #warning TASK_SIZE is smaller than it needs to be.
 #endif
 
-int mem_init_done;
-unsigned long ioremap_bot = IMALLOC_BASE;
-static unsigned long phbs_io_bot = PHBS_IO_BASE;
-
-extern pgd_t swapper_pg_dir[];
-extern struct task_struct *current_set[NR_CPUS];
-
 unsigned long klimit = (unsigned long)_end;
 
-unsigned long _SDR1=0;
-unsigned long _ASR=0;
-
 /* max amount of RAM to use */
 unsigned long __max_memory;
 
@@ -193,19 +183,6 @@ static int __init setup_kcore(void)
 }
 module_init(setup_kcore);
 
-void __iomem * reserve_phb_iospace(unsigned long size)
-{
-       void __iomem *virt_addr;
-               
-       if (phbs_io_bot >= IMALLOC_BASE) 
-               panic("reserve_phb_iospace(): phb io space overflow\n");
-                       
-       virt_addr = (void __iomem *) phbs_io_bot;
-       phbs_io_bot += size;
-
-       return virt_addr;
-}
-
 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 {
        memset(addr, 0, kmem_cache_size(cache));
@@ -244,16 +221,3 @@ void pgtable_cache_init(void)
                              name);
        }
 }
-
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
-                             unsigned long size, pgprot_t vma_prot)
-{
-       if (ppc_md.phys_mem_access_prot)
-               return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
-
-       if (!page_is_ram(addr >> PAGE_SHIFT))
-               vma_prot = __pgprot(pgprot_val(vma_prot)
-                                   | _PAGE_GUARDED | _PAGE_NO_CACHE);
-       return vma_prot;
-}
-EXPORT_SYMBOL(phys_mem_access_prot);
index 0650de74d0b3db7c0f90ef0b2d55fe938e737084..55b5860ed3c98c21c1633f915bdb0b0be94de1c7 100644 (file)
@@ -47,6 +47,9 @@
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/sections.h>
+#ifdef CONFIG_PPC64
+#include <asm/vdso.h>
+#endif
 
 #include "mmu_decl.h"
 
@@ -334,7 +337,7 @@ void flush_dcache_icache_page(struct page *page)
        void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
        __flush_dcache_icache(start);
        kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
-#elif defined(CONFIG_8xx)
+#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
        /* On 8xx there is no need to kmap since highmem is not supported */
        __flush_dcache_icache(page_address(page)); 
 #else
@@ -463,18 +466,18 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
        if (pgdir == NULL)
                return;
 
-       ptep = find_linux_pte(pgdir, ea);
+       ptep = find_linux_pte(pgdir, address);
        if (!ptep)
                return;
 
-       vsid = get_vsid(vma->vm_mm->context.id, ea);
+       vsid = get_vsid(vma->vm_mm->context.id, address);
 
        local_irq_save(flags);
        tmp = cpumask_of_cpu(smp_processor_id());
        if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
                local = 1;
 
-       __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
+       __hash_page(address, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
                    0x300, local);
        local_irq_restore(flags);
 #endif
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
new file mode 100644 (file)
index 0000000..fe65f52
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ *  linux/arch/ppc64/mm/mmap.c
+ *
+ *  flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static inline unsigned long mmap_base(void)
+{
+       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+
+       if (gap < MIN_GAP)
+               gap = MIN_GAP;
+       else if (gap > MAX_GAP)
+               gap = MAX_GAP;
+
+       return TASK_SIZE - (gap & PAGE_MASK);
+}
+
+static inline int mmap_is_legacy(void)
+{
+       /*
+        * Force standard allocation for 64 bit programs.
+        */
+       if (!test_thread_flag(TIF_32BIT))
+               return 1;
+
+       if (current->personality & ADDR_COMPAT_LAYOUT)
+               return 1;
+
+       if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+               return 1;
+
+       return sysctl_legacy_va_layout;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+       /*
+        * Fall back to the standard layout if the personality
+        * bit is set, or if the expected stack growth is unlimited:
+        */
+       if (mmap_is_legacy()) {
+               mm->mmap_base = TASK_UNMAPPED_BASE;
+               mm->get_unmapped_area = arch_get_unmapped_area;
+               mm->unmap_area = arch_unmap_area;
+       } else {
+               mm->mmap_base = mmap_base();
+               mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+               mm->unmap_area = arch_unmap_area_topdown;
+       }
+}
index 06fe8af3af558417bf8ee60a9e0b44dd71db6ca9..a4d7a327c0e52bc409e8890a3b91bee4799dda32 100644 (file)
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_PPC32
 extern void mapin_ram(void);
 extern int map_page(unsigned long va, phys_addr_t pa, int flags);
 extern void setbat(int index, unsigned long virt, unsigned long phys,
                   unsigned int size, int flags);
-extern void reserve_phys_mem(unsigned long start, unsigned long size);
 extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
                      unsigned int size, int flags, unsigned int pid);
 extern void invalidate_tlbcam_entry(int index);
@@ -36,16 +36,16 @@ extern unsigned long ioremap_base;
 extern unsigned long ioremap_bot;
 extern unsigned int rtas_data, rtas_size;
 
-extern unsigned long __max_low_memory;
-extern unsigned long __initial_memory_limit;
-extern unsigned long total_memory;
-extern unsigned long total_lowmem;
-extern int mem_init_done;
-
 extern PTE *Hash, *Hash_end;
 extern unsigned long Hash_size, Hash_mask;
 
 extern unsigned int num_tlbcam_entries;
+#endif
+
+extern unsigned long __max_low_memory;
+extern unsigned long __initial_memory_limit;
+extern unsigned long total_memory;
+extern unsigned long total_lowmem;
 
 /* ...and now those things that may be slightly different between processor
  * architectures.  -- Dan
@@ -66,8 +66,8 @@ extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 extern void adjust_total_lowmem(void);
 
-#else
-/* anything except 4xx or 8xx */
+#elif defined(CONFIG_PPC32)
+/* anything 32-bit except 4xx or 8xx */
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(void);
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
new file mode 100644 (file)
index 0000000..cb864b8
--- /dev/null
@@ -0,0 +1,779 @@
+/*
+ * pSeries NUMA support
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/threads.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <asm/lmb.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+
+static int numa_enabled = 1;
+
+static int numa_debug;
+#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
+
+#ifdef DEBUG_NUMA
+#define ARRAY_INITIALISER -1
+#else
+#define ARRAY_INITIALISER 0
+#endif
+
+int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
+       ARRAY_INITIALISER};
+char *numa_memory_lookup_table;
+cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
+int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
+
+struct pglist_data *node_data[MAX_NUMNODES];
+bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
+static int min_common_depth;
+
+/*
+ * We need somewhere to store start/span for each node until we have
+ * allocated the real node_data structures.
+ */
+static struct {
+       unsigned long node_start_pfn;
+       unsigned long node_end_pfn;
+       unsigned long node_present_pages;
+} init_node_data[MAX_NUMNODES] __initdata;
+
+EXPORT_SYMBOL(node_data);
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+EXPORT_SYMBOL(numa_memory_lookup_table);
+EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(nr_cpus_in_node);
+
+static inline void map_cpu_to_node(int cpu, int node)
+{
+       numa_cpu_lookup_table[cpu] = node;
+       if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
+               cpu_set(cpu, numa_cpumask_lookup_table[node]);
+               nr_cpus_in_node[node]++;
+       }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void unmap_cpu_from_node(unsigned long cpu)
+{
+       int node = numa_cpu_lookup_table[cpu];
+
+       dbg("removing cpu %lu from node %d\n", cpu, node);
+
+       if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+               cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+               nr_cpus_in_node[node]--;
+       } else {
+               printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
+                      cpu, node);
+       }
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct device_node * __devinit find_cpu_node(unsigned int cpu)
+{
+       unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
+       struct device_node *cpu_node = NULL;
+       unsigned int *interrupt_server, *reg;
+       int len;
+
+       while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
+               /* Try interrupt server first */
+               interrupt_server = (unsigned int *)get_property(cpu_node,
+                                       "ibm,ppc-interrupt-server#s", &len);
+
+               len = len / sizeof(u32);
+
+               if (interrupt_server && (len > 0)) {
+                       while (len--) {
+                               if (interrupt_server[len] == hw_cpuid)
+                                       return cpu_node;
+                       }
+               } else {
+                       reg = (unsigned int *)get_property(cpu_node,
+                                                          "reg", &len);
+                       if (reg && (len > 0) && (reg[0] == hw_cpuid))
+                               return cpu_node;
+               }
+       }
+
+       return NULL;
+}
+
+/* must hold reference to node during call */
+static int *of_get_associativity(struct device_node *dev)
+{
+       return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
+}
+
+static int of_node_numa_domain(struct device_node *device)
+{
+       int numa_domain;
+       unsigned int *tmp;
+
+       if (min_common_depth == -1)
+               return 0;
+
+       tmp = of_get_associativity(device);
+       if (tmp && (tmp[0] >= min_common_depth)) {
+               numa_domain = tmp[min_common_depth];
+       } else {
+               dbg("WARNING: no NUMA information for %s\n",
+                   device->full_name);
+               numa_domain = 0;
+       }
+       return numa_domain;
+}
+
+/*
+ * In theory, the "ibm,associativity" property may contain multiple
+ * associativity lists because a resource may be multiply connected
+ * into the machine.  This resource then has different associativity
+ * characteristics relative to its multiple connections.  We ignore
+ * this for now.  We also assume that all cpu and memory sets have
+ * their distances represented at a common level.  This won't be
+ * true for heirarchical NUMA.
+ *
+ * In any case the ibm,associativity-reference-points should give
+ * the correct depth for a normal NUMA system.
+ *
+ * - Dave Hansen <haveblue@us.ibm.com>
+ */
+static int __init find_min_common_depth(void)
+{
+       int depth;
+       unsigned int *ref_points;
+       struct device_node *rtas_root;
+       unsigned int len;
+
+       rtas_root = of_find_node_by_path("/rtas");
+
+       if (!rtas_root)
+               return -1;
+
+       /*
+        * this property is 2 32-bit integers, each representing a level of
+        * depth in the associativity nodes.  The first is for an SMP
+        * configuration (should be all 0's) and the second is for a normal
+        * NUMA configuration.
+        */
+       ref_points = (unsigned int *)get_property(rtas_root,
+                       "ibm,associativity-reference-points", &len);
+
+       if ((len >= 1) && ref_points) {
+               depth = ref_points[1];
+       } else {
+               dbg("WARNING: could not find NUMA "
+                   "associativity reference point\n");
+               depth = -1;
+       }
+       of_node_put(rtas_root);
+
+       return depth;
+}
+
+static int __init get_mem_addr_cells(void)
+{
+       struct device_node *memory = NULL;
+       int rc;
+
+       memory = of_find_node_by_type(memory, "memory");
+       if (!memory)
+               return 0; /* it won't matter */
+
+       rc = prom_n_addr_cells(memory);
+       return rc;
+}
+
+static int __init get_mem_size_cells(void)
+{
+       struct device_node *memory = NULL;
+       int rc;
+
+       memory = of_find_node_by_type(memory, "memory");
+       if (!memory)
+               return 0; /* it won't matter */
+       rc = prom_n_size_cells(memory);
+       return rc;
+}
+
+static unsigned long read_n_cells(int n, unsigned int **buf)
+{
+       unsigned long result = 0;
+
+       while (n--) {
+               result = (result << 32) | **buf;
+               (*buf)++;
+       }
+       return result;
+}
+
+/*
+ * Figure out to which domain a cpu belongs and stick it there.
+ * Return the id of the domain used.
+ */
+static int numa_setup_cpu(unsigned long lcpu)
+{
+       int numa_domain = 0;
+       struct device_node *cpu = find_cpu_node(lcpu);
+
+       if (!cpu) {
+               WARN_ON(1);
+               goto out;
+       }
+
+       numa_domain = of_node_numa_domain(cpu);
+
+       if (numa_domain >= num_online_nodes()) {
+               /*
+                * POWER4 LPAR uses 0xffff as invalid node,
+                * dont warn in this case.
+                */
+               if (numa_domain != 0xffff)
+                       printk(KERN_ERR "WARNING: cpu %ld "
+                              "maps to invalid NUMA node %d\n",
+                              lcpu, numa_domain);
+               numa_domain = 0;
+       }
+out:
+       node_set_online(numa_domain);
+
+       map_cpu_to_node(lcpu, numa_domain);
+
+       of_node_put(cpu);
+
+       return numa_domain;
+}
+
+static int cpu_numa_callback(struct notifier_block *nfb,
+                            unsigned long action,
+                            void *hcpu)
+{
+       unsigned long lcpu = (unsigned long)hcpu;
+       int ret = NOTIFY_DONE;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+               if (min_common_depth == -1 || !numa_enabled)
+                       map_cpu_to_node(lcpu, 0);
+               else
+                       numa_setup_cpu(lcpu);
+               ret = NOTIFY_OK;
+               break;
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DEAD:
+       case CPU_UP_CANCELED:
+               unmap_cpu_from_node(lcpu);
+               break;
+               ret = NOTIFY_OK;
+#endif
+       }
+       return ret;
+}
+
+/*
+ * Check and possibly modify a memory region to enforce the memory limit.
+ *
+ * Returns the size the region should have to enforce the memory limit.
+ * This will either be the original value of size, a truncated value,
+ * or zero. If the returned value of size is 0 the region should be
+ * discarded as it lies wholy above the memory limit.
+ */
+static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
+{
+       /*
+        * We use lmb_end_of_DRAM() in here instead of memory_limit because
+        * we've already adjusted it for the limit and it takes care of
+        * having memory holes below the limit.
+        */
+       extern unsigned long memory_limit;
+
+       if (! memory_limit)
+               return size;
+
+       if (start + size <= lmb_end_of_DRAM())
+               return size;
+
+       if (start >= lmb_end_of_DRAM())
+               return 0;
+
+       return lmb_end_of_DRAM() - start;
+}
+
+static int __init parse_numa_properties(void)
+{
+       struct device_node *cpu = NULL;
+       struct device_node *memory = NULL;
+       int addr_cells, size_cells;
+       int max_domain = 0;
+       long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
+       unsigned long i;
+
+       if (numa_enabled == 0) {
+               printk(KERN_WARNING "NUMA disabled by user\n");
+               return -1;
+       }
+
+       numa_memory_lookup_table =
+               (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+       memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+
+       for (i = 0; i < entries ; i++)
+               numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+
+       min_common_depth = find_min_common_depth();
+
+       dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+       if (min_common_depth < 0)
+               return min_common_depth;
+
+       max_domain = numa_setup_cpu(boot_cpuid);
+
+       /*
+        * Even though we connect cpus to numa domains later in SMP init,
+        * we need to know the maximum node id now. This is because each
+        * node id must have NODE_DATA etc backing it.
+        * As a result of hotplug we could still have cpus appear later on
+        * with larger node ids. In that case we force the cpu into node 0.
+        */
+       for_each_cpu(i) {
+               int numa_domain;
+
+               cpu = find_cpu_node(i);
+
+               if (cpu) {
+                       numa_domain = of_node_numa_domain(cpu);
+                       of_node_put(cpu);
+
+                       if (numa_domain < MAX_NUMNODES &&
+                           max_domain < numa_domain)
+                               max_domain = numa_domain;
+               }
+       }
+
+       addr_cells = get_mem_addr_cells();
+       size_cells = get_mem_size_cells();
+       memory = NULL;
+       while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+               unsigned long start;
+               unsigned long size;
+               int numa_domain;
+               int ranges;
+               unsigned int *memcell_buf;
+               unsigned int len;
+
+               memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+               if (!memcell_buf || len <= 0)
+                       continue;
+
+               ranges = memory->n_addrs;
+new_range:
+               /* these are order-sensitive, and modify the buffer pointer */
+               start = read_n_cells(addr_cells, &memcell_buf);
+               size = read_n_cells(size_cells, &memcell_buf);
+
+               start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
+               size = _ALIGN_UP(size, MEMORY_INCREMENT);
+
+               numa_domain = of_node_numa_domain(memory);
+
+               if (numa_domain >= MAX_NUMNODES) {
+                       if (numa_domain != 0xffff)
+                               printk(KERN_ERR "WARNING: memory at %lx maps "
+                                      "to invalid NUMA node %d\n", start,
+                                      numa_domain);
+                       numa_domain = 0;
+               }
+
+               if (max_domain < numa_domain)
+                       max_domain = numa_domain;
+
+               if (! (size = numa_enforce_memory_limit(start, size))) {
+                       if (--ranges)
+                               goto new_range;
+                       else
+                               continue;
+               }
+
+               /*
+                * Initialize new node struct, or add to an existing one.
+                */
+               if (init_node_data[numa_domain].node_end_pfn) {
+                       if ((start / PAGE_SIZE) <
+                           init_node_data[numa_domain].node_start_pfn)
+                               init_node_data[numa_domain].node_start_pfn =
+                                       start / PAGE_SIZE;
+                       if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
+                           init_node_data[numa_domain].node_end_pfn)
+                               init_node_data[numa_domain].node_end_pfn =
+                                       (start / PAGE_SIZE) +
+                                       (size / PAGE_SIZE);
+
+                       init_node_data[numa_domain].node_present_pages +=
+                               size / PAGE_SIZE;
+               } else {
+                       node_set_online(numa_domain);
+
+                       init_node_data[numa_domain].node_start_pfn =
+                               start / PAGE_SIZE;
+                       init_node_data[numa_domain].node_end_pfn =
+                               init_node_data[numa_domain].node_start_pfn +
+                               size / PAGE_SIZE;
+                       init_node_data[numa_domain].node_present_pages =
+                               size / PAGE_SIZE;
+               }
+
+               for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
+                       numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
+                               numa_domain;
+
+               if (--ranges)
+                       goto new_range;
+       }
+
+       for (i = 0; i <= max_domain; i++)
+               node_set_online(i);
+
+       return 0;
+}
+
+static void __init setup_nonnuma(void)
+{
+       unsigned long top_of_ram = lmb_end_of_DRAM();
+       unsigned long total_ram = lmb_phys_mem_size();
+       unsigned long i;
+
+       printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+              top_of_ram, total_ram);
+       printk(KERN_INFO "Memory hole size: %ldMB\n",
+              (top_of_ram - total_ram) >> 20);
+
+       if (!numa_memory_lookup_table) {
+               long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
+               numa_memory_lookup_table =
+                       (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+               memset(numa_memory_lookup_table, 0, entries * sizeof(char));
+               for (i = 0; i < entries ; i++)
+                       numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+       }
+
+       map_cpu_to_node(boot_cpuid, 0);
+
+       node_set_online(0);
+
+       init_node_data[0].node_start_pfn = 0;
+       init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
+       init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
+
+       for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
+               numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
+}
+
+static void __init dump_numa_topology(void)
+{
+       unsigned int node;
+       unsigned int count;
+
+       if (min_common_depth == -1 || !numa_enabled)
+               return;
+
+       for_each_online_node(node) {
+               unsigned long i;
+
+               printk(KERN_INFO "Node %d Memory:", node);
+
+               count = 0;
+
+               for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
+                       if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
+                               if (count == 0)
+                                       printk(" 0x%lx", i);
+                               ++count;
+                       } else {
+                               if (count > 0)
+                                       printk("-0x%lx", i);
+                               count = 0;
+                       }
+               }
+
+               if (count > 0)
+                       printk("-0x%lx", i);
+               printk("\n");
+       }
+       return;
+}
+
+/*
+ * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * required. nid is the preferred node and end is the physical address of
+ * the highest address in the node.
+ *
+ * Returns the physical address of the memory.
+ */
+static unsigned long careful_allocation(int nid, unsigned long size,
+                                       unsigned long align, unsigned long end)
+{
+       unsigned long ret = lmb_alloc_base(size, align, end);
+
+       /* retry over all memory */
+       if (!ret)
+               ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
+
+       if (!ret)
+               panic("numa.c: cannot allocate %lu bytes on node %d",
+                     size, nid);
+
+       /*
+        * If the memory came from a previously allocated node, we must
+        * retry with the bootmem allocator.
+        */
+       if (pa_to_nid(ret) < nid) {
+               nid = pa_to_nid(ret);
+               ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
+                               size, align, 0);
+
+               if (!ret)
+                       panic("numa.c: cannot allocate %lu bytes on node %d",
+                             size, nid);
+
+               ret = virt_to_abs(ret);
+
+               dbg("alloc_bootmem %lx %lx\n", ret, size);
+       }
+
+       return ret;
+}
+
+void __init do_init_bootmem(void)
+{
+       int nid;
+       int addr_cells, size_cells;
+       struct device_node *memory = NULL;
+       static struct notifier_block ppc64_numa_nb = {
+               .notifier_call = cpu_numa_callback,
+               .priority = 1 /* Must run before sched domains notifier. */
+       };
+
+       min_low_pfn = 0;
+       max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+       max_pfn = max_low_pfn;
+
+       if (parse_numa_properties())
+               setup_nonnuma();
+       else
+               dump_numa_topology();
+
+       register_cpu_notifier(&ppc64_numa_nb);
+
+       for_each_online_node(nid) {
+               unsigned long start_paddr, end_paddr;
+               int i;
+               unsigned long bootmem_paddr;
+               unsigned long bootmap_pages;
+
+               start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
+               end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
+
+               /* Allocate the node structure node local if possible */
+               NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
+                                       sizeof(struct pglist_data),
+                                       SMP_CACHE_BYTES, end_paddr);
+               NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
+               memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+
+               dbg("node %d\n", nid);
+               dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
+
+               NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+               NODE_DATA(nid)->node_start_pfn =
+                       init_node_data[nid].node_start_pfn;
+               NODE_DATA(nid)->node_spanned_pages =
+                       end_paddr - start_paddr;
+
+               if (NODE_DATA(nid)->node_spanned_pages == 0)
+                       continue;
+
+               dbg("start_paddr = %lx\n", start_paddr);
+               dbg("end_paddr = %lx\n", end_paddr);
+
+               bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
+
+               bootmem_paddr = careful_allocation(nid,
+                               bootmap_pages << PAGE_SHIFT,
+                               PAGE_SIZE, end_paddr);
+               memset(abs_to_virt(bootmem_paddr), 0,
+                      bootmap_pages << PAGE_SHIFT);
+               dbg("bootmap_paddr = %lx\n", bootmem_paddr);
+
+               init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+                                 start_paddr >> PAGE_SHIFT,
+                                 end_paddr >> PAGE_SHIFT);
+
+               /*
+                * We need to do another scan of all memory sections to
+                * associate memory with the correct node.
+                */
+               addr_cells = get_mem_addr_cells();
+               size_cells = get_mem_size_cells();
+               memory = NULL;
+               while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+                       unsigned long mem_start, mem_size;
+                       int numa_domain, ranges;
+                       unsigned int *memcell_buf;
+                       unsigned int len;
+
+                       memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+                       if (!memcell_buf || len <= 0)
+                               continue;
+
+                       ranges = memory->n_addrs;       /* ranges in cell */
+new_range:
+                       mem_start = read_n_cells(addr_cells, &memcell_buf);
+                       mem_size = read_n_cells(size_cells, &memcell_buf);
+                       if (numa_enabled) {
+                               numa_domain = of_node_numa_domain(memory);
+                               if (numa_domain  >= MAX_NUMNODES)
+                                       numa_domain = 0;
+                       } else
+                               numa_domain =  0;
+
+                       if (numa_domain != nid)
+                               continue;
+
+                       mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+                       if (mem_size) {
+                               dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
+                               free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
+                       }
+
+                       if (--ranges)           /* process all ranges in cell */
+                               goto new_range;
+               }
+
+               /*
+                * Mark reserved regions on this node
+                */
+               for (i = 0; i < lmb.reserved.cnt; i++) {
+                       unsigned long physbase = lmb.reserved.region[i].base;
+                       unsigned long size = lmb.reserved.region[i].size;
+
+                       if (pa_to_nid(physbase) != nid &&
+                           pa_to_nid(physbase+size-1) != nid)
+                               continue;
+
+                       if (physbase < end_paddr &&
+                           (physbase+size) > start_paddr) {
+                               /* overlaps */
+                               if (physbase < start_paddr) {
+                                       size -= start_paddr - physbase;
+                                       physbase = start_paddr;
+                               }
+
+                               if (size > end_paddr - physbase)
+                                       size = end_paddr - physbase;
+
+                               dbg("reserve_bootmem %lx %lx\n", physbase,
+                                   size);
+                               reserve_bootmem_node(NODE_DATA(nid), physbase,
+                                                    size);
+                       }
+               }
+               /*
+                * This loop may look famaliar, but we have to do it again
+                * after marking our reserved memory to mark memory present
+                * for sparsemem.
+                */
+               addr_cells = get_mem_addr_cells();
+               size_cells = get_mem_size_cells();
+               memory = NULL;
+               while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+                       unsigned long mem_start, mem_size;
+                       int numa_domain, ranges;
+                       unsigned int *memcell_buf;
+                       unsigned int len;
+
+                       memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+                       if (!memcell_buf || len <= 0)
+                               continue;
+
+                       ranges = memory->n_addrs;       /* ranges in cell */
+new_range2:
+                       mem_start = read_n_cells(addr_cells, &memcell_buf);
+                       mem_size = read_n_cells(size_cells, &memcell_buf);
+                       if (numa_enabled) {
+                               numa_domain = of_node_numa_domain(memory);
+                               if (numa_domain  >= MAX_NUMNODES)
+                                       numa_domain = 0;
+                       } else
+                               numa_domain =  0;
+
+                       if (numa_domain != nid)
+                               continue;
+
+                       mem_size = numa_enforce_memory_limit(mem_start, mem_size);
+                       memory_present(numa_domain, mem_start >> PAGE_SHIFT,
+                                      (mem_start + mem_size) >> PAGE_SHIFT);
+
+                       if (--ranges)           /* process all ranges in cell */
+                               goto new_range2;
+               }
+
+       }
+}
+
+void __init paging_init(void)
+{
+       unsigned long zones_size[MAX_NR_ZONES];
+       unsigned long zholes_size[MAX_NR_ZONES];
+       int nid;
+
+       memset(zones_size, 0, sizeof(zones_size));
+       memset(zholes_size, 0, sizeof(zholes_size));
+
+       for_each_online_node(nid) {
+               unsigned long start_pfn;
+               unsigned long end_pfn;
+
+               start_pfn = init_node_data[nid].node_start_pfn;
+               end_pfn = init_node_data[nid].node_end_pfn;
+
+               zones_size[ZONE_DMA] = end_pfn - start_pfn;
+               zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+                       init_node_data[nid].node_present_pages;
+
+               dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
+                   zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
+
+               free_area_init_node(nid, NODE_DATA(nid), zones_size,
+                                                       start_pfn, zholes_size);
+       }
+}
+
+static int __init early_numa(char *p)
+{
+       if (!p)
+               return 0;
+
+       if (strstr(p, "off"))
+               numa_enabled = 0;
+
+       if (strstr(p, "debug"))
+               numa_debug = 1;
+
+       return 0;
+}
+early_param("numa", early_numa);
index 724f97e5dee5571389ad568045f3d786668746a4..484d24f9208bcf839b129249ab64d640c94bcf7b 100644 (file)
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
 
-#if PGTABLE_RANGE > USER_VSID_RANGE
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
-
-int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
 
-extern pgd_t swapper_pg_dir[];
-extern struct task_struct *current_set[NR_CPUS];
-
-unsigned long klimit = (unsigned long)_end;
-
-/* max amount of RAM to use */
-unsigned long __max_memory;
-
-/* info on what we think the IO hole is */
-unsigned long  io_hole_start;
-unsigned long  io_hole_size;
-
 #ifdef CONFIG_PPC_ISERIES
 
 void __iomem *ioremap(unsigned long addr, unsigned long size)
@@ -355,3 +334,16 @@ int iounmap_explicit(volatile void __iomem *start, unsigned long size)
 EXPORT_SYMBOL(ioremap);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
+
+void __iomem * reserve_phb_iospace(unsigned long size)
+{
+       void __iomem *virt_addr;
+               
+       if (phbs_io_bot >= IMALLOC_BASE) 
+               panic("reserve_phb_iospace(): phb io space overflow\n");
+                       
+       virt_addr = (void __iomem *) phbs_io_bot;
+       phbs_io_bot += size;
+
+       return virt_addr;
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
new file mode 100644 (file)
index 0000000..0473953
--- /dev/null
@@ -0,0 +1,158 @@
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code writteh by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+
+extern void slb_allocate(unsigned long ea);
+
+static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
+{
+       return (ea & ESID_MASK) | SLB_ESID_V | slot;
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags)
+{
+       return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags;
+}
+
+static inline void create_slbe(unsigned long ea, unsigned long flags,
+                              unsigned long entry)
+{
+       asm volatile("slbmte  %0,%1" :
+                    : "r" (mk_vsid_data(ea, flags)),
+                      "r" (mk_esid_data(ea, entry))
+                    : "memory" );
+}
+
+static void slb_flush_and_rebolt(void)
+{
+       /* If you change this make sure you change SLB_NUM_BOLTED
+        * appropriately too. */
+       unsigned long ksp_flags = SLB_VSID_KERNEL;
+       unsigned long ksp_esid_data;
+
+       WARN_ON(!irqs_disabled());
+
+       if (cpu_has_feature(CPU_FTR_16M_PAGE))
+               ksp_flags |= SLB_VSID_L;
+
+       ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
+       if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
+               ksp_esid_data &= ~SLB_ESID_V;
+
+       /* We need to do this all in asm, so we're sure we don't touch
+        * the stack between the slbia and rebolting it. */
+       asm volatile("isync\n"
+                    "slbia\n"
+                    /* Slot 1 - first VMALLOC segment */
+                    "slbmte    %0,%1\n"
+                    /* Slot 2 - kernel stack */
+                    "slbmte    %2,%3\n"
+                    "isync"
+                    :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
+                       "r"(mk_esid_data(VMALLOCBASE, 1)),
+                       "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
+                       "r"(ksp_esid_data)
+                    : "memory");
+}
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+       unsigned long offset = get_paca()->slb_cache_ptr;
+       unsigned long esid_data = 0;
+       unsigned long pc = KSTK_EIP(tsk);
+       unsigned long stack = KSTK_ESP(tsk);
+       unsigned long unmapped_base;
+
+       if (offset <= SLB_CACHE_ENTRIES) {
+               int i;
+               asm volatile("isync" : : : "memory");
+               for (i = 0; i < offset; i++) {
+                       esid_data = ((unsigned long)get_paca()->slb_cache[i]
+                               << SID_SHIFT) | SLBIE_C;
+                       asm volatile("slbie %0" : : "r" (esid_data));
+               }
+               asm volatile("isync" : : : "memory");
+       } else {
+               slb_flush_and_rebolt();
+       }
+
+       /* Workaround POWER5 < DD2.1 issue */
+       if (offset == 1 || offset > SLB_CACHE_ENTRIES)
+               asm volatile("slbie %0" : : "r" (esid_data));
+
+       get_paca()->slb_cache_ptr = 0;
+       get_paca()->context = mm->context;
+
+       /*
+        * preload some userspace segments into the SLB.
+        */
+       if (test_tsk_thread_flag(tsk, TIF_32BIT))
+               unmapped_base = TASK_UNMAPPED_BASE_USER32;
+       else
+               unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+       if (pc >= KERNELBASE)
+               return;
+       slb_allocate(pc);
+
+       if (GET_ESID(pc) == GET_ESID(stack))
+               return;
+
+       if (stack >= KERNELBASE)
+               return;
+       slb_allocate(stack);
+
+       if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+           || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+               return;
+
+       if (unmapped_base >= KERNELBASE)
+               return;
+       slb_allocate(unmapped_base);
+}
+
+void slb_initialize(void)
+{
+       /* On iSeries the bolted entries have already been set up by
+        * the hypervisor from the lparMap data in head.S */
+#ifndef CONFIG_PPC_ISERIES
+       unsigned long flags = SLB_VSID_KERNEL;
+
+       /* Invalidate the entire SLB (even slot 0) & all the ERATS */
+       if (cpu_has_feature(CPU_FTR_16M_PAGE))
+               flags |= SLB_VSID_L;
+
+       asm volatile("isync":::"memory");
+       asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+       asm volatile("isync; slbia; isync":::"memory");
+       create_slbe(KERNELBASE, flags, 0);
+       create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1);
+       /* We don't bolt the stack for the time being - we're in boot,
+        * so the stack is in the bolted segment.  By the time it goes
+        * elsewhere, we'll call _switch() which will bolt in the new
+        * one. */
+       asm volatile("isync":::"memory");
+#endif
+
+       get_paca()->stab_rr = SLB_NUM_BOLTED;
+}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
new file mode 100644 (file)
index 0000000..a3a03da
--- /dev/null
@@ -0,0 +1,151 @@
+/*
+ * arch/ppc64/mm/slb_low.S
+ *
+ * Low-level SLB routines
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ *
+ * Based on earlier C version:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+
+/* void slb_allocate(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ *     r3 = faulting address, r13 = PACA
+ *     r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ */
+_GLOBAL(slb_allocate)
+       /*
+        * First find a slot, round robin. Previously we tried to find
+        * a free slot first but that took too long. Unfortunately we
+        * dont have any LRU information to help us choose a slot.
+        */
+#ifdef CONFIG_PPC_ISERIES
+       /*
+        * On iSeries, the "bolted" stack segment can be cast out on
+        * shared processor switch so we need to check for a miss on
+        * it and restore it to the right slot.
+        */
+       ld      r9,PACAKSAVE(r13)
+       clrrdi  r9,r9,28
+       clrrdi  r11,r3,28
+       li      r10,SLB_NUM_BOLTED-1    /* Stack goes in last bolted slot */
+       cmpld   r9,r11
+       beq     3f
+#endif /* CONFIG_PPC_ISERIES */
+
+       ld      r10,PACASTABRR(r13)
+       addi    r10,r10,1
+       /* use a cpu feature mask if we ever change our slb size */
+       cmpldi  r10,SLB_NUM_ENTRIES
+
+       blt+    4f
+       li      r10,SLB_NUM_BOLTED
+
+4:
+       std     r10,PACASTABRR(r13)
+3:
+       /* r3 = faulting address, r10 = entry */
+
+       srdi    r9,r3,60                /* get region */
+       srdi    r3,r3,28                /* get esid */
+       cmpldi  cr7,r9,0xc              /* cmp KERNELBASE for later use */
+
+       rldimi  r10,r3,28,0             /* r10= ESID<<28 | entry */
+       oris    r10,r10,SLB_ESID_V@h    /* r10 |= SLB_ESID_V */
+
+       /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
+
+       blt     cr7,0f                  /* user or kernel? */
+
+       /* kernel address: proto-VSID = ESID */
+       /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
+        * this code will generate the protoVSID 0xfffffffff for the
+        * top segment.  That's ok, the scramble below will translate
+        * it to VSID 0, which is reserved as a bad VSID - one which
+        * will never have any pages in it.  */
+       li      r11,SLB_VSID_KERNEL
+BEGIN_FTR_SECTION
+       bne     cr7,9f
+       li      r11,(SLB_VSID_KERNEL|SLB_VSID_L)
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+       b       9f
+
+0:     /* user address: proto-VSID = context<<15 | ESID */
+       srdi.   r9,r3,USER_ESID_BITS
+       bne-    8f                      /* invalid ea bits set */
+
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+       lhz     r9,PACAHIGHHTLBAREAS(r13)
+       srdi    r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+       srd     r9,r9,r11
+       lhz     r11,PACALOWHTLBAREAS(r13)
+       srd     r11,r11,r3
+       or      r9,r9,r11
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
+
+       li      r11,SLB_VSID_USER
+
+#ifdef CONFIG_HUGETLB_PAGE
+BEGIN_FTR_SECTION
+       rldimi  r11,r9,8,55             /* shift masked bit into SLB_VSID_L */
+END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
+#endif /* CONFIG_HUGETLB_PAGE */
+
+       ld      r9,PACACONTEXTID(r13)
+       rldimi  r3,r9,USER_ESID_BITS,0
+
+9:     /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
+       ASM_VSID_SCRAMBLE(r3,r9)
+
+       rldimi  r11,r3,SLB_VSID_SHIFT,16        /* combine VSID and flags */
+
+       /*
+        * No need for an isync before or after this slbmte. The exception
+        * we enter with and the rfid we exit with are context synchronizing.
+        */
+       slbmte  r11,r10
+
+       bgelr   cr7                     /* we're done for kernel addresses */
+
+       /* Update the slb cache */
+       lhz     r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
+       cmpldi  r3,SLB_CACHE_ENTRIES
+       bge     1f
+
+       /* still room in the slb cache */
+       sldi    r11,r3,1                /* r11 = offset * sizeof(u16) */
+       rldicl  r10,r10,36,28           /* get low 16 bits of the ESID */
+       add     r11,r11,r13             /* r11 = (u16 *)paca + offset */
+       sth     r10,PACASLBCACHE(r11)   /* paca->slb_cache[offset] = esid */
+       addi    r3,r3,1                 /* offset++ */
+       b       2f
+1:                                     /* offset >= SLB_CACHE_ENTRIES */
+       li      r3,SLB_CACHE_ENTRIES+1
+2:
+       sth     r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
+       blr
+
+8:     /* invalid EA */
+       li      r3,0                    /* BAD_VSID */
+       li      r11,SLB_VSID_USER       /* flags don't much matter */
+       b       9b
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
new file mode 100644 (file)
index 0000000..1b83f00
--- /dev/null
@@ -0,0 +1,279 @@
+/*
+ * PowerPC64 Segment Translation Support.
+ *
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+#include <asm/lmb.h>
+#include <asm/abs_addr.h>
+
+struct stab_entry {
+       unsigned long esid_data;
+       unsigned long vsid_data;
+};
+
+/* Both the segment table and SLB code uses the following cache */
+#define NR_STAB_CACHE_ENTRIES 8
+DEFINE_PER_CPU(long, stab_cache_ptr);
+DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+
+/*
+ * Create a segment table entry for the given esid/vsid pair.
+ */
+static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
+{
+       unsigned long esid_data, vsid_data;
+       unsigned long entry, group, old_esid, castout_entry, i;
+       unsigned int global_entry;
+       struct stab_entry *ste, *castout_ste;
+       unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE;
+
+       vsid_data = vsid << STE_VSID_SHIFT;
+       esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
+       if (! kernel_segment)
+               esid_data |= STE_ESID_KS;
+
+       /* Search the primary group first. */
+       global_entry = (esid & 0x1f) << 3;
+       ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+
+       /* Find an empty entry, if one exists. */
+       for (group = 0; group < 2; group++) {
+               for (entry = 0; entry < 8; entry++, ste++) {
+                       if (!(ste->esid_data & STE_ESID_V)) {
+                               ste->vsid_data = vsid_data;
+                               asm volatile("eieio":::"memory");
+                               ste->esid_data = esid_data;
+                               return (global_entry | entry);
+                       }
+               }
+               /* Now search the secondary group. */
+               global_entry = ((~esid) & 0x1f) << 3;
+               ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+       }
+
+       /*
+        * Could not find empty entry, pick one with a round robin selection.
+        * Search all entries in the two groups.
+        */
+       castout_entry = get_paca()->stab_rr;
+       for (i = 0; i < 16; i++) {
+               if (castout_entry < 8) {
+                       global_entry = (esid & 0x1f) << 3;
+                       ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+                       castout_ste = ste + castout_entry;
+               } else {
+                       global_entry = ((~esid) & 0x1f) << 3;
+                       ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+                       castout_ste = ste + (castout_entry - 8);
+               }
+
+               /* Dont cast out the first kernel segment */
+               if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE)
+                       break;
+
+               castout_entry = (castout_entry + 1) & 0xf;
+       }
+
+       get_paca()->stab_rr = (castout_entry + 1) & 0xf;
+
+       /* Modify the old entry to the new value. */
+
+       /* Force previous translations to complete. DRENG */
+       asm volatile("isync" : : : "memory");
+
+       old_esid = castout_ste->esid_data >> SID_SHIFT;
+       castout_ste->esid_data = 0;             /* Invalidate old entry */
+
+       asm volatile("sync" : : : "memory");    /* Order update */
+
+       castout_ste->vsid_data = vsid_data;
+       asm volatile("eieio" : : : "memory");   /* Order update */
+       castout_ste->esid_data = esid_data;
+
+       asm volatile("slbie  %0" : : "r" (old_esid << SID_SHIFT));
+       /* Ensure completion of slbie */
+       asm volatile("sync" : : : "memory");
+
+       return (global_entry | (castout_entry & 0x7));
+}
+
+/*
+ * Allocate a segment table entry for the given ea and mm
+ */
+static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
+{
+       unsigned long vsid;
+       unsigned char stab_entry;
+       unsigned long offset;
+
+       /* Kernel or user address? */
+       if (ea >= KERNELBASE) {
+               vsid = get_kernel_vsid(ea);
+       } else {
+               if ((ea >= TASK_SIZE_USER64) || (! mm))
+                       return 1;
+
+               vsid = get_vsid(mm->context.id, ea);
+       }
+
+       stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
+
+       if (ea < KERNELBASE) {
+               offset = __get_cpu_var(stab_cache_ptr);
+               if (offset < NR_STAB_CACHE_ENTRIES)
+                       __get_cpu_var(stab_cache[offset++]) = stab_entry;
+               else
+                       offset = NR_STAB_CACHE_ENTRIES+1;
+               __get_cpu_var(stab_cache_ptr) = offset;
+
+               /* Order update */
+               asm volatile("sync":::"memory");
+       }
+
+       return 0;
+}
+
+int ste_allocate(unsigned long ea)
+{
+       return __ste_allocate(ea, current->mm);
+}
+
+/*
+ * Do the segment table work for a context switch: flush all user
+ * entries from the table, then preload some probably useful entries
+ * for the new task
+ */
+void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
+{
+       struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
+       struct stab_entry *ste;
+       unsigned long offset = __get_cpu_var(stab_cache_ptr);
+       unsigned long pc = KSTK_EIP(tsk);
+       unsigned long stack = KSTK_ESP(tsk);
+       unsigned long unmapped_base;
+
+       /* Force previous translations to complete. DRENG */
+       asm volatile("isync" : : : "memory");
+
+       if (offset <= NR_STAB_CACHE_ENTRIES) {
+               int i;
+
+               for (i = 0; i < offset; i++) {
+                       ste = stab + __get_cpu_var(stab_cache[i]);
+                       ste->esid_data = 0; /* invalidate entry */
+               }
+       } else {
+               unsigned long entry;
+
+               /* Invalidate all entries. */
+               ste = stab;
+
+               /* Never flush the first entry. */
+               ste += 1;
+               for (entry = 1;
+                    entry < (PAGE_SIZE / sizeof(struct stab_entry));
+                    entry++, ste++) {
+                       unsigned long ea;
+                       ea = ste->esid_data & ESID_MASK;
+                       if (ea < KERNELBASE) {
+                               ste->esid_data = 0;
+                       }
+               }
+       }
+
+       asm volatile("sync; slbia; sync":::"memory");
+
+       __get_cpu_var(stab_cache_ptr) = 0;
+
+       /* Now preload some entries for the new task */
+       if (test_tsk_thread_flag(tsk, TIF_32BIT))
+               unmapped_base = TASK_UNMAPPED_BASE_USER32;
+       else
+               unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+       __ste_allocate(pc, mm);
+
+       if (GET_ESID(pc) == GET_ESID(stack))
+               return;
+
+       __ste_allocate(stack, mm);
+
+       if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+           || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+               return;
+
+       __ste_allocate(unmapped_base, mm);
+
+       /* Order update */
+       asm volatile("sync" : : : "memory");
+}
+
+extern void slb_initialize(void);
+
+/*
+ * Allocate segment tables for secondary CPUs.  These must all go in
+ * the first (bolted) segment, so that do_stab_bolted won't get a
+ * recursive segment miss on the segment table itself.
+ */
+void stabs_alloc(void)
+{
+       int cpu;
+
+       if (cpu_has_feature(CPU_FTR_SLB))
+               return;
+
+       for_each_cpu(cpu) {
+               unsigned long newstab;
+
+               if (cpu == 0)
+                       continue; /* stab for CPU 0 is statically allocated */
+
+               newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT);
+               if (! newstab)
+                       panic("Unable to allocate segment table for CPU %d.\n",
+                             cpu);
+
+               newstab += KERNELBASE;
+
+               memset((void *)newstab, 0, PAGE_SIZE);
+
+               paca[cpu].stab_addr = newstab;
+               paca[cpu].stab_real = virt_to_abs(newstab);
+               printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
+       }
+}
+
+/*
+ * Build an entry for the base kernel segment and put it into
+ * the segment table or SLB.  All other segment table or SLB
+ * entries are faulted in.
+ */
+void stab_initialize(unsigned long stab)
+{
+       unsigned long vsid = get_kernel_vsid(KERNELBASE);
+
+       if (cpu_has_feature(CPU_FTR_SLB)) {
+               slb_initialize();
+       } else {
+               asm volatile("isync; slbia; isync":::"memory");
+               make_ste(stab, GET_ESID(KERNELBASE), vsid);
+
+               /* Order update */
+               asm volatile("sync":::"memory");
+       }
+}
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
new file mode 100644 (file)
index 0000000..09ab81a
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <linux/highmem.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/* This is declared as we are using the more or less generic
+ * include/asm-ppc64/tlb.h file -- tgall
+ */
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+struct pte_freelist_batch
+{
+       struct rcu_head rcu;
+       unsigned int    index;
+       pgtable_free_t  tables[0];
+};
+
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+#define PTE_FREELIST_SIZE \
+       ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+         / sizeof(pgtable_free_t))
+
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+       /* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+       pte_freelist_forced_free++;
+
+       smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+       pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+       struct pte_freelist_batch *batch =
+               container_of(head, struct pte_freelist_batch, rcu);
+       unsigned int i;
+
+       for (i = 0; i < batch->index; i++)
+               pgtable_free(batch->tables[i]);
+
+       free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+       INIT_RCU_HEAD(&batch->rcu);
+       call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+{
+       /* This is safe as we are holding page_table_lock */
+        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+       struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+       if (atomic_read(&tlb->mm->mm_users) < 2 ||
+           cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
+               pgtable_free(pgf);
+               return;
+       }
+
+       if (*batchp == NULL) {
+               *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+               if (*batchp == NULL) {
+                       pgtable_free_now(pgf);
+                       return;
+               }
+               (*batchp)->index = 0;
+       }
+       (*batchp)->tables[(*batchp)->index++] = pgf;
+       if ((*batchp)->index == PTE_FREELIST_SIZE) {
+               pte_free_submit(*batchp);
+               *batchp = NULL;
+       }
+}
+
+/*
+ * Update the MMU hash table to correspond with a change to
+ * a Linux PTE.  If wrprot is true, it is permissible to
+ * change the existing HPTE to read-only rather than removing it
+ * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
+ */
+void hpte_update(struct mm_struct *mm, unsigned long addr,
+                unsigned long pte, int wrprot)
+{
+       struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+       unsigned long vsid;
+       int i;
+
+       i = batch->index;
+
+       /*
+        * This can happen when we are in the middle of a TLB batch and
+        * we encounter memory pressure (eg copy_page_range when it tries
+        * to allocate a new pte). If we have to reclaim memory and end
+        * up scanning and resetting referenced bits then our batch context
+        * will change mid stream.
+        */
+       if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) {
+               flush_tlb_pending();
+               i = 0;
+       }
+       if (i == 0) {
+               batch->mm = mm;
+               batch->large = pte_huge(pte);
+       }
+       if (addr < KERNELBASE) {
+               vsid = get_vsid(mm->context.id, addr);
+               WARN_ON(vsid == 0);
+       } else
+               vsid = get_kernel_vsid(addr);
+       batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
+       batch->pte[i] = __pte(pte);
+       batch->index = ++i;
+       if (i >= PPC64_TLB_BATCH_NR)
+               flush_tlb_pending();
+}
+
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+       int i;
+       int cpu;
+       cpumask_t tmp;
+       int local = 0;
+
+       BUG_ON(in_interrupt());
+
+       cpu = get_cpu();
+       i = batch->index;
+       tmp = cpumask_of_cpu(cpu);
+       if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+               local = 1;
+
+       if (i == 1)
+               flush_hash_page(batch->vaddr[0], batch->pte[0], local);
+       else
+               flush_hash_range(i, local);
+       batch->index = 0;
+       put_cpu();
+}
+
+void pte_free_finish(void)
+{
+       /* This is safe as we are holding page_table_lock */
+       struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+       if (*batchp == NULL)
+               return;
+       pte_free_submit(*batchp);
+       *batchp = NULL;
+}
index fa889204d6ae2a2ddcbd0c97d342f64ba44d08c0..4a9928ef3032de74900cfcb9e79147046987eb84 100644 (file)
@@ -83,7 +83,7 @@ head-y := arch/ppc64/kernel/head.o
 
 libs-y                         += arch/ppc64/lib/
 core-y                         += arch/ppc64/kernel/ arch/powerpc/kernel/
-core-y                         += arch/ppc64/mm/
+core-y                         += arch/powerpc/mm/
 core-y                         += arch/powerpc/platforms/
 core-$(CONFIG_XMON)            += arch/ppc64/xmon/
 drivers-$(CONFIG_OPROFILE)     += arch/powerpc/oprofile/
diff --git a/arch/ppc64/mm/Makefile b/arch/ppc64/mm/Makefile
deleted file mode 100644 (file)
index 3695d00..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for the linux ppc-specific parts of the memory manager.
-#
-
-EXTRA_CFLAGS += -mno-minimal-toc
-
-obj-y := fault.o init.o imalloc.o hash_utils.o hash_low.o tlb.o \
-       slb_low.o slb.o stab.o mmap.o
-obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_PPC_MULTIPLATFORM) += hash_native.o
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c
deleted file mode 100644 (file)
index be3f25c..0000000
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- *  arch/ppc/mm/fault.c
- *
- *  PowerPC version 
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Derived from "arch/i386/mm/fault.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Modified by Cort Dougan and Paul Mackerras.
- *
- *  Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/config.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/kdebug.h>
-#include <asm/siginfo.h>
-
-/*
- * Check whether the instruction at regs->nip is a store using
- * an update addressing form which will update r1.
- */
-static int store_updates_sp(struct pt_regs *regs)
-{
-       unsigned int inst;
-
-       if (get_user(inst, (unsigned int __user *)regs->nip))
-               return 0;
-       /* check for 1 in the rA field */
-       if (((inst >> 16) & 0x1f) != 1)
-               return 0;
-       /* check major opcode */
-       switch (inst >> 26) {
-       case 37:        /* stwu */
-       case 39:        /* stbu */
-       case 45:        /* sthu */
-       case 53:        /* stfsu */
-       case 55:        /* stfdu */
-               return 1;
-       case 62:        /* std or stdu */
-               return (inst & 3) == 1;
-       case 31:
-               /* check minor opcode */
-               switch ((inst >> 1) & 0x3ff) {
-               case 181:       /* stdux */
-               case 183:       /* stwux */
-               case 247:       /* stbux */
-               case 439:       /* sthux */
-               case 695:       /* stfsux */
-               case 759:       /* stfdux */
-                       return 1;
-               }
-       }
-       return 0;
-}
-
-static void do_dabr(struct pt_regs *regs, unsigned long error_code)
-{
-       siginfo_t info;
-
-       if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-                       11, SIGSEGV) == NOTIFY_STOP)
-               return;
-
-       if (debugger_dabr_match(regs))
-               return;
-
-       /* Clear the DABR */
-       set_dabr(0);
-
-       /* Deliver the signal to userspace */
-       info.si_signo = SIGTRAP;
-       info.si_errno = 0;
-       info.si_code = TRAP_HWBKPT;
-       info.si_addr = (void __user *)regs->nip;
-       force_sig_info(SIGTRAP, &info, current);
-}
-
-/*
- * The error_code parameter is
- *  - DSISR for a non-SLB data access fault,
- *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
- *  - 0 any SLB fault.
- * The return value is 0 if the fault was handled, or the signal
- * number if this is a kernel fault that can't be handled here.
- */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
-                           unsigned long error_code)
-{
-       struct vm_area_struct * vma;
-       struct mm_struct *mm = current->mm;
-       siginfo_t info;
-       unsigned long code = SEGV_MAPERR;
-       unsigned long is_write = error_code & DSISR_ISSTORE;
-       unsigned long trap = TRAP(regs);
-       unsigned long is_exec = trap == 0x400;
-
-       BUG_ON((trap == 0x380) || (trap == 0x480));
-
-       if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code,
-                               11, SIGSEGV) == NOTIFY_STOP)
-               return 0;
-
-       if (trap == 0x300) {
-               if (debugger_fault_handler(regs))
-                       return 0;
-       }
-
-       /* On a kernel SLB miss we can only check for a valid exception entry */
-       if (!user_mode(regs) && (address >= TASK_SIZE))
-               return SIGSEGV;
-
-       if (error_code & DSISR_DABRMATCH) {
-               do_dabr(regs, error_code);
-               return 0;
-       }
-
-       if (in_atomic() || mm == NULL) {
-               if (!user_mode(regs))
-                       return SIGSEGV;
-               /* in_atomic() in user mode is really bad,
-                  as is current->mm == NULL. */
-               printk(KERN_EMERG "Page fault in user mode with"
-                      "in_atomic() = %d mm = %p\n", in_atomic(), mm);
-               printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
-                      regs->nip, regs->msr);
-               die("Weird page fault", regs, SIGSEGV);
-       }
-
-       /* When running in the kernel we expect faults to occur only to
-        * addresses in user space.  All other faults represent errors in the
-        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
-        * erroneous fault occuring in a code path which already holds mmap_sem
-        * we will deadlock attempting to validate the fault against the
-        * address space.  Luckily the kernel only validly references user
-        * space from well defined areas of code, which are listed in the
-        * exceptions table.
-        *
-        * As the vast majority of faults will be valid we will only perform
-        * the source reference check when there is a possibilty of a deadlock.
-        * Attempt to lock the address space, if we cannot we then validate the
-        * source.  If this is invalid we can skip the address space check,
-        * thus avoiding the deadlock.
-        */
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               if (!user_mode(regs) && !search_exception_tables(regs->nip))
-                       goto bad_area_nosemaphore;
-
-               down_read(&mm->mmap_sem);
-       }
-
-       vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-
-       if (vma->vm_start <= address) {
-               goto good_area;
-       }
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
-
-       /*
-        * N.B. The POWER/Open ABI allows programs to access up to
-        * 288 bytes below the stack pointer.
-        * The kernel signal delivery code writes up to about 1.5kB
-        * below the stack pointer (r1) before decrementing it.
-        * The exec code can write slightly over 640kB to the stack
-        * before setting the user r1.  Thus we allow the stack to
-        * expand to 1MB without further checks.
-        */
-       if (address + 0x100000 < vma->vm_end) {
-               /* get user regs even if this fault is in kernel mode */
-               struct pt_regs *uregs = current->thread.regs;
-               if (uregs == NULL)
-                       goto bad_area;
-
-               /*
-                * A user-mode access to an address a long way below
-                * the stack pointer is only valid if the instruction
-                * is one which would update the stack pointer to the
-                * address accessed if the instruction completed,
-                * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-                * (or the byte, halfword, float or double forms).
-                *
-                * If we don't check this then any write to the area
-                * between the last mapped region and the stack will
-                * expand the stack rather than segfaulting.
-                */
-               if (address + 2048 < uregs->gpr[1]
-                   && (!user_mode(regs) || !store_updates_sp(regs)))
-                       goto bad_area;
-       }
-
-       if (expand_stack(vma, address))
-               goto bad_area;
-
-good_area:
-       code = SEGV_ACCERR;
-
-       if (is_exec) {
-               /* protection fault */
-               if (error_code & DSISR_PROTFAULT)
-                       goto bad_area;
-               if (!(vma->vm_flags & VM_EXEC))
-                       goto bad_area;
-       /* a write */
-       } else if (is_write) {
-               if (!(vma->vm_flags & VM_WRITE))
-                       goto bad_area;
-       /* a read */
-       } else {
-               if (!(vma->vm_flags & VM_READ))
-                       goto bad_area;
-       }
-
- survive:
-       /*
-        * If for any reason at all we couldn't handle the fault,
-        * make sure we exit gracefully rather than endlessly redo
-        * the fault.
-        */
-       switch (handle_mm_fault(mm, vma, address, is_write)) {
-
-       case VM_FAULT_MINOR:
-               current->min_flt++;
-               break;
-       case VM_FAULT_MAJOR:
-               current->maj_flt++;
-               break;
-       case VM_FAULT_SIGBUS:
-               goto do_sigbus;
-       case VM_FAULT_OOM:
-               goto out_of_memory;
-       default:
-               BUG();
-       }
-
-       up_read(&mm->mmap_sem);
-       return 0;
-
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses cause a SIGSEGV */
-       if (user_mode(regs)) {
-               info.si_signo = SIGSEGV;
-               info.si_errno = 0;
-               info.si_code = code;
-               info.si_addr = (void __user *) address;
-               force_sig_info(SIGSEGV, &info, current);
-               return 0;
-       }
-
-       if (trap == 0x400 && (error_code & DSISR_PROTFAULT)
-           && printk_ratelimit())
-               printk(KERN_CRIT "kernel tried to execute NX-protected"
-                      " page (%lx) - exploit attempt? (uid: %d)\n",
-                      address, current->uid);
-
-       return SIGSEGV;
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-       up_read(&mm->mmap_sem);
-       if (current->pid == 1) {
-               yield();
-               down_read(&mm->mmap_sem);
-               goto survive;
-       }
-       printk("VM: killing process %s\n", current->comm);
-       if (user_mode(regs))
-               do_exit(SIGKILL);
-       return SIGKILL;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-       if (user_mode(regs)) {
-               info.si_signo = SIGBUS;
-               info.si_errno = 0;
-               info.si_code = BUS_ADRERR;
-               info.si_addr = (void __user *)address;
-               force_sig_info(SIGBUS, &info, current);
-               return 0;
-       }
-       return SIGBUS;
-}
-
-/*
- * bad_page_fault is called when we have a bad access from the kernel.
- * It is called from do_page_fault above and from some of the procedures
- * in traps.c.
- */
-void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
-{
-       const struct exception_table_entry *entry;
-
-       /* Are we prepared to handle this fault?  */
-       if ((entry = search_exception_tables(regs->nip)) != NULL) {
-               regs->nip = entry->fixup;
-               return;
-       }
-
-       /* kernel has accessed a bad area */
-       die("Kernel access of bad area", regs, sig);
-}
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
deleted file mode 100644 (file)
index ee5a5d3..0000000
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * ppc64 MMU hashtable management routines
- *
- * (c) Copyright IBM Corp. 2003
- *
- * Maintained by: Benjamin Herrenschmidt
- *                <benh@kernel.crashing.org>
- *
- * This file is covered by the GNU Public Licence v2 as
- * described in the kernel's COPYING file.
- */
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-
-       .text
-
-/*
- * Stackframe:
- *             
- *         +-> Back chain                      (SP + 256)
- *         |   General register save area      (SP + 112)
- *         |   Parameter save area             (SP + 48)
- *         |   TOC save area                   (SP + 40)
- *         |   link editor doubleword          (SP + 32)
- *         |   compiler doubleword             (SP + 24)
- *         |   LR save area                    (SP + 16)
- *         |   CR save area                    (SP + 8)
- * SP ---> +-- Back chain                      (SP + 0)
- */
-#define STACKFRAMESIZE 256
-
-/* Save parameters offsets */
-#define STK_PARM(i)    (STACKFRAMESIZE + 48 + ((i)-3)*8)
-
-/* Save non-volatile offsets */
-#define STK_REG(i)     (112 + ((i)-14)*8)
-
-/*
- * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid,
- *             pte_t *ptep, unsigned long trap, int local)
- *
- * Adds a page to the hash table. This is the non-LPAR version for now
- */
-
-_GLOBAL(__hash_page)
-       mflr    r0
-       std     r0,16(r1)
-       stdu    r1,-STACKFRAMESIZE(r1)
-       /* Save all params that we need after a function call */
-       std     r6,STK_PARM(r6)(r1)
-       std     r8,STK_PARM(r8)(r1)
-       
-       /* Add _PAGE_PRESENT to access */
-       ori     r4,r4,_PAGE_PRESENT
-
-       /* Save non-volatile registers.
-        * r31 will hold "old PTE"
-        * r30 is "new PTE"
-        * r29 is "va"
-        * r28 is a hash value
-        * r27 is hashtab mask (maybe dynamic patched instead ?)
-        */
-       std     r27,STK_REG(r27)(r1)
-       std     r28,STK_REG(r28)(r1)
-       std     r29,STK_REG(r29)(r1)
-       std     r30,STK_REG(r30)(r1)
-       std     r31,STK_REG(r31)(r1)
-       
-       /* Step 1:
-        *
-        * Check permissions, atomically mark the linux PTE busy
-        * and hashed.
-        */ 
-1:
-       ldarx   r31,0,r6
-       /* Check access rights (access & ~(pte_val(*ptep))) */
-       andc.   r0,r4,r31
-       bne-    htab_wrong_access
-       /* Check if PTE is busy */
-       andi.   r0,r31,_PAGE_BUSY
-       /* If so, just bail out and refault if needed. Someone else
-        * is changing this PTE anyway and might hash it.
-        */
-       bne-    bail_ok
-       /* Prepare new PTE value (turn access RW into DIRTY, then
-        * add BUSY,HASHPTE and ACCESSED)
-        */
-       rlwinm  r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
-       or      r30,r30,r31
-       ori     r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
-       /* Write the linux PTE atomically (setting busy) */
-       stdcx.  r30,0,r6
-       bne-    1b
-       isync
-
-       /* Step 2:
-        *
-        * Insert/Update the HPTE in the hash table. At this point,
-        * r4 (access) is re-useable, we use it for the new HPTE flags
-        */
-
-       /* Calc va and put it in r29 */
-       rldicr  r29,r5,28,63-28
-       rldicl  r3,r3,0,36
-       or      r29,r3,r29
-
-       /* Calculate hash value for primary slot and store it in r28 */
-       rldicl  r5,r5,0,25              /* vsid & 0x0000007fffffffff */
-       rldicl  r0,r3,64-12,48          /* (ea >> 12) & 0xffff */
-       xor     r28,r5,r0
-
-       /* Convert linux PTE bits into HW equivalents */
-       andi.   r3,r30,0x1fe            /* Get basic set of flags */
-       xori    r3,r3,HW_NO_EXEC        /* _PAGE_EXEC -> NOEXEC */
-       rlwinm  r0,r30,32-9+1,30,30     /* _PAGE_RW -> _PAGE_USER (r0) */
-       rlwinm  r4,r30,32-7+1,30,30     /* _PAGE_DIRTY -> _PAGE_USER (r4) */
-       and     r0,r0,r4                /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */
-       andc    r0,r30,r0               /* r0 = pte & ~r0 */
-       rlwimi  r3,r0,32-1,31,31        /* Insert result into PP lsb */
-
-       /* We eventually do the icache sync here (maybe inline that
-        * code rather than call a C function...) 
-        */
-BEGIN_FTR_SECTION
-       mr      r4,r30
-       mr      r5,r7
-       bl      .hash_page_do_lazy_icache
-END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
-
-       /* At this point, r3 contains new PP bits, save them in
-        * place of "access" in the param area (sic)
-        */
-       std     r3,STK_PARM(r4)(r1)
-
-       /* Get htab_hash_mask */
-       ld      r4,htab_hash_mask@got(2)
-       ld      r27,0(r4)       /* htab_hash_mask -> r27 */
-
-       /* Check if we may already be in the hashtable, in this case, we
-        * go to out-of-line code to try to modify the HPTE
-        */
-       andi.   r0,r31,_PAGE_HASHPTE
-       bne     htab_modify_pte
-
-htab_insert_pte:
-       /* Clear hpte bits in new pte (we also clear BUSY btw) and
-        * add _PAGE_HASHPTE
-        */
-       lis     r0,_PAGE_HPTEFLAGS@h
-       ori     r0,r0,_PAGE_HPTEFLAGS@l
-       andc    r30,r30,r0
-       ori     r30,r30,_PAGE_HASHPTE
-
-       /* page number in r5 */
-       rldicl  r5,r31,64-PTE_SHIFT,PTE_SHIFT
-
-       /* Calculate primary group hash */
-       and     r0,r28,r27
-       rldicr  r3,r0,3,63-3    /* r0 = (hash & mask) << 3 */
-
-       /* Call ppc_md.hpte_insert */
-       ld      r7,STK_PARM(r4)(r1)     /* Retreive new pp bits */
-       mr      r4,r29                  /* Retreive va */
-       li      r6,0                    /* no vflags */
-_GLOBAL(htab_call_hpte_insert1)
-       bl      .                       /* Will be patched by htab_finish_init() */
-       cmpdi   0,r3,0
-       bge     htab_pte_insert_ok      /* Insertion successful */
-       cmpdi   0,r3,-2                 /* Critical failure */
-       beq-    htab_pte_insert_failure
-
-       /* Now try secondary slot */
-       
-       /* page number in r5 */
-       rldicl  r5,r31,64-PTE_SHIFT,PTE_SHIFT
-
-       /* Calculate secondary group hash */
-       andc    r0,r27,r28
-       rldicr  r3,r0,3,63-3    /* r0 = (~hash & mask) << 3 */
-       
-       /* Call ppc_md.hpte_insert */
-       ld      r7,STK_PARM(r4)(r1)     /* Retreive new pp bits */
-       mr      r4,r29                  /* Retreive va */
-       li      r6,HPTE_V_SECONDARY@l   /* secondary slot */
-_GLOBAL(htab_call_hpte_insert2)
-       bl      .                       /* Will be patched by htab_finish_init() */
-       cmpdi   0,r3,0
-       bge+    htab_pte_insert_ok      /* Insertion successful */
-       cmpdi   0,r3,-2                 /* Critical failure */
-       beq-    htab_pte_insert_failure
-
-       /* Both are full, we need to evict something */
-       mftb    r0
-       /* Pick a random group based on TB */
-       andi.   r0,r0,1
-       mr      r5,r28
-       bne     2f
-       not     r5,r5
-2:     and     r0,r5,r27
-       rldicr  r3,r0,3,63-3    /* r0 = (hash & mask) << 3 */   
-       /* Call ppc_md.hpte_remove */
-_GLOBAL(htab_call_hpte_remove)
-       bl      .                       /* Will be patched by htab_finish_init() */
-
-       /* Try all again */
-       b       htab_insert_pte 
-
-bail_ok:
-       li      r3,0
-       b       bail
-
-htab_pte_insert_ok:
-       /* Insert slot number & secondary bit in PTE */
-       rldimi  r30,r3,12,63-15
-               
-       /* Write out the PTE with a normal write
-        * (maybe add eieio may be good still ?)
-        */
-htab_write_out_pte:
-       ld      r6,STK_PARM(r6)(r1)
-       std     r30,0(r6)
-       li      r3, 0
-bail:
-       ld      r27,STK_REG(r27)(r1)
-       ld      r28,STK_REG(r28)(r1)
-       ld      r29,STK_REG(r29)(r1)
-       ld      r30,STK_REG(r30)(r1)
-       ld      r31,STK_REG(r31)(r1)
-       addi    r1,r1,STACKFRAMESIZE
-       ld      r0,16(r1)
-       mtlr    r0
-       blr
-
-htab_modify_pte:
-       /* Keep PP bits in r4 and slot idx from the PTE around in r3 */
-       mr      r4,r3
-       rlwinm  r3,r31,32-12,29,31
-
-       /* Secondary group ? if yes, get a inverted hash value */
-       mr      r5,r28
-       andi.   r0,r31,_PAGE_SECONDARY
-       beq     1f
-       not     r5,r5
-1:
-       /* Calculate proper slot value for ppc_md.hpte_updatepp */
-       and     r0,r5,r27
-       rldicr  r0,r0,3,63-3    /* r0 = (hash & mask) << 3 */
-       add     r3,r0,r3        /* add slot idx */
-
-       /* Call ppc_md.hpte_updatepp */
-       mr      r5,r29                  /* va */
-       li      r6,0                    /* large is 0 */
-       ld      r7,STK_PARM(r8)(r1)     /* get "local" param */
-_GLOBAL(htab_call_hpte_updatepp)
-       bl      .                       /* Will be patched by htab_finish_init() */
-
-       /* if we failed because typically the HPTE wasn't really here
-        * we try an insertion. 
-        */
-       cmpdi   0,r3,-1
-       beq-    htab_insert_pte
-
-       /* Clear the BUSY bit and Write out the PTE */
-       li      r0,_PAGE_BUSY
-       andc    r30,r30,r0
-       b       htab_write_out_pte
-
-htab_wrong_access:
-       /* Bail out clearing reservation */
-       stdcx.  r31,0,r6
-       li      r3,1
-       b       bail
-
-htab_pte_insert_failure:
-       /* Bail out restoring old PTE */
-       ld      r6,STK_PARM(r6)(r1)
-       std     r31,0(r6)
-       li      r3,-1
-       b       bail
-
-
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
deleted file mode 100644 (file)
index 174d145..0000000
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * native hashtable management.
- *
- * SMP scalability work:
- *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/threads.h>
-#include <linux/smp.h>
-
-#include <asm/abs_addr.h>
-#include <asm/machdep.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <asm/cputable.h>
-
-#define HPTE_LOCK_BIT 3
-
-static DEFINE_SPINLOCK(native_tlbie_lock);
-
-static inline void native_lock_hpte(hpte_t *hptep)
-{
-       unsigned long *word = &hptep->v;
-
-       while (1) {
-               if (!test_and_set_bit(HPTE_LOCK_BIT, word))
-                       break;
-               while(test_bit(HPTE_LOCK_BIT, word))
-                       cpu_relax();
-       }
-}
-
-static inline void native_unlock_hpte(hpte_t *hptep)
-{
-       unsigned long *word = &hptep->v;
-
-       asm volatile("lwsync":::"memory");
-       clear_bit(HPTE_LOCK_BIT, word);
-}
-
-long native_hpte_insert(unsigned long hpte_group, unsigned long va,
-                       unsigned long prpn, unsigned long vflags,
-                       unsigned long rflags)
-{
-       hpte_t *hptep = htab_address + hpte_group;
-       unsigned long hpte_v, hpte_r;
-       int i;
-
-       for (i = 0; i < HPTES_PER_GROUP; i++) {
-               if (! (hptep->v & HPTE_V_VALID)) {
-                       /* retry with lock held */
-                       native_lock_hpte(hptep);
-                       if (! (hptep->v & HPTE_V_VALID))
-                               break;
-                       native_unlock_hpte(hptep);
-               }
-
-               hptep++;
-       }
-
-       if (i == HPTES_PER_GROUP)
-               return -1;
-
-       hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
-       if (vflags & HPTE_V_LARGE)
-               va &= ~(1UL << HPTE_V_AVPN_SHIFT);
-       hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
-
-       hptep->r = hpte_r;
-       /* Guarantee the second dword is visible before the valid bit */
-       __asm__ __volatile__ ("eieio" : : : "memory");
-       /*
-        * Now set the first dword including the valid bit
-        * NOTE: this also unlocks the hpte
-        */
-       hptep->v = hpte_v;
-
-       __asm__ __volatile__ ("ptesync" : : : "memory");
-
-       return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
-}
-
-static long native_hpte_remove(unsigned long hpte_group)
-{
-       hpte_t *hptep;
-       int i;
-       int slot_offset;
-       unsigned long hpte_v;
-
-       /* pick a random entry to start at */
-       slot_offset = mftb() & 0x7;
-
-       for (i = 0; i < HPTES_PER_GROUP; i++) {
-               hptep = htab_address + hpte_group + slot_offset;
-               hpte_v = hptep->v;
-
-               if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
-                       /* retry with lock held */
-                       native_lock_hpte(hptep);
-                       hpte_v = hptep->v;
-                       if ((hpte_v & HPTE_V_VALID)
-                           && !(hpte_v & HPTE_V_BOLTED))
-                               break;
-                       native_unlock_hpte(hptep);
-               }
-
-               slot_offset++;
-               slot_offset &= 0x7;
-       }
-
-       if (i == HPTES_PER_GROUP)
-               return -1;
-
-       /* Invalidate the hpte. NOTE: this also unlocks it */
-       hptep->v = 0;
-
-       return i;
-}
-
-static inline void set_pp_bit(unsigned long pp, hpte_t *addr)
-{
-       unsigned long old;
-       unsigned long *p = &addr->r;
-
-       __asm__ __volatile__(
-       "1:     ldarx   %0,0,%3\n\
-               rldimi  %0,%2,0,61\n\
-               stdcx.  %0,0,%3\n\
-               bne     1b"
-       : "=&r" (old), "=m" (*p)
-       : "r" (pp), "r" (p), "m" (*p)
-       : "cc");
-}
-
-/*
- * Only works on small pages. Yes its ugly to have to check each slot in
- * the group but we only use this during bootup.
- */
-static long native_hpte_find(unsigned long vpn)
-{
-       hpte_t *hptep;
-       unsigned long hash;
-       unsigned long i, j;
-       long slot;
-       unsigned long hpte_v;
-
-       hash = hpt_hash(vpn, 0);
-
-       for (j = 0; j < 2; j++) {
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               for (i = 0; i < HPTES_PER_GROUP; i++) {
-                       hptep = htab_address + slot;
-                       hpte_v = hptep->v;
-
-                       if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11))
-                           && (hpte_v & HPTE_V_VALID)
-                           && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) {
-                               /* HPTE matches */
-                               if (j)
-                                       slot = -slot;
-                               return slot;
-                       }
-                       ++slot;
-               }
-               hash = ~hash;
-       }
-
-       return -1;
-}
-
-static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-                                unsigned long va, int large, int local)
-{
-       hpte_t *hptep = htab_address + slot;
-       unsigned long hpte_v;
-       unsigned long avpn = va >> 23;
-       int ret = 0;
-
-       if (large)
-               avpn &= ~1;
-
-       native_lock_hpte(hptep);
-
-       hpte_v = hptep->v;
-
-       /* Even if we miss, we need to invalidate the TLB */
-       if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-           || !(hpte_v & HPTE_V_VALID)) {
-               native_unlock_hpte(hptep);
-               ret = -1;
-       } else {
-               set_pp_bit(newpp, hptep);
-               native_unlock_hpte(hptep);
-       }
-
-       /* Ensure it is out of the tlb too */
-       if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-               tlbiel(va);
-       } else {
-               int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-               if (lock_tlbie)
-                       spin_lock(&native_tlbie_lock);
-               tlbie(va, large);
-               if (lock_tlbie)
-                       spin_unlock(&native_tlbie_lock);
-       }
-
-       return ret;
-}
-
-/*
- * Update the page protection bits. Intended to be used to create
- * guard pages for kernel data structures on pages which are bolted
- * in the HPT. Assumes pages being operated on will not be stolen.
- * Does not work on large pages.
- *
- * No need to lock here because we should be the only user.
- */
-static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
-{
-       unsigned long vsid, va, vpn, flags = 0;
-       long slot;
-       hpte_t *hptep;
-       int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-       vsid = get_kernel_vsid(ea);
-       va = (vsid << 28) | (ea & 0x0fffffff);
-       vpn = va >> PAGE_SHIFT;
-
-       slot = native_hpte_find(vpn);
-       if (slot == -1)
-               panic("could not find page to bolt\n");
-       hptep = htab_address + slot;
-
-       set_pp_bit(newpp, hptep);
-
-       /* Ensure it is out of the tlb too */
-       if (lock_tlbie)
-               spin_lock_irqsave(&native_tlbie_lock, flags);
-       tlbie(va, 0);
-       if (lock_tlbie)
-               spin_unlock_irqrestore(&native_tlbie_lock, flags);
-}
-
-static void native_hpte_invalidate(unsigned long slot, unsigned long va,
-                                   int large, int local)
-{
-       hpte_t *hptep = htab_address + slot;
-       unsigned long hpte_v;
-       unsigned long avpn = va >> 23;
-       unsigned long flags;
-       int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-       if (large)
-               avpn &= ~1;
-
-       local_irq_save(flags);
-       native_lock_hpte(hptep);
-
-       hpte_v = hptep->v;
-
-       /* Even if we miss, we need to invalidate the TLB */
-       if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-           || !(hpte_v & HPTE_V_VALID)) {
-               native_unlock_hpte(hptep);
-       } else {
-               /* Invalidate the hpte. NOTE: this also unlocks it */
-               hptep->v = 0;
-       }
-
-       /* Invalidate the tlb */
-       if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-               tlbiel(va);
-       } else {
-               if (lock_tlbie)
-                       spin_lock(&native_tlbie_lock);
-               tlbie(va, large);
-               if (lock_tlbie)
-                       spin_unlock(&native_tlbie_lock);
-       }
-       local_irq_restore(flags);
-}
-
-/*
- * clear all mappings on kexec.  All cpus are in real mode (or they will
- * be when they isi), and we are the only one left.  We rely on our kernel
- * mapping being 0xC0's and the hardware ignoring those two real bits.
- *
- * TODO: add batching support when enabled.  remember, no dynamic memory here,
- * athough there is the control page available...
- */
-static void native_hpte_clear(void)
-{
-       unsigned long slot, slots, flags;
-       hpte_t *hptep = htab_address;
-       unsigned long hpte_v;
-       unsigned long pteg_count;
-
-       pteg_count = htab_hash_mask + 1;
-
-       local_irq_save(flags);
-
-       /* we take the tlbie lock and hold it.  Some hardware will
-        * deadlock if we try to tlbie from two processors at once.
-        */
-       spin_lock(&native_tlbie_lock);
-
-       slots = pteg_count * HPTES_PER_GROUP;
-
-       for (slot = 0; slot < slots; slot++, hptep++) {
-               /*
-                * we could lock the pte here, but we are the only cpu
-                * running,  right?  and for crash dump, we probably
-                * don't want to wait for a maybe bad cpu.
-                */
-               hpte_v = hptep->v;
-
-               if (hpte_v & HPTE_V_VALID) {
-                       hptep->v = 0;
-                       tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE);
-               }
-       }
-
-       spin_unlock(&native_tlbie_lock);
-       local_irq_restore(flags);
-}
-
-static void native_flush_hash_range(unsigned long number, int local)
-{
-       unsigned long va, vpn, hash, secondary, slot, flags, avpn;
-       int i, j;
-       hpte_t *hptep;
-       unsigned long hpte_v;
-       struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-       unsigned long large = batch->large;
-
-       local_irq_save(flags);
-
-       j = 0;
-       for (i = 0; i < number; i++) {
-               va = batch->vaddr[j];
-               if (large)
-                       vpn = va >> HPAGE_SHIFT;
-               else
-                       vpn = va >> PAGE_SHIFT;
-               hash = hpt_hash(vpn, large);
-               secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15;
-               if (secondary)
-                       hash = ~hash;
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12;
-
-               hptep = htab_address + slot;
-
-               avpn = va >> 23;
-               if (large)
-                       avpn &= ~0x1UL;
-
-               native_lock_hpte(hptep);
-
-               hpte_v = hptep->v;
-
-               /* Even if we miss, we need to invalidate the TLB */
-               if ((HPTE_V_AVPN_VAL(hpte_v) != avpn)
-                   || !(hpte_v & HPTE_V_VALID)) {
-                       native_unlock_hpte(hptep);
-               } else {
-                       /* Invalidate the hpte. NOTE: this also unlocks it */
-                       hptep->v = 0;
-               }
-
-               j++;
-       }
-
-       if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) {
-               asm volatile("ptesync":::"memory");
-
-               for (i = 0; i < j; i++)
-                       __tlbiel(batch->vaddr[i]);
-
-               asm volatile("ptesync":::"memory");
-       } else {
-               int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
-
-               if (lock_tlbie)
-                       spin_lock(&native_tlbie_lock);
-
-               asm volatile("ptesync":::"memory");
-
-               for (i = 0; i < j; i++)
-                       __tlbie(batch->vaddr[i], large);
-
-               asm volatile("eieio; tlbsync; ptesync":::"memory");
-
-               if (lock_tlbie)
-                       spin_unlock(&native_tlbie_lock);
-       }
-
-       local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PPC_PSERIES
-/* Disable TLB batching on nighthawk */
-static inline int tlb_batching_enabled(void)
-{
-       struct device_node *root = of_find_node_by_path("/");
-       int enabled = 1;
-
-       if (root) {
-               const char *model = get_property(root, "model", NULL);
-               if (model && !strcmp(model, "IBM,9076-N81"))
-                       enabled = 0;
-               of_node_put(root);
-       }
-
-       return enabled;
-}
-#else
-static inline int tlb_batching_enabled(void)
-{
-       return 1;
-}
-#endif
-
-void hpte_init_native(void)
-{
-       ppc_md.hpte_invalidate  = native_hpte_invalidate;
-       ppc_md.hpte_updatepp    = native_hpte_updatepp;
-       ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
-       ppc_md.hpte_insert      = native_hpte_insert;
-       ppc_md.hpte_remove      = native_hpte_remove;
-       ppc_md.hpte_clear_all   = native_hpte_clear;
-       if (tlb_batching_enabled())
-               ppc_md.flush_hash_range = native_flush_hash_range;
-       htab_finish_init();
-}
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
deleted file mode 100644 (file)
index 8350743..0000000
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * PowerPC64 port by Mike Corrigan and Dave Engebretsen
- *   {mikejc|engebret}@us.ibm.com
- *
- *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
- *
- * SMP scalability work:
- *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
- *    Module name: htab.c
- *
- *    Description:
- *      PowerPC Hashed Page Table functions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#undef DEBUG
-
-#include <linux/config.h>
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include <linux/ctype.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/signal.h>
-
-#include <asm/ppcdebug.h>
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/machdep.h>
-#include <asm/lmb.h>
-#include <asm/abs_addr.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-#include <asm/eeh.h>
-#include <asm/tlb.h>
-#include <asm/cacheflush.h>
-#include <asm/cputable.h>
-#include <asm/abs_addr.h>
-#include <asm/sections.h>
-
-#ifdef DEBUG
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-/*
- * Note:  pte   --> Linux PTE
- *        HPTE  --> PowerPC Hashed Page Table Entry
- *
- * Execution context:
- *   htab_initialize is called with the MMU off (of course), but
- *   the kernel has been copied down to zero so it can directly
- *   reference global data.  At this point it is very difficult
- *   to print debug info.
- *
- */
-
-#ifdef CONFIG_U3_DART
-extern unsigned long dart_tablebase;
-#endif /* CONFIG_U3_DART */
-
-hpte_t *htab_address;
-unsigned long htab_hash_mask;
-
-extern unsigned long _SDR1;
-
-#define KB (1024)
-#define MB (1024*KB)
-
-static inline void loop_forever(void)
-{
-       volatile unsigned long x = 1;
-       for(;x;x|=1)
-               ;
-}
-
-static inline void create_pte_mapping(unsigned long start, unsigned long end,
-                                     unsigned long mode, int large)
-{
-       unsigned long addr;
-       unsigned int step;
-       unsigned long tmp_mode;
-       unsigned long vflags;
-
-       if (large) {
-               step = 16*MB;
-               vflags = HPTE_V_BOLTED | HPTE_V_LARGE;
-       } else {
-               step = 4*KB;
-               vflags = HPTE_V_BOLTED;
-       }
-
-       for (addr = start; addr < end; addr += step) {
-               unsigned long vpn, hash, hpteg;
-               unsigned long vsid = get_kernel_vsid(addr);
-               unsigned long va = (vsid << 28) | (addr & 0xfffffff);
-               int ret = -1;
-
-               if (large)
-                       vpn = va >> HPAGE_SHIFT;
-               else
-                       vpn = va >> PAGE_SHIFT;
-
-
-               tmp_mode = mode;
-               
-               /* Make non-kernel text non-executable */
-               if (!in_kernel_text(addr))
-                       tmp_mode = mode | HW_NO_EXEC;
-
-               hash = hpt_hash(vpn, large);
-
-               hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
-#ifdef CONFIG_PPC_ISERIES
-               if (systemcfg->platform & PLATFORM_ISERIES_LPAR)
-                       ret = iSeries_hpte_bolt_or_insert(hpteg, va,
-                               virt_to_abs(addr) >> PAGE_SHIFT,
-                               vflags, tmp_mode);
-               else
-#endif
-#ifdef CONFIG_PPC_PSERIES
-               if (systemcfg->platform & PLATFORM_LPAR)
-                       ret = pSeries_lpar_hpte_insert(hpteg, va,
-                               virt_to_abs(addr) >> PAGE_SHIFT,
-                               vflags, tmp_mode);
-               else
-#endif
-#ifdef CONFIG_PPC_MULTIPLATFORM
-                       ret = native_hpte_insert(hpteg, va,
-                               virt_to_abs(addr) >> PAGE_SHIFT,
-                               vflags, tmp_mode);
-#endif
-
-               if (ret == -1) {
-                       ppc64_terminate_msg(0x20, "create_pte_mapping");
-                       loop_forever();
-               }
-       }
-}
-
-void __init htab_initialize(void)
-{
-       unsigned long table, htab_size_bytes;
-       unsigned long pteg_count;
-       unsigned long mode_rw;
-       int i, use_largepages = 0;
-       unsigned long base = 0, size = 0;
-       extern unsigned long tce_alloc_start, tce_alloc_end;
-
-       DBG(" -> htab_initialize()\n");
-
-       /*
-        * Calculate the required size of the htab.  We want the number of
-        * PTEGs to equal one half the number of real pages.
-        */ 
-       htab_size_bytes = 1UL << ppc64_pft_size;
-       pteg_count = htab_size_bytes >> 7;
-
-       /* For debug, make the HTAB 1/8 as big as it normally would be. */
-       ifppcdebug(PPCDBG_HTABSIZE) {
-               pteg_count >>= 3;
-               htab_size_bytes = pteg_count << 7;
-       }
-
-       htab_hash_mask = pteg_count - 1;
-
-       if (systemcfg->platform & PLATFORM_LPAR) {
-               /* Using a hypervisor which owns the htab */
-               htab_address = NULL;
-               _SDR1 = 0; 
-       } else {
-               /* Find storage for the HPT.  Must be contiguous in
-                * the absolute address space.
-                */
-               table = lmb_alloc(htab_size_bytes, htab_size_bytes);
-
-               DBG("Hash table allocated at %lx, size: %lx\n", table,
-                   htab_size_bytes);
-
-               if ( !table ) {
-                       ppc64_terminate_msg(0x20, "hpt space");
-                       loop_forever();
-               }
-               htab_address = abs_to_virt(table);
-
-               /* htab absolute addr + encoded htabsize */
-               _SDR1 = table + __ilog2(pteg_count) - 11;
-
-               /* Initialize the HPT with no entries */
-               memset((void *)table, 0, htab_size_bytes);
-       }
-
-       mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
-
-       /* On U3 based machines, we need to reserve the DART area and
-        * _NOT_ map it to avoid cache paradoxes as it's remapped non
-        * cacheable later on
-        */
-       if (cpu_has_feature(CPU_FTR_16M_PAGE))
-               use_largepages = 1;
-
-       /* create bolted the linear mapping in the hash table */
-       for (i=0; i < lmb.memory.cnt; i++) {
-               base = lmb.memory.region[i].base + KERNELBASE;
-               size = lmb.memory.region[i].size;
-
-               DBG("creating mapping for region: %lx : %lx\n", base, size);
-
-#ifdef CONFIG_U3_DART
-               /* Do not map the DART space. Fortunately, it will be aligned
-                * in such a way that it will not cross two lmb regions and will
-                * fit within a single 16Mb page.
-                * The DART space is assumed to be a full 16Mb region even if we
-                * only use 2Mb of that space. We will use more of it later for
-                * AGP GART. We have to use a full 16Mb large page.
-                */
-               DBG("DART base: %lx\n", dart_tablebase);
-
-               if (dart_tablebase != 0 && dart_tablebase >= base
-                   && dart_tablebase < (base + size)) {
-                       if (base != dart_tablebase)
-                               create_pte_mapping(base, dart_tablebase, mode_rw,
-                                                  use_largepages);
-                       if ((base + size) > (dart_tablebase + 16*MB))
-                               create_pte_mapping(dart_tablebase + 16*MB, base + size,
-                                                  mode_rw, use_largepages);
-                       continue;
-               }
-#endif /* CONFIG_U3_DART */
-               create_pte_mapping(base, base + size, mode_rw, use_largepages);
-       }
-
-       /*
-        * If we have a memory_limit and we've allocated TCEs then we need to
-        * explicitly map the TCE area at the top of RAM. We also cope with the
-        * case that the TCEs start below memory_limit.
-        * tce_alloc_start/end are 16MB aligned so the mapping should work
-        * for either 4K or 16MB pages.
-        */
-       if (tce_alloc_start) {
-               tce_alloc_start += KERNELBASE;
-               tce_alloc_end += KERNELBASE;
-
-               if (base + size >= tce_alloc_start)
-                       tce_alloc_start = base + size + 1;
-
-               create_pte_mapping(tce_alloc_start, tce_alloc_end,
-                       mode_rw, use_largepages);
-       }
-
-       DBG(" <- htab_initialize()\n");
-}
-#undef KB
-#undef MB
-
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
-{
-       struct page *page;
-
-       if (!pfn_valid(pte_pfn(pte)))
-               return pp;
-
-       page = pte_page(pte);
-
-       /* page is dirty */
-       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-               if (trap == 0x400) {
-                       __flush_dcache_icache(page_address(page));
-                       set_bit(PG_arch_1, &page->flags);
-               } else
-                       pp |= HW_NO_EXEC;
-       }
-       return pp;
-}
-
-/* Result code is:
- *  0 - handled
- *  1 - normal page fault
- * -1 - critical hash insertion error
- */
-int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
-{
-       void *pgdir;
-       unsigned long vsid;
-       struct mm_struct *mm;
-       pte_t *ptep;
-       int ret;
-       int user_region = 0;
-       int local = 0;
-       cpumask_t tmp;
-
-       if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
-               return 1;
-
-       switch (REGION_ID(ea)) {
-       case USER_REGION_ID:
-               user_region = 1;
-               mm = current->mm;
-               if (! mm)
-                       return 1;
-
-               vsid = get_vsid(mm->context.id, ea);
-               break;
-       case VMALLOC_REGION_ID:
-               mm = &init_mm;
-               vsid = get_kernel_vsid(ea);
-               break;
-#if 0
-       case KERNEL_REGION_ID:
-               /*
-                * Should never get here - entire 0xC0... region is bolted.
-                * Send the problem up to do_page_fault 
-                */
-#endif
-       default:
-               /* Not a valid range
-                * Send the problem up to do_page_fault 
-                */
-               return 1;
-               break;
-       }
-
-       pgdir = mm->pgd;
-
-       if (pgdir == NULL)
-               return 1;
-
-       tmp = cpumask_of_cpu(smp_processor_id());
-       if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
-               local = 1;
-
-       /* Is this a huge page ? */
-       if (unlikely(in_hugepage_area(mm->context, ea)))
-               ret = hash_huge_page(mm, access, ea, vsid, local);
-       else {
-               ptep = find_linux_pte(pgdir, ea);
-               if (ptep == NULL)
-                       return 1;
-               ret = __hash_page(ea, access, vsid, ptep, trap, local);
-       }
-
-       return ret;
-}
-
-void flush_hash_page(unsigned long va, pte_t pte, int local)
-{
-       unsigned long vpn, hash, secondary, slot;
-       unsigned long huge = pte_huge(pte);
-
-       if (huge)
-               vpn = va >> HPAGE_SHIFT;
-       else
-               vpn = va >> PAGE_SHIFT;
-       hash = hpt_hash(vpn, huge);
-       secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
-       if (secondary)
-               hash = ~hash;
-       slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-       slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
-
-       ppc_md.hpte_invalidate(slot, va, huge, local);
-}
-
-void flush_hash_range(unsigned long number, int local)
-{
-       if (ppc_md.flush_hash_range) {
-               ppc_md.flush_hash_range(number, local);
-       } else {
-               int i;
-               struct ppc64_tlb_batch *batch =
-                       &__get_cpu_var(ppc64_tlb_batch);
-
-               for (i = 0; i < number; i++)
-                       flush_hash_page(batch->vaddr[i], batch->pte[i], local);
-       }
-}
-
-static inline void make_bl(unsigned int *insn_addr, void *func)
-{
-       unsigned long funcp = *((unsigned long *)func);
-       int offset = funcp - (unsigned long)insn_addr;
-
-       *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
-       flush_icache_range((unsigned long)insn_addr, 4+
-                          (unsigned long)insn_addr);
-}
-
-/*
- * low_hash_fault is called when we the low level hash code failed
- * to instert a PTE due to an hypervisor error
- */
-void low_hash_fault(struct pt_regs *regs, unsigned long address)
-{
-       if (user_mode(regs)) {
-               siginfo_t info;
-
-               info.si_signo = SIGBUS;
-               info.si_errno = 0;
-               info.si_code = BUS_ADRERR;
-               info.si_addr = (void __user *)address;
-               force_sig_info(SIGBUS, &info, current);
-               return;
-       }
-       bad_page_fault(regs, address, SIGBUS);
-}
-
-void __init htab_finish_init(void)
-{
-       extern unsigned int *htab_call_hpte_insert1;
-       extern unsigned int *htab_call_hpte_insert2;
-       extern unsigned int *htab_call_hpte_remove;
-       extern unsigned int *htab_call_hpte_updatepp;
-
-       make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
-       make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
-       make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
-       make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
-}
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
deleted file mode 100644 (file)
index 0ea0994..0000000
+++ /dev/null
@@ -1,745 +0,0 @@
-/*
- * PPC64 (POWER4) Huge TLB Page Support for Kernel.
- *
- * Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * Based on the IA-32 version:
- * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/tlb.h>
-
-#include <linux/sysctl.h>
-
-#define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
-
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pg;
-       pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
-
-       BUG_ON(! in_hugepage_area(mm->context, addr));
-
-       addr &= HPAGE_MASK;
-
-       pg = pgd_offset(mm, addr);
-       if (!pgd_none(*pg)) {
-               pu = pud_offset(pg, addr);
-               if (!pud_none(*pu)) {
-                       pm = pmd_offset(pu, addr);
-                       pt = (pte_t *)pm;
-                       BUG_ON(!pmd_none(*pm)
-                              && !(pte_present(*pt) && pte_huge(*pt)));
-                       return pt;
-               }
-       }
-
-       return NULL;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pg;
-       pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
-
-       BUG_ON(! in_hugepage_area(mm->context, addr));
-
-       addr &= HPAGE_MASK;
-
-       pg = pgd_offset(mm, addr);
-       pu = pud_alloc(mm, pg, addr);
-
-       if (pu) {
-               pm = pmd_alloc(mm, pu, addr);
-               if (pm) {
-                       pt = (pte_t *)pm;
-                       BUG_ON(!pmd_none(*pm)
-                              && !(pte_present(*pt) && pte_huge(*pt)));
-                       return pt;
-               }
-       }
-
-       return NULL;
-}
-
-#define HUGEPTE_BATCH_SIZE     (HPAGE_SIZE / PMD_SIZE)
-
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-                    pte_t *ptep, pte_t pte)
-{
-       int i;
-
-       if (pte_present(*ptep)) {
-               pte_clear(mm, addr, ptep);
-               flush_tlb_pending();
-       }
-
-       for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
-               *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-               ptep++;
-       }
-}
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-                             pte_t *ptep)
-{
-       unsigned long old = pte_update(ptep, ~0UL);
-       int i;
-
-       if (old & _PAGE_HASHPTE)
-               hpte_update(mm, addr, old, 0);
-
-       for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
-               ptep[i] = __pte(0);
-
-       return __pte(old);
-}
-
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-       if (len & ~HPAGE_MASK)
-               return -EINVAL;
-       if (addr & ~HPAGE_MASK)
-               return -EINVAL;
-       if (! (within_hugepage_low_range(addr, len)
-              || within_hugepage_high_range(addr, len)) )
-               return -EINVAL;
-       return 0;
-}
-
-static void flush_low_segments(void *parm)
-{
-       u16 areas = (unsigned long) parm;
-       unsigned long i;
-
-       asm volatile("isync" : : : "memory");
-
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
-
-       for (i = 0; i < NUM_LOW_AREAS; i++) {
-               if (! (areas & (1U << i)))
-                       continue;
-               asm volatile("slbie %0"
-                            : : "r" ((i << SID_SHIFT) | SLBIE_C));
-       }
-
-       asm volatile("isync" : : : "memory");
-}
-
-static void flush_high_segments(void *parm)
-{
-       u16 areas = (unsigned long) parm;
-       unsigned long i, j;
-
-       asm volatile("isync" : : : "memory");
-
-       BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
-
-       for (i = 0; i < NUM_HIGH_AREAS; i++) {
-               if (! (areas & (1U << i)))
-                       continue;
-               for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
-                       asm volatile("slbie %0"
-                                    :: "r" (((i << HTLB_AREA_SHIFT)
-                                            + (j << SID_SHIFT)) | SLBIE_C));
-       }
-
-       asm volatile("isync" : : : "memory");
-}
-
-static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
-{
-       unsigned long start = area << SID_SHIFT;
-       unsigned long end = (area+1) << SID_SHIFT;
-       struct vm_area_struct *vma;
-
-       BUG_ON(area >= NUM_LOW_AREAS);
-
-       /* Check no VMAs are in the region */
-       vma = find_vma(mm, start);
-       if (vma && (vma->vm_start < end))
-               return -EBUSY;
-
-       return 0;
-}
-
-static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
-{
-       unsigned long start = area << HTLB_AREA_SHIFT;
-       unsigned long end = (area+1) << HTLB_AREA_SHIFT;
-       struct vm_area_struct *vma;
-
-       BUG_ON(area >= NUM_HIGH_AREAS);
-
-       /* Check no VMAs are in the region */
-       vma = find_vma(mm, start);
-       if (vma && (vma->vm_start < end))
-               return -EBUSY;
-
-       return 0;
-}
-
-static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
-{
-       unsigned long i;
-
-       BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
-       BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
-
-       newareas &= ~(mm->context.low_htlb_areas);
-       if (! newareas)
-               return 0; /* The segments we want are already open */
-
-       for (i = 0; i < NUM_LOW_AREAS; i++)
-               if ((1 << i) & newareas)
-                       if (prepare_low_area_for_htlb(mm, i) != 0)
-                               return -EBUSY;
-
-       mm->context.low_htlb_areas |= newareas;
-
-       /* update the paca copy of the context struct */
-       get_paca()->context = mm->context;
-
-       /* the context change must make it to memory before the flush,
-        * so that further SLB misses do the right thing. */
-       mb();
-       on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
-
-       return 0;
-}
-
-static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
-{
-       unsigned long i;
-
-       BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
-       BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
-                    != NUM_HIGH_AREAS);
-
-       newareas &= ~(mm->context.high_htlb_areas);
-       if (! newareas)
-               return 0; /* The areas we want are already open */
-
-       for (i = 0; i < NUM_HIGH_AREAS; i++)
-               if ((1 << i) & newareas)
-                       if (prepare_high_area_for_htlb(mm, i) != 0)
-                               return -EBUSY;
-
-       mm->context.high_htlb_areas |= newareas;
-
-       /* update the paca copy of the context struct */
-       get_paca()->context = mm->context;
-
-       /* the context change must make it to memory before the flush,
-        * so that further SLB misses do the right thing. */
-       mb();
-       on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
-
-       return 0;
-}
-
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
-{
-       int err;
-
-       if ( (addr+len) < addr )
-               return -EINVAL;
-
-       if ((addr + len) < 0x100000000UL)
-               err = open_low_hpage_areas(current->mm,
-                                         LOW_ESID_MASK(addr, len));
-       else
-               err = open_high_hpage_areas(current->mm,
-                                           HTLB_AREA_MASK(addr, len));
-       if (err) {
-               printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
-                      " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
-                      addr, len,
-                      LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
-               return err;
-       }
-
-       return 0;
-}
-
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-       pte_t *ptep;
-       struct page *page;
-
-       if (! in_hugepage_area(mm->context, address))
-               return ERR_PTR(-EINVAL);
-
-       ptep = huge_pte_offset(mm, address);
-       page = pte_page(*ptep);
-       if (page)
-               page += (address % HPAGE_SIZE) / PAGE_SIZE;
-
-       return page;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-       return 0;
-}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd, int write)
-{
-       BUG();
-       return NULL;
-}
-
-/* Because we have an exclusive hugepage region which lies within the
- * normal user address space, we have to take special measures to make
- * non-huge mmap()s evade the hugepage reserved regions. */
-unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
-                                    unsigned long len, unsigned long pgoff,
-                                    unsigned long flags)
-{
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long start_addr;
-
-       if (len > TASK_SIZE)
-               return -ENOMEM;
-
-       if (addr) {
-               addr = PAGE_ALIGN(addr);
-               vma = find_vma(mm, addr);
-               if (((TASK_SIZE - len) >= addr)
-                   && (!vma || (addr+len) <= vma->vm_start)
-                   && !is_hugepage_only_range(mm, addr,len))
-                       return addr;
-       }
-       if (len > mm->cached_hole_size) {
-               start_addr = addr = mm->free_area_cache;
-       } else {
-               start_addr = addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-       }
-
-full_search:
-       vma = find_vma(mm, addr);
-       while (TASK_SIZE - len >= addr) {
-               BUG_ON(vma && (addr >= vma->vm_end));
-
-               if (touches_hugepage_low_range(mm, addr, len)) {
-                       addr = ALIGN(addr+1, 1<<SID_SHIFT);
-                       vma = find_vma(mm, addr);
-                       continue;
-               }
-               if (touches_hugepage_high_range(mm, addr, len)) {
-                       addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
-                       vma = find_vma(mm, addr);
-                       continue;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-               addr = vma->vm_end;
-               vma = vma->vm_next;
-       }
-
-       /* Make sure we didn't miss any holes */
-       if (start_addr != TASK_UNMAPPED_BASE) {
-               start_addr = addr = TASK_UNMAPPED_BASE;
-               mm->cached_hole_size = 0;
-               goto full_search;
-       }
-       return -ENOMEM;
-}
-
-/*
- * This mmap-allocator allocates new areas top-down from below the
- * stack's low limit (the base):
- *
- * Because we have an exclusive hugepage region which lies within the
- * normal user address space, we have to take special measures to make
- * non-huge mmap()s evade the hugepage reserved regions.
- */
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
-                         const unsigned long len, const unsigned long pgoff,
-                         const unsigned long flags)
-{
-       struct vm_area_struct *vma, *prev_vma;
-       struct mm_struct *mm = current->mm;
-       unsigned long base = mm->mmap_base, addr = addr0;
-       unsigned long largest_hole = mm->cached_hole_size;
-       int first_time = 1;
-
-       /* requested length too big for entire address space */
-       if (len > TASK_SIZE)
-               return -ENOMEM;
-
-       /* dont allow allocations above current base */
-       if (mm->free_area_cache > base)
-               mm->free_area_cache = base;
-
-       /* requesting a specific address */
-       if (addr) {
-               addr = PAGE_ALIGN(addr);
-               vma = find_vma(mm, addr);
-               if (TASK_SIZE - len >= addr &&
-                               (!vma || addr + len <= vma->vm_start)
-                               && !is_hugepage_only_range(mm, addr,len))
-                       return addr;
-       }
-
-       if (len <= largest_hole) {
-               largest_hole = 0;
-               mm->free_area_cache = base;
-       }
-try_again:
-       /* make sure it can fit in the remaining address space */
-       if (mm->free_area_cache < len)
-               goto fail;
-
-       /* either no address requested or cant fit in requested address hole */
-       addr = (mm->free_area_cache - len) & PAGE_MASK;
-       do {
-hugepage_recheck:
-               if (touches_hugepage_low_range(mm, addr, len)) {
-                       addr = (addr & ((~0) << SID_SHIFT)) - len;
-                       goto hugepage_recheck;
-               } else if (touches_hugepage_high_range(mm, addr, len)) {
-                       addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
-                       goto hugepage_recheck;
-               }
-
-               /*
-                * Lookup failure means no vma is above this address,
-                * i.e. return with success:
-                */
-               if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
-                       return addr;
-
-               /*
-                * new region fits between prev_vma->vm_end and
-                * vma->vm_start, use it:
-                */
-               if (addr+len <= vma->vm_start &&
-                         (!prev_vma || (addr >= prev_vma->vm_end))) {
-                       /* remember the address as a hint for next time */
-                       mm->cached_hole_size = largest_hole;
-                       return (mm->free_area_cache = addr);
-               } else {
-                       /* pull free_area_cache down to the first hole */
-                       if (mm->free_area_cache == vma->vm_end) {
-                               mm->free_area_cache = vma->vm_start;
-                               mm->cached_hole_size = largest_hole;
-                       }
-               }
-
-               /* remember the largest hole we saw so far */
-               if (addr + largest_hole < vma->vm_start)
-                       largest_hole = vma->vm_start - addr;
-
-               /* try just below the current vma->vm_start */
-               addr = vma->vm_start-len;
-       } while (len <= vma->vm_start);
-
-fail:
-       /*
-        * if hint left us with no space for the requested
-        * mapping then try again:
-        */
-       if (first_time) {
-               mm->free_area_cache = base;
-               largest_hole = 0;
-               first_time = 0;
-               goto try_again;
-       }
-       /*
-        * A failed mmap() very likely causes application failure,
-        * so fall back to the bottom-up function here. This scenario
-        * can happen with large stack limits and large mmap()
-        * allocations.
-        */
-       mm->free_area_cache = TASK_UNMAPPED_BASE;
-       mm->cached_hole_size = ~0UL;
-       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
-       /*
-        * Restore the topdown base:
-        */
-       mm->free_area_cache = base;
-       mm->cached_hole_size = ~0UL;
-
-       return addr;
-}
-
-static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
-{
-       unsigned long addr = 0;
-       struct vm_area_struct *vma;
-
-       vma = find_vma(current->mm, addr);
-       while (addr + len <= 0x100000000UL) {
-               BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-
-               if (! __within_hugepage_low_range(addr, len, segmask)) {
-                       addr = ALIGN(addr+1, 1<<SID_SHIFT);
-                       vma = find_vma(current->mm, addr);
-                       continue;
-               }
-
-               if (!vma || (addr + len) <= vma->vm_start)
-                       return addr;
-               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-               /* Depending on segmask this might not be a confirmed
-                * hugepage region, so the ALIGN could have skipped
-                * some VMAs */
-               vma = find_vma(current->mm, addr);
-       }
-
-       return -ENOMEM;
-}
-
-static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
-{
-       unsigned long addr = 0x100000000UL;
-       struct vm_area_struct *vma;
-
-       vma = find_vma(current->mm, addr);
-       while (addr + len <= TASK_SIZE_USER64) {
-               BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-
-               if (! __within_hugepage_high_range(addr, len, areamask)) {
-                       addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
-                       vma = find_vma(current->mm, addr);
-                       continue;
-               }
-
-               if (!vma || (addr + len) <= vma->vm_start)
-                       return addr;
-               addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-               /* Depending on segmask this might not be a confirmed
-                * hugepage region, so the ALIGN could have skipped
-                * some VMAs */
-               vma = find_vma(current->mm, addr);
-       }
-
-       return -ENOMEM;
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-                                       unsigned long len, unsigned long pgoff,
-                                       unsigned long flags)
-{
-       int lastshift;
-       u16 areamask, curareas;
-
-       if (len & ~HPAGE_MASK)
-               return -EINVAL;
-
-       if (!cpu_has_feature(CPU_FTR_16M_PAGE))
-               return -EINVAL;
-
-       if (test_thread_flag(TIF_32BIT)) {
-               curareas = current->mm->context.low_htlb_areas;
-
-               /* First see if we can do the mapping in the existing
-                * low areas */
-               addr = htlb_get_low_area(len, curareas);
-               if (addr != -ENOMEM)
-                       return addr;
-
-               lastshift = 0;
-               for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
-                    ! lastshift; areamask >>=1) {
-                       if (areamask & 1)
-                               lastshift = 1;
-
-                       addr = htlb_get_low_area(len, curareas | areamask);
-                       if ((addr != -ENOMEM)
-                           && open_low_hpage_areas(current->mm, areamask) == 0)
-                               return addr;
-               }
-       } else {
-               curareas = current->mm->context.high_htlb_areas;
-
-               /* First see if we can do the mapping in the existing
-                * high areas */
-               addr = htlb_get_high_area(len, curareas);
-               if (addr != -ENOMEM)
-                       return addr;
-
-               lastshift = 0;
-               for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
-                    ! lastshift; areamask >>=1) {
-                       if (areamask & 1)
-                               lastshift = 1;
-
-                       addr = htlb_get_high_area(len, curareas | areamask);
-                       if ((addr != -ENOMEM)
-                           && open_high_hpage_areas(current->mm, areamask) == 0)
-                               return addr;
-               }
-       }
-       printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-              " enough areas\n");
-       return -ENOMEM;
-}
-
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
-                  unsigned long ea, unsigned long vsid, int local)
-{
-       pte_t *ptep;
-       unsigned long va, vpn;
-       pte_t old_pte, new_pte;
-       unsigned long rflags, prpn;
-       long slot;
-       int err = 1;
-
-       spin_lock(&mm->page_table_lock);
-
-       ptep = huge_pte_offset(mm, ea);
-
-       /* Search the Linux page table for a match with va */
-       va = (vsid << 28) | (ea & 0x0fffffff);
-       vpn = va >> HPAGE_SHIFT;
-
-       /*
-        * If no pte found or not present, send the problem up to
-        * do_page_fault
-        */
-       if (unlikely(!ptep || pte_none(*ptep)))
-               goto out;
-
-/*     BUG_ON(pte_bad(*ptep)); */
-
-       /* 
-        * Check the user's access rights to the page.  If access should be
-        * prevented then send the problem up to do_page_fault.
-        */
-       if (unlikely(access & ~pte_val(*ptep)))
-               goto out;
-       /*
-        * At this point, we have a pte (old_pte) which can be used to build
-        * or update an HPTE. There are 2 cases:
-        *
-        * 1. There is a valid (present) pte with no associated HPTE (this is 
-        *      the most common case)
-        * 2. There is a valid (present) pte with an associated HPTE. The
-        *      current values of the pp bits in the HPTE prevent access
-        *      because we are doing software DIRTY bit management and the
-        *      page is currently not DIRTY. 
-        */
-
-
-       old_pte = *ptep;
-       new_pte = old_pte;
-
-       rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
-       /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-       rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
-
-       /* Check if pte already has an hpte (case 2) */
-       if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
-               /* There MIGHT be an HPTE for this pte */
-               unsigned long hash, slot;
-
-               hash = hpt_hash(vpn, 1);
-               if (pte_val(old_pte) & _PAGE_SECONDARY)
-                       hash = ~hash;
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
-
-               if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
-                       pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
-       }
-
-       if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
-               unsigned long hash = hpt_hash(vpn, 1);
-               unsigned long hpte_group;
-
-               prpn = pte_pfn(old_pte);
-
-repeat:
-               hpte_group = ((hash & htab_hash_mask) *
-                             HPTES_PER_GROUP) & ~0x7UL;
-
-               /* Update the linux pte with the HPTE slot */
-               pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-               pte_val(new_pte) |= _PAGE_HASHPTE;
-
-               /* Add in WIMG bits */
-               /* XXX We should store these in the pte */
-               rflags |= _PAGE_COHERENT;
-
-               slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                         HPTE_V_LARGE, rflags);
-
-               /* Primary is full, try the secondary */
-               if (unlikely(slot == -1)) {
-                       pte_val(new_pte) |= _PAGE_SECONDARY;
-                       hpte_group = ((~hash & htab_hash_mask) *
-                                     HPTES_PER_GROUP) & ~0x7UL; 
-                       slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                                 HPTE_V_LARGE |
-                                                 HPTE_V_SECONDARY,
-                                                 rflags);
-                       if (slot == -1) {
-                               if (mftb() & 0x1)
-                                       hpte_group = ((hash & htab_hash_mask) *
-                                                     HPTES_PER_GROUP)&~0x7UL;
-
-                               ppc_md.hpte_remove(hpte_group);
-                               goto repeat;
-                        }
-               }
-
-               if (unlikely(slot == -2))
-                       panic("hash_huge_page: pte_insert failed\n");
-
-               pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
-
-               /* 
-                * No need to use ldarx/stdcx here because all who
-                * might be updating the pte will hold the
-                * page_table_lock
-                */
-               *ptep = new_pte;
-       }
-
-       err = 0;
-
- out:
-       spin_unlock(&mm->page_table_lock);
-
-       return err;
-}
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
deleted file mode 100644 (file)
index c65b87b..0000000
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * c 2001 PPC 64 Team, IBM Corp
- * 
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
-#include <asm/semaphore.h>
-#include <asm/imalloc.h>
-#include <asm/cacheflush.h>
-
-static DECLARE_MUTEX(imlist_sem);
-struct vm_struct * imlist = NULL;
-
-static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
-{
-       unsigned long addr;
-       struct vm_struct **p, *tmp;
-
-       addr = ioremap_bot;
-       for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
-               if (size + addr < (unsigned long) tmp->addr)
-                       break;
-               if ((unsigned long)tmp->addr >= ioremap_bot)
-                       addr = tmp->size + (unsigned long) tmp->addr;
-               if (addr >= IMALLOC_END-size)
-                       return 1;
-       }
-       *im_addr = addr;
-
-       return 0;
-}
-
-/* Return whether the region described by v_addr and size is a subset
- * of the region described by parent
- */
-static inline int im_region_is_subset(unsigned long v_addr, unsigned long size,
-                       struct vm_struct *parent)
-{
-       return (int) (v_addr >= (unsigned long) parent->addr &&
-                     v_addr < (unsigned long) parent->addr + parent->size &&
-                     size < parent->size);
-}
-
-/* Return whether the region described by v_addr and size is a superset
- * of the region described by child
- */
-static int im_region_is_superset(unsigned long v_addr, unsigned long size,
-               struct vm_struct *child)
-{
-       struct vm_struct parent;
-
-       parent.addr = (void *) v_addr;
-       parent.size = size;
-
-       return im_region_is_subset((unsigned long) child->addr, child->size,
-                       &parent);
-}
-
-/* Return whether the region described by v_addr and size overlaps
- * the region described by vm.  Overlapping regions meet the
- * following conditions:
- * 1) The regions share some part of the address space
- * 2) The regions aren't identical
- * 3) Neither region is a subset of the other
- */
-static int im_region_overlaps(unsigned long v_addr, unsigned long size,
-                    struct vm_struct *vm)
-{
-       if (im_region_is_superset(v_addr, size, vm))
-               return 0;
-
-       return (v_addr + size > (unsigned long) vm->addr + vm->size &&
-               v_addr < (unsigned long) vm->addr + vm->size) ||
-              (v_addr < (unsigned long) vm->addr &&
-               v_addr + size > (unsigned long) vm->addr);
-}
-
-/* Determine imalloc status of region described by v_addr and size.
- * Can return one of the following:
- * IM_REGION_UNUSED   -  Entire region is unallocated in imalloc space.
- * IM_REGION_SUBSET -    Region is a subset of a region that is already
- *                      allocated in imalloc space.
- *                      vm will be assigned to a ptr to the parent region.
- * IM_REGION_EXISTS -    Exact region already allocated in imalloc space.
- *                       vm will be assigned to a ptr to the existing imlist
- *                       member.
- * IM_REGION_OVERLAPS -  Region overlaps an allocated region in imalloc space.
- * IM_REGION_SUPERSET -  Region is a superset of a region that is already
- *                       allocated in imalloc space.
- */
-static int im_region_status(unsigned long v_addr, unsigned long size,
-                   struct vm_struct **vm)
-{
-       struct vm_struct *tmp;
-
-       for (tmp = imlist; tmp; tmp = tmp->next)
-               if (v_addr < (unsigned long) tmp->addr + tmp->size)
-                       break;
-
-       if (tmp) {
-               if (im_region_overlaps(v_addr, size, tmp))
-                       return IM_REGION_OVERLAP;
-
-               *vm = tmp;
-               if (im_region_is_subset(v_addr, size, tmp)) {
-                       /* Return with tmp pointing to superset */
-                       return IM_REGION_SUBSET;
-               }
-               if (im_region_is_superset(v_addr, size, tmp)) {
-                       /* Return with tmp pointing to first subset */
-                       return IM_REGION_SUPERSET;
-               }
-               else if (v_addr == (unsigned long) tmp->addr &&
-                        size == tmp->size) {
-                       /* Return with tmp pointing to exact region */
-                       return IM_REGION_EXISTS;
-               }
-       }
-
-       *vm = NULL;
-       return IM_REGION_UNUSED;
-}
-
-static struct vm_struct * split_im_region(unsigned long v_addr, 
-               unsigned long size, struct vm_struct *parent)
-{
-       struct vm_struct *vm1 = NULL;
-       struct vm_struct *vm2 = NULL;
-       struct vm_struct *new_vm = NULL;
-       
-       vm1 = (struct vm_struct *) kmalloc(sizeof(*vm1), GFP_KERNEL);
-       if (vm1 == NULL) {
-               printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
-               return NULL;
-       }
-
-       if (v_addr == (unsigned long) parent->addr) {
-               /* Use existing parent vm_struct to represent child, allocate
-                * new one for the remainder of parent range
-                */
-               vm1->size = parent->size - size;
-               vm1->addr = (void *) (v_addr + size);
-               vm1->next = parent->next;
-
-               parent->size = size;
-               parent->next = vm1;
-               new_vm = parent;
-       } else if (v_addr + size == (unsigned long) parent->addr + 
-                       parent->size) {
-               /* Allocate new vm_struct to represent child, use existing
-                * parent one for remainder of parent range
-                */
-               vm1->size = size;
-               vm1->addr = (void *) v_addr;
-               vm1->next = parent->next;
-               new_vm = vm1;
-
-               parent->size -= size;
-               parent->next = vm1;
-       } else {
-               /* Allocate two new vm_structs for the new child and 
-                * uppermost remainder, and use existing parent one for the
-                * lower remainder of parent range
-                */
-               vm2 = (struct vm_struct *) kmalloc(sizeof(*vm2), GFP_KERNEL);
-               if (vm2 == NULL) {
-                       printk(KERN_ERR "%s() out of memory\n", __FUNCTION__);
-                       kfree(vm1);
-                       return NULL;
-               }
-
-               vm1->size = size;
-               vm1->addr = (void *) v_addr;
-               vm1->next = vm2;
-               new_vm = vm1;
-
-               vm2->size = ((unsigned long) parent->addr + parent->size) - 
-                               (v_addr + size);
-               vm2->addr = (void *) v_addr + size;
-               vm2->next = parent->next;
-
-               parent->size = v_addr - (unsigned long) parent->addr;
-               parent->next = vm1;
-       }
-
-       return new_vm;
-}
-
-static struct vm_struct * __add_new_im_area(unsigned long req_addr, 
-                                           unsigned long size)
-{
-       struct vm_struct **p, *tmp, *area;
-               
-       for (p = &imlist; (tmp = *p) ; p = &tmp->next) {
-               if (req_addr + size <= (unsigned long)tmp->addr)
-                       break;
-       }
-       
-       area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
-       if (!area)
-               return NULL;
-       area->flags = 0;
-       area->addr = (void *)req_addr;
-       area->size = size;
-       area->next = *p;
-       *p = area;
-
-       return area;
-}
-
-static struct vm_struct * __im_get_area(unsigned long req_addr, 
-                                       unsigned long size,
-                                       int criteria)
-{
-       struct vm_struct *tmp;
-       int status;
-
-       status = im_region_status(req_addr, size, &tmp);
-       if ((criteria & status) == 0) {
-               return NULL;
-       }
-       
-       switch (status) {
-       case IM_REGION_UNUSED:
-               tmp = __add_new_im_area(req_addr, size);
-               break;
-       case IM_REGION_SUBSET:
-               tmp = split_im_region(req_addr, size, tmp);
-               break;
-       case IM_REGION_EXISTS:
-               /* Return requested region */
-               break;
-       case IM_REGION_SUPERSET:
-               /* Return first existing subset of requested region */
-               break;
-       default:
-               printk(KERN_ERR "%s() unexpected imalloc region status\n",
-                               __FUNCTION__);
-               tmp = NULL;
-       }
-
-       return tmp;
-}
-
-struct vm_struct * im_get_free_area(unsigned long size)
-{
-       struct vm_struct *area;
-       unsigned long addr;
-       
-       down(&imlist_sem);
-       if (get_free_im_addr(size, &addr)) {
-               printk(KERN_ERR "%s() cannot obtain addr for size 0x%lx\n",
-                               __FUNCTION__, size);
-               area = NULL;
-               goto next_im_done;
-       }
-
-       area = __im_get_area(addr, size, IM_REGION_UNUSED);
-       if (area == NULL) {
-               printk(KERN_ERR 
-                      "%s() cannot obtain area for addr 0x%lx size 0x%lx\n",
-                       __FUNCTION__, addr, size);
-       }
-next_im_done:
-       up(&imlist_sem);
-       return area;
-}
-
-struct vm_struct * im_get_area(unsigned long v_addr, unsigned long size,
-               int criteria)
-{
-       struct vm_struct *area;
-
-       down(&imlist_sem);
-       area = __im_get_area(v_addr, size, criteria);
-       up(&imlist_sem);
-       return area;
-}
-
-void im_free(void * addr)
-{
-       struct vm_struct **p, *tmp;
-  
-       if (!addr)
-               return;
-       if ((unsigned long) addr & ~PAGE_MASK) {
-               printk(KERN_ERR "Trying to %s bad address (%p)\n", __FUNCTION__,                        addr);
-               return;
-       }
-       down(&imlist_sem);
-       for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
-               if (tmp->addr == addr) {
-                       *p = tmp->next;
-
-                       /* XXX: do we need the lock? */
-                       spin_lock(&init_mm.page_table_lock);
-                       unmap_vm_area(tmp);
-                       spin_unlock(&init_mm.page_table_lock);
-
-                       kfree(tmp);
-                       up(&imlist_sem);
-                       return;
-               }
-       }
-       up(&imlist_sem);
-       printk(KERN_ERR "Trying to %s nonexistent area (%p)\n", __FUNCTION__,
-                       addr);
-}
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
deleted file mode 100644 (file)
index c2157c9..0000000
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- *  PowerPC version 
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Dave Engebretsen <engebret@us.ibm.com>
- *      Rework for PPC64 port.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/config.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/highmem.h>
-#include <linux/idr.h>
-#include <linux/nodemask.h>
-#include <linux/module.h>
-
-#include <asm/pgalloc.h>
-#include <asm/page.h>
-#include <asm/prom.h>
-#include <asm/lmb.h>
-#include <asm/rtas.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/machdep.h>
-#include <asm/tlb.h>
-#include <asm/eeh.h>
-#include <asm/processor.h>
-#include <asm/mmzone.h>
-#include <asm/cputable.h>
-#include <asm/ppcdebug.h>
-#include <asm/sections.h>
-#include <asm/system.h>
-#include <asm/iommu.h>
-#include <asm/abs_addr.h>
-#include <asm/vdso.h>
-#include <asm/imalloc.h>
-
-#if PGTABLE_RANGE > USER_VSID_RANGE
-#warning Limited user VSID range means pagetable space is wasted
-#endif
-
-#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
-
-int mem_init_done;
-unsigned long ioremap_bot = IMALLOC_BASE;
-static unsigned long phbs_io_bot = PHBS_IO_BASE;
-
-extern pgd_t swapper_pg_dir[];
-extern struct task_struct *current_set[NR_CPUS];
-
-unsigned long klimit = (unsigned long)_end;
-
-unsigned long _SDR1=0;
-unsigned long _ASR=0;
-
-/* max amount of RAM to use */
-unsigned long __max_memory;
-
-/* info on what we think the IO hole is */
-unsigned long  io_hole_start;
-unsigned long  io_hole_size;
-
-void show_mem(void)
-{
-       unsigned long total = 0, reserved = 0;
-       unsigned long shared = 0, cached = 0;
-       struct page *page;
-       pg_data_t *pgdat;
-       unsigned long i;
-
-       printk("Mem-info:\n");
-       show_free_areas();
-       printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-       for_each_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_spanned_pages; i++) {
-                       page = pgdat_page_nr(pgdat, i);
-                       total++;
-                       if (PageReserved(page))
-                               reserved++;
-                       else if (PageSwapCache(page))
-                               cached++;
-                       else if (page_count(page))
-                               shared += page_count(page) - 1;
-               }
-       }
-       printk("%ld pages of RAM\n", total);
-       printk("%ld reserved pages\n", reserved);
-       printk("%ld pages shared\n", shared);
-       printk("%ld pages swap cached\n", cached);
-}
-
-#ifdef CONFIG_PPC_ISERIES
-
-void __iomem *ioremap(unsigned long addr, unsigned long size)
-{
-       return (void __iomem *)addr;
-}
-
-extern void __iomem *__ioremap(unsigned long addr, unsigned long size,
-                      unsigned long flags)
-{
-       return (void __iomem *)addr;
-}
-
-void iounmap(volatile void __iomem *addr)
-{
-       return;
-}
-
-#else
-
-/*
- * map_io_page currently only called by __ioremap
- * map_io_page adds an entry to the ioremap page table
- * and adds an entry to the HPT, possibly bolting it
- */
-static int map_io_page(unsigned long ea, unsigned long pa, int flags)
-{
-       pgd_t *pgdp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-       unsigned long vsid;
-
-       if (mem_init_done) {
-               spin_lock(&init_mm.page_table_lock);
-               pgdp = pgd_offset_k(ea);
-               pudp = pud_alloc(&init_mm, pgdp, ea);
-               if (!pudp)
-                       return -ENOMEM;
-               pmdp = pmd_alloc(&init_mm, pudp, ea);
-               if (!pmdp)
-                       return -ENOMEM;
-               ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
-               if (!ptep)
-                       return -ENOMEM;
-               set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
-                                                         __pgprot(flags)));
-               spin_unlock(&init_mm.page_table_lock);
-       } else {
-               unsigned long va, vpn, hash, hpteg;
-
-               /*
-                * If the mm subsystem is not fully up, we cannot create a
-                * linux page table entry for this mapping.  Simply bolt an
-                * entry in the hardware page table.
-                */
-               vsid = get_kernel_vsid(ea);
-               va = (vsid << 28) | (ea & 0xFFFFFFF);
-               vpn = va >> PAGE_SHIFT;
-
-               hash = hpt_hash(vpn, 0);
-
-               hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
-
-               /* Panic if a pte grpup is full */
-               if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT,
-                                      HPTE_V_BOLTED,
-                                      _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX)
-                   == -1) {
-                       panic("map_io_page: could not insert mapping");
-               }
-       }
-       return 0;
-}
-
-
-static void __iomem * __ioremap_com(unsigned long addr, unsigned long pa,
-                           unsigned long ea, unsigned long size,
-                           unsigned long flags)
-{
-       unsigned long i;
-
-       if ((flags & _PAGE_PRESENT) == 0)
-               flags |= pgprot_val(PAGE_KERNEL);
-
-       for (i = 0; i < size; i += PAGE_SIZE)
-               if (map_io_page(ea+i, pa+i, flags))
-                       return NULL;
-
-       return (void __iomem *) (ea + (addr & ~PAGE_MASK));
-}
-
-
-void __iomem *
-ioremap(unsigned long addr, unsigned long size)
-{
-       return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
-}
-
-void __iomem * __ioremap(unsigned long addr, unsigned long size,
-                        unsigned long flags)
-{
-       unsigned long pa, ea;
-       void __iomem *ret;
-
-       /*
-        * Choose an address to map it to.
-        * Once the imalloc system is running, we use it.
-        * Before that, we map using addresses going
-        * up from ioremap_bot.  imalloc will use
-        * the addresses from ioremap_bot through
-        * IMALLOC_END
-        * 
-        */
-       pa = addr & PAGE_MASK;
-       size = PAGE_ALIGN(addr + size) - pa;
-
-       if (size == 0)
-               return NULL;
-
-       if (mem_init_done) {
-               struct vm_struct *area;
-               area = im_get_free_area(size);
-               if (area == NULL)
-                       return NULL;
-               ea = (unsigned long)(area->addr);
-               ret = __ioremap_com(addr, pa, ea, size, flags);
-               if (!ret)
-                       im_free(area->addr);
-       } else {
-               ea = ioremap_bot;
-               ret = __ioremap_com(addr, pa, ea, size, flags);
-               if (ret)
-                       ioremap_bot += size;
-       }
-       return ret;
-}
-
-#define IS_PAGE_ALIGNED(_val) ((_val) == ((_val) & PAGE_MASK))
-
-int __ioremap_explicit(unsigned long pa, unsigned long ea,
-                      unsigned long size, unsigned long flags)
-{
-       struct vm_struct *area;
-       void __iomem *ret;
-       
-       /* For now, require page-aligned values for pa, ea, and size */
-       if (!IS_PAGE_ALIGNED(pa) || !IS_PAGE_ALIGNED(ea) ||
-           !IS_PAGE_ALIGNED(size)) {
-               printk(KERN_ERR "unaligned value in %s\n", __FUNCTION__);
-               return 1;
-       }
-       
-       if (!mem_init_done) {
-               /* Two things to consider in this case:
-                * 1) No records will be kept (imalloc, etc) that the region
-                *    has been remapped
-                * 2) It won't be easy to iounmap() the region later (because
-                *    of 1)
-                */
-               ;
-       } else {
-               area = im_get_area(ea, size,
-                       IM_REGION_UNUSED|IM_REGION_SUBSET|IM_REGION_EXISTS);
-               if (area == NULL) {
-                       /* Expected when PHB-dlpar is in play */
-                       return 1;
-               }
-               if (ea != (unsigned long) area->addr) {
-                       printk(KERN_ERR "unexpected addr return from "
-                              "im_get_area\n");
-                       return 1;
-               }
-       }
-       
-       ret = __ioremap_com(pa, pa, ea, size, flags);
-       if (ret == NULL) {
-               printk(KERN_ERR "ioremap_explicit() allocation failure !\n");
-               return 1;
-       }
-       if (ret != (void *) ea) {
-               printk(KERN_ERR "__ioremap_com() returned unexpected addr\n");
-               return 1;
-       }
-
-       return 0;
-}
-
-/*  
- * Unmap an IO region and remove it from imalloc'd list.
- * Access to IO memory should be serialized by driver.
- * This code is modeled after vmalloc code - unmap_vm_area()
- *
- * XXX what about calls before mem_init_done (ie python_countermeasures())
- */
-void iounmap(volatile void __iomem *token)
-{
-       void *addr;
-
-       if (!mem_init_done)
-               return;
-       
-       addr = (void *) ((unsigned long __force) token & PAGE_MASK);
-
-       im_free(addr);
-}
-
-static int iounmap_subset_regions(unsigned long addr, unsigned long size)
-{
-       struct vm_struct *area;
-
-       /* Check whether subsets of this region exist */
-       area = im_get_area(addr, size, IM_REGION_SUPERSET);
-       if (area == NULL)
-               return 1;
-
-       while (area) {
-               iounmap((void __iomem *) area->addr);
-               area = im_get_area(addr, size,
-                               IM_REGION_SUPERSET);
-       }
-
-       return 0;
-}
-
-int iounmap_explicit(volatile void __iomem *start, unsigned long size)
-{
-       struct vm_struct *area;
-       unsigned long addr;
-       int rc;
-       
-       addr = (unsigned long __force) start & PAGE_MASK;
-
-       /* Verify that the region either exists or is a subset of an existing
-        * region.  In the latter case, split the parent region to create 
-        * the exact region 
-        */
-       area = im_get_area(addr, size, 
-                           IM_REGION_EXISTS | IM_REGION_SUBSET);
-       if (area == NULL) {
-               /* Determine whether subset regions exist.  If so, unmap */
-               rc = iounmap_subset_regions(addr, size);
-               if (rc) {
-                       printk(KERN_ERR
-                              "%s() cannot unmap nonexistent range 0x%lx\n",
-                               __FUNCTION__, addr);
-                       return 1;
-               }
-       } else {
-               iounmap((void __iomem *) area->addr);
-       }
-       /*
-        * FIXME! This can't be right:
-       iounmap(area->addr);
-        * Maybe it should be "iounmap(area);"
-        */
-       return 0;
-}
-
-#endif
-
-EXPORT_SYMBOL(ioremap);
-EXPORT_SYMBOL(__ioremap);
-EXPORT_SYMBOL(iounmap);
-
-void free_initmem(void)
-{
-       unsigned long addr;
-
-       addr = (unsigned long)__init_begin;
-       for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-               memset((void *)addr, 0xcc, PAGE_SIZE);
-               ClearPageReserved(virt_to_page(addr));
-               set_page_count(virt_to_page(addr), 1);
-               free_page(addr);
-               totalram_pages++;
-       }
-       printk ("Freeing unused kernel memory: %luk freed\n",
-               ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       if (start < end)
-               printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-       for (; start < end; start += PAGE_SIZE) {
-               ClearPageReserved(virt_to_page(start));
-               set_page_count(virt_to_page(start), 1);
-               free_page(start);
-               totalram_pages++;
-       }
-}
-#endif
-
-static DEFINE_SPINLOCK(mmu_context_lock);
-static DEFINE_IDR(mmu_context_idr);
-
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       int index;
-       int err;
-
-again:
-       if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
-               return -ENOMEM;
-
-       spin_lock(&mmu_context_lock);
-       err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
-       spin_unlock(&mmu_context_lock);
-
-       if (err == -EAGAIN)
-               goto again;
-       else if (err)
-               return err;
-
-       if (index > MAX_CONTEXT) {
-               idr_remove(&mmu_context_idr, index);
-               return -ENOMEM;
-       }
-
-       mm->context.id = index;
-
-       return 0;
-}
-
-void destroy_context(struct mm_struct *mm)
-{
-       spin_lock(&mmu_context_lock);
-       idr_remove(&mmu_context_idr, mm->context.id);
-       spin_unlock(&mmu_context_lock);
-
-       mm->context.id = NO_CONTEXT;
-}
-
-/*
- * Do very early mm setup.
- */
-void __init mm_init_ppc64(void)
-{
-#ifndef CONFIG_PPC_ISERIES
-       unsigned long i;
-#endif
-
-       ppc64_boot_msg(0x100, "MM Init");
-
-       /* This is the story of the IO hole... please, keep seated,
-        * unfortunately, we are out of oxygen masks at the moment.
-        * So we need some rough way to tell where your big IO hole
-        * is. On pmac, it's between 2G and 4G, on POWER3, it's around
-        * that area as well, on POWER4 we don't have one, etc...
-        * We need that as a "hint" when sizing the TCE table on POWER3
-        * So far, the simplest way that seem work well enough for us it
-        * to just assume that the first discontinuity in our physical
-        * RAM layout is the IO hole. That may not be correct in the future
-        * (and isn't on iSeries but then we don't care ;)
-        */
-
-#ifndef CONFIG_PPC_ISERIES
-       for (i = 1; i < lmb.memory.cnt; i++) {
-               unsigned long base, prevbase, prevsize;
-
-               prevbase = lmb.memory.region[i-1].base;
-               prevsize = lmb.memory.region[i-1].size;
-               base = lmb.memory.region[i].base;
-               if (base > (prevbase + prevsize)) {
-                       io_hole_start = prevbase + prevsize;
-                       io_hole_size = base  - (prevbase + prevsize);
-                       break;
-               }
-       }
-#endif /* CONFIG_PPC_ISERIES */
-       if (io_hole_start)
-               printk("IO Hole assumed to be %lx -> %lx\n",
-                      io_hole_start, io_hole_start + io_hole_size - 1);
-
-       ppc64_boot_msg(0x100, "MM Init Done");
-}
-
-/*
- * This is called by /dev/mem to know if a given address has to
- * be mapped non-cacheable or not
- */
-int page_is_ram(unsigned long pfn)
-{
-       int i;
-       unsigned long paddr = (pfn << PAGE_SHIFT);
-
-       for (i=0; i < lmb.memory.cnt; i++) {
-               unsigned long base;
-
-               base = lmb.memory.region[i].base;
-
-               if ((paddr >= base) &&
-                       (paddr < (base + lmb.memory.region[i].size))) {
-                       return 1;
-               }
-       }
-
-       return 0;
-}
-EXPORT_SYMBOL(page_is_ram);
-
-/*
- * Initialize the bootmem system and give it all the memory we
- * have available.
- */
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init do_init_bootmem(void)
-{
-       unsigned long i;
-       unsigned long start, bootmap_pages;
-       unsigned long total_pages = lmb_end_of_DRAM() >> PAGE_SHIFT;
-       int boot_mapsize;
-
-       /*
-        * Find an area to use for the bootmem bitmap.  Calculate the size of
-        * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
-        * Add 1 additional page in case the address isn't page-aligned.
-        */
-       bootmap_pages = bootmem_bootmap_pages(total_pages);
-
-       start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
-       BUG_ON(!start);
-
-       boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
-
-       max_pfn = max_low_pfn;
-
-       /* Add all physical memory to the bootmem map, mark each area
-        * present.
-        */
-       for (i=0; i < lmb.memory.cnt; i++)
-               free_bootmem(lmb.memory.region[i].base,
-                            lmb_size_bytes(&lmb.memory, i));
-
-       /* reserve the sections we're already using */
-       for (i=0; i < lmb.reserved.cnt; i++)
-               reserve_bootmem(lmb.reserved.region[i].base,
-                               lmb_size_bytes(&lmb.reserved, i));
-
-       for (i=0; i < lmb.memory.cnt; i++)
-               memory_present(0, lmb_start_pfn(&lmb.memory, i),
-                              lmb_end_pfn(&lmb.memory, i));
-}
-
-/*
- * paging_init() sets up the page tables - in fact we've already done this.
- */
-void __init paging_init(void)
-{
-       unsigned long zones_size[MAX_NR_ZONES];
-       unsigned long zholes_size[MAX_NR_ZONES];
-       unsigned long total_ram = lmb_phys_mem_size();
-       unsigned long top_of_ram = lmb_end_of_DRAM();
-
-       printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
-              top_of_ram, total_ram);
-       printk(KERN_INFO "Memory hole size: %ldMB\n",
-              (top_of_ram - total_ram) >> 20);
-       /*
-        * All pages are DMA-able so we put them all in the DMA zone.
-        */
-       memset(zones_size, 0, sizeof(zones_size));
-       memset(zholes_size, 0, sizeof(zholes_size));
-
-       zones_size[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
-       zholes_size[ZONE_DMA] = (top_of_ram - total_ram) >> PAGE_SHIFT;
-
-       free_area_init_node(0, NODE_DATA(0), zones_size,
-                           __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size);
-}
-#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
-
-static struct kcore_list kcore_vmem;
-
-static int __init setup_kcore(void)
-{
-       int i;
-
-       for (i=0; i < lmb.memory.cnt; i++) {
-               unsigned long base, size;
-               struct kcore_list *kcore_mem;
-
-               base = lmb.memory.region[i].base;
-               size = lmb.memory.region[i].size;
-
-               /* GFP_ATOMIC to avoid might_sleep warnings during boot */
-               kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
-               if (!kcore_mem)
-                       panic("mem_init: kmalloc failed\n");
-
-               kclist_add(kcore_mem, __va(base), size);
-       }
-
-       kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
-
-       return 0;
-}
-module_init(setup_kcore);
-
-void __init mem_init(void)
-{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-       int nid;
-#endif
-       pg_data_t *pgdat;
-       unsigned long i;
-       struct page *page;
-       unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
-
-       num_physpages = max_low_pfn;    /* RAM is assumed contiguous */
-       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-        for_each_online_node(nid) {
-               if (NODE_DATA(nid)->node_spanned_pages != 0) {
-                       printk("freeing bootmem node %x\n", nid);
-                       totalram_pages +=
-                               free_all_bootmem_node(NODE_DATA(nid));
-               }
-       }
-#else
-       max_mapnr = num_physpages;
-       totalram_pages += free_all_bootmem();
-#endif
-
-       for_each_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_spanned_pages; i++) {
-                       page = pgdat_page_nr(pgdat, i);
-                       if (PageReserved(page))
-                               reservedpages++;
-               }
-       }
-
-       codesize = (unsigned long)&_etext - (unsigned long)&_stext;
-       initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
-       datasize = (unsigned long)&_edata - (unsigned long)&__init_end;
-       bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
-
-       printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
-              "%luk reserved, %luk data, %luk bss, %luk init)\n",
-               (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
-               num_physpages << (PAGE_SHIFT-10),
-               codesize >> 10,
-               reservedpages << (PAGE_SHIFT-10),
-               datasize >> 10,
-               bsssize >> 10,
-               initsize >> 10);
-
-       mem_init_done = 1;
-
-       /* Initialize the vDSO */
-       vdso_init();
-}
-
-/*
- * This is called when a page has been modified by the kernel.
- * It just marks the page as not i-cache clean.  We do the i-cache
- * flush later when the page is given to a user process, if necessary.
- */
-void flush_dcache_page(struct page *page)
-{
-       if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               return;
-       /* avoid an atomic op if possible */
-       if (test_bit(PG_arch_1, &page->flags))
-               clear_bit(PG_arch_1, &page->flags);
-}
-EXPORT_SYMBOL(flush_dcache_page);
-
-void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
-{
-       clear_page(page);
-
-       if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               return;
-       /*
-        * We shouldnt have to do this, but some versions of glibc
-        * require it (ld.so assumes zero filled pages are icache clean)
-        * - Anton
-        */
-
-       /* avoid an atomic op if possible */
-       if (test_bit(PG_arch_1, &pg->flags))
-               clear_bit(PG_arch_1, &pg->flags);
-}
-EXPORT_SYMBOL(clear_user_page);
-
-void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
-                   struct page *pg)
-{
-       copy_page(vto, vfrom);
-
-       /*
-        * We should be able to use the following optimisation, however
-        * there are two problems.
-        * Firstly a bug in some versions of binutils meant PLT sections
-        * were not marked executable.
-        * Secondly the first word in the GOT section is blrl, used
-        * to establish the GOT address. Until recently the GOT was
-        * not marked executable.
-        * - Anton
-        */
-#if 0
-       if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
-               return;
-#endif
-
-       if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               return;
-
-       /* avoid an atomic op if possible */
-       if (test_bit(PG_arch_1, &pg->flags))
-               clear_bit(PG_arch_1, &pg->flags);
-}
-
-void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
-                            unsigned long addr, int len)
-{
-       unsigned long maddr;
-
-       maddr = (unsigned long)page_address(page) + (addr & ~PAGE_MASK);
-       flush_icache_range(maddr, maddr + len);
-}
-EXPORT_SYMBOL(flush_icache_user_range);
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a PTE in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux PTE.
- * 
- * This must always be called with the mm->page_table_lock held
- */
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
-                     pte_t pte)
-{
-       unsigned long vsid;
-       void *pgdir;
-       pte_t *ptep;
-       int local = 0;
-       cpumask_t tmp;
-       unsigned long flags;
-
-       /* handle i-cache coherency */
-       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
-           !cpu_has_feature(CPU_FTR_NOEXECUTE)) {
-               unsigned long pfn = pte_pfn(pte);
-               if (pfn_valid(pfn)) {
-                       struct page *page = pfn_to_page(pfn);
-                       if (!PageReserved(page)
-                           && !test_bit(PG_arch_1, &page->flags)) {
-                               __flush_dcache_icache(page_address(page));
-                               set_bit(PG_arch_1, &page->flags);
-                       }
-               }
-       }
-
-       /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
-       if (!pte_young(pte))
-               return;
-
-       pgdir = vma->vm_mm->pgd;
-       if (pgdir == NULL)
-               return;
-
-       ptep = find_linux_pte(pgdir, ea);
-       if (!ptep)
-               return;
-
-       vsid = get_vsid(vma->vm_mm->context.id, ea);
-
-       local_irq_save(flags);
-       tmp = cpumask_of_cpu(smp_processor_id());
-       if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
-               local = 1;
-
-       __hash_page(ea, pte_val(pte) & (_PAGE_USER|_PAGE_RW), vsid, ptep,
-                   0x300, local);
-       local_irq_restore(flags);
-}
-
-void __iomem * reserve_phb_iospace(unsigned long size)
-{
-       void __iomem *virt_addr;
-               
-       if (phbs_io_bot >= IMALLOC_BASE) 
-               panic("reserve_phb_iospace(): phb io space overflow\n");
-                       
-       virt_addr = (void __iomem *) phbs_io_bot;
-       phbs_io_bot += size;
-
-       return virt_addr;
-}
-
-static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
-{
-       memset(addr, 0, kmem_cache_size(cache));
-}
-
-static const int pgtable_cache_size[2] = {
-       PTE_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-       "pgd_pte_cache", "pud_pmd_cache",
-};
-
-kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-
-void pgtable_cache_init(void)
-{
-       int i;
-
-       BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
-       BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
-       BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
-       BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
-
-       for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-               int size = pgtable_cache_size[i];
-               const char *name = pgtable_cache_name[i];
-
-               pgtable_cache[i] = kmem_cache_create(name,
-                                                    size, size,
-                                                    SLAB_HWCACHE_ALIGN
-                                                    | SLAB_MUST_HWCACHE_ALIGN,
-                                                    zero_ctor,
-                                                    NULL);
-               if (! pgtable_cache[i])
-                       panic("pgtable_cache_init(): could not create %s!\n",
-                             name);
-       }
-}
-
-pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
-                             unsigned long size, pgprot_t vma_prot)
-{
-       if (ppc_md.phys_mem_access_prot)
-               return ppc_md.phys_mem_access_prot(file, addr, size, vma_prot);
-
-       if (!page_is_ram(addr >> PAGE_SHIFT))
-               vma_prot = __pgprot(pgprot_val(vma_prot)
-                                   | _PAGE_GUARDED | _PAGE_NO_CACHE);
-       return vma_prot;
-}
-EXPORT_SYMBOL(phys_mem_access_prot);
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c
deleted file mode 100644 (file)
index fe65f52..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  linux/arch/ppc64/mm/mmap.c
- *
- *  flexible mmap layout support
- *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
- */
-
-#include <linux/personality.h>
-#include <linux/mm.h>
-
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave an at least ~128 MB hole.
- */
-#define MIN_GAP (128*1024*1024)
-#define MAX_GAP (TASK_SIZE/6*5)
-
-static inline unsigned long mmap_base(void)
-{
-       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
-
-       if (gap < MIN_GAP)
-               gap = MIN_GAP;
-       else if (gap > MAX_GAP)
-               gap = MAX_GAP;
-
-       return TASK_SIZE - (gap & PAGE_MASK);
-}
-
-static inline int mmap_is_legacy(void)
-{
-       /*
-        * Force standard allocation for 64 bit programs.
-        */
-       if (!test_thread_flag(TIF_32BIT))
-               return 1;
-
-       if (current->personality & ADDR_COMPAT_LAYOUT)
-               return 1;
-
-       if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
-               return 1;
-
-       return sysctl_legacy_va_layout;
-}
-
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-       /*
-        * Fall back to the standard layout if the personality
-        * bit is set, or if the expected stack growth is unlimited:
-        */
-       if (mmap_is_legacy()) {
-               mm->mmap_base = TASK_UNMAPPED_BASE;
-               mm->get_unmapped_area = arch_get_unmapped_area;
-               mm->unmap_area = arch_unmap_area;
-       } else {
-               mm->mmap_base = mmap_base();
-               mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-               mm->unmap_area = arch_unmap_area_topdown;
-       }
-}
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
deleted file mode 100644 (file)
index cb864b8..0000000
+++ /dev/null
@@ -1,779 +0,0 @@
-/*
- * pSeries NUMA support
- *
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/threads.h>
-#include <linux/bootmem.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-#include <asm/lmb.h>
-#include <asm/machdep.h>
-#include <asm/abs_addr.h>
-
-static int numa_enabled = 1;
-
-static int numa_debug;
-#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
-
-#ifdef DEBUG_NUMA
-#define ARRAY_INITIALISER -1
-#else
-#define ARRAY_INITIALISER 0
-#endif
-
-int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
-       ARRAY_INITIALISER};
-char *numa_memory_lookup_table;
-cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
-int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
-
-struct pglist_data *node_data[MAX_NUMNODES];
-bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
-static int min_common_depth;
-
-/*
- * We need somewhere to store start/span for each node until we have
- * allocated the real node_data structures.
- */
-static struct {
-       unsigned long node_start_pfn;
-       unsigned long node_end_pfn;
-       unsigned long node_present_pages;
-} init_node_data[MAX_NUMNODES] __initdata;
-
-EXPORT_SYMBOL(node_data);
-EXPORT_SYMBOL(numa_cpu_lookup_table);
-EXPORT_SYMBOL(numa_memory_lookup_table);
-EXPORT_SYMBOL(numa_cpumask_lookup_table);
-EXPORT_SYMBOL(nr_cpus_in_node);
-
-static inline void map_cpu_to_node(int cpu, int node)
-{
-       numa_cpu_lookup_table[cpu] = node;
-       if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
-               cpu_set(cpu, numa_cpumask_lookup_table[node]);
-               nr_cpus_in_node[node]++;
-       }
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void unmap_cpu_from_node(unsigned long cpu)
-{
-       int node = numa_cpu_lookup_table[cpu];
-
-       dbg("removing cpu %lu from node %d\n", cpu, node);
-
-       if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
-               cpu_clear(cpu, numa_cpumask_lookup_table[node]);
-               nr_cpus_in_node[node]--;
-       } else {
-               printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
-                      cpu, node);
-       }
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-static struct device_node * __devinit find_cpu_node(unsigned int cpu)
-{
-       unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
-       struct device_node *cpu_node = NULL;
-       unsigned int *interrupt_server, *reg;
-       int len;
-
-       while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
-               /* Try interrupt server first */
-               interrupt_server = (unsigned int *)get_property(cpu_node,
-                                       "ibm,ppc-interrupt-server#s", &len);
-
-               len = len / sizeof(u32);
-
-               if (interrupt_server && (len > 0)) {
-                       while (len--) {
-                               if (interrupt_server[len] == hw_cpuid)
-                                       return cpu_node;
-                       }
-               } else {
-                       reg = (unsigned int *)get_property(cpu_node,
-                                                          "reg", &len);
-                       if (reg && (len > 0) && (reg[0] == hw_cpuid))
-                               return cpu_node;
-               }
-       }
-
-       return NULL;
-}
-
-/* must hold reference to node during call */
-static int *of_get_associativity(struct device_node *dev)
-{
-       return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
-}
-
-static int of_node_numa_domain(struct device_node *device)
-{
-       int numa_domain;
-       unsigned int *tmp;
-
-       if (min_common_depth == -1)
-               return 0;
-
-       tmp = of_get_associativity(device);
-       if (tmp && (tmp[0] >= min_common_depth)) {
-               numa_domain = tmp[min_common_depth];
-       } else {
-               dbg("WARNING: no NUMA information for %s\n",
-                   device->full_name);
-               numa_domain = 0;
-       }
-       return numa_domain;
-}
-
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine.  This resource then has different associativity
- * characteristics relative to its multiple connections.  We ignore
- * this for now.  We also assume that all cpu and memory sets have
- * their distances represented at a common level.  This won't be
- * true for heirarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
-static int __init find_min_common_depth(void)
-{
-       int depth;
-       unsigned int *ref_points;
-       struct device_node *rtas_root;
-       unsigned int len;
-
-       rtas_root = of_find_node_by_path("/rtas");
-
-       if (!rtas_root)
-               return -1;
-
-       /*
-        * this property is 2 32-bit integers, each representing a level of
-        * depth in the associativity nodes.  The first is for an SMP
-        * configuration (should be all 0's) and the second is for a normal
-        * NUMA configuration.
-        */
-       ref_points = (unsigned int *)get_property(rtas_root,
-                       "ibm,associativity-reference-points", &len);
-
-       if ((len >= 1) && ref_points) {
-               depth = ref_points[1];
-       } else {
-               dbg("WARNING: could not find NUMA "
-                   "associativity reference point\n");
-               depth = -1;
-       }
-       of_node_put(rtas_root);
-
-       return depth;
-}
-
-static int __init get_mem_addr_cells(void)
-{
-       struct device_node *memory = NULL;
-       int rc;
-
-       memory = of_find_node_by_type(memory, "memory");
-       if (!memory)
-               return 0; /* it won't matter */
-
-       rc = prom_n_addr_cells(memory);
-       return rc;
-}
-
-static int __init get_mem_size_cells(void)
-{
-       struct device_node *memory = NULL;
-       int rc;
-
-       memory = of_find_node_by_type(memory, "memory");
-       if (!memory)
-               return 0; /* it won't matter */
-       rc = prom_n_size_cells(memory);
-       return rc;
-}
-
-static unsigned long read_n_cells(int n, unsigned int **buf)
-{
-       unsigned long result = 0;
-
-       while (n--) {
-               result = (result << 32) | **buf;
-               (*buf)++;
-       }
-       return result;
-}
-
-/*
- * Figure out to which domain a cpu belongs and stick it there.
- * Return the id of the domain used.
- */
-static int numa_setup_cpu(unsigned long lcpu)
-{
-       int numa_domain = 0;
-       struct device_node *cpu = find_cpu_node(lcpu);
-
-       if (!cpu) {
-               WARN_ON(1);
-               goto out;
-       }
-
-       numa_domain = of_node_numa_domain(cpu);
-
-       if (numa_domain >= num_online_nodes()) {
-               /*
-                * POWER4 LPAR uses 0xffff as invalid node,
-                * dont warn in this case.
-                */
-               if (numa_domain != 0xffff)
-                       printk(KERN_ERR "WARNING: cpu %ld "
-                              "maps to invalid NUMA node %d\n",
-                              lcpu, numa_domain);
-               numa_domain = 0;
-       }
-out:
-       node_set_online(numa_domain);
-
-       map_cpu_to_node(lcpu, numa_domain);
-
-       of_node_put(cpu);
-
-       return numa_domain;
-}
-
-static int cpu_numa_callback(struct notifier_block *nfb,
-                            unsigned long action,
-                            void *hcpu)
-{
-       unsigned long lcpu = (unsigned long)hcpu;
-       int ret = NOTIFY_DONE;
-
-       switch (action) {
-       case CPU_UP_PREPARE:
-               if (min_common_depth == -1 || !numa_enabled)
-                       map_cpu_to_node(lcpu, 0);
-               else
-                       numa_setup_cpu(lcpu);
-               ret = NOTIFY_OK;
-               break;
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_UP_CANCELED:
-               unmap_cpu_from_node(lcpu);
-               break;
-               ret = NOTIFY_OK;
-#endif
-       }
-       return ret;
-}
-
-/*
- * Check and possibly modify a memory region to enforce the memory limit.
- *
- * Returns the size the region should have to enforce the memory limit.
- * This will either be the original value of size, a truncated value,
- * or zero. If the returned value of size is 0 the region should be
- * discarded as it lies wholy above the memory limit.
- */
-static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
-{
-       /*
-        * We use lmb_end_of_DRAM() in here instead of memory_limit because
-        * we've already adjusted it for the limit and it takes care of
-        * having memory holes below the limit.
-        */
-       extern unsigned long memory_limit;
-
-       if (! memory_limit)
-               return size;
-
-       if (start + size <= lmb_end_of_DRAM())
-               return size;
-
-       if (start >= lmb_end_of_DRAM())
-               return 0;
-
-       return lmb_end_of_DRAM() - start;
-}
-
-static int __init parse_numa_properties(void)
-{
-       struct device_node *cpu = NULL;
-       struct device_node *memory = NULL;
-       int addr_cells, size_cells;
-       int max_domain = 0;
-       long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
-       unsigned long i;
-
-       if (numa_enabled == 0) {
-               printk(KERN_WARNING "NUMA disabled by user\n");
-               return -1;
-       }
-
-       numa_memory_lookup_table =
-               (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
-       memset(numa_memory_lookup_table, 0, entries * sizeof(char));
-
-       for (i = 0; i < entries ; i++)
-               numa_memory_lookup_table[i] = ARRAY_INITIALISER;
-
-       min_common_depth = find_min_common_depth();
-
-       dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
-       if (min_common_depth < 0)
-               return min_common_depth;
-
-       max_domain = numa_setup_cpu(boot_cpuid);
-
-       /*
-        * Even though we connect cpus to numa domains later in SMP init,
-        * we need to know the maximum node id now. This is because each
-        * node id must have NODE_DATA etc backing it.
-        * As a result of hotplug we could still have cpus appear later on
-        * with larger node ids. In that case we force the cpu into node 0.
-        */
-       for_each_cpu(i) {
-               int numa_domain;
-
-               cpu = find_cpu_node(i);
-
-               if (cpu) {
-                       numa_domain = of_node_numa_domain(cpu);
-                       of_node_put(cpu);
-
-                       if (numa_domain < MAX_NUMNODES &&
-                           max_domain < numa_domain)
-                               max_domain = numa_domain;
-               }
-       }
-
-       addr_cells = get_mem_addr_cells();
-       size_cells = get_mem_size_cells();
-       memory = NULL;
-       while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
-               unsigned long start;
-               unsigned long size;
-               int numa_domain;
-               int ranges;
-               unsigned int *memcell_buf;
-               unsigned int len;
-
-               memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
-               if (!memcell_buf || len <= 0)
-                       continue;
-
-               ranges = memory->n_addrs;
-new_range:
-               /* these are order-sensitive, and modify the buffer pointer */
-               start = read_n_cells(addr_cells, &memcell_buf);
-               size = read_n_cells(size_cells, &memcell_buf);
-
-               start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
-               size = _ALIGN_UP(size, MEMORY_INCREMENT);
-
-               numa_domain = of_node_numa_domain(memory);
-
-               if (numa_domain >= MAX_NUMNODES) {
-                       if (numa_domain != 0xffff)
-                               printk(KERN_ERR "WARNING: memory at %lx maps "
-                                      "to invalid NUMA node %d\n", start,
-                                      numa_domain);
-                       numa_domain = 0;
-               }
-
-               if (max_domain < numa_domain)
-                       max_domain = numa_domain;
-
-               if (! (size = numa_enforce_memory_limit(start, size))) {
-                       if (--ranges)
-                               goto new_range;
-                       else
-                               continue;
-               }
-
-               /*
-                * Initialize new node struct, or add to an existing one.
-                */
-               if (init_node_data[numa_domain].node_end_pfn) {
-                       if ((start / PAGE_SIZE) <
-                           init_node_data[numa_domain].node_start_pfn)
-                               init_node_data[numa_domain].node_start_pfn =
-                                       start / PAGE_SIZE;
-                       if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
-                           init_node_data[numa_domain].node_end_pfn)
-                               init_node_data[numa_domain].node_end_pfn =
-                                       (start / PAGE_SIZE) +
-                                       (size / PAGE_SIZE);
-
-                       init_node_data[numa_domain].node_present_pages +=
-                               size / PAGE_SIZE;
-               } else {
-                       node_set_online(numa_domain);
-
-                       init_node_data[numa_domain].node_start_pfn =
-                               start / PAGE_SIZE;
-                       init_node_data[numa_domain].node_end_pfn =
-                               init_node_data[numa_domain].node_start_pfn +
-                               size / PAGE_SIZE;
-                       init_node_data[numa_domain].node_present_pages =
-                               size / PAGE_SIZE;
-               }
-
-               for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
-                       numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
-                               numa_domain;
-
-               if (--ranges)
-                       goto new_range;
-       }
-
-       for (i = 0; i <= max_domain; i++)
-               node_set_online(i);
-
-       return 0;
-}
-
-static void __init setup_nonnuma(void)
-{
-       unsigned long top_of_ram = lmb_end_of_DRAM();
-       unsigned long total_ram = lmb_phys_mem_size();
-       unsigned long i;
-
-       printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
-              top_of_ram, total_ram);
-       printk(KERN_INFO "Memory hole size: %ldMB\n",
-              (top_of_ram - total_ram) >> 20);
-
-       if (!numa_memory_lookup_table) {
-               long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
-               numa_memory_lookup_table =
-                       (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
-               memset(numa_memory_lookup_table, 0, entries * sizeof(char));
-               for (i = 0; i < entries ; i++)
-                       numa_memory_lookup_table[i] = ARRAY_INITIALISER;
-       }
-
-       map_cpu_to_node(boot_cpuid, 0);
-
-       node_set_online(0);
-
-       init_node_data[0].node_start_pfn = 0;
-       init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
-       init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
-
-       for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
-               numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
-}
-
-static void __init dump_numa_topology(void)
-{
-       unsigned int node;
-       unsigned int count;
-
-       if (min_common_depth == -1 || !numa_enabled)
-               return;
-
-       for_each_online_node(node) {
-               unsigned long i;
-
-               printk(KERN_INFO "Node %d Memory:", node);
-
-               count = 0;
-
-               for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
-                       if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
-                               if (count == 0)
-                                       printk(" 0x%lx", i);
-                               ++count;
-                       } else {
-                               if (count > 0)
-                                       printk("-0x%lx", i);
-                               count = 0;
-                       }
-               }
-
-               if (count > 0)
-                       printk("-0x%lx", i);
-               printk("\n");
-       }
-       return;
-}
-
-/*
- * Allocate some memory, satisfying the lmb or bootmem allocator where
- * required. nid is the preferred node and end is the physical address of
- * the highest address in the node.
- *
- * Returns the physical address of the memory.
- */
-static unsigned long careful_allocation(int nid, unsigned long size,
-                                       unsigned long align, unsigned long end)
-{
-       unsigned long ret = lmb_alloc_base(size, align, end);
-
-       /* retry over all memory */
-       if (!ret)
-               ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
-
-       if (!ret)
-               panic("numa.c: cannot allocate %lu bytes on node %d",
-                     size, nid);
-
-       /*
-        * If the memory came from a previously allocated node, we must
-        * retry with the bootmem allocator.
-        */
-       if (pa_to_nid(ret) < nid) {
-               nid = pa_to_nid(ret);
-               ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
-                               size, align, 0);
-
-               if (!ret)
-                       panic("numa.c: cannot allocate %lu bytes on node %d",
-                             size, nid);
-
-               ret = virt_to_abs(ret);
-
-               dbg("alloc_bootmem %lx %lx\n", ret, size);
-       }
-
-       return ret;
-}
-
-void __init do_init_bootmem(void)
-{
-       int nid;
-       int addr_cells, size_cells;
-       struct device_node *memory = NULL;
-       static struct notifier_block ppc64_numa_nb = {
-               .notifier_call = cpu_numa_callback,
-               .priority = 1 /* Must run before sched domains notifier. */
-       };
-
-       min_low_pfn = 0;
-       max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
-       max_pfn = max_low_pfn;
-
-       if (parse_numa_properties())
-               setup_nonnuma();
-       else
-               dump_numa_topology();
-
-       register_cpu_notifier(&ppc64_numa_nb);
-
-       for_each_online_node(nid) {
-               unsigned long start_paddr, end_paddr;
-               int i;
-               unsigned long bootmem_paddr;
-               unsigned long bootmap_pages;
-
-               start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
-               end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
-
-               /* Allocate the node structure node local if possible */
-               NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
-                                       sizeof(struct pglist_data),
-                                       SMP_CACHE_BYTES, end_paddr);
-               NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
-               memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-
-               dbg("node %d\n", nid);
-               dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
-
-               NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
-               NODE_DATA(nid)->node_start_pfn =
-                       init_node_data[nid].node_start_pfn;
-               NODE_DATA(nid)->node_spanned_pages =
-                       end_paddr - start_paddr;
-
-               if (NODE_DATA(nid)->node_spanned_pages == 0)
-                       continue;
-
-               dbg("start_paddr = %lx\n", start_paddr);
-               dbg("end_paddr = %lx\n", end_paddr);
-
-               bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
-
-               bootmem_paddr = careful_allocation(nid,
-                               bootmap_pages << PAGE_SHIFT,
-                               PAGE_SIZE, end_paddr);
-               memset(abs_to_virt(bootmem_paddr), 0,
-                      bootmap_pages << PAGE_SHIFT);
-               dbg("bootmap_paddr = %lx\n", bootmem_paddr);
-
-               init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
-                                 start_paddr >> PAGE_SHIFT,
-                                 end_paddr >> PAGE_SHIFT);
-
-               /*
-                * We need to do another scan of all memory sections to
-                * associate memory with the correct node.
-                */
-               addr_cells = get_mem_addr_cells();
-               size_cells = get_mem_size_cells();
-               memory = NULL;
-               while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
-                       unsigned long mem_start, mem_size;
-                       int numa_domain, ranges;
-                       unsigned int *memcell_buf;
-                       unsigned int len;
-
-                       memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
-                       if (!memcell_buf || len <= 0)
-                               continue;
-
-                       ranges = memory->n_addrs;       /* ranges in cell */
-new_range:
-                       mem_start = read_n_cells(addr_cells, &memcell_buf);
-                       mem_size = read_n_cells(size_cells, &memcell_buf);
-                       if (numa_enabled) {
-                               numa_domain = of_node_numa_domain(memory);
-                               if (numa_domain  >= MAX_NUMNODES)
-                                       numa_domain = 0;
-                       } else
-                               numa_domain =  0;
-
-                       if (numa_domain != nid)
-                               continue;
-
-                       mem_size = numa_enforce_memory_limit(mem_start, mem_size);
-                       if (mem_size) {
-                               dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
-                               free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
-                       }
-
-                       if (--ranges)           /* process all ranges in cell */
-                               goto new_range;
-               }
-
-               /*
-                * Mark reserved regions on this node
-                */
-               for (i = 0; i < lmb.reserved.cnt; i++) {
-                       unsigned long physbase = lmb.reserved.region[i].base;
-                       unsigned long size = lmb.reserved.region[i].size;
-
-                       if (pa_to_nid(physbase) != nid &&
-                           pa_to_nid(physbase+size-1) != nid)
-                               continue;
-
-                       if (physbase < end_paddr &&
-                           (physbase+size) > start_paddr) {
-                               /* overlaps */
-                               if (physbase < start_paddr) {
-                                       size -= start_paddr - physbase;
-                                       physbase = start_paddr;
-                               }
-
-                               if (size > end_paddr - physbase)
-                                       size = end_paddr - physbase;
-
-                               dbg("reserve_bootmem %lx %lx\n", physbase,
-                                   size);
-                               reserve_bootmem_node(NODE_DATA(nid), physbase,
-                                                    size);
-                       }
-               }
-               /*
-                * This loop may look famaliar, but we have to do it again
-                * after marking our reserved memory to mark memory present
-                * for sparsemem.
-                */
-               addr_cells = get_mem_addr_cells();
-               size_cells = get_mem_size_cells();
-               memory = NULL;
-               while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
-                       unsigned long mem_start, mem_size;
-                       int numa_domain, ranges;
-                       unsigned int *memcell_buf;
-                       unsigned int len;
-
-                       memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
-                       if (!memcell_buf || len <= 0)
-                               continue;
-
-                       ranges = memory->n_addrs;       /* ranges in cell */
-new_range2:
-                       mem_start = read_n_cells(addr_cells, &memcell_buf);
-                       mem_size = read_n_cells(size_cells, &memcell_buf);
-                       if (numa_enabled) {
-                               numa_domain = of_node_numa_domain(memory);
-                               if (numa_domain  >= MAX_NUMNODES)
-                                       numa_domain = 0;
-                       } else
-                               numa_domain =  0;
-
-                       if (numa_domain != nid)
-                               continue;
-
-                       mem_size = numa_enforce_memory_limit(mem_start, mem_size);
-                       memory_present(numa_domain, mem_start >> PAGE_SHIFT,
-                                      (mem_start + mem_size) >> PAGE_SHIFT);
-
-                       if (--ranges)           /* process all ranges in cell */
-                               goto new_range2;
-               }
-
-       }
-}
-
-void __init paging_init(void)
-{
-       unsigned long zones_size[MAX_NR_ZONES];
-       unsigned long zholes_size[MAX_NR_ZONES];
-       int nid;
-
-       memset(zones_size, 0, sizeof(zones_size));
-       memset(zholes_size, 0, sizeof(zholes_size));
-
-       for_each_online_node(nid) {
-               unsigned long start_pfn;
-               unsigned long end_pfn;
-
-               start_pfn = init_node_data[nid].node_start_pfn;
-               end_pfn = init_node_data[nid].node_end_pfn;
-
-               zones_size[ZONE_DMA] = end_pfn - start_pfn;
-               zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
-                       init_node_data[nid].node_present_pages;
-
-               dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
-                   zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
-
-               free_area_init_node(nid, NODE_DATA(nid), zones_size,
-                                                       start_pfn, zholes_size);
-       }
-}
-
-static int __init early_numa(char *p)
-{
-       if (!p)
-               return 0;
-
-       if (strstr(p, "off"))
-               numa_enabled = 0;
-
-       if (strstr(p, "debug"))
-               numa_debug = 1;
-
-       return 0;
-}
-early_param("numa", early_numa);
diff --git a/arch/ppc64/mm/slb.c b/arch/ppc64/mm/slb.c
deleted file mode 100644 (file)
index 0473953..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * PowerPC64 SLB support.
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code writteh by:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- *    Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <linux/config.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-
-extern void slb_allocate(unsigned long ea);
-
-static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot)
-{
-       return (ea & ESID_MASK) | SLB_ESID_V | slot;
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, unsigned long flags)
-{
-       return (get_kernel_vsid(ea) << SLB_VSID_SHIFT) | flags;
-}
-
-static inline void create_slbe(unsigned long ea, unsigned long flags,
-                              unsigned long entry)
-{
-       asm volatile("slbmte  %0,%1" :
-                    : "r" (mk_vsid_data(ea, flags)),
-                      "r" (mk_esid_data(ea, entry))
-                    : "memory" );
-}
-
-static void slb_flush_and_rebolt(void)
-{
-       /* If you change this make sure you change SLB_NUM_BOLTED
-        * appropriately too. */
-       unsigned long ksp_flags = SLB_VSID_KERNEL;
-       unsigned long ksp_esid_data;
-
-       WARN_ON(!irqs_disabled());
-
-       if (cpu_has_feature(CPU_FTR_16M_PAGE))
-               ksp_flags |= SLB_VSID_L;
-
-       ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
-       if ((ksp_esid_data & ESID_MASK) == KERNELBASE)
-               ksp_esid_data &= ~SLB_ESID_V;
-
-       /* We need to do this all in asm, so we're sure we don't touch
-        * the stack between the slbia and rebolting it. */
-       asm volatile("isync\n"
-                    "slbia\n"
-                    /* Slot 1 - first VMALLOC segment */
-                    "slbmte    %0,%1\n"
-                    /* Slot 2 - kernel stack */
-                    "slbmte    %2,%3\n"
-                    "isync"
-                    :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)),
-                       "r"(mk_esid_data(VMALLOCBASE, 1)),
-                       "r"(mk_vsid_data(ksp_esid_data, ksp_flags)),
-                       "r"(ksp_esid_data)
-                    : "memory");
-}
-
-/* Flush all user entries from the segment table of the current processor. */
-void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
-{
-       unsigned long offset = get_paca()->slb_cache_ptr;
-       unsigned long esid_data = 0;
-       unsigned long pc = KSTK_EIP(tsk);
-       unsigned long stack = KSTK_ESP(tsk);
-       unsigned long unmapped_base;
-
-       if (offset <= SLB_CACHE_ENTRIES) {
-               int i;
-               asm volatile("isync" : : : "memory");
-               for (i = 0; i < offset; i++) {
-                       esid_data = ((unsigned long)get_paca()->slb_cache[i]
-                               << SID_SHIFT) | SLBIE_C;
-                       asm volatile("slbie %0" : : "r" (esid_data));
-               }
-               asm volatile("isync" : : : "memory");
-       } else {
-               slb_flush_and_rebolt();
-       }
-
-       /* Workaround POWER5 < DD2.1 issue */
-       if (offset == 1 || offset > SLB_CACHE_ENTRIES)
-               asm volatile("slbie %0" : : "r" (esid_data));
-
-       get_paca()->slb_cache_ptr = 0;
-       get_paca()->context = mm->context;
-
-       /*
-        * preload some userspace segments into the SLB.
-        */
-       if (test_tsk_thread_flag(tsk, TIF_32BIT))
-               unmapped_base = TASK_UNMAPPED_BASE_USER32;
-       else
-               unmapped_base = TASK_UNMAPPED_BASE_USER64;
-
-       if (pc >= KERNELBASE)
-               return;
-       slb_allocate(pc);
-
-       if (GET_ESID(pc) == GET_ESID(stack))
-               return;
-
-       if (stack >= KERNELBASE)
-               return;
-       slb_allocate(stack);
-
-       if ((GET_ESID(pc) == GET_ESID(unmapped_base))
-           || (GET_ESID(stack) == GET_ESID(unmapped_base)))
-               return;
-
-       if (unmapped_base >= KERNELBASE)
-               return;
-       slb_allocate(unmapped_base);
-}
-
-void slb_initialize(void)
-{
-       /* On iSeries the bolted entries have already been set up by
-        * the hypervisor from the lparMap data in head.S */
-#ifndef CONFIG_PPC_ISERIES
-       unsigned long flags = SLB_VSID_KERNEL;
-
-       /* Invalidate the entire SLB (even slot 0) & all the ERATS */
-       if (cpu_has_feature(CPU_FTR_16M_PAGE))
-               flags |= SLB_VSID_L;
-
-       asm volatile("isync":::"memory");
-       asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
-       asm volatile("isync; slbia; isync":::"memory");
-       create_slbe(KERNELBASE, flags, 0);
-       create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1);
-       /* We don't bolt the stack for the time being - we're in boot,
-        * so the stack is in the bolted segment.  By the time it goes
-        * elsewhere, we'll call _switch() which will bolt in the new
-        * one. */
-       asm volatile("isync":::"memory");
-#endif
-
-       get_paca()->stab_rr = SLB_NUM_BOLTED;
-}
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
deleted file mode 100644 (file)
index a3a03da..0000000
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * arch/ppc64/mm/slb_low.S
- *
- * Low-level SLB routines
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- *
- * Based on earlier C version:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- *    Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-
-#include <linux/config.h>
-#include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-
-/* void slb_allocate(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- *     r3 = faulting address, r13 = PACA
- *     r9, r10, r11 are clobbered by this function
- * No other registers are examined or changed.
- */
-_GLOBAL(slb_allocate)
-       /*
-        * First find a slot, round robin. Previously we tried to find
-        * a free slot first but that took too long. Unfortunately we
-        * dont have any LRU information to help us choose a slot.
-        */
-#ifdef CONFIG_PPC_ISERIES
-       /*
-        * On iSeries, the "bolted" stack segment can be cast out on
-        * shared processor switch so we need to check for a miss on
-        * it and restore it to the right slot.
-        */
-       ld      r9,PACAKSAVE(r13)
-       clrrdi  r9,r9,28
-       clrrdi  r11,r3,28
-       li      r10,SLB_NUM_BOLTED-1    /* Stack goes in last bolted slot */
-       cmpld   r9,r11
-       beq     3f
-#endif /* CONFIG_PPC_ISERIES */
-
-       ld      r10,PACASTABRR(r13)
-       addi    r10,r10,1
-       /* use a cpu feature mask if we ever change our slb size */
-       cmpldi  r10,SLB_NUM_ENTRIES
-
-       blt+    4f
-       li      r10,SLB_NUM_BOLTED
-
-4:
-       std     r10,PACASTABRR(r13)
-3:
-       /* r3 = faulting address, r10 = entry */
-
-       srdi    r9,r3,60                /* get region */
-       srdi    r3,r3,28                /* get esid */
-       cmpldi  cr7,r9,0xc              /* cmp KERNELBASE for later use */
-
-       rldimi  r10,r3,28,0             /* r10= ESID<<28 | entry */
-       oris    r10,r10,SLB_ESID_V@h    /* r10 |= SLB_ESID_V */
-
-       /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */
-
-       blt     cr7,0f                  /* user or kernel? */
-
-       /* kernel address: proto-VSID = ESID */
-       /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
-        * this code will generate the protoVSID 0xfffffffff for the
-        * top segment.  That's ok, the scramble below will translate
-        * it to VSID 0, which is reserved as a bad VSID - one which
-        * will never have any pages in it.  */
-       li      r11,SLB_VSID_KERNEL
-BEGIN_FTR_SECTION
-       bne     cr7,9f
-       li      r11,(SLB_VSID_KERNEL|SLB_VSID_L)
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-       b       9f
-
-0:     /* user address: proto-VSID = context<<15 | ESID */
-       srdi.   r9,r3,USER_ESID_BITS
-       bne-    8f                      /* invalid ea bits set */
-
-#ifdef CONFIG_HUGETLB_PAGE
-BEGIN_FTR_SECTION
-       lhz     r9,PACAHIGHHTLBAREAS(r13)
-       srdi    r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
-       srd     r9,r9,r11
-       lhz     r11,PACALOWHTLBAREAS(r13)
-       srd     r11,r11,r3
-       or      r9,r9,r11
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
-
-       li      r11,SLB_VSID_USER
-
-#ifdef CONFIG_HUGETLB_PAGE
-BEGIN_FTR_SECTION
-       rldimi  r11,r9,8,55             /* shift masked bit into SLB_VSID_L */
-END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
-#endif /* CONFIG_HUGETLB_PAGE */
-
-       ld      r9,PACACONTEXTID(r13)
-       rldimi  r3,r9,USER_ESID_BITS,0
-
-9:     /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */
-       ASM_VSID_SCRAMBLE(r3,r9)
-
-       rldimi  r11,r3,SLB_VSID_SHIFT,16        /* combine VSID and flags */
-
-       /*
-        * No need for an isync before or after this slbmte. The exception
-        * we enter with and the rfid we exit with are context synchronizing.
-        */
-       slbmte  r11,r10
-
-       bgelr   cr7                     /* we're done for kernel addresses */
-
-       /* Update the slb cache */
-       lhz     r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
-       cmpldi  r3,SLB_CACHE_ENTRIES
-       bge     1f
-
-       /* still room in the slb cache */
-       sldi    r11,r3,1                /* r11 = offset * sizeof(u16) */
-       rldicl  r10,r10,36,28           /* get low 16 bits of the ESID */
-       add     r11,r11,r13             /* r11 = (u16 *)paca + offset */
-       sth     r10,PACASLBCACHE(r11)   /* paca->slb_cache[offset] = esid */
-       addi    r3,r3,1                 /* offset++ */
-       b       2f
-1:                                     /* offset >= SLB_CACHE_ENTRIES */
-       li      r3,SLB_CACHE_ENTRIES+1
-2:
-       sth     r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
-       blr
-
-8:     /* invalid EA */
-       li      r3,0                    /* BAD_VSID */
-       li      r11,SLB_VSID_USER       /* flags don't much matter */
-       b       9b
diff --git a/arch/ppc64/mm/stab.c b/arch/ppc64/mm/stab.c
deleted file mode 100644 (file)
index 1b83f00..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * PowerPC64 Segment Translation Support.
- *
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- *    Copyright (c) 2001 Dave Engebretsen
- *
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <linux/config.h>
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-#include <asm/lmb.h>
-#include <asm/abs_addr.h>
-
-struct stab_entry {
-       unsigned long esid_data;
-       unsigned long vsid_data;
-};
-
-/* Both the segment table and SLB code uses the following cache */
-#define NR_STAB_CACHE_ENTRIES 8
-DEFINE_PER_CPU(long, stab_cache_ptr);
-DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
-
-/*
- * Create a segment table entry for the given esid/vsid pair.
- */
-static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
-{
-       unsigned long esid_data, vsid_data;
-       unsigned long entry, group, old_esid, castout_entry, i;
-       unsigned int global_entry;
-       struct stab_entry *ste, *castout_ste;
-       unsigned long kernel_segment = (esid << SID_SHIFT) >= KERNELBASE;
-
-       vsid_data = vsid << STE_VSID_SHIFT;
-       esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
-       if (! kernel_segment)
-               esid_data |= STE_ESID_KS;
-
-       /* Search the primary group first. */
-       global_entry = (esid & 0x1f) << 3;
-       ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
-
-       /* Find an empty entry, if one exists. */
-       for (group = 0; group < 2; group++) {
-               for (entry = 0; entry < 8; entry++, ste++) {
-                       if (!(ste->esid_data & STE_ESID_V)) {
-                               ste->vsid_data = vsid_data;
-                               asm volatile("eieio":::"memory");
-                               ste->esid_data = esid_data;
-                               return (global_entry | entry);
-                       }
-               }
-               /* Now search the secondary group. */
-               global_entry = ((~esid) & 0x1f) << 3;
-               ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
-       }
-
-       /*
-        * Could not find empty entry, pick one with a round robin selection.
-        * Search all entries in the two groups.
-        */
-       castout_entry = get_paca()->stab_rr;
-       for (i = 0; i < 16; i++) {
-               if (castout_entry < 8) {
-                       global_entry = (esid & 0x1f) << 3;
-                       ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
-                       castout_ste = ste + castout_entry;
-               } else {
-                       global_entry = ((~esid) & 0x1f) << 3;
-                       ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
-                       castout_ste = ste + (castout_entry - 8);
-               }
-
-               /* Dont cast out the first kernel segment */
-               if ((castout_ste->esid_data & ESID_MASK) != KERNELBASE)
-                       break;
-
-               castout_entry = (castout_entry + 1) & 0xf;
-       }
-
-       get_paca()->stab_rr = (castout_entry + 1) & 0xf;
-
-       /* Modify the old entry to the new value. */
-
-       /* Force previous translations to complete. DRENG */
-       asm volatile("isync" : : : "memory");
-
-       old_esid = castout_ste->esid_data >> SID_SHIFT;
-       castout_ste->esid_data = 0;             /* Invalidate old entry */
-
-       asm volatile("sync" : : : "memory");    /* Order update */
-
-       castout_ste->vsid_data = vsid_data;
-       asm volatile("eieio" : : : "memory");   /* Order update */
-       castout_ste->esid_data = esid_data;
-
-       asm volatile("slbie  %0" : : "r" (old_esid << SID_SHIFT));
-       /* Ensure completion of slbie */
-       asm volatile("sync" : : : "memory");
-
-       return (global_entry | (castout_entry & 0x7));
-}
-
-/*
- * Allocate a segment table entry for the given ea and mm
- */
-static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
-{
-       unsigned long vsid;
-       unsigned char stab_entry;
-       unsigned long offset;
-
-       /* Kernel or user address? */
-       if (ea >= KERNELBASE) {
-               vsid = get_kernel_vsid(ea);
-       } else {
-               if ((ea >= TASK_SIZE_USER64) || (! mm))
-                       return 1;
-
-               vsid = get_vsid(mm->context.id, ea);
-       }
-
-       stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
-
-       if (ea < KERNELBASE) {
-               offset = __get_cpu_var(stab_cache_ptr);
-               if (offset < NR_STAB_CACHE_ENTRIES)
-                       __get_cpu_var(stab_cache[offset++]) = stab_entry;
-               else
-                       offset = NR_STAB_CACHE_ENTRIES+1;
-               __get_cpu_var(stab_cache_ptr) = offset;
-
-               /* Order update */
-               asm volatile("sync":::"memory");
-       }
-
-       return 0;
-}
-
-int ste_allocate(unsigned long ea)
-{
-       return __ste_allocate(ea, current->mm);
-}
-
-/*
- * Do the segment table work for a context switch: flush all user
- * entries from the table, then preload some probably useful entries
- * for the new task
- */
-void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
-{
-       struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
-       struct stab_entry *ste;
-       unsigned long offset = __get_cpu_var(stab_cache_ptr);
-       unsigned long pc = KSTK_EIP(tsk);
-       unsigned long stack = KSTK_ESP(tsk);
-       unsigned long unmapped_base;
-
-       /* Force previous translations to complete. DRENG */
-       asm volatile("isync" : : : "memory");
-
-       if (offset <= NR_STAB_CACHE_ENTRIES) {
-               int i;
-
-               for (i = 0; i < offset; i++) {
-                       ste = stab + __get_cpu_var(stab_cache[i]);
-                       ste->esid_data = 0; /* invalidate entry */
-               }
-       } else {
-               unsigned long entry;
-
-               /* Invalidate all entries. */
-               ste = stab;
-
-               /* Never flush the first entry. */
-               ste += 1;
-               for (entry = 1;
-                    entry < (PAGE_SIZE / sizeof(struct stab_entry));
-                    entry++, ste++) {
-                       unsigned long ea;
-                       ea = ste->esid_data & ESID_MASK;
-                       if (ea < KERNELBASE) {
-                               ste->esid_data = 0;
-                       }
-               }
-       }
-
-       asm volatile("sync; slbia; sync":::"memory");
-
-       __get_cpu_var(stab_cache_ptr) = 0;
-
-       /* Now preload some entries for the new task */
-       if (test_tsk_thread_flag(tsk, TIF_32BIT))
-               unmapped_base = TASK_UNMAPPED_BASE_USER32;
-       else
-               unmapped_base = TASK_UNMAPPED_BASE_USER64;
-
-       __ste_allocate(pc, mm);
-
-       if (GET_ESID(pc) == GET_ESID(stack))
-               return;
-
-       __ste_allocate(stack, mm);
-
-       if ((GET_ESID(pc) == GET_ESID(unmapped_base))
-           || (GET_ESID(stack) == GET_ESID(unmapped_base)))
-               return;
-
-       __ste_allocate(unmapped_base, mm);
-
-       /* Order update */
-       asm volatile("sync" : : : "memory");
-}
-
-extern void slb_initialize(void);
-
-/*
- * Allocate segment tables for secondary CPUs.  These must all go in
- * the first (bolted) segment, so that do_stab_bolted won't get a
- * recursive segment miss on the segment table itself.
- */
-void stabs_alloc(void)
-{
-       int cpu;
-
-       if (cpu_has_feature(CPU_FTR_SLB))
-               return;
-
-       for_each_cpu(cpu) {
-               unsigned long newstab;
-
-               if (cpu == 0)
-                       continue; /* stab for CPU 0 is statically allocated */
-
-               newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT);
-               if (! newstab)
-                       panic("Unable to allocate segment table for CPU %d.\n",
-                             cpu);
-
-               newstab += KERNELBASE;
-
-               memset((void *)newstab, 0, PAGE_SIZE);
-
-               paca[cpu].stab_addr = newstab;
-               paca[cpu].stab_real = virt_to_abs(newstab);
-               printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
-       }
-}
-
-/*
- * Build an entry for the base kernel segment and put it into
- * the segment table or SLB.  All other segment table or SLB
- * entries are faulted in.
- */
-void stab_initialize(unsigned long stab)
-{
-       unsigned long vsid = get_kernel_vsid(KERNELBASE);
-
-       if (cpu_has_feature(CPU_FTR_SLB)) {
-               slb_initialize();
-       } else {
-               asm volatile("isync; slbia; isync":::"memory");
-               make_ste(stab, GET_ESID(KERNELBASE), vsid);
-
-               /* Order update */
-               asm volatile("sync":::"memory");
-       }
-}
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
deleted file mode 100644 (file)
index 09ab81a..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * This file contains the routines for flushing entries from the
- * TLB and MMU hash table.
- *
- *  Derived from arch/ppc64/mm/init.c:
- *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
- *    Copyright (C) 1996 Paul Mackerras
- *  Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk).
- *
- *  Derived from "arch/i386/mm/init.c"
- *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
- *
- *  Dave Engebretsen <engebret@us.ibm.com>
- *      Rework for PPC64 port.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version
- *  2 of the License, or (at your option) any later version.
- */
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/tlb.h>
-#include <linux/highmem.h>
-
-DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-/* This is declared as we are using the more or less generic
- * include/asm-ppc64/tlb.h file -- tgall
- */
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-unsigned long pte_freelist_forced_free;
-
-struct pte_freelist_batch
-{
-       struct rcu_head rcu;
-       unsigned int    index;
-       pgtable_free_t  tables[0];
-};
-
-DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
-unsigned long pte_freelist_forced_free;
-
-#define PTE_FREELIST_SIZE \
-       ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-         / sizeof(pgtable_free_t))
-
-#ifdef CONFIG_SMP
-static void pte_free_smp_sync(void *arg)
-{
-       /* Do nothing, just ensure we sync with all CPUs */
-}
-#endif
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-static void pgtable_free_now(pgtable_free_t pgf)
-{
-       pte_freelist_forced_free++;
-
-       smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-
-       pgtable_free(pgf);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-       struct pte_freelist_batch *batch =
-               container_of(head, struct pte_freelist_batch, rcu);
-       unsigned int i;
-
-       for (i = 0; i < batch->index; i++)
-               pgtable_free(batch->tables[i]);
-
-       free_page((unsigned long)batch);
-}
-
-static void pte_free_submit(struct pte_freelist_batch *batch)
-{
-       INIT_RCU_HEAD(&batch->rcu);
-       call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
-{
-       /* This is safe as we are holding page_table_lock */
-        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
-       struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-       if (atomic_read(&tlb->mm->mm_users) < 2 ||
-           cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-               pgtable_free(pgf);
-               return;
-       }
-
-       if (*batchp == NULL) {
-               *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
-               if (*batchp == NULL) {
-                       pgtable_free_now(pgf);
-                       return;
-               }
-               (*batchp)->index = 0;
-       }
-       (*batchp)->tables[(*batchp)->index++] = pgf;
-       if ((*batchp)->index == PTE_FREELIST_SIZE) {
-               pte_free_submit(*batchp);
-               *batchp = NULL;
-       }
-}
-
-/*
- * Update the MMU hash table to correspond with a change to
- * a Linux PTE.  If wrprot is true, it is permissible to
- * change the existing HPTE to read-only rather than removing it
- * (if we remove it we should clear the _PTE_HPTEFLAGS bits).
- */
-void hpte_update(struct mm_struct *mm, unsigned long addr,
-                unsigned long pte, int wrprot)
-{
-       struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
-       unsigned long vsid;
-       int i;
-
-       i = batch->index;
-
-       /*
-        * This can happen when we are in the middle of a TLB batch and
-        * we encounter memory pressure (eg copy_page_range when it tries
-        * to allocate a new pte). If we have to reclaim memory and end
-        * up scanning and resetting referenced bits then our batch context
-        * will change mid stream.
-        */
-       if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) {
-               flush_tlb_pending();
-               i = 0;
-       }
-       if (i == 0) {
-               batch->mm = mm;
-               batch->large = pte_huge(pte);
-       }
-       if (addr < KERNELBASE) {
-               vsid = get_vsid(mm->context.id, addr);
-               WARN_ON(vsid == 0);
-       } else
-               vsid = get_kernel_vsid(addr);
-       batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff);
-       batch->pte[i] = __pte(pte);
-       batch->index = ++i;
-       if (i >= PPC64_TLB_BATCH_NR)
-               flush_tlb_pending();
-}
-
-void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
-{
-       int i;
-       int cpu;
-       cpumask_t tmp;
-       int local = 0;
-
-       BUG_ON(in_interrupt());
-
-       cpu = get_cpu();
-       i = batch->index;
-       tmp = cpumask_of_cpu(cpu);
-       if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
-               local = 1;
-
-       if (i == 1)
-               flush_hash_page(batch->vaddr[0], batch->pte[0], local);
-       else
-               flush_hash_range(i, local);
-       batch->index = 0;
-       put_cpu();
-}
-
-void pte_free_finish(void)
-{
-       /* This is safe as we are holding page_table_lock */
-       struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
-
-       if (*batchp == NULL)
-               return;
-       pte_free_submit(*batchp);
-       *batchp = NULL;
-}