x86_64: move mm

author Thomas Gleixner <tglx@linutronix.de>

Thu, 11 Oct 2007 09:17:18 +0000 (11:17 +0200)

committer Thomas Gleixner <tglx@linutronix.de>

Thu, 11 Oct 2007 09:17:18 +0000 (11:17 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:18 +0000 (11:17 +0200)
committer Thomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:18 +0000 (11:17 +0200)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 7317648e6587c07e3765f4bc93b5226b44179605..9832910968486f3de246f0f1065021f5de977871 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
  ifeq ($(CONFIG_X86_32),y)
  include ${srctree}/arch/x86/mm/Makefile_32
  else
-include ${srctree}/arch/x86_64/mm/Makefile_64
+include ${srctree}/arch/x86/mm/Makefile_64
  endif
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64

new file mode 100644 (file)

index 0000000..6bcb479
--- /dev/null
+++ b/arch/x86/mm/Makefile_64
@@ -0,0 +1,10 @@
+#
+# Makefile for the linux x86_64-specific parts of the memory manager.
+#
+
+obj-y   := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_NUMA) += numa_64.o
+obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_ACPI_NUMA) += srat_64.o
+
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c

new file mode 100644 (file)

index 0000000..79ac6e7
--- /dev/null
+++ b/arch/x86/mm/extable_64.c
@@ -0,0 +1,34 @@
+/*
+ * linux/arch/x86_64/mm/extable.c
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/* Simple binary search */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+              const struct exception_table_entry *last,
+              unsigned long value)
+{
+       /* Work around a B stepping K8 bug */
+       if ((value >> 32) == 0)
+               value |= 0xffffffffUL << 32; 
+
+        while (first <= last) {
+               const struct exception_table_entry *mid;
+               long diff;
+
+               mid = (last - first) / 2 + first;
+               diff = mid->insn - value;
+                if (diff == 0)
+                        return mid;
+                else if (diff < 0)
+                        first = mid+1;
+                else
+                        last = mid-1;
+        }
+        return NULL;
+}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c

new file mode 100644 (file)

index 0000000..54816ad
--- /dev/null
+++ b/arch/x86/mm/fault_64.c
@@ -0,0 +1,636 @@
+/*
+ *  linux/arch/x86-64/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>             /* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+
+/* Page fault error code bits */
+#define PF_PROT        (1<<0)          /* or no page found */
+#define PF_WRITE       (1<<1)
+#define PF_USER        (1<<2)
+#define PF_RSVD        (1<<3)
+#define PF_INSTR       (1<<4)
+
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+/* Hook to register for page fault notifications */
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+       vmalloc_sync_all();
+       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+
+static inline int notify_page_fault(struct pt_regs *regs, long err)
+{
+       struct die_args args = {
+               .regs = regs,
+               .str = "page fault",
+               .err = err,
+               .trapnr = 14,
+               .signr = SIGSEGV
+       };
+       return atomic_notifier_call_chain(&notify_page_fault_chain,
+                                         DIE_PAGE_FAULT, &args);
+}
+
+/* Sometimes the CPU reports invalid exceptions on prefetch.
+   Check that here and ignore.
+   Opcode checker based on code by Richard Brunner */
+static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                               unsigned long error_code)
+{ 
+       unsigned char *instr;
+       int scan_more = 1;
+       int prefetch = 0; 
+       unsigned char *max_instr;
+
+       /* If it was a exec fault ignore */
+       if (error_code & PF_INSTR)
+               return 0;
+       
+       instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
+       max_instr = instr + 15;
+
+       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+               return 0;
+
+       while (scan_more && instr < max_instr) { 
+               unsigned char opcode;
+               unsigned char instr_hi;
+               unsigned char instr_lo;
+
+               if (probe_kernel_address(instr, opcode))
+                       break; 
+
+               instr_hi = opcode & 0xf0; 
+               instr_lo = opcode & 0x0f; 
+               instr++;
+
+               switch (instr_hi) { 
+               case 0x20:
+               case 0x30:
+                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
+                          prefixes.  In long mode, the CPU will signal
+                          invalid opcode if some of these prefixes are
+                          present so we will never get here anyway */
+                       scan_more = ((instr_lo & 7) == 0x6);
+                       break;
+                       
+               case 0x40:
+                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
+                          Need to figure out under what instruction mode the
+                          instruction was issued ... */
+                       /* Could check the LDT for lm, but for now it's good
+                          enough to assume that long mode only uses well known
+                          segments or kernel. */
+                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+                       break;
+                       
+               case 0x60:
+                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                       scan_more = (instr_lo & 0xC) == 0x4;
+                       break;          
+               case 0xF0:
+                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+                       scan_more = !instr_lo || (instr_lo>>1) == 1;
+                       break;                  
+               case 0x00:
+                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                       scan_more = 0;
+                       if (probe_kernel_address(instr, opcode))
+                               break;
+                       prefetch = (instr_lo == 0xF) &&
+                               (opcode == 0x0D || opcode == 0x18);
+                       break;                  
+               default:
+                       scan_more = 0;
+                       break;
+               } 
+       }
+       return prefetch;
+}
+
+static int bad_address(void *p) 
+{ 
+       unsigned long dummy;
+       return probe_kernel_address((unsigned long *)p, dummy);
+} 
+
+void dump_pagetable(unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = (pgd_t *)read_cr3();
+
+       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
+       pgd += pgd_index(address);
+       if (bad_address(pgd)) goto bad;
+       printk("PGD %lx ", pgd_val(*pgd));
+       if (!pgd_present(*pgd)) goto ret; 
+
+       pud = pud_offset(pgd, address);
+       if (bad_address(pud)) goto bad;
+       printk("PUD %lx ", pud_val(*pud));
+       if (!pud_present(*pud)) goto ret;
+
+       pmd = pmd_offset(pud, address);
+       if (bad_address(pmd)) goto bad;
+       printk("PMD %lx ", pmd_val(*pmd));
+       if (!pmd_present(*pmd)) goto ret;        
+
+       pte = pte_offset_kernel(pmd, address);
+       if (bad_address(pte)) goto bad;
+       printk("PTE %lx", pte_val(*pte)); 
+ret:
+       printk("\n");
+       return;
+bad:
+       printk("BAD\n");
+}
+
+static const char errata93_warning[] = 
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8. 
+   A lot of BIOS that didn't get tested properly miss this. 
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here. */
+
+static int is_errata93(struct pt_regs *regs, unsigned long address) 
+{
+       static int warned;
+       if (address != regs->rip)
+               return 0;
+       if ((address >> 32) != 0) 
+               return 0;
+       address |= 0xffffffffUL << 32;
+       if ((address >= (u64)_stext && address <= (u64)_etext) || 
+           (address >= MODULES_VADDR && address <= MODULES_END)) { 
+               if (!warned) {
+                       printk(errata93_warning);               
+                       warned = 1;
+               }
+               regs->rip = address;
+               return 1;
+       }
+       return 0;
+} 
+
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+                                unsigned long error_code)
+{
+       unsigned long flags = oops_begin();
+       struct task_struct *tsk;
+
+       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+              current->comm, address);
+       dump_pagetable(address);
+       tsk = current;
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+       __die("Bad pagetable", regs, error_code);
+       oops_end(flags);
+       do_exit(SIGKILL);
+}
+
+/*
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+       pgd_t *pgd, *pgd_ref;
+       pud_t *pud, *pud_ref;
+       pmd_t *pmd, *pmd_ref;
+       pte_t *pte, *pte_ref;
+
+       /* Copy kernel mappings over when needed. This can also
+          happen within a race in page table update. In the later
+          case just flush. */
+
+       pgd = pgd_offset(current->mm ?: &init_mm, address);
+       pgd_ref = pgd_offset_k(address);
+       if (pgd_none(*pgd_ref))
+               return -1;
+       if (pgd_none(*pgd))
+               set_pgd(pgd, *pgd_ref);
+       else
+               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+       /* Below here mismatches are bugs because these lower tables
+          are shared */
+
+       pud = pud_offset(pgd, address);
+       pud_ref = pud_offset(pgd_ref, address);
+       if (pud_none(*pud_ref))
+               return -1;
+       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+               BUG();
+       pmd = pmd_offset(pud, address);
+       pmd_ref = pmd_offset(pud_ref, address);
+       if (pmd_none(*pmd_ref))
+               return -1;
+       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+               BUG();
+       pte_ref = pte_offset_kernel(pmd_ref, address);
+       if (!pte_present(*pte_ref))
+               return -1;
+       pte = pte_offset_kernel(pmd, address);
+       /* Don't use pte_page here, because the mappings can point
+          outside mem_map, and the NUMA hash lookup cannot handle
+          that. */
+       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+               BUG();
+       return 0;
+}
+
+static int page_fault_trace;
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+                                       unsigned long error_code)
+{
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       struct vm_area_struct * vma;
+       unsigned long address;
+       const struct exception_table_entry *fixup;
+       int write, fault;
+       unsigned long flags;
+       siginfo_t info;
+
+       tsk = current;
+       mm = tsk->mm;
+       prefetchw(&mm->mmap_sem);
+
+       /* get the address */
+       address = read_cr2();
+
+       info.si_code = SEGV_MAPERR;
+
+
+       /*
+        * We fault-in kernel-space virtual memory on-demand. The
+        * 'reference' page table is init_mm.pgd.
+        *
+        * NOTE! We MUST NOT take any locks for this case. We may
+        * be in an interrupt or a critical region, and should
+        * only copy the information from the master page table,
+        * nothing more.
+        *
+        * This verifies that the fault happens in kernel space
+        * (error_code & 4) == 0, and that the fault was not a
+        * protection error (error_code & 9) == 0.
+        */
+       if (unlikely(address >= TASK_SIZE64)) {
+               /*
+                * Don't check for the module range here: its PML4
+                * is always initialized because it's shared with the main
+                * kernel text. Only vmalloc may need PML4 syncups.
+                */
+               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
+                       if (vmalloc_fault(address) >= 0)
+                               return;
+               }
+               if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+                       return;
+               /*
+                * Don't take the mm semaphore here. If we fixup a prefetch
+                * fault we could otherwise deadlock.
+                */
+               goto bad_area_nosemaphore;
+       }
+
+       if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+               return;
+
+       if (likely(regs->eflags & X86_EFLAGS_IF))
+               local_irq_enable();
+
+       if (unlikely(page_fault_trace))
+               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
+
+       if (unlikely(error_code & PF_RSVD))
+               pgtable_bad(address, regs, error_code);
+
+       /*
+        * If we're in an interrupt or have no user
+        * context, we must not take the fault..
+        */
+       if (unlikely(in_atomic() || !mm))
+               goto bad_area_nosemaphore;
+
+       /*
+        * User-mode registers count as a user access even for any
+        * potential system fault or CPU buglet.
+        */
+       if (user_mode_vm(regs))
+               error_code |= PF_USER;
+
+ again:
+       /* When running in the kernel we expect faults to occur only to
+        * addresses in user space.  All other faults represent errors in the
+        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+        * erroneous fault occurring in a code path which already holds mmap_sem
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+        * exceptions table.
+        *
+        * As the vast majority of faults will be valid we will only perform
+        * the source reference check when there is a possibilty of a deadlock.
+        * Attempt to lock the address space, if we cannot we then validate the
+        * source.  If this is invalid we can skip the address space check,
+        * thus avoiding the deadlock.
+        */
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               if ((error_code & PF_USER) == 0 &&
+                   !search_exception_tables(regs->rip))
+                       goto bad_area_nosemaphore;
+               down_read(&mm->mmap_sem);
+       }
+
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto bad_area;
+       if (likely(vma->vm_start <= address))
+               goto good_area;
+       if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto bad_area;
+       if (error_code & 4) {
+               /* Allow userspace just enough access below the stack pointer
+                * to let the 'enter' instruction work.
+                */
+               if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
+                       goto bad_area;
+       }
+       if (expand_stack(vma, address))
+               goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+       info.si_code = SEGV_ACCERR;
+       write = 0;
+       switch (error_code & (PF_PROT|PF_WRITE)) {
+               default:        /* 3: write, present */
+                       /* fall through */
+               case PF_WRITE:          /* write, not present */
+                       if (!(vma->vm_flags & VM_WRITE))
+                               goto bad_area;
+                       write++;
+                       break;
+               case PF_PROT:           /* read, present */
+                       goto bad_area;
+               case 0:                 /* read, not present */
+                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                               goto bad_area;
+       }
+
+       /*
+        * If for any reason at all we couldn't handle the fault,
+        * make sure we exit gracefully rather than endlessly redo
+        * the fault.
+        */
+       fault = handle_mm_fault(mm, vma, address, write);
+       if (unlikely(fault & VM_FAULT_ERROR)) {
+               if (fault & VM_FAULT_OOM)
+                       goto out_of_memory;
+               else if (fault & VM_FAULT_SIGBUS)
+                       goto do_sigbus;
+               BUG();
+       }
+       if (fault & VM_FAULT_MAJOR)
+               tsk->maj_flt++;
+       else
+               tsk->min_flt++;
+       up_read(&mm->mmap_sem);
+       return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+       up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & PF_USER) {
+
+               /*
+                * It's possible to have interrupts off here.
+                */
+               local_irq_enable();
+
+               if (is_prefetch(regs, address, error_code))
+                       return;
+
+               /* Work around K8 erratum #100 K8 in compat mode
+                  occasionally jumps to illegal addresses >4GB.  We
+                  catch this here in the page fault handler because
+                  these addresses are not reachable. Just detect this
+                  case and return.  Any code segment in LDT is
+                  compatibility mode. */
+               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+                   (address >> 32))
+                       return;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                   printk_ratelimit()) {
+                       printk(
+                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
+                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+                                       tsk->comm, tsk->pid, address, regs->rip,
+                                       regs->rsp, error_code);
+               }
+       
+               tsk->thread.cr2 = address;
+               /* Kernel addresses are always protection faults */
+               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_no = 14;
+               info.si_signo = SIGSEGV;
+               info.si_errno = 0;
+               /* info.si_code has been set above */
+               info.si_addr = (void __user *)address;
+               force_sig_info(SIGSEGV, &info, tsk);
+               return;
+       }
+
+no_context:
+       
+       /* Are we prepared to handle this kernel fault?  */
+       fixup = search_exception_tables(regs->rip);
+       if (fixup) {
+               regs->rip = fixup->fixup;
+               return;
+       }
+
+       /* 
+        * Hall of shame of CPU/BIOS bugs.
+        */
+
+       if (is_prefetch(regs, address, error_code))
+               return;
+
+       if (is_errata93(regs, address))
+               return; 
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+
+       flags = oops_begin();
+
+       if (address < PAGE_SIZE)
+               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+       else
+               printk(KERN_ALERT "Unable to handle kernel paging request");
+       printk(" at %016lx RIP: \n" KERN_ALERT,address);
+       printk_address(regs->rip);
+       dump_pagetable(address);
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+       __die("Oops", regs, error_code);
+       /* Executive summary in case the body of the oops scrolled away */
+       printk(KERN_EMERG "CR2: %016lx\n", address);
+       oops_end(flags);
+       do_exit(SIGKILL);
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+       up_read(&mm->mmap_sem);
+       if (is_init(current)) {
+               yield();
+               goto again;
+       }
+       printk("VM: killing process %s\n", tsk->comm);
+       if (error_code & 4)
+               do_group_exit(SIGKILL);
+       goto no_context;
+
+do_sigbus:
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die */
+       if (!(error_code & PF_USER))
+               goto no_context;
+
+       tsk->thread.cr2 = address;
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 14;
+       info.si_signo = SIGBUS;
+       info.si_errno = 0;
+       info.si_code = BUS_ADRERR;
+       info.si_addr = (void __user *)address;
+       force_sig_info(SIGBUS, &info, tsk);
+       return;
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+void vmalloc_sync_all(void)
+{
+       /* Note that races in the updates of insync and start aren't 
+          problematic:
+          insync can only get set bits added, and updates to start are only
+          improving performance (without affecting correctness if undone). */
+       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+       static unsigned long start = VMALLOC_START & PGDIR_MASK;
+       unsigned long address;
+
+       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+               if (!test_bit(pgd_index(address), insync)) {
+                       const pgd_t *pgd_ref = pgd_offset_k(address);
+                       struct page *page;
+
+                       if (pgd_none(*pgd_ref))
+                               continue;
+                       spin_lock(&pgd_lock);
+                       list_for_each_entry(page, &pgd_list, lru) {
+                               pgd_t *pgd;
+                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                               if (pgd_none(*pgd))
+                                       set_pgd(pgd, *pgd_ref);
+                               else
+                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                       }
+                       spin_unlock(&pgd_lock);
+                       set_bit(pgd_index(address), insync);
+               }
+               if (address == start)
+                       start = address + PGDIR_SIZE;
+       }
+       /* Check that there is no need to do the same for the modules area. */
+       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
+                               (__START_KERNEL & PGDIR_MASK)));
+}
+
+static int __init enable_pagefaulttrace(char *str)
+{
+       page_fault_trace = 1;
+       return 1;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

new file mode 100644 (file)

index 0000000..458893b
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,750 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+const struct dma_mapping_ops* dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+static unsigned long dma_reserve __initdata;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+void show_mem(void)
+{
+       long i, total = 0, reserved = 0;
+       long shared = 0, cached = 0;
+       pg_data_t *pgdat;
+       struct page *page;
+
+       printk(KERN_INFO "Mem-info:\n");
+       show_free_areas();
+       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+
+       for_each_online_pgdat(pgdat) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       /* this loop can take a while with 256 GB and 4k pages
+                          so update the NMI watchdog */
+                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+                               touch_nmi_watchdog();
+                       }
+                       if (!pfn_valid(pgdat->node_start_pfn + i))
+                               continue;
+                       page = pfn_to_page(pgdat->node_start_pfn + i);
+                       total++;
+                       if (PageReserved(page))
+                               reserved++;
+                       else if (PageSwapCache(page))
+                               cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+       }
+       printk(KERN_INFO "%lu pages of RAM\n", total);
+       printk(KERN_INFO "%lu reserved pages\n",reserved);
+       printk(KERN_INFO "%lu pages shared\n",shared);
+       printk(KERN_INFO "%lu pages swap cached\n",cached);
+}
+
+int after_bootmem;
+
+static __init void *spp_getpage(void)
+{ 
+       void *ptr;
+       if (after_bootmem)
+               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+       else
+               ptr = alloc_bootmem_pages(PAGE_SIZE);
+       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
+               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
+
+       Dprintk("spp_getpage %p\n", ptr);
+       return ptr;
+} 
+
+static __init void set_pte_phys(unsigned long vaddr,
+                        unsigned long phys, pgprot_t prot)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte, new_pte;
+
+       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+
+       pgd = pgd_offset_k(vaddr);
+       if (pgd_none(*pgd)) {
+               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               pmd = (pmd_t *) spp_getpage(); 
+               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+               if (pmd != pmd_offset(pud, 0)) {
+                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+                       return;
+               }
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               pte = (pte_t *) spp_getpage();
+               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+               if (pte != pte_offset_kernel(pmd, 0)) {
+                       printk("PAGETABLE BUG #02!\n");
+                       return;
+               }
+       }
+       new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
+
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (!pte_none(*pte) &&
+           pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
+               pte_ERROR(*pte);
+       set_pte(pte, new_pte);
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/* NOTE: this is meant to be run only at boot */
+void __init 
+__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+{
+       unsigned long address = __fix_to_virt(idx);
+
+       if (idx >= __end_of_fixed_addresses) {
+               printk("Invalid __set_fixmap\n");
+               return;
+       }
+       set_pte_phys(address, phys, prot);
+}
+
+unsigned long __meminitdata table_start, table_end;
+
+static __meminit void *alloc_low_page(unsigned long *phys)
+{ 
+       unsigned long pfn = table_end++;
+       void *adr;
+
+       if (after_bootmem) {
+               adr = (void *)get_zeroed_page(GFP_ATOMIC);
+               *phys = __pa(adr);
+               return adr;
+       }
+
+       if (pfn >= end_pfn) 
+               panic("alloc_low_page: ran out of memory"); 
+
+       adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
+       memset(adr, 0, PAGE_SIZE);
+       *phys  = pfn * PAGE_SIZE;
+       return adr;
+}
+
+static __meminit void unmap_low_page(void *adr)
+{ 
+
+       if (after_bootmem)
+               return;
+
+       early_iounmap(adr, PAGE_SIZE);
+} 
+
+/* Must run before zap_low_mappings */
+__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+{
+       unsigned long vaddr;
+       pmd_t *pmd, *last_pmd;
+       int i, pmds;
+
+       pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+       vaddr = __START_KERNEL_map;
+       pmd = level2_kernel_pgt;
+       last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+       for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
+               for (i = 0; i < pmds; i++) {
+                       if (pmd_present(pmd[i]))
+                               goto next;
+               }
+               vaddr += addr & ~PMD_MASK;
+               addr &= PMD_MASK;
+               for (i = 0; i < pmds; i++, addr += PMD_SIZE)
+                       set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
+               __flush_tlb();
+               return (void *)vaddr;
+       next:
+               ;
+       }
+       printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+       return NULL;
+}
+
+/* To avoid virtual aliases later */
+__meminit void early_iounmap(void *addr, unsigned long size)
+{
+       unsigned long vaddr;
+       pmd_t *pmd;
+       int i, pmds;
+
+       vaddr = (unsigned long)addr;
+       pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+       pmd = level2_kernel_pgt + pmd_index(vaddr);
+       for (i = 0; i < pmds; i++)
+               pmd_clear(pmd + i);
+       __flush_tlb();
+}
+
+static void __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+{
+       int i = pmd_index(address);
+
+       for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+               unsigned long entry;
+               pmd_t *pmd = pmd_page + pmd_index(address);
+
+               if (address >= end) {
+                       if (!after_bootmem)
+                               for (; i < PTRS_PER_PMD; i++, pmd++)
+                                       set_pmd(pmd, __pmd(0));
+                       break;
+               }
+
+               if (pmd_val(*pmd))
+                       continue;
+
+               entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+               entry &= __supported_pte_mask;
+               set_pmd(pmd, __pmd(entry));
+       }
+}
+
+static void __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+       pmd_t *pmd = pmd_offset(pud,0);
+       spin_lock(&init_mm.page_table_lock);
+       phys_pmd_init(pmd, address, end);
+       spin_unlock(&init_mm.page_table_lock);
+       __flush_tlb_all();
+}
+
+static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{ 
+       int i = pud_index(addr);
+
+
+       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+               unsigned long pmd_phys;
+               pud_t *pud = pud_page + pud_index(addr);
+               pmd_t *pmd;
+
+               if (addr >= end)
+                       break;
+
+               if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
+                       set_pud(pud, __pud(0)); 
+                       continue;
+               } 
+
+               if (pud_val(*pud)) {
+                       phys_pmd_update(pud, addr, end);
+                       continue;
+               }
+
+               pmd = alloc_low_page(&pmd_phys);
+               spin_lock(&init_mm.page_table_lock);
+               set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
+               phys_pmd_init(pmd, addr, end);
+               spin_unlock(&init_mm.page_table_lock);
+               unmap_low_page(pmd);
+       }
+       __flush_tlb();
+} 
+
+static void __init find_early_table_space(unsigned long end)
+{
+       unsigned long puds, pmds, tables, start;
+
+       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+       tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
+                round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+       /* RED-PEN putting page tables only on node 0 could
+          cause a hotspot and fill up ZONE_DMA. The page tables
+          need roughly 0.5KB per GB. */
+       start = 0x8000;
+       table_start = find_e820_area(start, end, tables);
+       if (table_start == -1UL)
+               panic("Cannot find space for the kernel page tables");
+
+       table_start >>= PAGE_SHIFT;
+       table_end = table_start;
+
+       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+               end, table_start << PAGE_SHIFT,
+               (table_start << PAGE_SHIFT) + tables);
+}
+
+/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
+   This runs before bootmem is initialized and gets pages directly from the 
+   physical memory. To access them they are temporarily mapped. */
+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
+{ 
+       unsigned long next; 
+
+       Dprintk("init_memory_mapping\n");
+
+       /* 
+        * Find space for the kernel direct mapping tables.
+        * Later we should allocate these tables in the local node of the memory
+        * mapped.  Unfortunately this is done currently before the nodes are 
+        * discovered.
+        */
+       if (!after_bootmem)
+               find_early_table_space(end);
+
+       start = (unsigned long)__va(start);
+       end = (unsigned long)__va(end);
+
+       for (; start < end; start = next) {
+               unsigned long pud_phys; 
+               pgd_t *pgd = pgd_offset_k(start);
+               pud_t *pud;
+
+               if (after_bootmem)
+                       pud = pud_offset(pgd, start & PGDIR_MASK);
+               else
+                       pud = alloc_low_page(&pud_phys);
+
+               next = start + PGDIR_SIZE;
+               if (next > end) 
+                       next = end; 
+               phys_pud_init(pud, __pa(start), __pa(next));
+               if (!after_bootmem)
+                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+               unmap_low_page(pud);
+       } 
+
+       if (!after_bootmem)
+               mmu_cr4_features = read_cr4();
+       __flush_tlb_all();
+}
+
+#ifndef CONFIG_NUMA
+void __init paging_init(void)
+{
+       unsigned long max_zone_pfns[MAX_NR_ZONES];
+       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+       max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+       max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+       max_zone_pfns[ZONE_NORMAL] = end_pfn;
+
+       memory_present(0, 0, end_pfn);
+       sparse_init();
+       free_area_init_nodes(max_zone_pfns);
+}
+#endif
+
+/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
+   from the CPU leading to inconsistent cache lines. address and size
+   must be aligned to 2MB boundaries. 
+   Does nothing when the mapping doesn't exist. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+       unsigned long end = address + size;
+
+       BUG_ON(address & ~LARGE_PAGE_MASK);
+       BUG_ON(size & ~LARGE_PAGE_MASK); 
+       
+       for (; address < end; address += LARGE_PAGE_SIZE) { 
+               pgd_t *pgd = pgd_offset_k(address);
+               pud_t *pud;
+               pmd_t *pmd;
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, address);
+               if (pud_none(*pud))
+                       continue; 
+               pmd = pmd_offset(pud, address);
+               if (!pmd || pmd_none(*pmd))
+                       continue; 
+               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
+                       /* Could handle this, but it should not happen currently. */
+                       printk(KERN_ERR 
+              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
+                       pmd_ERROR(*pmd); 
+               }
+               set_pmd(pmd, __pmd(0));                 
+       }
+       __flush_tlb_all();
+} 
+
+/*
+ * Memory hotplug specific functions
+ */
+void online_page(struct page *page)
+{
+       ClearPageReserved(page);
+       init_page_count(page);
+       __free_page(page);
+       totalram_pages++;
+       num_physpages++;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+       struct pglist_data *pgdat = NODE_DATA(nid);
+       struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       int ret;
+
+       init_memory_mapping(start, (start + size -1));
+
+       ret = __add_pages(zone, start_pfn, nr_pages);
+       if (ret)
+               goto error;
+
+       return ret;
+error:
+       printk("%s: Problem encountered in __add_pages!\n", __func__);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+int remove_memory(u64 start, u64 size)
+{
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
+int memory_add_physaddr_to_nid(u64 start)
+{
+       return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+       int err = -EIO;
+       unsigned long pfn;
+       unsigned long total = 0, mem = 0;
+       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+               if (pfn_valid(pfn)) {
+                       online_page(pfn_to_page(pfn));
+                       err = 0;
+                       mem++;
+               }
+               total++;
+       }
+       if (!err) {
+               z->spanned_pages += total;
+               z->present_pages += mem;
+               z->zone_pgdat->node_spanned_pages += total;
+               z->zone_pgdat->node_present_pages += mem;
+       }
+       return err;
+}
+#endif
+
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
+                        kcore_vsyscall;
+
+void __init mem_init(void)
+{
+       long codesize, reservedpages, datasize, initsize;
+
+       pci_iommu_alloc();
+
+       /* clear the zero-page */
+       memset(empty_zero_page, 0, PAGE_SIZE);
+
+       reservedpages = 0;
+
+       /* this will put all low memory onto the freelists */
+#ifdef CONFIG_NUMA
+       totalram_pages = numa_free_all_bootmem();
+#else
+       totalram_pages = free_all_bootmem();
+#endif
+       reservedpages = end_pfn - totalram_pages -
+                                       absent_pages_in_range(0, end_pfn);
+
+       after_bootmem = 1;
+
+       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+       /* Register memory areas for /proc/kcore */
+       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+                  VMALLOC_END-VMALLOC_START);
+       kclist_add(&kcore_kernel, &_stext, _end - _stext);
+       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+                                VSYSCALL_END - VSYSCALL_START);
+
+       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+               end_pfn << (PAGE_SHIFT-10),
+               codesize >> 10,
+               reservedpages << (PAGE_SHIFT-10),
+               datasize >> 10,
+               initsize >> 10);
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+       unsigned long addr;
+
+       if (begin >= end)
+               return;
+
+       printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+       for (addr = begin; addr < end; addr += PAGE_SIZE) {
+               ClearPageReserved(virt_to_page(addr));
+               init_page_count(virt_to_page(addr));
+               memset((void *)(addr & ~(PAGE_SIZE-1)),
+                       POISON_FREE_INITMEM, PAGE_SIZE);
+               if (addr >= __START_KERNEL_map)
+                       change_page_attr_addr(addr, 1, __pgprot(0));
+               free_page(addr);
+               totalram_pages++;
+       }
+       if (addr > __START_KERNEL_map)
+               global_flush_tlb();
+}
+
+void free_initmem(void)
+{
+       free_init_pages("unused kernel memory",
+                       (unsigned long)(&__init_begin),
+                       (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+
+void mark_rodata_ro(void)
+{
+       unsigned long start = (unsigned long)_stext, end;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       /* It must still be possible to apply SMP alternatives. */
+       if (num_possible_cpus() > 1)
+               start = (unsigned long)_etext;
+#endif
+
+#ifdef CONFIG_KPROBES
+       start = (unsigned long)__start_rodata;
+#endif
+       
+       end = (unsigned long)__end_rodata;
+       start = (start + PAGE_SIZE - 1) & PAGE_MASK;
+       end &= PAGE_MASK;
+       if (end <= start)
+               return;
+
+       change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+
+       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+              (end - start) >> 10);
+
+       /*
+        * change_page_attr_addr() requires a global_flush_tlb() call after it.
+        * We do this after the printk so that if something went wrong in the
+        * change, the printk gets out at least to give a better debug hint
+        * of who is the culprit.
+        */
+       global_flush_tlb();
+}
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+       free_init_pages("initrd memory", start, end);
+}
+#endif
+
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
+{ 
+#ifdef CONFIG_NUMA
+       int nid = phys_to_nid(phys);
+#endif
+       unsigned long pfn = phys >> PAGE_SHIFT;
+       if (pfn >= end_pfn) {
+               /* This can happen with kdump kernels when accessing firmware
+                  tables. */
+               if (pfn < end_pfn_map)
+                       return;
+               printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+                               phys, len);
+               return;
+       }
+
+       /* Should check here against the e820 map to avoid double free */
+#ifdef CONFIG_NUMA
+       reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else                  
+       reserve_bootmem(phys, len);    
+#endif
+       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
+               dma_reserve += len / PAGE_SIZE;
+               set_dma_reserve(dma_reserve);
+       }
+}
+
+int kern_addr_valid(unsigned long addr) 
+{ 
+       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       if (above != 0 && above != -1UL)
+               return 0; 
+       
+       pgd = pgd_offset_k(addr);
+       if (pgd_none(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, addr);
+       if (pud_none(*pud))
+               return 0; 
+
+       pmd = pmd_offset(pud, addr);
+       if (pmd_none(*pmd))
+               return 0;
+       if (pmd_large(*pmd))
+               return pfn_valid(pmd_pfn(*pmd));
+
+       pte = pte_offset_kernel(pmd, addr);
+       if (pte_none(*pte))
+               return 0;
+       return pfn_valid(pte_pfn(*pte));
+}
+
+/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+   not need special handling anymore. */
+
+static struct vm_area_struct gate_vma = {
+       .vm_start = VSYSCALL_START,
+       .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
+       .vm_page_prot = PAGE_READONLY_EXEC,
+       .vm_flags = VM_READ | VM_EXEC
+};
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+#ifdef CONFIG_IA32_EMULATION
+       if (test_tsk_thread_flag(tsk, TIF_IA32))
+               return NULL;
+#endif
+       return &gate_vma;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+       struct vm_area_struct *vma = get_gate_vma(task);
+       if (!vma)
+               return 0;
+       return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/* Use this when you have no reliable task/vma, typically from interrupt
+ * context.  It is less reliable than using the task's vma and may give
+ * false positives.
+ */
+int in_gate_area_no_task(unsigned long addr)
+{
+       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+}
+
+void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+       return __alloc_bootmem_core(pgdat->bdata, size,
+                       SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+               return "[vdso]";
+       if (vma == &gate_vma)
+               return "[vsyscall]";
+       return NULL;
+}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c

new file mode 100644 (file)

index 0000000..6cac90a
--- /dev/null
+++ b/arch/x86/mm/ioremap_64.c
@@ -0,0 +1,210 @@
+/*
+ * arch/x86_64/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
+#include <asm/proto.h>
+
+unsigned long __phys_addr(unsigned long x)
+{
+       if (x >= __START_KERNEL_map)
+               return x - __START_KERNEL_map + phys_base;
+       return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+#define ISA_START_ADDRESS      0xa0000
+#define ISA_END_ADDRESS                0x100000
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int
+ioremap_change_attr(unsigned long phys_addr, unsigned long size,
+                                       unsigned long flags)
+{
+       int err = 0;
+       if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
+               unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+               unsigned long vaddr = (unsigned long) __va(phys_addr);
+
+               /*
+                * Must use a address here and not struct page because the phys addr
+                * can be a in hole between nodes and not have an memmap entry.
+                */
+               err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
+               if (!err)
+                       global_flush_tlb();
+       }
+       return err;
+}
+
+/*
+ * Generic mapping function
+ */
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+{
+       void * addr;
+       struct vm_struct * area;
+       unsigned long offset, last_addr;
+       pgprot_t pgprot;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+               return (__force void __iomem *)phys_to_virt(phys_addr);
+
+#ifdef CONFIG_FLATMEM
+       /*
+        * Don't allow anybody to remap normal RAM that we're using..
+        */
+       if (last_addr < virt_to_phys(high_memory)) {
+               char *t_addr, *t_end;
+               struct page *page;
+
+               t_addr = __va(phys_addr);
+               t_end = t_addr + (size - 1);
+          
+               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
+                       if(!PageReserved(page))
+                               return NULL;
+       }
+#endif
+
+       pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
+                         | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+       /*
+        * Ok, go for it..
+        */
+       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+       if (!area)
+               return NULL;
+       area->phys_addr = phys_addr;
+       addr = area->addr;
+       if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
+                              phys_addr, pgprot)) {
+               remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+               return NULL;
+       }
+       if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
+               area->flags &= 0xffffff;
+               vunmap(addr);
+               return NULL;
+       }
+       return (__force void __iomem *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(__ioremap);
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+       return __ioremap(phys_addr, size, _PAGE_PCD);
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+       struct vm_struct *p, *o;
+
+       if (addr <= high_memory) 
+               return; 
+       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+               addr < phys_to_virt(ISA_END_ADDRESS))
+               return;
+
+       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
+       /* Use the vm area unlocked, assuming the caller
+          ensures there isn't another iounmap for the same address
+          in parallel. Reuse of the virtual address is prevented by
+          leaving it in the global lists until we're done with it.
+          cpa takes care of the direct mappings. */
+       read_lock(&vmlist_lock);
+       for (p = vmlist; p; p = p->next) {
+               if (p->addr == addr)
+                       break;
+       }
+       read_unlock(&vmlist_lock);
+
+       if (!p) {
+               printk("iounmap: bad address %p\n", addr);
+               dump_stack();
+               return;
+       }
+
+       /* Reset the direct mapping. Can block */
+       if (p->flags >> 20)
+               ioremap_change_attr(p->phys_addr, p->size, 0);
+
+       /* Finally remove it */
+       o = remove_vm_area((void *)addr);
+       BUG_ON(p != o || o == NULL);
+       kfree(p); 
+}
+EXPORT_SYMBOL(iounmap);
+
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c

new file mode 100644 (file)

index 0000000..a96006f
--- /dev/null
+++ b/arch/x86/mm/k8topology_64.c
@@ -0,0 +1,182 @@
+/* 
+ * AMD K8 NUMA support.
+ * Discover the memory map and associated nodes.
+ * 
+ * This version reads it directly from the K8 northbridge.
+ * 
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+
+static __init int find_northbridge(void)
+{
+       int num; 
+
+       for (num = 0; num < 32; num++) { 
+               u32 header;
+               
+               header = read_pci_config(0, num, 0, 0x00);  
+               if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
+                       continue;       
+
+               header = read_pci_config(0, num, 1, 0x00); 
+               if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
+                       continue;       
+               return num; 
+       } 
+
+       return -1;      
+}
+
+int __init k8_scan_nodes(unsigned long start, unsigned long end)
+{ 
+       unsigned long prevbase;
+       struct bootnode nodes[8];
+       int nodeid, i, j, nb;
+       unsigned char nodeids[8];
+       int found = 0;
+       u32 reg;
+       unsigned numnodes;
+       unsigned num_cores;
+
+       if (!early_pci_allowed())
+               return -1;
+
+       nb = find_northbridge(); 
+       if (nb < 0) 
+               return nb;
+
+       printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
+
+       num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+       printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+
+       reg = read_pci_config(0, nb, 0, 0x60); 
+       numnodes = ((reg >> 4) & 0xF) + 1;
+       if (numnodes <= 1)
+               return -1;
+
+       printk(KERN_INFO "Number of nodes %d\n", numnodes);
+
+       memset(&nodes,0,sizeof(nodes)); 
+       prevbase = 0;
+       for (i = 0; i < 8; i++) { 
+               unsigned long base,limit; 
+               u32 nodeid;
+               
+               base = read_pci_config(0, nb, 1, 0x40 + i*8);
+               limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+               nodeid = limit & 7; 
+               nodeids[i] = nodeid;
+               if ((base & 3) == 0) { 
+                       if (i < numnodes)
+                               printk("Skipping disabled node %d\n", i); 
+                       continue;
+               } 
+               if (nodeid >= numnodes) {
+                       printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+                              base, limit); 
+                       continue;
+               } 
+
+               if (!limit) { 
+                       printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
+                              base);
+                       continue;
+               }
+               if ((base >> 8) & 3 || (limit >> 8) & 3) {
+                       printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
+                              nodeid, (base>>8)&3, (limit>>8) & 3); 
+                       return -1; 
+               }       
+               if (node_isset(nodeid, node_possible_map)) {
+                       printk(KERN_INFO "Node %d already present. Skipping\n", 
+                              nodeid);
+                       continue;
+               }
+
+               limit >>= 16; 
+               limit <<= 24; 
+               limit |= (1<<24)-1;
+               limit++;
+
+               if (limit > end_pfn << PAGE_SHIFT)
+                       limit = end_pfn << PAGE_SHIFT;
+               if (limit <= base)
+                       continue; 
+                       
+               base >>= 16;
+               base <<= 24; 
+
+               if (base < start) 
+                       base = start; 
+               if (limit > end) 
+                       limit = end; 
+               if (limit == base) { 
+                       printk(KERN_ERR "Empty node %d\n", nodeid); 
+                       continue; 
+               }
+               if (limit < base) { 
+                       printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+                              nodeid, base, limit);                           
+                       continue;
+               } 
+               
+               /* Could sort here, but pun for now. Should not happen anyroads. */
+               if (prevbase > base) { 
+                       printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+                              prevbase,base);
+                       return -1;
+               }
+                       
+               printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
+                      nodeid, base, limit); 
+               
+               found++;
+               
+               nodes[nodeid].start = base; 
+               nodes[nodeid].end = limit;
+               e820_register_active_regions(nodeid,
+                               nodes[nodeid].start >> PAGE_SHIFT,
+                               nodes[nodeid].end >> PAGE_SHIFT);
+
+               prevbase = base;
+
+               node_set(nodeid, node_possible_map);
+       } 
+
+       if (!found)
+               return -1; 
+
+       memnode_shift = compute_hash_shift(nodes, 8);
+       if (memnode_shift < 0) { 
+               printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
+               return -1; 
+       } 
+       printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
+
+       for (i = 0; i < 8; i++) {
+               if (nodes[i].start != nodes[i].end) { 
+                       nodeid = nodeids[i];
+                       for (j = 0; j < num_cores; j++)
+                               apicid_to_node[(nodeid * num_cores) + j] = i;
+                       setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
+               } 
+       }
+
+       numa_init_array();
+       return 0;
+} 
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c

new file mode 100644 (file)

index 0000000..80bba0d
--- /dev/null
+++ b/arch/x86/mm/mmap_64.c
@@ -0,0 +1,29 @@
+/* Copyright 2005 Andi Kleen, SuSE Labs.
+ * Licensed under GPL, v.2
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <asm/ia32.h>
+
+/* Notebook: move the mmap code from sys_x86_64.c over here. */
+
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+#ifdef CONFIG_IA32_EMULATION
+       if (current_thread_info()->flags & _TIF_IA32)
+               return ia32_pick_mmap_layout(mm);
+#endif
+       mm->mmap_base = TASK_UNMAPPED_BASE;
+       if (current->flags & PF_RANDOMIZE) {
+               /* Add 28bit randomness which is about 40bits of address space
+                  because mmap base has to be page aligned.
+                  or ~1/128 of the total user VM
+                  (total user address space is 47bits) */
+               unsigned rnd = get_random_int() & 0xfffffff;
+               mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
+       }
+       mm->get_unmapped_area = arch_get_unmapped_area;
+       mm->unmap_area = arch_unmap_area;
+}
+
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c

new file mode 100644 (file)

index 0000000..6da2355
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,648 @@
+/* 
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */ 
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+#ifndef Dprintk
+#define Dprintk(x...)
+#endif
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+bootmem_data_t plat_node_bdata[MAX_NUMNODES];
+
+struct memnode memnode;
+
+unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
+       [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
+
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
+{
+       int i; 
+       int res = -1;
+       unsigned long addr, end;
+
+       memset(memnodemap, 0xff, memnodemapsize);
+       for (i = 0; i < numnodes; i++) {
+               addr = nodes[i].start;
+               end = nodes[i].end;
+               if (addr >= end)
+                       continue;
+               if ((end >> shift) >= memnodemapsize)
+                       return 0;
+               do {
+                       if (memnodemap[addr >> shift] != 0xff)
+                               return -1;
+                       memnodemap[addr >> shift] = i;
+                       addr += (1UL << shift);
+               } while (addr < end);
+               res = 1;
+       } 
+       return res;
+}
+
+static int __init allocate_cachealigned_memnodemap(void)
+{
+       unsigned long pad, pad_addr;
+
+       memnodemap = memnode.embedded_map;
+       if (memnodemapsize <= 48)
+               return 0;
+
+       pad = L1_CACHE_BYTES - 1;
+       pad_addr = 0x8000;
+       nodemap_size = pad + memnodemapsize;
+       nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+                                     nodemap_size);
+       if (nodemap_addr == -1UL) {
+               printk(KERN_ERR
+                      "NUMA: Unable to allocate Memory to Node hash map\n");
+               nodemap_addr = nodemap_size = 0;
+               return -1;
+       }
+       pad_addr = (nodemap_addr + pad) & ~pad;
+       memnodemap = phys_to_virt(pad_addr);
+
+       printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+              nodemap_addr, nodemap_addr + nodemap_size);
+       return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+{
+       int i, nodes_used = 0;
+       unsigned long start, end;
+       unsigned long bitfield = 0, memtop = 0;
+
+       for (i = 0; i < numnodes; i++) {
+               start = nodes[i].start;
+               end = nodes[i].end;
+               if (start >= end)
+                       continue;
+               bitfield |= start;
+               nodes_used++;
+               if (end > memtop)
+                       memtop = end;
+       }
+       if (nodes_used <= 1)
+               i = 63;
+       else
+               i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+       memnodemapsize = (memtop >> i)+1;
+       return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+       int shift;
+
+       shift = extract_lsb_from_nodes(nodes, numnodes);
+       if (allocate_cachealigned_memnodemap())
+               return -1;
+       printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+               shift);
+
+       if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+               printk(KERN_INFO
+       "Your memory is not aligned you need to rebuild your kernel "
+       "with a bigger NODEMAPSIZE shift=%d\n",
+                       shift);
+               return -1;
+       }
+       return shift;
+}
+
+#ifdef CONFIG_SPARSEMEM
+int early_pfn_to_nid(unsigned long pfn)
+{
+       return phys_to_nid(pfn << PAGE_SHIFT);
+}
+#endif
+
+static void * __init
+early_node_mem(int nodeid, unsigned long start, unsigned long end,
+             unsigned long size)
+{
+       unsigned long mem = find_e820_area(start, end, size);
+       void *ptr;
+       if (mem != -1L)
+               return __va(mem);
+       ptr = __alloc_bootmem_nopanic(size,
+                               SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
+       if (ptr == 0) {
+               printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+                       size, nodeid);
+               return NULL;
+       }
+       return ptr;
+}
+
+/* Initialize bootmem allocator for a node */
+void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
+{ 
+       unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
+       unsigned long nodedata_phys;
+       void *bootmap;
+       const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+
+       start = round_up(start, ZONE_ALIGN); 
+
+       printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+
+       start_pfn = start >> PAGE_SHIFT;
+       end_pfn = end >> PAGE_SHIFT;
+
+       node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+       if (node_data[nodeid] == NULL)
+               return;
+       nodedata_phys = __pa(node_data[nodeid]);
+
+       memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
+       NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+       NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+       NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+       /* Find a place for the bootmem map */
+       bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
+       bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+       bootmap = early_node_mem(nodeid, bootmap_start, end,
+                                       bootmap_pages<<PAGE_SHIFT);
+       if (bootmap == NULL)  {
+               if (nodedata_phys < start || nodedata_phys >= end)
+                       free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+               node_data[nodeid] = NULL;
+               return;
+       }
+       bootmap_start = __pa(bootmap);
+       Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
+       
+       bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+                                        bootmap_start >> PAGE_SHIFT, 
+                                        start_pfn, end_pfn); 
+
+       free_bootmem_with_active_regions(nodeid, end);
+
+       reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
+       reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+       srat_reserve_add_area(nodeid);
+#endif
+       node_set_online(nodeid);
+} 
+
+/* Initialize final allocator for a zone */
+void __init setup_node_zones(int nodeid)
+{ 
+       unsigned long start_pfn, end_pfn, memmapsize, limit;
+
+       start_pfn = node_start_pfn(nodeid);
+       end_pfn = node_end_pfn(nodeid);
+
+       Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
+               nodeid, start_pfn, end_pfn);
+
+       /* Try to allocate mem_map at end to not fill up precious <4GB
+          memory. */
+       memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+       limit = end_pfn << PAGE_SHIFT;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+       NODE_DATA(nodeid)->node_mem_map = 
+               __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
+                               memmapsize, SMP_CACHE_BYTES, 
+                               round_down(limit - memmapsize, PAGE_SIZE), 
+                               limit);
+#endif
+} 
+
+void __init numa_init_array(void)
+{
+       int rr, i;
+       /* There are unfortunately some poorly designed mainboards around
+          that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+          mapping. To avoid this fill in the mapping for all possible
+          CPUs, as the number of CPUs is not known yet. 
+          We round robin the existing nodes. */
+       rr = first_node(node_online_map);
+       for (i = 0; i < NR_CPUS; i++) {
+               if (cpu_to_node[i] != NUMA_NO_NODE)
+                       continue;
+               numa_set_node(i, rr);
+               rr = next_node(rr, node_online_map);
+               if (rr == MAX_NUMNODES)
+                       rr = first_node(node_online_map);
+       }
+
+}
+
+#ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
+char *cmdline __initdata;
+
+/*
+ * Setups up nid to range from addr to addr + size.  If the end boundary is
+ * greater than max_addr, then max_addr is used instead.  The return value is 0
+ * if there is additional memory left for allocation past addr and -1 otherwise.
+ * addr is adjusted to be at the end of the node.
+ */
+static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
+                                  u64 size, u64 max_addr)
+{
+       int ret = 0;
+       nodes[nid].start = *addr;
+       *addr += size;
+       if (*addr >= max_addr) {
+               *addr = max_addr;
+               ret = -1;
+       }
+       nodes[nid].end = *addr;
+       node_set(nid, node_possible_map);
+       printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+              nodes[nid].start, nodes[nid].end,
+              (nodes[nid].end - nodes[nid].start) >> 20);
+       return ret;
+}
+
+/*
+ * Splits num_nodes nodes up equally starting at node_start.  The return value
+ * is the number of nodes split up and addr is adjusted to be at the end of the
+ * last node allocated.
+ */
+static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
+                                     u64 max_addr, int node_start,
+                                     int num_nodes)
+{
+       unsigned int big;
+       u64 size;
+       int i;
+
+       if (num_nodes <= 0)
+               return -1;
+       if (num_nodes > MAX_NUMNODES)
+               num_nodes = MAX_NUMNODES;
+       size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
+              num_nodes;
+       /*
+        * Calculate the number of big nodes that can be allocated as a result
+        * of consolidating the leftovers.
+        */
+       big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
+             FAKE_NODE_MIN_SIZE;
+
+       /* Round down to nearest FAKE_NODE_MIN_SIZE. */
+       size &= FAKE_NODE_MIN_HASH_MASK;
+       if (!size) {
+               printk(KERN_ERR "Not enough memory for each node.  "
+                      "NUMA emulation disabled.\n");
+               return -1;
+       }
+
+       for (i = node_start; i < num_nodes + node_start; i++) {
+               u64 end = *addr + size;
+               if (i < big)
+                       end += FAKE_NODE_MIN_SIZE;
+               /*
+                * The final node can have the remaining system RAM.  Other
+                * nodes receive roughly the same amount of available pages.
+                */
+               if (i == num_nodes + node_start - 1)
+                       end = max_addr;
+               else
+                       while (end - *addr - e820_hole_size(*addr, end) <
+                              size) {
+                               end += FAKE_NODE_MIN_SIZE;
+                               if (end > max_addr) {
+                                       end = max_addr;
+                                       break;
+                               }
+                       }
+               if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
+                       break;
+       }
+       return i - node_start + 1;
+}
+
+/*
+ * Splits the remaining system RAM into chunks of size.  The remaining memory is
+ * always assigned to a final node and can be asymmetric.  Returns the number of
+ * nodes split.
+ */
+static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
+                                     u64 max_addr, int node_start, u64 size)
+{
+       int i = node_start;
+       size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
+       while (!setup_node_range(i++, nodes, addr, size, max_addr))
+               ;
+       return i - node_start;
+}
+
+/*
+ * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * numa=fake command-line option.
+ */
+static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+{
+       struct bootnode nodes[MAX_NUMNODES];
+       u64 addr = start_pfn << PAGE_SHIFT;
+       u64 max_addr = end_pfn << PAGE_SHIFT;
+       int num_nodes = 0;
+       int coeff_flag;
+       int coeff = -1;
+       int num = 0;
+       u64 size;
+       int i;
+
+       memset(&nodes, 0, sizeof(nodes));
+       /*
+        * If the numa=fake command-line is just a single number N, split the
+        * system RAM into N fake nodes.
+        */
+       if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
+               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
+                                               simple_strtol(cmdline, NULL, 0));
+               if (num_nodes < 0)
+                       return num_nodes;
+               goto out;
+       }
+
+       /* Parse the command line. */
+       for (coeff_flag = 0; ; cmdline++) {
+               if (*cmdline && isdigit(*cmdline)) {
+                       num = num * 10 + *cmdline - '0';
+                       continue;
+               }
+               if (*cmdline == '*') {
+                       if (num > 0)
+                               coeff = num;
+                       coeff_flag = 1;
+               }
+               if (!*cmdline || *cmdline == ',') {
+                       if (!coeff_flag)
+                               coeff = 1;
+                       /*
+                        * Round down to the nearest FAKE_NODE_MIN_SIZE.
+                        * Command-line coefficients are in megabytes.
+                        */
+                       size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
+                       if (size)
+                               for (i = 0; i < coeff; i++, num_nodes++)
+                                       if (setup_node_range(num_nodes, nodes,
+                                               &addr, size, max_addr) < 0)
+                                               goto done;
+                       if (!*cmdline)
+                               break;
+                       coeff_flag = 0;
+                       coeff = -1;
+               }
+               num = 0;
+       }
+done:
+       if (!num_nodes)
+               return -1;
+       /* Fill remainder of system RAM, if appropriate. */
+       if (addr < max_addr) {
+               if (coeff_flag && coeff < 0) {
+                       /* Split remaining nodes into num-sized chunks */
+                       num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
+                                                        num_nodes, num);
+                       goto out;
+               }
+               switch (*(cmdline - 1)) {
+               case '*':
+                       /* Split remaining nodes into coeff chunks */
+                       if (coeff <= 0)
+                               break;
+                       num_nodes += split_nodes_equally(nodes, &addr, max_addr,
+                                                        num_nodes, coeff);
+                       break;
+               case ',':
+                       /* Do not allocate remaining system RAM */
+                       break;
+               default:
+                       /* Give one final node */
+                       setup_node_range(num_nodes, nodes, &addr,
+                                        max_addr - addr, max_addr);
+                       num_nodes++;
+               }
+       }
+out:
+       memnode_shift = compute_hash_shift(nodes, num_nodes);
+       if (memnode_shift < 0) {
+               memnode_shift = 0;
+               printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
+                      "disabled.\n");
+               return -1;
+       }
+
+       /*
+        * We need to vacate all active ranges that may have been registered by
+        * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+        * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
+        */
+       remove_all_active_ranges();
+#ifdef CONFIG_ACPI_NUMA
+       acpi_numa = -1;
+#endif
+       for_each_node_mask(i, node_possible_map) {
+               e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+                                               nodes[i].end >> PAGE_SHIFT);
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       }
+       acpi_fake_nodes(nodes, num_nodes);
+       numa_init_array();
+       return 0;
+}
+#endif /* CONFIG_NUMA_EMU */
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{ 
+       int i;
+
+       nodes_clear(node_possible_map);
+
+#ifdef CONFIG_NUMA_EMU
+       if (cmdline && !numa_emulation(start_pfn, end_pfn))
+               return;
+       nodes_clear(node_possible_map);
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+       if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+                                         end_pfn << PAGE_SHIFT))
+               return;
+       nodes_clear(node_possible_map);
+#endif
+
+#ifdef CONFIG_K8_NUMA
+       if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+               return;
+       nodes_clear(node_possible_map);
+#endif
+       printk(KERN_INFO "%s\n",
+              numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+       printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+              start_pfn << PAGE_SHIFT,
+              end_pfn << PAGE_SHIFT); 
+               /* setup dummy node covering all memory */ 
+       memnode_shift = 63; 
+       memnodemap = memnode.embedded_map;
+       memnodemap[0] = 0;
+       nodes_clear(node_online_map);
+       node_set_online(0);
+       node_set(0, node_possible_map);
+       for (i = 0; i < NR_CPUS; i++)
+               numa_set_node(i, 0);
+       node_to_cpumask[0] = cpumask_of_cpu(0);
+       e820_register_active_regions(0, start_pfn, end_pfn);
+       setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+__cpuinit void numa_add_cpu(int cpu)
+{
+       set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+} 
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+       cpu_pda(cpu)->nodenumber = node;
+       cpu_to_node[cpu] = node;
+}
+
+unsigned long __init numa_free_all_bootmem(void) 
+{ 
+       int i;
+       unsigned long pages = 0;
+       for_each_online_node(i) {
+               pages += free_all_bootmem_node(NODE_DATA(i));
+       }
+       return pages;
+} 
+
+void __init paging_init(void)
+{ 
+       int i;
+       unsigned long max_zone_pfns[MAX_NR_ZONES];
+       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+       max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+       max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+       max_zone_pfns[ZONE_NORMAL] = end_pfn;
+
+       sparse_memory_present_with_active_regions(MAX_NUMNODES);
+       sparse_init();
+
+       for_each_online_node(i) {
+               setup_node_zones(i); 
+       }
+
+       free_area_init_nodes(max_zone_pfns);
+} 
+
+static __init int numa_setup(char *opt)
+{ 
+       if (!opt)
+               return -EINVAL;
+       if (!strncmp(opt,"off",3))
+               numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+       if (!strncmp(opt, "fake=", 5))
+               cmdline = opt + 5;
+#endif
+#ifdef CONFIG_ACPI_NUMA
+       if (!strncmp(opt,"noacpi",6))
+               acpi_numa = -1;
+       if (!strncmp(opt,"hotadd=", 7))
+               hotadd_percent = simple_strtoul(opt+7, NULL, 10);
+#endif
+       return 0;
+} 
+
+early_param("numa", numa_setup);
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+       int i;
+       for (i = 0; i < NR_CPUS; i++) {
+               u8 apicid = x86_cpu_to_apicid[i];
+               if (apicid == BAD_APICID)
+                       continue;
+               if (apicid_to_node[apicid] == NUMA_NO_NODE)
+                       continue;
+               numa_set_node(i,apicid_to_node[apicid]);
+       }
+}
+
+EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(node_to_cpumask);
+EXPORT_SYMBOL(memnode);
+EXPORT_SYMBOL(node_data);
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * Functions to convert PFNs from/to per node page addresses.
+ * These are out of line because they are quite big.
+ * They could be all tuned by pre caching more state.
+ * Should do that.
+ */
+
+int pfn_valid(unsigned long pfn)
+{
+       unsigned nid;
+       if (pfn >= num_physpages)
+               return 0;
+       nid = pfn_to_nid(pfn);
+       if (nid == 0xff)
+               return 0;
+       return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
+}
+EXPORT_SYMBOL(pfn_valid);
+#endif
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c

new file mode 100644 (file)

index 0000000..10b9809
--- /dev/null
+++ b/arch/x86/mm/pageattr_64.c
@@ -0,0 +1,249 @@
+/* 
+ * Copyright 2002 Andi Kleen, SuSE Labs. 
+ * Thanks to Ben LaHaise for precious feedback.
+ */ 
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+pte_t *lookup_address(unsigned long address)
+{ 
+       pgd_t *pgd = pgd_offset_k(address);
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       if (pgd_none(*pgd))
+               return NULL;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return NULL; 
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return NULL; 
+       if (pmd_large(*pmd))
+               return (pte_t *)pmd;
+       pte = pte_offset_kernel(pmd, address);
+       if (pte && !pte_present(*pte))
+               pte = NULL; 
+       return pte;
+} 
+
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+                                    pgprot_t ref_prot)
+{ 
+       int i; 
+       unsigned long addr;
+       struct page *base = alloc_pages(GFP_KERNEL, 0);
+       pte_t *pbase;
+       if (!base) 
+               return NULL;
+       /*
+        * page_private is used to track the number of entries in
+        * the page table page have non standard attributes.
+        */
+       SetPagePrivate(base);
+       page_private(base) = 0;
+
+       address = __pa(address);
+       addr = address & LARGE_PAGE_MASK; 
+       pbase = (pte_t *)page_address(base);
+       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
+                                  addr == address ? prot : ref_prot);
+       }
+       return base;
+} 
+
+static void cache_flush_page(void *adr)
+{
+       int i;
+       for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
+               asm volatile("clflush (%0)" :: "r" (adr + i));
+}
+
+static void flush_kernel_map(void *arg)
+{
+       struct list_head *l = (struct list_head *)arg;
+       struct page *pg;
+
+       /* When clflush is available always use it because it is
+          much cheaper than WBINVD. */
+       /* clflush is still broken. Disable for now. */
+       if (1 || !cpu_has_clflush)
+               asm volatile("wbinvd" ::: "memory");
+       else list_for_each_entry(pg, l, lru) {
+               void *adr = page_address(pg);
+               cache_flush_page(adr);
+       }
+       __flush_tlb_all();
+}
+
+static inline void flush_map(struct list_head *l)
+{      
+       on_each_cpu(flush_kernel_map, l, 1, 1);
+}
+
+static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
+
+static inline void save_page(struct page *fpage)
+{
+       if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+               list_add(&fpage->lru, &deferred_pages);
+}
+
+/* 
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again. 
+ */
+static void revert_page(unsigned long address, pgprot_t ref_prot)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t large_pte;
+       unsigned long pfn;
+
+       pgd = pgd_offset_k(address);
+       BUG_ON(pgd_none(*pgd));
+       pud = pud_offset(pgd,address);
+       BUG_ON(pud_none(*pud));
+       pmd = pmd_offset(pud, address);
+       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
+       pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
+       large_pte = pfn_pte(pfn, ref_prot);
+       large_pte = pte_mkhuge(large_pte);
+       set_pte((pte_t *)pmd, large_pte);
+}      
+
+static int
+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
+                                  pgprot_t ref_prot)
+{ 
+       pte_t *kpte; 
+       struct page *kpte_page;
+       pgprot_t ref_prot2;
+
+       kpte = lookup_address(address);
+       if (!kpte) return 0;
+       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+       BUG_ON(PageLRU(kpte_page));
+       BUG_ON(PageCompound(kpte_page));
+       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
+               if (!pte_huge(*kpte)) {
+                       set_pte(kpte, pfn_pte(pfn, prot));
+               } else {
+                       /*
+                        * split_large_page will take the reference for this
+                        * change_page_attr on the split page.
+                        */
+                       struct page *split;
+                       ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
+                       split = split_large_page(address, prot, ref_prot2);
+                       if (!split)
+                               return -ENOMEM;
+                       set_pte(kpte, mk_pte(split, ref_prot2));
+                       kpte_page = split;
+               }
+               page_private(kpte_page)++;
+       } else if (!pte_huge(*kpte)) {
+               set_pte(kpte, pfn_pte(pfn, ref_prot));
+               BUG_ON(page_private(kpte_page) == 0);
+               page_private(kpte_page)--;
+       } else
+               BUG();
+
+       /* on x86-64 the direct mapping set at boot is not using 4k pages */
+       BUG_ON(PageReserved(kpte_page));
+
+       save_page(kpte_page);
+       if (page_private(kpte_page) == 0)
+               revert_page(address, ref_prot);
+       return 0;
+} 
+
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ * 
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ * 
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
+{
+       int err = 0, kernel_map = 0;
+       int i; 
+
+       if (address >= __START_KERNEL_map
+           && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
+               address = (unsigned long)__va(__pa(address));
+               kernel_map = 1;
+       }
+
+       down_write(&init_mm.mmap_sem);
+       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
+               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
+
+               if (!kernel_map || pte_present(pfn_pte(0, prot))) {
+                       err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
+                       if (err)
+                               break;
+               }
+               /* Handle kernel mapping too which aliases part of the
+                * lowmem */
+               if (__pa(address) < KERNEL_TEXT_SIZE) {
+                       unsigned long addr2;
+                       pgprot_t prot2;
+                       addr2 = __START_KERNEL_map + __pa(address);
+                       /* Make sure the kernel mappings stay executable */
+                       prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
+                       err = __change_page_attr(addr2, pfn, prot2,
+                                                PAGE_KERNEL_EXEC);
+               } 
+       }       
+       up_write(&init_mm.mmap_sem); 
+       return err;
+}
+
+/* Don't call this for MMIO areas that may not have a mem_map entry */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+       return change_page_attr_addr(addr, numpages, prot);
+}
+
+void global_flush_tlb(void)
+{ 
+       struct page *pg, *next;
+       struct list_head l;
+
+       down_read(&init_mm.mmap_sem);
+       list_replace_init(&deferred_pages, &l);
+       up_read(&init_mm.mmap_sem);
+
+       flush_map(&l);
+
+       list_for_each_entry_safe(pg, next, &l, lru) {
+               list_del(&pg->lru);
+               clear_bit(PG_arch_1, &pg->flags);
+               if (page_private(pg) != 0)
+                       continue;
+               ClearPagePrivate(pg);
+               __free_page(pg);
+       } 
+} 
+
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c

new file mode 100644 (file)

index 0000000..acdf03e
--- /dev/null
+++ b/arch/x86/mm/srat_64.c
@@ -0,0 +1,566 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+
+int acpi_numa __initdata;
+
+static struct acpi_table_slit *acpi_slit;
+
+static nodemask_t nodes_parsed __initdata;
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES];
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 0;
+
+/* Too small nodes confuse the VM badly. Usually they result
+   from BIOS bugs. */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+static __init int setup_node(int pxm)
+{
+       return acpi_map_pxm_to_node(pxm);
+}
+
+static __init int conflicting_nodes(unsigned long start, unsigned long end)
+{
+       int i;
+       for_each_node_mask(i, nodes_parsed) {
+               struct bootnode *nd = &nodes[i];
+               if (nd->start == nd->end)
+                       continue;
+               if (nd->end > start && nd->start < end)
+                       return i;
+               if (nd->end == end && nd->start == start)
+                       return i;
+       }
+       return -1;
+}
+
+static __init void cutoff_node(int i, unsigned long start, unsigned long end)
+{
+       struct bootnode *nd = &nodes[i];
+
+       if (found_add_area)
+               return;
+
+       if (nd->start < start) {
+               nd->start = start;
+               if (nd->end < nd->start)
+                       nd->start = nd->end;
+       }
+       if (nd->end > end) {
+               nd->end = end;
+               if (nd->start > nd->end)
+                       nd->start = nd->end;
+       }
+}
+
+static __init void bad_srat(void)
+{
+       int i;
+       printk(KERN_ERR "SRAT: SRAT not used.\n");
+       acpi_numa = -1;
+       found_add_area = 0;
+       for (i = 0; i < MAX_LOCAL_APIC; i++)
+               apicid_to_node[i] = NUMA_NO_NODE;
+       for (i = 0; i < MAX_NUMNODES; i++)
+               nodes_add[i].start = nodes[i].end = 0;
+       remove_all_active_ranges();
+}
+
+static __init inline int srat_disabled(void)
+{
+       return numa_off || acpi_numa < 0;
+}
+
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+       int i, j;
+       int d = slit->locality_count;
+       for (i = 0; i < d; i++) {
+               for (j = 0; j < d; j++)  {
+                       u8 val = slit->entry[d*i + j];
+                       if (i == j) {
+                               if (val != LOCAL_DISTANCE)
+                                       return 0;
+                       } else if (val <= LOCAL_DISTANCE)
+                               return 0;
+               }
+       }
+       return 1;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+       if (!slit_valid(slit)) {
+               printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+               return;
+       }
+       acpi_slit = slit;
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+       int pxm, node;
+       if (srat_disabled())
+               return;
+       if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+               bad_srat();
+               return;
+       }
+       if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+               return;
+       pxm = pa->proximity_domain_lo;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+               bad_srat();
+               return;
+       }
+       apicid_to_node[pa->apic_id] = node;
+       acpi_numa = 1;
+       printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+              pxm, pa->apic_id, node);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+       static unsigned long allocated;
+       static unsigned long last_area_end;
+       unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+       long mem = pages * sizeof(struct page);
+       unsigned long addr;
+       unsigned long allowed;
+       unsigned long oldpages = pages;
+
+       if (mem < 0)
+               return 0;
+       allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
+       allowed = (allowed / 100) * hotadd_percent;
+       if (allocated + mem > allowed) {
+               unsigned long range;
+               /* Give them at least part of their hotadd memory upto hotadd_percent
+                  It would be better to spread the limit out
+                  over multiple hotplug areas, but that is too complicated
+                  right now */
+               if (allocated >= allowed)
+                       return 0;
+               range = allowed - allocated;
+               pages = (range / PAGE_SIZE);
+               mem = pages * sizeof(struct page);
+               nd->end = nd->start + range;
+       }
+       /* Not completely fool proof, but a good sanity check */
+       addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+       if (addr == -1UL)
+               return 0;
+       if (pages != oldpages)
+               printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+                       pages << PAGE_SHIFT);
+       last_area_end = addr + mem;
+       allocated += mem;
+       return 1;
+}
+
+static int update_end_of_memory(unsigned long end)
+{
+       found_add_area = 1;
+       if ((end >> PAGE_SHIFT) > end_pfn)
+               end_pfn = end >> PAGE_SHIFT;
+       return 1;
+}
+
+static inline int save_add_info(void)
+{
+       return hotadd_percent > 0;
+}
+#else
+int update_end_of_memory(unsigned long end) {return -1;}
+static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+#endif
+/*
+ * Update nodes_add and decide if to include add are in the zone.
+ * Both SPARSE and RESERVE need nodes_add infomation.
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+       unsigned long s_pfn = start >> PAGE_SHIFT;
+       unsigned long e_pfn = end >> PAGE_SHIFT;
+       int ret = 0, changed = 0;
+       struct bootnode *nd = &nodes_add[node];
+
+       /* I had some trouble with strange memory hotadd regions breaking
+          the boot. Be very strict here and reject anything unexpected.
+          If you want working memory hotadd write correct SRATs.
+
+          The node size check is a basic sanity check to guard against
+          mistakes */
+       if ((signed long)(end - start) < NODE_MIN_SIZE) {
+               printk(KERN_ERR "SRAT: Hotplug area too small\n");
+               return -1;
+       }
+
+       /* This check might be a bit too strict, but I'm keeping it for now. */
+       if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
+               printk(KERN_ERR
+                       "SRAT: Hotplug area %lu -> %lu has existing memory\n",
+                       s_pfn, e_pfn);
+               return -1;
+       }
+
+       if (!hotadd_enough_memory(&nodes_add[node]))  {
+               printk(KERN_ERR "SRAT: Hotplug area too large\n");
+               return -1;
+       }
+
+       /* Looks good */
+
+       if (nd->start == nd->end) {
+               nd->start = start;
+               nd->end = end;
+               changed = 1;
+       } else {
+               if (nd->start == end) {
+                       nd->start = start;
+                       changed = 1;
+               }
+               if (nd->end == start) {
+                       nd->end = end;
+                       changed = 1;
+               }
+               if (!changed)
+                       printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+       }
+
+       ret = update_end_of_memory(nd->end);
+
+       if (changed)
+               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+       return ret;
+}
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+       struct bootnode *nd, oldnode;
+       unsigned long start, end;
+       int node, pxm;
+       int i;
+
+       if (srat_disabled())
+               return;
+       if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
+               bad_srat();
+               return;
+       }
+       if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+               return;
+
+       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+               return;
+       start = ma->base_address;
+       end = start + ma->length;
+       pxm = ma->proximity_domain;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+               bad_srat();
+               return;
+       }
+       i = conflicting_nodes(start, end);
+       if (i == node) {
+               printk(KERN_WARNING
+               "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
+                       pxm, start, end, nodes[i].start, nodes[i].end);
+       } else if (i >= 0) {
+               printk(KERN_ERR
+                      "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
+                      pxm, start, end, node_to_pxm(i),
+                       nodes[i].start, nodes[i].end);
+               bad_srat();
+               return;
+       }
+       nd = &nodes[node];
+       oldnode = *nd;
+       if (!node_test_and_set(node, nodes_parsed)) {
+               nd->start = start;
+               nd->end = end;
+       } else {
+               if (start < nd->start)
+                       nd->start = start;
+               if (nd->end < end)
+                       nd->end = end;
+       }
+
+       printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
+              nd->start, nd->end);
+       e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
+                                               nd->end >> PAGE_SHIFT);
+       push_node_boundaries(node, nd->start >> PAGE_SHIFT,
+                                               nd->end >> PAGE_SHIFT);
+
+       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
+           (reserve_hotadd(node, start, end) < 0)) {
+               /* Ignore hotadd region. Undo damage */
+               printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+               *nd = oldnode;
+               if ((nd->start | nd->end) == 0)
+                       node_clear(node, nodes_parsed);
+       }
+}
+
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+   Make sure the PXMs cover all memory. */
+static int __init nodes_cover_memory(const struct bootnode *nodes)
+{
+       int i;
+       unsigned long pxmram, e820ram;
+
+       pxmram = 0;
+       for_each_node_mask(i, nodes_parsed) {
+               unsigned long s = nodes[i].start >> PAGE_SHIFT;
+               unsigned long e = nodes[i].end >> PAGE_SHIFT;
+               pxmram += e - s;
+               pxmram -= absent_pages_in_range(s, e);
+               if ((long)pxmram < 0)
+                       pxmram = 0;
+       }
+
+       e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
+       /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
+       if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+               printk(KERN_ERR
+       "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
+                       (pxmram << PAGE_SHIFT) >> 20,
+                       (e820ram << PAGE_SHIFT) >> 20);
+               return 0;
+       }
+       return 1;
+}
+
+static void unparse_node(int node)
+{
+       int i;
+       node_clear(node, nodes_parsed);
+       for (i = 0; i < MAX_LOCAL_APIC; i++) {
+               if (apicid_to_node[i] == node)
+                       apicid_to_node[i] = NUMA_NO_NODE;
+       }
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+/* Use the information discovered above to actually set up the nodes. */
+int __init acpi_scan_nodes(unsigned long start, unsigned long end)
+{
+       int i;
+
+       if (acpi_numa <= 0)
+               return -1;
+
+       /* First clean up the node list */
+       for (i = 0; i < MAX_NUMNODES; i++) {
+               cutoff_node(i, start, end);
+               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+                       unparse_node(i);
+                       node_set_offline(i);
+               }
+       }
+
+       if (!nodes_cover_memory(nodes)) {
+               bad_srat();
+               return -1;
+       }
+
+       memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+       if (memnode_shift < 0) {
+               printk(KERN_ERR
+                    "SRAT: No NUMA node hash function found. Contact maintainer\n");
+               bad_srat();
+               return -1;
+       }
+
+       node_possible_map = nodes_parsed;
+
+       /* Finally register nodes */
+       for_each_node_mask(i, node_possible_map)
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       /* Try again in case setup_node_bootmem missed one due
+          to missing bootmem */
+       for_each_node_mask(i, node_possible_map)
+               if (!node_online(i))
+                       setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+
+       for (i = 0; i < NR_CPUS; i++) {
+               if (cpu_to_node[i] == NUMA_NO_NODE)
+                       continue;
+               if (!node_isset(cpu_to_node[i], node_possible_map))
+                       numa_set_node(i, NUMA_NO_NODE);
+       }
+       numa_init_array();
+       return 0;
+}
+
+#ifdef CONFIG_NUMA_EMU
+static int __init find_node_by_addr(unsigned long addr)
+{
+       int ret = NUMA_NO_NODE;
+       int i;
+
+       for_each_node_mask(i, nodes_parsed) {
+               /*
+                * Find the real node that this emulated node appears on.  For
+                * the sake of simplicity, we only use a real node's starting
+                * address to determine which emulated node it appears on.
+                */
+               if (addr >= nodes[i].start && addr < nodes[i].end) {
+                       ret = i;
+                       break;
+               }
+       }
+       return i;
+}
+
+/*
+ * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
+ * mappings that respect the real ACPI topology but reflect our emulated
+ * environment.  For each emulated node, we find which real node it appears on
+ * and create PXM to NID mappings for those fake nodes which mirror that
+ * locality.  SLIT will now represent the correct distances between emulated
+ * nodes as a result of the real topology.
+ */
+void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
+{
+       int i, j;
+       int fake_node_to_pxm_map[MAX_NUMNODES] = {
+               [0 ... MAX_NUMNODES-1] = PXM_INVAL
+       };
+       unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
+               [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+       };
+
+       printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+                        "topology.\n");
+       for (i = 0; i < num_nodes; i++) {
+               int nid, pxm;
+
+               nid = find_node_by_addr(fake_nodes[i].start);
+               if (nid == NUMA_NO_NODE)
+                       continue;
+               pxm = node_to_pxm(nid);
+               if (pxm == PXM_INVAL)
+                       continue;
+               fake_node_to_pxm_map[i] = pxm;
+               /*
+                * For each apicid_to_node mapping that exists for this real
+                * node, it must now point to the fake node ID.
+                */
+               for (j = 0; j < MAX_LOCAL_APIC; j++)
+                       if (apicid_to_node[j] == nid)
+                               fake_apicid_to_node[j] = i;
+       }
+       for (i = 0; i < num_nodes; i++)
+               __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+       memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+
+       nodes_clear(nodes_parsed);
+       for (i = 0; i < num_nodes; i++)
+               if (fake_nodes[i].start != fake_nodes[i].end)
+                       node_set(i, nodes_parsed);
+       WARN_ON(!nodes_cover_memory(fake_nodes));
+}
+
+static int null_slit_node_compare(int a, int b)
+{
+       return node_to_pxm(a) == node_to_pxm(b);
+}
+#else
+static int null_slit_node_compare(int a, int b)
+{
+       return a == b;
+}
+#endif /* CONFIG_NUMA_EMU */
+
+void __init srat_reserve_add_area(int nodeid)
+{
+       if (found_add_area && nodes_add[nodeid].end) {
+               u64 total_mb;
+
+               printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+                               "for node %d at %Lx-%Lx\n",
+                       nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+               total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+                                       >> PAGE_SHIFT;
+               total_mb *= sizeof(struct page);
+               total_mb >>= 20;
+               printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+                               "pre-allocated memory.\n", (unsigned long long)total_mb);
+               reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+                              nodes_add[nodeid].end - nodes_add[nodeid].start);
+       }
+}
+
+int __node_distance(int a, int b)
+{
+       int index;
+
+       if (!acpi_slit)
+               return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
+                                                     REMOTE_DISTANCE;
+       index = acpi_slit->locality_count * node_to_pxm(a);
+       return acpi_slit->entry[index + node_to_pxm(b)];
+}
+
+EXPORT_SYMBOL(__node_distance);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+       int i, ret = 0;
+
+       for_each_node(i)
+               if (nodes_add[i].start <= start && nodes_add[i].end > start)
+                       ret = i;
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile

index 11ef2c37ba579dc21a57263b500734f50f088d77..4208a0ded52943b33c2b9ee337380513ecaeea32 100644 (file)
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -75,7 +75,7 @@ head-y := arch/x86_64/kernel/head_64.o arch/x86_64/kernel/head64.o arch/x86_64/k
  
  libs-y                                         += arch/x86/lib/
  core-y                                 += arch/x86_64/kernel/ \
-                                          arch/x86_64/mm/ \
+                                          arch/x86/mm/ \
                                            arch/x86/crypto/ \
                                            arch/x86/vdso/
  core-$(CONFIG_IA32_EMULATION)          += arch/x86_64/ia32/
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile

deleted file mode 100644 (file)

index 7317648..0000000
--- a/arch/x86_64/mm/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/mm/Makefile_32
-else
-include ${srctree}/arch/x86_64/mm/Makefile_64
-endif
diff --git a/arch/x86_64/mm/Makefile_64 b/arch/x86_64/mm/Makefile_64

deleted file mode 100644 (file)

index 5c2883c..0000000
--- a/arch/x86_64/mm/Makefile_64
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for the linux x86_64-specific parts of the memory manager.
-#
-
-obj-y   := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA) += numa_64.o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_64.o
-
-hugetlbpage-y = ../../x86/mm/hugetlbpage.o
diff --git a/arch/x86_64/mm/extable_64.c b/arch/x86_64/mm/extable_64.c

deleted file mode 100644 (file)

index 79ac6e7..0000000
--- a/arch/x86_64/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * linux/arch/x86_64/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-
-/* Simple binary search */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-              const struct exception_table_entry *last,
-              unsigned long value)
-{
-       /* Work around a B stepping K8 bug */
-       if ((value >> 32) == 0)
-               value |= 0xffffffffUL << 32; 
-
-        while (first <= last) {
-               const struct exception_table_entry *mid;
-               long diff;
-
-               mid = (last - first) / 2 + first;
-               diff = mid->insn - value;
-                if (diff == 0)
-                        return mid;
-                else if (diff < 0)
-                        first = mid+1;
-                else
-                        last = mid-1;
-        }
-        return NULL;
-}
diff --git a/arch/x86_64/mm/fault_64.c b/arch/x86_64/mm/fault_64.c

deleted file mode 100644 (file)

index 54816ad..0000000
--- a/arch/x86_64/mm/fault_64.c
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- *  linux/arch/x86-64/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>             /* For unblank_screen() */
-#include <linux/compiler.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-
-#include <asm/system.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm-generic/sections.h>
-
-/* Page fault error code bits */
-#define PF_PROT        (1<<0)          /* or no page found */
-#define PF_WRITE       (1<<1)
-#define PF_USER        (1<<2)
-#define PF_RSVD        (1<<3)
-#define PF_INSTR       (1<<4)
-
-static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
-
-/* Hook to register for page fault notifications */
-int register_page_fault_notifier(struct notifier_block *nb)
-{
-       vmalloc_sync_all();
-       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
-}
-EXPORT_SYMBOL_GPL(register_page_fault_notifier);
-
-int unregister_page_fault_notifier(struct notifier_block *nb)
-{
-       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
-}
-EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
-
-static inline int notify_page_fault(struct pt_regs *regs, long err)
-{
-       struct die_args args = {
-               .regs = regs,
-               .str = "page fault",
-               .err = err,
-               .trapnr = 14,
-               .signr = SIGSEGV
-       };
-       return atomic_notifier_call_chain(&notify_page_fault_chain,
-                                         DIE_PAGE_FAULT, &args);
-}
-
-/* Sometimes the CPU reports invalid exceptions on prefetch.
-   Check that here and ignore.
-   Opcode checker based on code by Richard Brunner */
-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-                               unsigned long error_code)
-{ 
-       unsigned char *instr;
-       int scan_more = 1;
-       int prefetch = 0; 
-       unsigned char *max_instr;
-
-       /* If it was a exec fault ignore */
-       if (error_code & PF_INSTR)
-               return 0;
-       
-       instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
-       max_instr = instr + 15;
-
-       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
-               return 0;
-
-       while (scan_more && instr < max_instr) { 
-               unsigned char opcode;
-               unsigned char instr_hi;
-               unsigned char instr_lo;
-
-               if (probe_kernel_address(instr, opcode))
-                       break; 
-
-               instr_hi = opcode & 0xf0; 
-               instr_lo = opcode & 0x0f; 
-               instr++;
-
-               switch (instr_hi) { 
-               case 0x20:
-               case 0x30:
-                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
-                          prefixes.  In long mode, the CPU will signal
-                          invalid opcode if some of these prefixes are
-                          present so we will never get here anyway */
-                       scan_more = ((instr_lo & 7) == 0x6);
-                       break;
-                       
-               case 0x40:
-                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
-                          Need to figure out under what instruction mode the
-                          instruction was issued ... */
-                       /* Could check the LDT for lm, but for now it's good
-                          enough to assume that long mode only uses well known
-                          segments or kernel. */
-                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
-                       break;
-                       
-               case 0x60:
-                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
-                       scan_more = (instr_lo & 0xC) == 0x4;
-                       break;          
-               case 0xF0:
-                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
-                       scan_more = !instr_lo || (instr_lo>>1) == 1;
-                       break;                  
-               case 0x00:
-                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
-                       scan_more = 0;
-                       if (probe_kernel_address(instr, opcode))
-                               break;
-                       prefetch = (instr_lo == 0xF) &&
-                               (opcode == 0x0D || opcode == 0x18);
-                       break;                  
-               default:
-                       scan_more = 0;
-                       break;
-               } 
-       }
-       return prefetch;
-}
-
-static int bad_address(void *p) 
-{ 
-       unsigned long dummy;
-       return probe_kernel_address((unsigned long *)p, dummy);
-} 
-
-void dump_pagetable(unsigned long address)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = (pgd_t *)read_cr3();
-
-       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
-       pgd += pgd_index(address);
-       if (bad_address(pgd)) goto bad;
-       printk("PGD %lx ", pgd_val(*pgd));
-       if (!pgd_present(*pgd)) goto ret; 
-
-       pud = pud_offset(pgd, address);
-       if (bad_address(pud)) goto bad;
-       printk("PUD %lx ", pud_val(*pud));
-       if (!pud_present(*pud)) goto ret;
-
-       pmd = pmd_offset(pud, address);
-       if (bad_address(pmd)) goto bad;
-       printk("PMD %lx ", pmd_val(*pmd));
-       if (!pmd_present(*pmd)) goto ret;        
-
-       pte = pte_offset_kernel(pmd, address);
-       if (bad_address(pte)) goto bad;
-       printk("PTE %lx", pte_val(*pte)); 
-ret:
-       printk("\n");
-       return;
-bad:
-       printk("BAD\n");
-}
-
-static const char errata93_warning[] = 
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-
-/* Workaround for K8 erratum #93 & buggy BIOS.
-   BIOS SMM functions are required to use a specific workaround
-   to avoid corruption of the 64bit RIP register on C stepping K8. 
-   A lot of BIOS that didn't get tested properly miss this. 
-   The OS sees this as a page fault with the upper 32bits of RIP cleared.
-   Try to work around it here.
-   Note we only handle faults in kernel here. */
-
-static int is_errata93(struct pt_regs *regs, unsigned long address) 
-{
-       static int warned;
-       if (address != regs->rip)
-               return 0;
-       if ((address >> 32) != 0) 
-               return 0;
-       address |= 0xffffffffUL << 32;
-       if ((address >= (u64)_stext && address <= (u64)_etext) || 
-           (address >= MODULES_VADDR && address <= MODULES_END)) { 
-               if (!warned) {
-                       printk(errata93_warning);               
-                       warned = 1;
-               }
-               regs->rip = address;
-               return 1;
-       }
-       return 0;
-} 
-
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-                                unsigned long error_code)
-{
-       unsigned long flags = oops_begin();
-       struct task_struct *tsk;
-
-       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-              current->comm, address);
-       dump_pagetable(address);
-       tsk = current;
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-       __die("Bad pagetable", regs, error_code);
-       oops_end(flags);
-       do_exit(SIGKILL);
-}
-
-/*
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-       pgd_t *pgd, *pgd_ref;
-       pud_t *pud, *pud_ref;
-       pmd_t *pmd, *pmd_ref;
-       pte_t *pte, *pte_ref;
-
-       /* Copy kernel mappings over when needed. This can also
-          happen within a race in page table update. In the later
-          case just flush. */
-
-       pgd = pgd_offset(current->mm ?: &init_mm, address);
-       pgd_ref = pgd_offset_k(address);
-       if (pgd_none(*pgd_ref))
-               return -1;
-       if (pgd_none(*pgd))
-               set_pgd(pgd, *pgd_ref);
-       else
-               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-
-       /* Below here mismatches are bugs because these lower tables
-          are shared */
-
-       pud = pud_offset(pgd, address);
-       pud_ref = pud_offset(pgd_ref, address);
-       if (pud_none(*pud_ref))
-               return -1;
-       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-               BUG();
-       pmd = pmd_offset(pud, address);
-       pmd_ref = pmd_offset(pud_ref, address);
-       if (pmd_none(*pmd_ref))
-               return -1;
-       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-               BUG();
-       pte_ref = pte_offset_kernel(pmd_ref, address);
-       if (!pte_present(*pte_ref))
-               return -1;
-       pte = pte_offset_kernel(pmd, address);
-       /* Don't use pte_page here, because the mappings can point
-          outside mem_map, and the NUMA hash lookup cannot handle
-          that. */
-       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-               BUG();
-       return 0;
-}
-
-static int page_fault_trace;
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-                                       unsigned long error_code)
-{
-       struct task_struct *tsk;
-       struct mm_struct *mm;
-       struct vm_area_struct * vma;
-       unsigned long address;
-       const struct exception_table_entry *fixup;
-       int write, fault;
-       unsigned long flags;
-       siginfo_t info;
-
-       tsk = current;
-       mm = tsk->mm;
-       prefetchw(&mm->mmap_sem);
-
-       /* get the address */
-       address = read_cr2();
-
-       info.si_code = SEGV_MAPERR;
-
-
-       /*
-        * We fault-in kernel-space virtual memory on-demand. The
-        * 'reference' page table is init_mm.pgd.
-        *
-        * NOTE! We MUST NOT take any locks for this case. We may
-        * be in an interrupt or a critical region, and should
-        * only copy the information from the master page table,
-        * nothing more.
-        *
-        * This verifies that the fault happens in kernel space
-        * (error_code & 4) == 0, and that the fault was not a
-        * protection error (error_code & 9) == 0.
-        */
-       if (unlikely(address >= TASK_SIZE64)) {
-               /*
-                * Don't check for the module range here: its PML4
-                * is always initialized because it's shared with the main
-                * kernel text. Only vmalloc may need PML4 syncups.
-                */
-               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
-                       if (vmalloc_fault(address) >= 0)
-                               return;
-               }
-               if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
-                       return;
-               /*
-                * Don't take the mm semaphore here. If we fixup a prefetch
-                * fault we could otherwise deadlock.
-                */
-               goto bad_area_nosemaphore;
-       }
-
-       if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
-               return;
-
-       if (likely(regs->eflags & X86_EFLAGS_IF))
-               local_irq_enable();
-
-       if (unlikely(page_fault_trace))
-               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
-                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
-
-       if (unlikely(error_code & PF_RSVD))
-               pgtable_bad(address, regs, error_code);
-
-       /*
-        * If we're in an interrupt or have no user
-        * context, we must not take the fault..
-        */
-       if (unlikely(in_atomic() || !mm))
-               goto bad_area_nosemaphore;
-
-       /*
-        * User-mode registers count as a user access even for any
-        * potential system fault or CPU buglet.
-        */
-       if (user_mode_vm(regs))
-               error_code |= PF_USER;
-
- again:
-       /* When running in the kernel we expect faults to occur only to
-        * addresses in user space.  All other faults represent errors in the
-        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
-        * erroneous fault occurring in a code path which already holds mmap_sem
-        * we will deadlock attempting to validate the fault against the
-        * address space.  Luckily the kernel only validly references user
-        * space from well defined areas of code, which are listed in the
-        * exceptions table.
-        *
-        * As the vast majority of faults will be valid we will only perform
-        * the source reference check when there is a possibilty of a deadlock.
-        * Attempt to lock the address space, if we cannot we then validate the
-        * source.  If this is invalid we can skip the address space check,
-        * thus avoiding the deadlock.
-        */
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               if ((error_code & PF_USER) == 0 &&
-                   !search_exception_tables(regs->rip))
-                       goto bad_area_nosemaphore;
-               down_read(&mm->mmap_sem);
-       }
-
-       vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (likely(vma->vm_start <= address))
-               goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
-       if (error_code & 4) {
-               /* Allow userspace just enough access below the stack pointer
-                * to let the 'enter' instruction work.
-                */
-               if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
-                       goto bad_area;
-       }
-       if (expand_stack(vma, address))
-               goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-       info.si_code = SEGV_ACCERR;
-       write = 0;
-       switch (error_code & (PF_PROT|PF_WRITE)) {
-               default:        /* 3: write, present */
-                       /* fall through */
-               case PF_WRITE:          /* write, not present */
-                       if (!(vma->vm_flags & VM_WRITE))
-                               goto bad_area;
-                       write++;
-                       break;
-               case PF_PROT:           /* read, present */
-                       goto bad_area;
-               case 0:                 /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                               goto bad_area;
-       }
-
-       /*
-        * If for any reason at all we couldn't handle the fault,
-        * make sure we exit gracefully rather than endlessly redo
-        * the fault.
-        */
-       fault = handle_mm_fault(mm, vma, address, write);
-       if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
-       }
-       if (fault & VM_FAULT_MAJOR)
-               tsk->maj_flt++;
-       else
-               tsk->min_flt++;
-       up_read(&mm->mmap_sem);
-       return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses just cause a SIGSEGV */
-       if (error_code & PF_USER) {
-
-               /*
-                * It's possible to have interrupts off here.
-                */
-               local_irq_enable();
-
-               if (is_prefetch(regs, address, error_code))
-                       return;
-
-               /* Work around K8 erratum #100 K8 in compat mode
-                  occasionally jumps to illegal addresses >4GB.  We
-                  catch this here in the page fault handler because
-                  these addresses are not reachable. Just detect this
-                  case and return.  Any code segment in LDT is
-                  compatibility mode. */
-               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
-                   (address >> 32))
-                       return;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit()) {
-                       printk(
-                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
-                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-                                       tsk->comm, tsk->pid, address, regs->rip,
-                                       regs->rsp, error_code);
-               }
-       
-               tsk->thread.cr2 = address;
-               /* Kernel addresses are always protection faults */
-               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-               tsk->thread.trap_no = 14;
-               info.si_signo = SIGSEGV;
-               info.si_errno = 0;
-               /* info.si_code has been set above */
-               info.si_addr = (void __user *)address;
-               force_sig_info(SIGSEGV, &info, tsk);
-               return;
-       }
-
-no_context:
-       
-       /* Are we prepared to handle this kernel fault?  */
-       fixup = search_exception_tables(regs->rip);
-       if (fixup) {
-               regs->rip = fixup->fixup;
-               return;
-       }
-
-       /* 
-        * Hall of shame of CPU/BIOS bugs.
-        */
-
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-       if (is_errata93(regs, address))
-               return; 
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-       flags = oops_begin();
-
-       if (address < PAGE_SIZE)
-               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
-       else
-               printk(KERN_ALERT "Unable to handle kernel paging request");
-       printk(" at %016lx RIP: \n" KERN_ALERT,address);
-       printk_address(regs->rip);
-       dump_pagetable(address);
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-       __die("Oops", regs, error_code);
-       /* Executive summary in case the body of the oops scrolled away */
-       printk(KERN_EMERG "CR2: %016lx\n", address);
-       oops_end(flags);
-       do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-       up_read(&mm->mmap_sem);
-       if (is_init(current)) {
-               yield();
-               goto again;
-       }
-       printk("VM: killing process %s\n", tsk->comm);
-       if (error_code & 4)
-               do_group_exit(SIGKILL);
-       goto no_context;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       if (!(error_code & PF_USER))
-               goto no_context;
-
-       tsk->thread.cr2 = address;
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 14;
-       info.si_signo = SIGBUS;
-       info.si_errno = 0;
-       info.si_code = BUS_ADRERR;
-       info.si_addr = (void __user *)address;
-       force_sig_info(SIGBUS, &info, tsk);
-       return;
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
-       /* Note that races in the updates of insync and start aren't 
-          problematic:
-          insync can only get set bits added, and updates to start are only
-          improving performance (without affecting correctness if undone). */
-       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-       static unsigned long start = VMALLOC_START & PGDIR_MASK;
-       unsigned long address;
-
-       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
-               if (!test_bit(pgd_index(address), insync)) {
-                       const pgd_t *pgd_ref = pgd_offset_k(address);
-                       struct page *page;
-
-                       if (pgd_none(*pgd_ref))
-                               continue;
-                       spin_lock(&pgd_lock);
-                       list_for_each_entry(page, &pgd_list, lru) {
-                               pgd_t *pgd;
-                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                               if (pgd_none(*pgd))
-                                       set_pgd(pgd, *pgd_ref);
-                               else
-                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                       }
-                       spin_unlock(&pgd_lock);
-                       set_bit(pgd_index(address), insync);
-               }
-               if (address == start)
-                       start = address + PGDIR_SIZE;
-       }
-       /* Check that there is no need to do the same for the modules area. */
-       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
-                               (__START_KERNEL & PGDIR_MASK)));
-}
-
-static int __init enable_pagefaulttrace(char *str)
-{
-       page_fault_trace = 1;
-       return 1;
-}
-__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init_64.c b/arch/x86_64/mm/init_64.c

deleted file mode 100644 (file)

index 458893b..0000000
--- a/arch/x86_64/mm/init_64.c
+++ /dev/null
@@ -1,750 +0,0 @@
-/*
- *  linux/arch/x86_64/mm/init.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
- *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/bootmem.h>
-#include <linux/proc_fs.h>
-#include <linux/pci.h>
-#include <linux/pfn.h>
-#include <linux/poison.h>
-#include <linux/dma-mapping.h>
-#include <linux/module.h>
-#include <linux/memory_hotplug.h>
-#include <linux/nmi.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/dma.h>
-#include <asm/fixmap.h>
-#include <asm/e820.h>
-#include <asm/apic.h>
-#include <asm/tlb.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/smp.h>
-#include <asm/sections.h>
-
-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
-
-const struct dma_mapping_ops* dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
-static unsigned long dma_reserve __initdata;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-/*
- * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
- * physical space so we can cache the place of the first one and move
- * around without checking the pgd every time.
- */
-
-void show_mem(void)
-{
-       long i, total = 0, reserved = 0;
-       long shared = 0, cached = 0;
-       pg_data_t *pgdat;
-       struct page *page;
-
-       printk(KERN_INFO "Mem-info:\n");
-       show_free_areas();
-       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-
-       for_each_online_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                       /* this loop can take a while with 256 GB and 4k pages
-                          so update the NMI watchdog */
-                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
-                               touch_nmi_watchdog();
-                       }
-                       if (!pfn_valid(pgdat->node_start_pfn + i))
-                               continue;
-                       page = pfn_to_page(pgdat->node_start_pfn + i);
-                       total++;
-                       if (PageReserved(page))
-                               reserved++;
-                       else if (PageSwapCache(page))
-                               cached++;
-                       else if (page_count(page))
-                               shared += page_count(page) - 1;
-               }
-       }
-       printk(KERN_INFO "%lu pages of RAM\n", total);
-       printk(KERN_INFO "%lu reserved pages\n",reserved);
-       printk(KERN_INFO "%lu pages shared\n",shared);
-       printk(KERN_INFO "%lu pages swap cached\n",cached);
-}
-
-int after_bootmem;
-
-static __init void *spp_getpage(void)
-{ 
-       void *ptr;
-       if (after_bootmem)
-               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
-       else
-               ptr = alloc_bootmem_pages(PAGE_SIZE);
-       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
-               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
-
-       Dprintk("spp_getpage %p\n", ptr);
-       return ptr;
-} 
-
-static __init void set_pte_phys(unsigned long vaddr,
-                        unsigned long phys, pgprot_t prot)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte, new_pte;
-
-       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
-
-       pgd = pgd_offset_k(vaddr);
-       if (pgd_none(*pgd)) {
-               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
-               return;
-       }
-       pud = pud_offset(pgd, vaddr);
-       if (pud_none(*pud)) {
-               pmd = (pmd_t *) spp_getpage(); 
-               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
-               if (pmd != pmd_offset(pud, 0)) {
-                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
-                       return;
-               }
-       }
-       pmd = pmd_offset(pud, vaddr);
-       if (pmd_none(*pmd)) {
-               pte = (pte_t *) spp_getpage();
-               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
-               if (pte != pte_offset_kernel(pmd, 0)) {
-                       printk("PAGETABLE BUG #02!\n");
-                       return;
-               }
-       }
-       new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
-
-       pte = pte_offset_kernel(pmd, vaddr);
-       if (!pte_none(*pte) &&
-           pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
-               pte_ERROR(*pte);
-       set_pte(pte, new_pte);
-
-       /*
-        * It's enough to flush this one mapping.
-        * (PGE mappings get flushed as well)
-        */
-       __flush_tlb_one(vaddr);
-}
-
-/* NOTE: this is meant to be run only at boot */
-void __init 
-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
-{
-       unsigned long address = __fix_to_virt(idx);
-
-       if (idx >= __end_of_fixed_addresses) {
-               printk("Invalid __set_fixmap\n");
-               return;
-       }
-       set_pte_phys(address, phys, prot);
-}
-
-unsigned long __meminitdata table_start, table_end;
-
-static __meminit void *alloc_low_page(unsigned long *phys)
-{ 
-       unsigned long pfn = table_end++;
-       void *adr;
-
-       if (after_bootmem) {
-               adr = (void *)get_zeroed_page(GFP_ATOMIC);
-               *phys = __pa(adr);
-               return adr;
-       }
-
-       if (pfn >= end_pfn) 
-               panic("alloc_low_page: ran out of memory"); 
-
-       adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
-       memset(adr, 0, PAGE_SIZE);
-       *phys  = pfn * PAGE_SIZE;
-       return adr;
-}
-
-static __meminit void unmap_low_page(void *adr)
-{ 
-
-       if (after_bootmem)
-               return;
-
-       early_iounmap(adr, PAGE_SIZE);
-} 
-
-/* Must run before zap_low_mappings */
-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
-{
-       unsigned long vaddr;
-       pmd_t *pmd, *last_pmd;
-       int i, pmds;
-
-       pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
-       vaddr = __START_KERNEL_map;
-       pmd = level2_kernel_pgt;
-       last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
-       for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
-               for (i = 0; i < pmds; i++) {
-                       if (pmd_present(pmd[i]))
-                               goto next;
-               }
-               vaddr += addr & ~PMD_MASK;
-               addr &= PMD_MASK;
-               for (i = 0; i < pmds; i++, addr += PMD_SIZE)
-                       set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
-               __flush_tlb();
-               return (void *)vaddr;
-       next:
-               ;
-       }
-       printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
-       return NULL;
-}
-
-/* To avoid virtual aliases later */
-__meminit void early_iounmap(void *addr, unsigned long size)
-{
-       unsigned long vaddr;
-       pmd_t *pmd;
-       int i, pmds;
-
-       vaddr = (unsigned long)addr;
-       pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
-       pmd = level2_kernel_pgt + pmd_index(vaddr);
-       for (i = 0; i < pmds; i++)
-               pmd_clear(pmd + i);
-       __flush_tlb();
-}
-
-static void __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
-{
-       int i = pmd_index(address);
-
-       for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
-               unsigned long entry;
-               pmd_t *pmd = pmd_page + pmd_index(address);
-
-               if (address >= end) {
-                       if (!after_bootmem)
-                               for (; i < PTRS_PER_PMD; i++, pmd++)
-                                       set_pmd(pmd, __pmd(0));
-                       break;
-               }
-
-               if (pmd_val(*pmd))
-                       continue;
-
-               entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
-               entry &= __supported_pte_mask;
-               set_pmd(pmd, __pmd(entry));
-       }
-}
-
-static void __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
-{
-       pmd_t *pmd = pmd_offset(pud,0);
-       spin_lock(&init_mm.page_table_lock);
-       phys_pmd_init(pmd, address, end);
-       spin_unlock(&init_mm.page_table_lock);
-       __flush_tlb_all();
-}
-
-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
-{ 
-       int i = pud_index(addr);
-
-
-       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
-               unsigned long pmd_phys;
-               pud_t *pud = pud_page + pud_index(addr);
-               pmd_t *pmd;
-
-               if (addr >= end)
-                       break;
-
-               if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
-                       set_pud(pud, __pud(0)); 
-                       continue;
-               } 
-
-               if (pud_val(*pud)) {
-                       phys_pmd_update(pud, addr, end);
-                       continue;
-               }
-
-               pmd = alloc_low_page(&pmd_phys);
-               spin_lock(&init_mm.page_table_lock);
-               set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
-               phys_pmd_init(pmd, addr, end);
-               spin_unlock(&init_mm.page_table_lock);
-               unmap_low_page(pmd);
-       }
-       __flush_tlb();
-} 
-
-static void __init find_early_table_space(unsigned long end)
-{
-       unsigned long puds, pmds, tables, start;
-
-       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-       tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
-                round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-       /* RED-PEN putting page tables only on node 0 could
-          cause a hotspot and fill up ZONE_DMA. The page tables
-          need roughly 0.5KB per GB. */
-       start = 0x8000;
-       table_start = find_e820_area(start, end, tables);
-       if (table_start == -1UL)
-               panic("Cannot find space for the kernel page tables");
-
-       table_start >>= PAGE_SHIFT;
-       table_end = table_start;
-
-       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
-               end, table_start << PAGE_SHIFT,
-               (table_start << PAGE_SHIFT) + tables);
-}
-
-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
-   This runs before bootmem is initialized and gets pages directly from the 
-   physical memory. To access them they are temporarily mapped. */
-void __meminit init_memory_mapping(unsigned long start, unsigned long end)
-{ 
-       unsigned long next; 
-
-       Dprintk("init_memory_mapping\n");
-
-       /* 
-        * Find space for the kernel direct mapping tables.
-        * Later we should allocate these tables in the local node of the memory
-        * mapped.  Unfortunately this is done currently before the nodes are 
-        * discovered.
-        */
-       if (!after_bootmem)
-               find_early_table_space(end);
-
-       start = (unsigned long)__va(start);
-       end = (unsigned long)__va(end);
-
-       for (; start < end; start = next) {
-               unsigned long pud_phys; 
-               pgd_t *pgd = pgd_offset_k(start);
-               pud_t *pud;
-
-               if (after_bootmem)
-                       pud = pud_offset(pgd, start & PGDIR_MASK);
-               else
-                       pud = alloc_low_page(&pud_phys);
-
-               next = start + PGDIR_SIZE;
-               if (next > end) 
-                       next = end; 
-               phys_pud_init(pud, __pa(start), __pa(next));
-               if (!after_bootmem)
-                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
-               unmap_low_page(pud);
-       } 
-
-       if (!after_bootmem)
-               mmu_cr4_features = read_cr4();
-       __flush_tlb_all();
-}
-
-#ifndef CONFIG_NUMA
-void __init paging_init(void)
-{
-       unsigned long max_zone_pfns[MAX_NR_ZONES];
-       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-       max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-       max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-       max_zone_pfns[ZONE_NORMAL] = end_pfn;
-
-       memory_present(0, 0, end_pfn);
-       sparse_init();
-       free_area_init_nodes(max_zone_pfns);
-}
-#endif
-
-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
-   from the CPU leading to inconsistent cache lines. address and size
-   must be aligned to 2MB boundaries. 
-   Does nothing when the mapping doesn't exist. */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
-{
-       unsigned long end = address + size;
-
-       BUG_ON(address & ~LARGE_PAGE_MASK);
-       BUG_ON(size & ~LARGE_PAGE_MASK); 
-       
-       for (; address < end; address += LARGE_PAGE_SIZE) { 
-               pgd_t *pgd = pgd_offset_k(address);
-               pud_t *pud;
-               pmd_t *pmd;
-               if (pgd_none(*pgd))
-                       continue;
-               pud = pud_offset(pgd, address);
-               if (pud_none(*pud))
-                       continue; 
-               pmd = pmd_offset(pud, address);
-               if (!pmd || pmd_none(*pmd))
-                       continue; 
-               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
-                       /* Could handle this, but it should not happen currently. */
-                       printk(KERN_ERR 
-              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
-                       pmd_ERROR(*pmd); 
-               }
-               set_pmd(pmd, __pmd(0));                 
-       }
-       __flush_tlb_all();
-} 
-
-/*
- * Memory hotplug specific functions
- */
-void online_page(struct page *page)
-{
-       ClearPageReserved(page);
-       init_page_count(page);
-       __free_page(page);
-       totalram_pages++;
-       num_physpages++;
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-/*
- * Memory is added always to NORMAL zone. This means you will never get
- * additional DMA/DMA32 memory.
- */
-int arch_add_memory(int nid, u64 start, u64 size)
-{
-       struct pglist_data *pgdat = NODE_DATA(nid);
-       struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
-       unsigned long start_pfn = start >> PAGE_SHIFT;
-       unsigned long nr_pages = size >> PAGE_SHIFT;
-       int ret;
-
-       init_memory_mapping(start, (start + size -1));
-
-       ret = __add_pages(zone, start_pfn, nr_pages);
-       if (ret)
-               goto error;
-
-       return ret;
-error:
-       printk("%s: Problem encountered in __add_pages!\n", __func__);
-       return ret;
-}
-EXPORT_SYMBOL_GPL(arch_add_memory);
-
-int remove_memory(u64 start, u64 size)
-{
-       return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(remove_memory);
-
-#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
-int memory_add_physaddr_to_nid(u64 start)
-{
-       return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
- */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
-{
-       int err = -EIO;
-       unsigned long pfn;
-       unsigned long total = 0, mem = 0;
-       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-               if (pfn_valid(pfn)) {
-                       online_page(pfn_to_page(pfn));
-                       err = 0;
-                       mem++;
-               }
-               total++;
-       }
-       if (!err) {
-               z->spanned_pages += total;
-               z->present_pages += mem;
-               z->zone_pgdat->node_spanned_pages += total;
-               z->zone_pgdat->node_present_pages += mem;
-       }
-       return err;
-}
-#endif
-
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
-                        kcore_vsyscall;
-
-void __init mem_init(void)
-{
-       long codesize, reservedpages, datasize, initsize;
-
-       pci_iommu_alloc();
-
-       /* clear the zero-page */
-       memset(empty_zero_page, 0, PAGE_SIZE);
-
-       reservedpages = 0;
-
-       /* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
-       totalram_pages = numa_free_all_bootmem();
-#else
-       totalram_pages = free_all_bootmem();
-#endif
-       reservedpages = end_pfn - totalram_pages -
-                                       absent_pages_in_range(0, end_pfn);
-
-       after_bootmem = 1;
-
-       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
-       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
-       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-
-       /* Register memory areas for /proc/kcore */
-       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
-       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
-                  VMALLOC_END-VMALLOC_START);
-       kclist_add(&kcore_kernel, &_stext, _end - _stext);
-       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
-       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
-                                VSYSCALL_END - VSYSCALL_START);
-
-       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
-               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-               end_pfn << (PAGE_SHIFT-10),
-               codesize >> 10,
-               reservedpages << (PAGE_SHIFT-10),
-               datasize >> 10,
-               initsize >> 10);
-}
-
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-       unsigned long addr;
-
-       if (begin >= end)
-               return;
-
-       printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-       for (addr = begin; addr < end; addr += PAGE_SIZE) {
-               ClearPageReserved(virt_to_page(addr));
-               init_page_count(virt_to_page(addr));
-               memset((void *)(addr & ~(PAGE_SIZE-1)),
-                       POISON_FREE_INITMEM, PAGE_SIZE);
-               if (addr >= __START_KERNEL_map)
-                       change_page_attr_addr(addr, 1, __pgprot(0));
-               free_page(addr);
-               totalram_pages++;
-       }
-       if (addr > __START_KERNEL_map)
-               global_flush_tlb();
-}
-
-void free_initmem(void)
-{
-       free_init_pages("unused kernel memory",
-                       (unsigned long)(&__init_begin),
-                       (unsigned long)(&__init_end));
-}
-
-#ifdef CONFIG_DEBUG_RODATA
-
-void mark_rodata_ro(void)
-{
-       unsigned long start = (unsigned long)_stext, end;
-
-#ifdef CONFIG_HOTPLUG_CPU
-       /* It must still be possible to apply SMP alternatives. */
-       if (num_possible_cpus() > 1)
-               start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
-       start = (unsigned long)__start_rodata;
-#endif
-       
-       end = (unsigned long)__end_rodata;
-       start = (start + PAGE_SIZE - 1) & PAGE_MASK;
-       end &= PAGE_MASK;
-       if (end <= start)
-               return;
-
-       change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
-
-       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
-              (end - start) >> 10);
-
-       /*
-        * change_page_attr_addr() requires a global_flush_tlb() call after it.
-        * We do this after the printk so that if something went wrong in the
-        * change, the printk gets out at least to give a better debug hint
-        * of who is the culprit.
-        */
-       global_flush_tlb();
-}
-#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_init_pages("initrd memory", start, end);
-}
-#endif
-
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
-{ 
-#ifdef CONFIG_NUMA
-       int nid = phys_to_nid(phys);
-#endif
-       unsigned long pfn = phys >> PAGE_SHIFT;
-       if (pfn >= end_pfn) {
-               /* This can happen with kdump kernels when accessing firmware
-                  tables. */
-               if (pfn < end_pfn_map)
-                       return;
-               printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
-                               phys, len);
-               return;
-       }
-
-       /* Should check here against the e820 map to avoid double free */
-#ifdef CONFIG_NUMA
-       reserve_bootmem_node(NODE_DATA(nid), phys, len);
-#else                  
-       reserve_bootmem(phys, len);    
-#endif
-       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
-               dma_reserve += len / PAGE_SIZE;
-               set_dma_reserve(dma_reserve);
-       }
-}
-
-int kern_addr_valid(unsigned long addr) 
-{ 
-       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       if (above != 0 && above != -1UL)
-               return 0; 
-       
-       pgd = pgd_offset_k(addr);
-       if (pgd_none(*pgd))
-               return 0;
-
-       pud = pud_offset(pgd, addr);
-       if (pud_none(*pud))
-               return 0; 
-
-       pmd = pmd_offset(pud, addr);
-       if (pmd_none(*pmd))
-               return 0;
-       if (pmd_large(*pmd))
-               return pfn_valid(pmd_pfn(*pmd));
-
-       pte = pte_offset_kernel(pmd, addr);
-       if (pte_none(*pte))
-               return 0;
-       return pfn_valid(pte_pfn(*pte));
-}
-
-/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
-   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
-   not need special handling anymore. */
-
-static struct vm_area_struct gate_vma = {
-       .vm_start = VSYSCALL_START,
-       .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
-       .vm_page_prot = PAGE_READONLY_EXEC,
-       .vm_flags = VM_READ | VM_EXEC
-};
-
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (test_tsk_thread_flag(tsk, TIF_IA32))
-               return NULL;
-#endif
-       return &gate_vma;
-}
-
-int in_gate_area(struct task_struct *task, unsigned long addr)
-{
-       struct vm_area_struct *vma = get_gate_vma(task);
-       if (!vma)
-               return 0;
-       return (addr >= vma->vm_start) && (addr < vma->vm_end);
-}
-
-/* Use this when you have no reliable task/vma, typically from interrupt
- * context.  It is less reliable than using the task's vma and may give
- * false positives.
- */
-int in_gate_area_no_task(unsigned long addr)
-{
-       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
-}
-
-void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
-{
-       return __alloc_bootmem_core(pgdat->bdata, size,
-                       SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
-}
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
-               return "[vdso]";
-       if (vma == &gate_vma)
-               return "[vsyscall]";
-       return NULL;
-}
diff --git a/arch/x86_64/mm/ioremap_64.c b/arch/x86_64/mm/ioremap_64.c

deleted file mode 100644 (file)

index 6cac90a..0000000
--- a/arch/x86_64/mm/ioremap_64.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * arch/x86_64/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-
-#include <asm/pgalloc.h>
-#include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/proto.h>
-
-unsigned long __phys_addr(unsigned long x)
-{
-       if (x >= __START_KERNEL_map)
-               return x - __START_KERNEL_map + phys_base;
-       return x - PAGE_OFFSET;
-}
-EXPORT_SYMBOL(__phys_addr);
-
-#define ISA_START_ADDRESS      0xa0000
-#define ISA_END_ADDRESS                0x100000
-
-/*
- * Fix up the linear direct mapping of the kernel to avoid cache attribute
- * conflicts.
- */
-static int
-ioremap_change_attr(unsigned long phys_addr, unsigned long size,
-                                       unsigned long flags)
-{
-       int err = 0;
-       if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
-               unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               unsigned long vaddr = (unsigned long) __va(phys_addr);
-
-               /*
-                * Must use a address here and not struct page because the phys addr
-                * can be a in hole between nodes and not have an memmap entry.
-                */
-               err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
-               if (!err)
-                       global_flush_tlb();
-       }
-       return err;
-}
-
-/*
- * Generic mapping function
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-       void * addr;
-       struct vm_struct * area;
-       unsigned long offset, last_addr;
-       pgprot_t pgprot;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return (__force void __iomem *)phys_to_virt(phys_addr);
-
-#ifdef CONFIG_FLATMEM
-       /*
-        * Don't allow anybody to remap normal RAM that we're using..
-        */
-       if (last_addr < virt_to_phys(high_memory)) {
-               char *t_addr, *t_end;
-               struct page *page;
-
-               t_addr = __va(phys_addr);
-               t_end = t_addr + (size - 1);
-          
-               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-                       if(!PageReserved(page))
-                               return NULL;
-       }
-#endif
-
-       pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
-                         | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
-       /*
-        * Mappings have to be page-aligned
-        */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-       /*
-        * Ok, go for it..
-        */
-       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-       if (!area)
-               return NULL;
-       area->phys_addr = phys_addr;
-       addr = area->addr;
-       if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-                              phys_addr, pgprot)) {
-               remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
-               return NULL;
-       }
-       if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
-               area->flags &= 0xffffff;
-               vunmap(addr);
-               return NULL;
-       }
-       return (__force void __iomem *) (offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address. 
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many 
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- * 
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-       return __ioremap(phys_addr, size, _PAGE_PCD);
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-       struct vm_struct *p, *o;
-
-       if (addr <= high_memory) 
-               return; 
-       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-               addr < phys_to_virt(ISA_END_ADDRESS))
-               return;
-
-       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-       /* Use the vm area unlocked, assuming the caller
-          ensures there isn't another iounmap for the same address
-          in parallel. Reuse of the virtual address is prevented by
-          leaving it in the global lists until we're done with it.
-          cpa takes care of the direct mappings. */
-       read_lock(&vmlist_lock);
-       for (p = vmlist; p; p = p->next) {
-               if (p->addr == addr)
-                       break;
-       }
-       read_unlock(&vmlist_lock);
-
-       if (!p) {
-               printk("iounmap: bad address %p\n", addr);
-               dump_stack();
-               return;
-       }
-
-       /* Reset the direct mapping. Can block */
-       if (p->flags >> 20)
-               ioremap_change_attr(p->phys_addr, p->size, 0);
-
-       /* Finally remove it */
-       o = remove_vm_area((void *)addr);
-       BUG_ON(p != o || o == NULL);
-       kfree(p); 
-}
-EXPORT_SYMBOL(iounmap);
-
diff --git a/arch/x86_64/mm/k8topology_64.c b/arch/x86_64/mm/k8topology_64.c

deleted file mode 100644 (file)

index a96006f..0000000
--- a/arch/x86_64/mm/k8topology_64.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/* 
- * AMD K8 NUMA support.
- * Discover the memory map and associated nodes.
- * 
- * This version reads it directly from the K8 northbridge.
- * 
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <asm/io.h>
-#include <linux/pci_ids.h>
-#include <asm/types.h>
-#include <asm/mmzone.h>
-#include <asm/proto.h>
-#include <asm/e820.h>
-#include <asm/pci-direct.h>
-#include <asm/numa.h>
-
-static __init int find_northbridge(void)
-{
-       int num; 
-
-       for (num = 0; num < 32; num++) { 
-               u32 header;
-               
-               header = read_pci_config(0, num, 0, 0x00);  
-               if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
-                       continue;       
-
-               header = read_pci_config(0, num, 1, 0x00); 
-               if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
-                       continue;       
-               return num; 
-       } 
-
-       return -1;      
-}
-
-int __init k8_scan_nodes(unsigned long start, unsigned long end)
-{ 
-       unsigned long prevbase;
-       struct bootnode nodes[8];
-       int nodeid, i, j, nb;
-       unsigned char nodeids[8];
-       int found = 0;
-       u32 reg;
-       unsigned numnodes;
-       unsigned num_cores;
-
-       if (!early_pci_allowed())
-               return -1;
-
-       nb = find_northbridge(); 
-       if (nb < 0) 
-               return nb;
-
-       printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
-
-       num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-       printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
-
-       reg = read_pci_config(0, nb, 0, 0x60); 
-       numnodes = ((reg >> 4) & 0xF) + 1;
-       if (numnodes <= 1)
-               return -1;
-
-       printk(KERN_INFO "Number of nodes %d\n", numnodes);
-
-       memset(&nodes,0,sizeof(nodes)); 
-       prevbase = 0;
-       for (i = 0; i < 8; i++) { 
-               unsigned long base,limit; 
-               u32 nodeid;
-               
-               base = read_pci_config(0, nb, 1, 0x40 + i*8);
-               limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-
-               nodeid = limit & 7; 
-               nodeids[i] = nodeid;
-               if ((base & 3) == 0) { 
-                       if (i < numnodes)
-                               printk("Skipping disabled node %d\n", i); 
-                       continue;
-               } 
-               if (nodeid >= numnodes) {
-                       printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-                              base, limit); 
-                       continue;
-               } 
-
-               if (!limit) { 
-                       printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
-                              base);
-                       continue;
-               }
-               if ((base >> 8) & 3 || (limit >> 8) & 3) {
-                       printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
-                              nodeid, (base>>8)&3, (limit>>8) & 3); 
-                       return -1; 
-               }       
-               if (node_isset(nodeid, node_possible_map)) {
-                       printk(KERN_INFO "Node %d already present. Skipping\n", 
-                              nodeid);
-                       continue;
-               }
-
-               limit >>= 16; 
-               limit <<= 24; 
-               limit |= (1<<24)-1;
-               limit++;
-
-               if (limit > end_pfn << PAGE_SHIFT)
-                       limit = end_pfn << PAGE_SHIFT;
-               if (limit <= base)
-                       continue; 
-                       
-               base >>= 16;
-               base <<= 24; 
-
-               if (base < start) 
-                       base = start; 
-               if (limit > end) 
-                       limit = end; 
-               if (limit == base) { 
-                       printk(KERN_ERR "Empty node %d\n", nodeid); 
-                       continue; 
-               }
-               if (limit < base) { 
-                       printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
-                              nodeid, base, limit);                           
-                       continue;
-               } 
-               
-               /* Could sort here, but pun for now. Should not happen anyroads. */
-               if (prevbase > base) { 
-                       printk(KERN_ERR "Node map not sorted %lx,%lx\n",
-                              prevbase,base);
-                       return -1;
-               }
-                       
-               printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
-                      nodeid, base, limit); 
-               
-               found++;
-               
-               nodes[nodeid].start = base; 
-               nodes[nodeid].end = limit;
-               e820_register_active_regions(nodeid,
-                               nodes[nodeid].start >> PAGE_SHIFT,
-                               nodes[nodeid].end >> PAGE_SHIFT);
-
-               prevbase = base;
-
-               node_set(nodeid, node_possible_map);
-       } 
-
-       if (!found)
-               return -1; 
-
-       memnode_shift = compute_hash_shift(nodes, 8);
-       if (memnode_shift < 0) { 
-               printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
-               return -1; 
-       } 
-       printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
-
-       for (i = 0; i < 8; i++) {
-               if (nodes[i].start != nodes[i].end) { 
-                       nodeid = nodeids[i];
-                       for (j = 0; j < num_cores; j++)
-                               apicid_to_node[(nodeid * num_cores) + j] = i;
-                       setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
-               } 
-       }
-
-       numa_init_array();
-       return 0;
-} 
diff --git a/arch/x86_64/mm/mmap_64.c b/arch/x86_64/mm/mmap_64.c

deleted file mode 100644 (file)

index 80bba0d..0000000
--- a/arch/x86_64/mm/mmap_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2005 Andi Kleen, SuSE Labs.
- * Licensed under GPL, v.2
- */
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/random.h>
-#include <asm/ia32.h>
-
-/* Notebook: move the mmap code from sys_x86_64.c over here. */
-
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (current_thread_info()->flags & _TIF_IA32)
-               return ia32_pick_mmap_layout(mm);
-#endif
-       mm->mmap_base = TASK_UNMAPPED_BASE;
-       if (current->flags & PF_RANDOMIZE) {
-               /* Add 28bit randomness which is about 40bits of address space
-                  because mmap base has to be page aligned.
-                  or ~1/128 of the total user VM
-                  (total user address space is 47bits) */
-               unsigned rnd = get_random_int() & 0xfffffff;
-               mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
-       }
-       mm->get_unmapped_area = arch_get_unmapped_area;
-       mm->unmap_area = arch_unmap_area;
-}
-
diff --git a/arch/x86_64/mm/numa_64.c b/arch/x86_64/mm/numa_64.c

deleted file mode 100644 (file)

index 6da2355..0000000
--- a/arch/x86_64/mm/numa_64.c
+++ /dev/null
@@ -1,648 +0,0 @@
-/* 
- * Generic VM initialization for x86-64 NUMA setups.
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */ 
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/dma.h>
-#include <asm/numa.h>
-#include <asm/acpi.h>
-
-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-bootmem_data_t plat_node_bdata[MAX_NUMNODES];
-
-struct memnode memnode;
-
-unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
-       [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
-       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
-
-int numa_off __initdata;
-unsigned long __initdata nodemap_addr;
-unsigned long __initdata nodemap_size;
-
-
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init
-populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
-{
-       int i; 
-       int res = -1;
-       unsigned long addr, end;
-
-       memset(memnodemap, 0xff, memnodemapsize);
-       for (i = 0; i < numnodes; i++) {
-               addr = nodes[i].start;
-               end = nodes[i].end;
-               if (addr >= end)
-                       continue;
-               if ((end >> shift) >= memnodemapsize)
-                       return 0;
-               do {
-                       if (memnodemap[addr >> shift] != 0xff)
-                               return -1;
-                       memnodemap[addr >> shift] = i;
-                       addr += (1UL << shift);
-               } while (addr < end);
-               res = 1;
-       } 
-       return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
-       unsigned long pad, pad_addr;
-
-       memnodemap = memnode.embedded_map;
-       if (memnodemapsize <= 48)
-               return 0;
-
-       pad = L1_CACHE_BYTES - 1;
-       pad_addr = 0x8000;
-       nodemap_size = pad + memnodemapsize;
-       nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
-                                     nodemap_size);
-       if (nodemap_addr == -1UL) {
-               printk(KERN_ERR
-                      "NUMA: Unable to allocate Memory to Node hash map\n");
-               nodemap_addr = nodemap_size = 0;
-               return -1;
-       }
-       pad_addr = (nodemap_addr + pad) & ~pad;
-       memnodemap = phys_to_virt(pad_addr);
-
-       printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-              nodemap_addr, nodemap_addr + nodemap_size);
-       return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init
-extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
-{
-       int i, nodes_used = 0;
-       unsigned long start, end;
-       unsigned long bitfield = 0, memtop = 0;
-
-       for (i = 0; i < numnodes; i++) {
-               start = nodes[i].start;
-               end = nodes[i].end;
-               if (start >= end)
-                       continue;
-               bitfield |= start;
-               nodes_used++;
-               if (end > memtop)
-                       memtop = end;
-       }
-       if (nodes_used <= 1)
-               i = 63;
-       else
-               i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
-       memnodemapsize = (memtop >> i)+1;
-       return i;
-}
-
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
-{
-       int shift;
-
-       shift = extract_lsb_from_nodes(nodes, numnodes);
-       if (allocate_cachealigned_memnodemap())
-               return -1;
-       printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
-               shift);
-
-       if (populate_memnodemap(nodes, numnodes, shift) != 1) {
-               printk(KERN_INFO
-       "Your memory is not aligned you need to rebuild your kernel "
-       "with a bigger NODEMAPSIZE shift=%d\n",
-                       shift);
-               return -1;
-       }
-       return shift;
-}
-
-#ifdef CONFIG_SPARSEMEM
-int early_pfn_to_nid(unsigned long pfn)
-{
-       return phys_to_nid(pfn << PAGE_SHIFT);
-}
-#endif
-
-static void * __init
-early_node_mem(int nodeid, unsigned long start, unsigned long end,
-             unsigned long size)
-{
-       unsigned long mem = find_e820_area(start, end, size);
-       void *ptr;
-       if (mem != -1L)
-               return __va(mem);
-       ptr = __alloc_bootmem_nopanic(size,
-                               SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
-       if (ptr == 0) {
-               printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-                       size, nodeid);
-               return NULL;
-       }
-       return ptr;
-}
-
-/* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{ 
-       unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
-       unsigned long nodedata_phys;
-       void *bootmap;
-       const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
-
-       start = round_up(start, ZONE_ALIGN); 
-
-       printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
-
-       start_pfn = start >> PAGE_SHIFT;
-       end_pfn = end >> PAGE_SHIFT;
-
-       node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
-       if (node_data[nodeid] == NULL)
-               return;
-       nodedata_phys = __pa(node_data[nodeid]);
-
-       memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
-       NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
-       NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-       NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
-
-       /* Find a place for the bootmem map */
-       bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
-       bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
-       bootmap = early_node_mem(nodeid, bootmap_start, end,
-                                       bootmap_pages<<PAGE_SHIFT);
-       if (bootmap == NULL)  {
-               if (nodedata_phys < start || nodedata_phys >= end)
-                       free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
-               node_data[nodeid] = NULL;
-               return;
-       }
-       bootmap_start = __pa(bootmap);
-       Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
-       
-       bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-                                        bootmap_start >> PAGE_SHIFT, 
-                                        start_pfn, end_pfn); 
-
-       free_bootmem_with_active_regions(nodeid, end);
-
-       reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
-       reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
-#ifdef CONFIG_ACPI_NUMA
-       srat_reserve_add_area(nodeid);
-#endif
-       node_set_online(nodeid);
-} 
-
-/* Initialize final allocator for a zone */
-void __init setup_node_zones(int nodeid)
-{ 
-       unsigned long start_pfn, end_pfn, memmapsize, limit;
-
-       start_pfn = node_start_pfn(nodeid);
-       end_pfn = node_end_pfn(nodeid);
-
-       Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
-               nodeid, start_pfn, end_pfn);
-
-       /* Try to allocate mem_map at end to not fill up precious <4GB
-          memory. */
-       memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
-       limit = end_pfn << PAGE_SHIFT;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-       NODE_DATA(nodeid)->node_mem_map = 
-               __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
-                               memmapsize, SMP_CACHE_BYTES, 
-                               round_down(limit - memmapsize, PAGE_SIZE), 
-                               limit);
-#endif
-} 
-
-void __init numa_init_array(void)
-{
-       int rr, i;
-       /* There are unfortunately some poorly designed mainboards around
-          that only connect memory to a single CPU. This breaks the 1:1 cpu->node
-          mapping. To avoid this fill in the mapping for all possible
-          CPUs, as the number of CPUs is not known yet. 
-          We round robin the existing nodes. */
-       rr = first_node(node_online_map);
-       for (i = 0; i < NR_CPUS; i++) {
-               if (cpu_to_node[i] != NUMA_NO_NODE)
-                       continue;
-               numa_set_node(i, rr);
-               rr = next_node(rr, node_online_map);
-               if (rr == MAX_NUMNODES)
-                       rr = first_node(node_online_map);
-       }
-
-}
-
-#ifdef CONFIG_NUMA_EMU
-/* Numa emulation */
-char *cmdline __initdata;
-
-/*
- * Setups up nid to range from addr to addr + size.  If the end boundary is
- * greater than max_addr, then max_addr is used instead.  The return value is 0
- * if there is additional memory left for allocation past addr and -1 otherwise.
- * addr is adjusted to be at the end of the node.
- */
-static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
-                                  u64 size, u64 max_addr)
-{
-       int ret = 0;
-       nodes[nid].start = *addr;
-       *addr += size;
-       if (*addr >= max_addr) {
-               *addr = max_addr;
-               ret = -1;
-       }
-       nodes[nid].end = *addr;
-       node_set(nid, node_possible_map);
-       printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-              nodes[nid].start, nodes[nid].end,
-              (nodes[nid].end - nodes[nid].start) >> 20);
-       return ret;
-}
-
-/*
- * Splits num_nodes nodes up equally starting at node_start.  The return value
- * is the number of nodes split up and addr is adjusted to be at the end of the
- * last node allocated.
- */
-static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
-                                     u64 max_addr, int node_start,
-                                     int num_nodes)
-{
-       unsigned int big;
-       u64 size;
-       int i;
-
-       if (num_nodes <= 0)
-               return -1;
-       if (num_nodes > MAX_NUMNODES)
-               num_nodes = MAX_NUMNODES;
-       size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
-              num_nodes;
-       /*
-        * Calculate the number of big nodes that can be allocated as a result
-        * of consolidating the leftovers.
-        */
-       big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
-             FAKE_NODE_MIN_SIZE;
-
-       /* Round down to nearest FAKE_NODE_MIN_SIZE. */
-       size &= FAKE_NODE_MIN_HASH_MASK;
-       if (!size) {
-               printk(KERN_ERR "Not enough memory for each node.  "
-                      "NUMA emulation disabled.\n");
-               return -1;
-       }
-
-       for (i = node_start; i < num_nodes + node_start; i++) {
-               u64 end = *addr + size;
-               if (i < big)
-                       end += FAKE_NODE_MIN_SIZE;
-               /*
-                * The final node can have the remaining system RAM.  Other
-                * nodes receive roughly the same amount of available pages.
-                */
-               if (i == num_nodes + node_start - 1)
-                       end = max_addr;
-               else
-                       while (end - *addr - e820_hole_size(*addr, end) <
-                              size) {
-                               end += FAKE_NODE_MIN_SIZE;
-                               if (end > max_addr) {
-                                       end = max_addr;
-                                       break;
-                               }
-                       }
-               if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
-                       break;
-       }
-       return i - node_start + 1;
-}
-
-/*
- * Splits the remaining system RAM into chunks of size.  The remaining memory is
- * always assigned to a final node and can be asymmetric.  Returns the number of
- * nodes split.
- */
-static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
-                                     u64 max_addr, int node_start, u64 size)
-{
-       int i = node_start;
-       size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
-       while (!setup_node_range(i++, nodes, addr, size, max_addr))
-               ;
-       return i - node_start;
-}
-
-/*
- * Sets up the system RAM area from start_pfn to end_pfn according to the
- * numa=fake command-line option.
- */
-static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
-{
-       struct bootnode nodes[MAX_NUMNODES];
-       u64 addr = start_pfn << PAGE_SHIFT;
-       u64 max_addr = end_pfn << PAGE_SHIFT;
-       int num_nodes = 0;
-       int coeff_flag;
-       int coeff = -1;
-       int num = 0;
-       u64 size;
-       int i;
-
-       memset(&nodes, 0, sizeof(nodes));
-       /*
-        * If the numa=fake command-line is just a single number N, split the
-        * system RAM into N fake nodes.
-        */
-       if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
-               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
-                                               simple_strtol(cmdline, NULL, 0));
-               if (num_nodes < 0)
-                       return num_nodes;
-               goto out;
-       }
-
-       /* Parse the command line. */
-       for (coeff_flag = 0; ; cmdline++) {
-               if (*cmdline && isdigit(*cmdline)) {
-                       num = num * 10 + *cmdline - '0';
-                       continue;
-               }
-               if (*cmdline == '*') {
-                       if (num > 0)
-                               coeff = num;
-                       coeff_flag = 1;
-               }
-               if (!*cmdline || *cmdline == ',') {
-                       if (!coeff_flag)
-                               coeff = 1;
-                       /*
-                        * Round down to the nearest FAKE_NODE_MIN_SIZE.
-                        * Command-line coefficients are in megabytes.
-                        */
-                       size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
-                       if (size)
-                               for (i = 0; i < coeff; i++, num_nodes++)
-                                       if (setup_node_range(num_nodes, nodes,
-                                               &addr, size, max_addr) < 0)
-                                               goto done;
-                       if (!*cmdline)
-                               break;
-                       coeff_flag = 0;
-                       coeff = -1;
-               }
-               num = 0;
-       }
-done:
-       if (!num_nodes)
-               return -1;
-       /* Fill remainder of system RAM, if appropriate. */
-       if (addr < max_addr) {
-               if (coeff_flag && coeff < 0) {
-                       /* Split remaining nodes into num-sized chunks */
-                       num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
-                                                        num_nodes, num);
-                       goto out;
-               }
-               switch (*(cmdline - 1)) {
-               case '*':
-                       /* Split remaining nodes into coeff chunks */
-                       if (coeff <= 0)
-                               break;
-                       num_nodes += split_nodes_equally(nodes, &addr, max_addr,
-                                                        num_nodes, coeff);
-                       break;
-               case ',':
-                       /* Do not allocate remaining system RAM */
-                       break;
-               default:
-                       /* Give one final node */
-                       setup_node_range(num_nodes, nodes, &addr,
-                                        max_addr - addr, max_addr);
-                       num_nodes++;
-               }
-       }
-out:
-       memnode_shift = compute_hash_shift(nodes, num_nodes);
-       if (memnode_shift < 0) {
-               memnode_shift = 0;
-               printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
-                      "disabled.\n");
-               return -1;
-       }
-
-       /*
-        * We need to vacate all active ranges that may have been registered by
-        * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
-        * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
-        */
-       remove_all_active_ranges();
-#ifdef CONFIG_ACPI_NUMA
-       acpi_numa = -1;
-#endif
-       for_each_node_mask(i, node_possible_map) {
-               e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
-                                               nodes[i].end >> PAGE_SHIFT);
-               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-       }
-       acpi_fake_nodes(nodes, num_nodes);
-       numa_init_array();
-       return 0;
-}
-#endif /* CONFIG_NUMA_EMU */
-
-void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{ 
-       int i;
-
-       nodes_clear(node_possible_map);
-
-#ifdef CONFIG_NUMA_EMU
-       if (cmdline && !numa_emulation(start_pfn, end_pfn))
-               return;
-       nodes_clear(node_possible_map);
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-       if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-                                         end_pfn << PAGE_SHIFT))
-               return;
-       nodes_clear(node_possible_map);
-#endif
-
-#ifdef CONFIG_K8_NUMA
-       if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
-               return;
-       nodes_clear(node_possible_map);
-#endif
-       printk(KERN_INFO "%s\n",
-              numa_off ? "NUMA turned off" : "No NUMA configuration found");
-
-       printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
-              start_pfn << PAGE_SHIFT,
-              end_pfn << PAGE_SHIFT); 
-               /* setup dummy node covering all memory */ 
-       memnode_shift = 63; 
-       memnodemap = memnode.embedded_map;
-       memnodemap[0] = 0;
-       nodes_clear(node_online_map);
-       node_set_online(0);
-       node_set(0, node_possible_map);
-       for (i = 0; i < NR_CPUS; i++)
-               numa_set_node(i, 0);
-       node_to_cpumask[0] = cpumask_of_cpu(0);
-       e820_register_active_regions(0, start_pfn, end_pfn);
-       setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
-}
-
-__cpuinit void numa_add_cpu(int cpu)
-{
-       set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-} 
-
-void __cpuinit numa_set_node(int cpu, int node)
-{
-       cpu_pda(cpu)->nodenumber = node;
-       cpu_to_node[cpu] = node;
-}
-
-unsigned long __init numa_free_all_bootmem(void) 
-{ 
-       int i;
-       unsigned long pages = 0;
-       for_each_online_node(i) {
-               pages += free_all_bootmem_node(NODE_DATA(i));
-       }
-       return pages;
-} 
-
-void __init paging_init(void)
-{ 
-       int i;
-       unsigned long max_zone_pfns[MAX_NR_ZONES];
-       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-       max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-       max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-       max_zone_pfns[ZONE_NORMAL] = end_pfn;
-
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
-       sparse_init();
-
-       for_each_online_node(i) {
-               setup_node_zones(i); 
-       }
-
-       free_area_init_nodes(max_zone_pfns);
-} 
-
-static __init int numa_setup(char *opt)
-{ 
-       if (!opt)
-               return -EINVAL;
-       if (!strncmp(opt,"off",3))
-               numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
-       if (!strncmp(opt, "fake=", 5))
-               cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
-       if (!strncmp(opt,"noacpi",6))
-               acpi_numa = -1;
-       if (!strncmp(opt,"hotadd=", 7))
-               hotadd_percent = simple_strtoul(opt+7, NULL, 10);
-#endif
-       return 0;
-} 
-
-early_param("numa", numa_setup);
-
-/*
- * Setup early cpu_to_node.
- *
- * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
- * and apicid_to_node[] tables have valid entries for a CPU.
- * This means we skip cpu_to_node[] initialisation for NUMA
- * emulation and faking node case (when running a kernel compiled
- * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
- * is already initialized in a round robin manner at numa_init_array,
- * prior to this call, and this initialization is good enough
- * for the fake NUMA cases.
- */
-void __init init_cpu_to_node(void)
-{
-       int i;
-       for (i = 0; i < NR_CPUS; i++) {
-               u8 apicid = x86_cpu_to_apicid[i];
-               if (apicid == BAD_APICID)
-                       continue;
-               if (apicid_to_node[apicid] == NUMA_NO_NODE)
-                       continue;
-               numa_set_node(i,apicid_to_node[apicid]);
-       }
-}
-
-EXPORT_SYMBOL(cpu_to_node);
-EXPORT_SYMBOL(node_to_cpumask);
-EXPORT_SYMBOL(memnode);
-EXPORT_SYMBOL(node_data);
-
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * Functions to convert PFNs from/to per node page addresses.
- * These are out of line because they are quite big.
- * They could be all tuned by pre caching more state.
- * Should do that.
- */
-
-int pfn_valid(unsigned long pfn)
-{
-       unsigned nid;
-       if (pfn >= num_physpages)
-               return 0;
-       nid = pfn_to_nid(pfn);
-       if (nid == 0xff)
-               return 0;
-       return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
-}
-EXPORT_SYMBOL(pfn_valid);
-#endif
diff --git a/arch/x86_64/mm/pageattr_64.c b/arch/x86_64/mm/pageattr_64.c

deleted file mode 100644 (file)

index 10b9809..0000000
--- a/arch/x86_64/mm/pageattr_64.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs. 
- * Thanks to Ben LaHaise for precious feedback.
- */ 
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-pte_t *lookup_address(unsigned long address)
-{ 
-       pgd_t *pgd = pgd_offset_k(address);
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       if (pgd_none(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return NULL; 
-       pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
-               return NULL; 
-       if (pmd_large(*pmd))
-               return (pte_t *)pmd;
-       pte = pte_offset_kernel(pmd, address);
-       if (pte && !pte_present(*pte))
-               pte = NULL; 
-       return pte;
-} 
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-                                    pgprot_t ref_prot)
-{ 
-       int i; 
-       unsigned long addr;
-       struct page *base = alloc_pages(GFP_KERNEL, 0);
-       pte_t *pbase;
-       if (!base) 
-               return NULL;
-       /*
-        * page_private is used to track the number of entries in
-        * the page table page have non standard attributes.
-        */
-       SetPagePrivate(base);
-       page_private(base) = 0;
-
-       address = __pa(address);
-       addr = address & LARGE_PAGE_MASK; 
-       pbase = (pte_t *)page_address(base);
-       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
-                                  addr == address ? prot : ref_prot);
-       }
-       return base;
-} 
-
-static void cache_flush_page(void *adr)
-{
-       int i;
-       for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
-               asm volatile("clflush (%0)" :: "r" (adr + i));
-}
-
-static void flush_kernel_map(void *arg)
-{
-       struct list_head *l = (struct list_head *)arg;
-       struct page *pg;
-
-       /* When clflush is available always use it because it is
-          much cheaper than WBINVD. */
-       /* clflush is still broken. Disable for now. */
-       if (1 || !cpu_has_clflush)
-               asm volatile("wbinvd" ::: "memory");
-       else list_for_each_entry(pg, l, lru) {
-               void *adr = page_address(pg);
-               cache_flush_page(adr);
-       }
-       __flush_tlb_all();
-}
-
-static inline void flush_map(struct list_head *l)
-{      
-       on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
-
-static inline void save_page(struct page *fpage)
-{
-       if (!test_and_set_bit(PG_arch_1, &fpage->flags))
-               list_add(&fpage->lru, &deferred_pages);
-}
-
-/* 
- * No more special protections in this 2/4MB area - revert to a
- * large page again. 
- */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t large_pte;
-       unsigned long pfn;
-
-       pgd = pgd_offset_k(address);
-       BUG_ON(pgd_none(*pgd));
-       pud = pud_offset(pgd,address);
-       BUG_ON(pud_none(*pud));
-       pmd = pmd_offset(pud, address);
-       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
-       pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
-       large_pte = pfn_pte(pfn, ref_prot);
-       large_pte = pte_mkhuge(large_pte);
-       set_pte((pte_t *)pmd, large_pte);
-}      
-
-static int
-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
-                                  pgprot_t ref_prot)
-{ 
-       pte_t *kpte; 
-       struct page *kpte_page;
-       pgprot_t ref_prot2;
-
-       kpte = lookup_address(address);
-       if (!kpte) return 0;
-       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
-       BUG_ON(PageLRU(kpte_page));
-       BUG_ON(PageCompound(kpte_page));
-       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
-               if (!pte_huge(*kpte)) {
-                       set_pte(kpte, pfn_pte(pfn, prot));
-               } else {
-                       /*
-                        * split_large_page will take the reference for this
-                        * change_page_attr on the split page.
-                        */
-                       struct page *split;
-                       ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-                       split = split_large_page(address, prot, ref_prot2);
-                       if (!split)
-                               return -ENOMEM;
-                       set_pte(kpte, mk_pte(split, ref_prot2));
-                       kpte_page = split;
-               }
-               page_private(kpte_page)++;
-       } else if (!pte_huge(*kpte)) {
-               set_pte(kpte, pfn_pte(pfn, ref_prot));
-               BUG_ON(page_private(kpte_page) == 0);
-               page_private(kpte_page)--;
-       } else
-               BUG();
-
-       /* on x86-64 the direct mapping set at boot is not using 4k pages */
-       BUG_ON(PageReserved(kpte_page));
-
-       save_page(kpte_page);
-       if (page_private(kpte_page) == 0)
-               revert_page(address, ref_prot);
-       return 0;
-} 
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- * 
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- * 
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
-{
-       int err = 0, kernel_map = 0;
-       int i; 
-
-       if (address >= __START_KERNEL_map
-           && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
-               address = (unsigned long)__va(__pa(address));
-               kernel_map = 1;
-       }
-
-       down_write(&init_mm.mmap_sem);
-       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
-               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
-
-               if (!kernel_map || pte_present(pfn_pte(0, prot))) {
-                       err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
-                       if (err)
-                               break;
-               }
-               /* Handle kernel mapping too which aliases part of the
-                * lowmem */
-               if (__pa(address) < KERNEL_TEXT_SIZE) {
-                       unsigned long addr2;
-                       pgprot_t prot2;
-                       addr2 = __START_KERNEL_map + __pa(address);
-                       /* Make sure the kernel mappings stay executable */
-                       prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
-                       err = __change_page_attr(addr2, pfn, prot2,
-                                                PAGE_KERNEL_EXEC);
-               } 
-       }       
-       up_write(&init_mm.mmap_sem); 
-       return err;
-}
-
-/* Don't call this for MMIO areas that may not have a mem_map entry */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-       unsigned long addr = (unsigned long)page_address(page);
-       return change_page_attr_addr(addr, numpages, prot);
-}
-
-void global_flush_tlb(void)
-{ 
-       struct page *pg, *next;
-       struct list_head l;
-
-       down_read(&init_mm.mmap_sem);
-       list_replace_init(&deferred_pages, &l);
-       up_read(&init_mm.mmap_sem);
-
-       flush_map(&l);
-
-       list_for_each_entry_safe(pg, next, &l, lru) {
-               list_del(&pg->lru);
-               clear_bit(PG_arch_1, &pg->flags);
-               if (page_private(pg) != 0)
-                       continue;
-               ClearPagePrivate(pg);
-               __free_page(pg);
-       } 
-} 
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86_64/mm/srat_64.c b/arch/x86_64/mm/srat_64.c

deleted file mode 100644 (file)

index acdf03e..0000000
--- a/arch/x86_64/mm/srat_64.c
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * ACPI 3.0 based NUMA setup
- * Copyright 2004 Andi Kleen, SuSE Labs.
- *
- * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
- *
- * Called from acpi_numa_init while reading the SRAT and SLIT tables.
- * Assumes all memory regions belonging to a single proximity domain
- * are in one chunk. Holes between them will be included in the node.
- */
-
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/mmzone.h>
-#include <linux/bitmap.h>
-#include <linux/module.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <asm/proto.h>
-#include <asm/numa.h>
-#include <asm/e820.h>
-
-int acpi_numa __initdata;
-
-static struct acpi_table_slit *acpi_slit;
-
-static nodemask_t nodes_parsed __initdata;
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
-
-/* Too small nodes confuse the VM badly. Usually they result
-   from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
-static __init int setup_node(int pxm)
-{
-       return acpi_map_pxm_to_node(pxm);
-}
-
-static __init int conflicting_nodes(unsigned long start, unsigned long end)
-{
-       int i;
-       for_each_node_mask(i, nodes_parsed) {
-               struct bootnode *nd = &nodes[i];
-               if (nd->start == nd->end)
-                       continue;
-               if (nd->end > start && nd->start < end)
-                       return i;
-               if (nd->end == end && nd->start == start)
-                       return i;
-       }
-       return -1;
-}
-
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
-       struct bootnode *nd = &nodes[i];
-
-       if (found_add_area)
-               return;
-
-       if (nd->start < start) {
-               nd->start = start;
-               if (nd->end < nd->start)
-                       nd->start = nd->end;
-       }
-       if (nd->end > end) {
-               nd->end = end;
-               if (nd->start > nd->end)
-                       nd->start = nd->end;
-       }
-}
-
-static __init void bad_srat(void)
-{
-       int i;
-       printk(KERN_ERR "SRAT: SRAT not used.\n");
-       acpi_numa = -1;
-       found_add_area = 0;
-       for (i = 0; i < MAX_LOCAL_APIC; i++)
-               apicid_to_node[i] = NUMA_NO_NODE;
-       for (i = 0; i < MAX_NUMNODES; i++)
-               nodes_add[i].start = nodes[i].end = 0;
-       remove_all_active_ranges();
-}
-
-static __init inline int srat_disabled(void)
-{
-       return numa_off || acpi_numa < 0;
-}
-
-/*
- * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
- * up the NUMA heuristics which wants the local node to have a smaller
- * distance than the others.
- * Do some quick checks here and only use the SLIT if it passes.
- */
-static __init int slit_valid(struct acpi_table_slit *slit)
-{
-       int i, j;
-       int d = slit->locality_count;
-       for (i = 0; i < d; i++) {
-               for (j = 0; j < d; j++)  {
-                       u8 val = slit->entry[d*i + j];
-                       if (i == j) {
-                               if (val != LOCAL_DISTANCE)
-                                       return 0;
-                       } else if (val <= LOCAL_DISTANCE)
-                               return 0;
-               }
-       }
-       return 1;
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-       if (!slit_valid(slit)) {
-               printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
-               return;
-       }
-       acpi_slit = slit;
-}
-
-/* Callback for Proximity Domain -> LAPIC mapping */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
-{
-       int pxm, node;
-       if (srat_disabled())
-               return;
-       if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
-               bad_srat();
-               return;
-       }
-       if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-               return;
-       pxm = pa->proximity_domain_lo;
-       node = setup_node(pxm);
-       if (node < 0) {
-               printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
-               bad_srat();
-               return;
-       }
-       apicid_to_node[pa->apic_id] = node;
-       acpi_numa = 1;
-       printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
-              pxm, pa->apic_id, node);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Protect against too large hotadd areas that would fill up memory.
- */
-static int hotadd_enough_memory(struct bootnode *nd)
-{
-       static unsigned long allocated;
-       static unsigned long last_area_end;
-       unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
-       long mem = pages * sizeof(struct page);
-       unsigned long addr;
-       unsigned long allowed;
-       unsigned long oldpages = pages;
-
-       if (mem < 0)
-               return 0;
-       allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
-       allowed = (allowed / 100) * hotadd_percent;
-       if (allocated + mem > allowed) {
-               unsigned long range;
-               /* Give them at least part of their hotadd memory upto hotadd_percent
-                  It would be better to spread the limit out
-                  over multiple hotplug areas, but that is too complicated
-                  right now */
-               if (allocated >= allowed)
-                       return 0;
-               range = allowed - allocated;
-               pages = (range / PAGE_SIZE);
-               mem = pages * sizeof(struct page);
-               nd->end = nd->start + range;
-       }
-       /* Not completely fool proof, but a good sanity check */
-       addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
-       if (addr == -1UL)
-               return 0;
-       if (pages != oldpages)
-               printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
-                       pages << PAGE_SHIFT);
-       last_area_end = addr + mem;
-       allocated += mem;
-       return 1;
-}
-
-static int update_end_of_memory(unsigned long end)
-{
-       found_add_area = 1;
-       if ((end >> PAGE_SHIFT) > end_pfn)
-               end_pfn = end >> PAGE_SHIFT;
-       return 1;
-}
-
-static inline int save_add_info(void)
-{
-       return hotadd_percent > 0;
-}
-#else
-int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-static inline int save_add_info(void) {return 1;}
-#else
-static inline int save_add_info(void) {return 0;}
-#endif
-#endif
-/*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add infomation.
- * This code supports one contigious hot add area per node.
- */
-static int reserve_hotadd(int node, unsigned long start, unsigned long end)
-{
-       unsigned long s_pfn = start >> PAGE_SHIFT;
-       unsigned long e_pfn = end >> PAGE_SHIFT;
-       int ret = 0, changed = 0;
-       struct bootnode *nd = &nodes_add[node];
-
-       /* I had some trouble with strange memory hotadd regions breaking
-          the boot. Be very strict here and reject anything unexpected.
-          If you want working memory hotadd write correct SRATs.
-
-          The node size check is a basic sanity check to guard against
-          mistakes */
-       if ((signed long)(end - start) < NODE_MIN_SIZE) {
-               printk(KERN_ERR "SRAT: Hotplug area too small\n");
-               return -1;
-       }
-
-       /* This check might be a bit too strict, but I'm keeping it for now. */
-       if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
-               printk(KERN_ERR
-                       "SRAT: Hotplug area %lu -> %lu has existing memory\n",
-                       s_pfn, e_pfn);
-               return -1;
-       }
-
-       if (!hotadd_enough_memory(&nodes_add[node]))  {
-               printk(KERN_ERR "SRAT: Hotplug area too large\n");
-               return -1;
-       }
-
-       /* Looks good */
-
-       if (nd->start == nd->end) {
-               nd->start = start;
-               nd->end = end;
-               changed = 1;
-       } else {
-               if (nd->start == end) {
-                       nd->start = start;
-                       changed = 1;
-               }
-               if (nd->end == start) {
-                       nd->end = end;
-                       changed = 1;
-               }
-               if (!changed)
-                       printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
-       }
-
-       ret = update_end_of_memory(nd->end);
-
-       if (changed)
-               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-       return ret;
-}
-
-/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
-       struct bootnode *nd, oldnode;
-       unsigned long start, end;
-       int node, pxm;
-       int i;
-
-       if (srat_disabled())
-               return;
-       if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
-               bad_srat();
-               return;
-       }
-       if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-               return;
-
-       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
-               return;
-       start = ma->base_address;
-       end = start + ma->length;
-       pxm = ma->proximity_domain;
-       node = setup_node(pxm);
-       if (node < 0) {
-               printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-               bad_srat();
-               return;
-       }
-       i = conflicting_nodes(start, end);
-       if (i == node) {
-               printk(KERN_WARNING
-               "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
-                       pxm, start, end, nodes[i].start, nodes[i].end);
-       } else if (i >= 0) {
-               printk(KERN_ERR
-                      "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
-                      pxm, start, end, node_to_pxm(i),
-                       nodes[i].start, nodes[i].end);
-               bad_srat();
-               return;
-       }
-       nd = &nodes[node];
-       oldnode = *nd;
-       if (!node_test_and_set(node, nodes_parsed)) {
-               nd->start = start;
-               nd->end = end;
-       } else {
-               if (start < nd->start)
-                       nd->start = start;
-               if (nd->end < end)
-                       nd->end = end;
-       }
-
-       printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
-              nd->start, nd->end);
-       e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
-                                               nd->end >> PAGE_SHIFT);
-       push_node_boundaries(node, nd->start >> PAGE_SHIFT,
-                                               nd->end >> PAGE_SHIFT);
-
-       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
-           (reserve_hotadd(node, start, end) < 0)) {
-               /* Ignore hotadd region. Undo damage */
-               printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
-               *nd = oldnode;
-               if ((nd->start | nd->end) == 0)
-                       node_clear(node, nodes_parsed);
-       }
-}
-
-/* Sanity check to catch more bad SRATs (they are amazingly common).
-   Make sure the PXMs cover all memory. */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
-{
-       int i;
-       unsigned long pxmram, e820ram;
-
-       pxmram = 0;
-       for_each_node_mask(i, nodes_parsed) {
-               unsigned long s = nodes[i].start >> PAGE_SHIFT;
-               unsigned long e = nodes[i].end >> PAGE_SHIFT;
-               pxmram += e - s;
-               pxmram -= absent_pages_in_range(s, e);
-               if ((long)pxmram < 0)
-                       pxmram = 0;
-       }
-
-       e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
-       /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
-       if ((long)(e820ram - pxmram) >= 1*1024*1024) {
-               printk(KERN_ERR
-       "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
-                       (pxmram << PAGE_SHIFT) >> 20,
-                       (e820ram << PAGE_SHIFT) >> 20);
-               return 0;
-       }
-       return 1;
-}
-
-static void unparse_node(int node)
-{
-       int i;
-       node_clear(node, nodes_parsed);
-       for (i = 0; i < MAX_LOCAL_APIC; i++) {
-               if (apicid_to_node[i] == node)
-                       apicid_to_node[i] = NUMA_NO_NODE;
-       }
-}
-
-void __init acpi_numa_arch_fixup(void) {}
-
-/* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(unsigned long start, unsigned long end)
-{
-       int i;
-
-       if (acpi_numa <= 0)
-               return -1;
-
-       /* First clean up the node list */
-       for (i = 0; i < MAX_NUMNODES; i++) {
-               cutoff_node(i, start, end);
-               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
-                       unparse_node(i);
-                       node_set_offline(i);
-               }
-       }
-
-       if (!nodes_cover_memory(nodes)) {
-               bad_srat();
-               return -1;
-       }
-
-       memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
-       if (memnode_shift < 0) {
-               printk(KERN_ERR
-                    "SRAT: No NUMA node hash function found. Contact maintainer\n");
-               bad_srat();
-               return -1;
-       }
-
-       node_possible_map = nodes_parsed;
-
-       /* Finally register nodes */
-       for_each_node_mask(i, node_possible_map)
-               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-       /* Try again in case setup_node_bootmem missed one due
-          to missing bootmem */
-       for_each_node_mask(i, node_possible_map)
-               if (!node_online(i))
-                       setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-
-       for (i = 0; i < NR_CPUS; i++) {
-               if (cpu_to_node[i] == NUMA_NO_NODE)
-                       continue;
-               if (!node_isset(cpu_to_node[i], node_possible_map))
-                       numa_set_node(i, NUMA_NO_NODE);
-       }
-       numa_init_array();
-       return 0;
-}
-
-#ifdef CONFIG_NUMA_EMU
-static int __init find_node_by_addr(unsigned long addr)
-{
-       int ret = NUMA_NO_NODE;
-       int i;
-
-       for_each_node_mask(i, nodes_parsed) {
-               /*
-                * Find the real node that this emulated node appears on.  For
-                * the sake of simplicity, we only use a real node's starting
-                * address to determine which emulated node it appears on.
-                */
-               if (addr >= nodes[i].start && addr < nodes[i].end) {
-                       ret = i;
-                       break;
-               }
-       }
-       return i;
-}
-
-/*
- * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
- * mappings that respect the real ACPI topology but reflect our emulated
- * environment.  For each emulated node, we find which real node it appears on
- * and create PXM to NID mappings for those fake nodes which mirror that
- * locality.  SLIT will now represent the correct distances between emulated
- * nodes as a result of the real topology.
- */
-void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
-{
-       int i, j;
-       int fake_node_to_pxm_map[MAX_NUMNODES] = {
-               [0 ... MAX_NUMNODES-1] = PXM_INVAL
-       };
-       unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
-               [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-       };
-
-       printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
-                        "topology.\n");
-       for (i = 0; i < num_nodes; i++) {
-               int nid, pxm;
-
-               nid = find_node_by_addr(fake_nodes[i].start);
-               if (nid == NUMA_NO_NODE)
-                       continue;
-               pxm = node_to_pxm(nid);
-               if (pxm == PXM_INVAL)
-                       continue;
-               fake_node_to_pxm_map[i] = pxm;
-               /*
-                * For each apicid_to_node mapping that exists for this real
-                * node, it must now point to the fake node ID.
-                */
-               for (j = 0; j < MAX_LOCAL_APIC; j++)
-                       if (apicid_to_node[j] == nid)
-                               fake_apicid_to_node[j] = i;
-       }
-       for (i = 0; i < num_nodes; i++)
-               __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-       memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-
-       nodes_clear(nodes_parsed);
-       for (i = 0; i < num_nodes; i++)
-               if (fake_nodes[i].start != fake_nodes[i].end)
-                       node_set(i, nodes_parsed);
-       WARN_ON(!nodes_cover_memory(fake_nodes));
-}
-
-static int null_slit_node_compare(int a, int b)
-{
-       return node_to_pxm(a) == node_to_pxm(b);
-}
-#else
-static int null_slit_node_compare(int a, int b)
-{
-       return a == b;
-}
-#endif /* CONFIG_NUMA_EMU */
-
-void __init srat_reserve_add_area(int nodeid)
-{
-       if (found_add_area && nodes_add[nodeid].end) {
-               u64 total_mb;
-
-               printk(KERN_INFO "SRAT: Reserving hot-add memory space "
-                               "for node %d at %Lx-%Lx\n",
-                       nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
-               total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
-                                       >> PAGE_SHIFT;
-               total_mb *= sizeof(struct page);
-               total_mb >>= 20;
-               printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
-                               "pre-allocated memory.\n", (unsigned long long)total_mb);
-               reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
-                              nodes_add[nodeid].end - nodes_add[nodeid].start);
-       }
-}
-
-int __node_distance(int a, int b)
-{
-       int index;
-
-       if (!acpi_slit)
-               return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
-                                                     REMOTE_DISTANCE;
-       index = acpi_slit->locality_count * node_to_pxm(a);
-       return acpi_slit->entry[index + node_to_pxm(b)];
-}
-
-EXPORT_SYMBOL(__node_distance);
-
-int memory_add_physaddr_to_nid(u64 start)
-{
-       int i, ret = 0;
-
-       for_each_node(i)
-               if (nodes_add[i].start <= start && nodes_add[i].end > start)
-                       ret = i;
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-
author	Thomas Gleixner <tglx@linutronix.de>
	Thu, 11 Oct 2007 09:17:18 +0000 (11:17 +0200)
committer	Thomas Gleixner <tglx@linutronix.de>
	Thu, 11 Oct 2007 09:17:18 +0000 (11:17 +0200)
arch/x86/mm/Makefile		patch \| blob \| blame \| history
arch/x86/mm/Makefile_64	[new file with mode: 0644]	patch \| blob
arch/x86/mm/extable_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/fault_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/init_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/ioremap_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/k8topology_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/mmap_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/numa_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/pageattr_64.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/srat_64.c	[new file with mode: 0644]	patch \| blob
arch/x86_64/Makefile		patch \| blob \| blame \| history
arch/x86_64/mm/Makefile	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/Makefile_64	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/extable_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/fault_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/init_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/ioremap_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/k8topology_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/mmap_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/numa_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/pageattr_64.c	[deleted file]	patch \| blob \| blame \| history
arch/x86_64/mm/srat_64.c	[deleted file]	patch \| blob \| blame \| history