.quad 1b,ia32_badarg
.previous
GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,threadinfo_status(%r10)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
---------- ---- ---- TI_flags(%r10)
+++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
--------------- ----sysenter_do_call:
cmpl $(IA32_NR_syscalls-1),%eax
ja ia32_badsys
+++++++++++++++ ++++sysenter_do_call:
IA32_ARG_FIXUP 1
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
.quad 1b,ia32_badarg
.previous
GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,threadinfo_status(%r10)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
---------- ---- ---- TI_flags(%r10)
+++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cstar_do_call:
/*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
/*CFI_REL_OFFSET cs,CS-RIP*/
CFI_REL_OFFSET rip,RIP-RIP
- swapgs
+++++++++++++++++++ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ SWAPGS
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
this could be a problem. */
SAVE_ARGS 0,0,1
GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,threadinfo_status(%r10)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ orl $TS_COMPAT,TI_status(%r10)
---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
---------- ---- ---- TI_flags(%r10)
+++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax
--- /dev/null
------- -- ---------#include <asm/gart.h>
+ /*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ #include <linux/pci.h>
+ #include <linux/gfp.h>
+ #include <linux/bitops.h>
+ #include <linux/scatterlist.h>
+ #include <linux/iommu-helper.h>
+ #include <asm/proto.h>
- -------- ---------struct command {
+++++++ ++++++++++++#include <asm/iommu.h>
+ #include <asm/amd_iommu_types.h>
+ #include <asm/amd_iommu.h>
+
+ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+ #define to_pages(addr, size) \
+ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+ ++++++++++++++++++#define EXIT_LOOP_COUNT 10000000
+ ++++++++++++++++++
+ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+
- -------- ---------static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * general struct to manage commands send to an IOMMU
+ ++++++++++++++++++ */
+ ++++++++++++++++++struct iommu_cmd {
+ u32 data[4];
+ };
+
+ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+ struct unity_map_entry *e);
+
+ ++++++++++++++++++/* returns !0 if the IOMMU is caching non-present entries in its TLB */
+ static int iommu_has_npcache(struct amd_iommu *iommu)
+ {
+ return iommu->cap & IOMMU_CAP_NPCACHE;
+ }
+
- -------- ---------static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * IOMMU command queuing functions
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Writes the command to the IOMMUs command buffer and informs the
+ ++++++++++++++++++ * hardware about the new command. Must be called with iommu->lock held.
+ ++++++++++++++++++ */
+ ++++++++++++++++++static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+ {
+ u32 tail, head;
+ u8 *target;
+
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+ target = (iommu->cmd_buf + tail);
+ memcpy_toio(target, cmd, sizeof(*cmd));
+ tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ if (tail == head)
+ return -ENOMEM;
+ writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ return 0;
+ }
+
- -------- --------- struct command cmd;
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * General queuing function for commands. Takes iommu->lock and calls
+ ++++++++++++++++++ * __iommu_queue_command().
+ ++++++++++++++++++ */
+ ++++++++++++++++++static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+ {
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ ret = __iommu_queue_command(iommu, cmd);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return ret;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function is called whenever we need to ensure that the IOMMU has
+ ++++++++++++++++++ * completed execution of all commands we sent. It sends a
+ ++++++++++++++++++ * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
+ ++++++++++++++++++ * us about that by writing a value to a physical address we pass with
+ ++++++++++++++++++ * the command.
+ ++++++++++++++++++ */
+ static int iommu_completion_wait(struct amd_iommu *iommu)
+ {
+ int ret;
- -------- --------- cmd.data[1] = HIGH_U32(ready_phys);
+ ++++++++++++++++++ struct iommu_cmd cmd;
+ volatile u64 ready = 0;
+ unsigned long ready_phys = virt_to_phys(&ready);
+ ++++++++++++++++++ unsigned long i = 0;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
- -------- --------- while (!ready)
+ ++++++++++++++++++ cmd.data[1] = upper_32_bits(ready_phys);
+ cmd.data[2] = 1; /* value written to 'ready' */
+ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+ iommu->need_sync = 0;
+
+ ret = iommu_queue_command(iommu, &cmd);
+
+ if (ret)
+ return ret;
+
- -------- --------- struct command cmd;
+ ++++++++++++++++++ while (!ready && (i < EXIT_LOOP_COUNT)) {
+ ++++++++++++++++++ ++i;
+ cpu_relax();
+ ++++++++++++++++++ }
+ ++++++++++++++++++
+ ++++++++++++++++++ if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
+ ++++++++++++++++++ printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Command send function for invalidating a device table entry
+ ++++++++++++++++++ */
+ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+ {
- -------- --------- struct command cmd;
+ ++++++++++++++++++ struct iommu_cmd cmd;
+
+ BUG_ON(iommu == NULL);
+
+ memset(&cmd, 0, sizeof(cmd));
+ CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
+ cmd.data[0] = devid;
+
+ iommu->need_sync = 1;
+
+ return iommu_queue_command(iommu, &cmd);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Generic command send function for invalidaing TLB entries
+ ++++++++++++++++++ */
+ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
+ u64 address, u16 domid, int pde, int s)
+ {
- -------- --------- cmd.data[3] = HIGH_U32(address);
- -------- --------- if (s)
+ ++++++++++++++++++ struct iommu_cmd cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ address &= PAGE_MASK;
+ CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
+ cmd.data[1] |= domid;
+ cmd.data[2] = LOW_U32(address);
- -------- --------- if (pde)
+ ++++++++++++++++++ cmd.data[3] = upper_32_bits(address);
+ ++++++++++++++++++ if (s) /* size bit - we flush more than one 4kb page */
+ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- -------- --------- _bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+ ++++++++++++++++++ if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+
+ iommu->need_sync = 1;
+
+ return iommu_queue_command(iommu, &cmd);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * TLB invalidation function which is called from the mapping functions.
+ ++++++++++++++++++ * It invalidates a single PTE if the range to flush is within a single
+ ++++++++++++++++++ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ ++++++++++++++++++ */
+ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
+ u64 address, size_t size)
+ {
+ int s = 0;
+ unsigned pages = to_pages(address, size);
+
+ address &= PAGE_MASK;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = 1;
+ }
+
+ iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
+
+ return 0;
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The functions below are used the create the page table mappings for
+ ++++++++++++++++++ * unity mapped regions.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Generic mapping functions. It maps a physical address into a DMA
+ ++++++++++++++++++ * address space. It allocates the page table pages if necessary.
+ ++++++++++++++++++ * In the future it can be extended to a generic mapping function
+ ++++++++++++++++++ * supporting all features of AMD IOMMU page tables like level skipping
+ ++++++++++++++++++ * and full 64 bit address spaces.
+ ++++++++++++++++++ */
+ static int iommu_map(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ int prot)
+ {
+ u64 __pte, *pte, *page;
+
+ bus_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(bus_addr);
+
+ /* only support 512GB address spaces for now */
+ if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+ return -EINVAL;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ *pte = IOMMU_L2_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ *pte = IOMMU_L1_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+
+ if (IOMMU_PTE_PRESENT(*pte))
+ return -EBUSY;
+
+ __pte = phys_addr | IOMMU_PTE_P;
+ if (prot & IOMMU_PROT_IR)
+ __pte |= IOMMU_PTE_IR;
+ if (prot & IOMMU_PROT_IW)
+ __pte |= IOMMU_PTE_IW;
+
+ *pte = __pte;
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function checks if a specific unity mapping entry is needed for
+ ++++++++++++++++++ * this specific IOMMU.
+ ++++++++++++++++++ */
+ static int iommu_for_unity_map(struct amd_iommu *iommu,
+ struct unity_map_entry *entry)
+ {
+ u16 bdf, i;
+
+ for (i = entry->devid_start; i <= entry->devid_end; ++i) {
+ bdf = amd_iommu_alias_table[i];
+ if (amd_iommu_rlookup_table[bdf] == iommu)
+ return 1;
+ }
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Init the unity mappings for a specific IOMMU in the system
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * Basically iterates over all unity mapping entries and applies them to
+ ++++++++++++++++++ * the default domain DMA of that IOMMU if necessary.
+ ++++++++++++++++++ */
+ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+ {
+ struct unity_map_entry *entry;
+ int ret;
+
+ list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+ if (!iommu_for_unity_map(iommu, entry))
+ continue;
+ ret = dma_ops_unity_map(iommu->default_dom, entry);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function actually applies the mapping to the page table of the
+ ++++++++++++++++++ * dma_ops domain.
+ ++++++++++++++++++ */
+ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+ struct unity_map_entry *e)
+ {
+ u64 addr;
+ int ret;
+
+ for (addr = e->address_start; addr < e->address_end;
+ addr += PAGE_SIZE) {
+ ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+ if (ret)
+ return ret;
+ /*
+ * if unity mapping is in aperture range mark the page
+ * as allocated in the aperture
+ */
+ if (addr < dma_dom->aperture_size)
+ __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+ }
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Inits the unity mappings required for a specific device
+ ++++++++++++++++++ */
+ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
+ u16 devid)
+ {
+ struct unity_map_entry *e;
+ int ret;
+
+ list_for_each_entry(e, &amd_iommu_unity_map, list) {
+ if (!(devid >= e->devid_start && devid <= e->devid_end))
+ continue;
+ ret = dma_ops_unity_map(dma_dom, e);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The next functions belong to the address allocator for the dma_ops
+ ++++++++++++++++++ * interface functions. They work like the allocators in the other IOMMU
+ ++++++++++++++++++ * drivers. Its basically a bitmap which marks the allocated pages in
+ ++++++++++++++++++ * the aperture. Maybe it could be enhanced in the future to a more
+ ++++++++++++++++++ * efficient allocator.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ static unsigned long dma_mask_to_pages(unsigned long mask)
+ {
+ return (mask >> PAGE_SHIFT) +
+ (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The address allocator core function.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * called with domain->lock held
+ ++++++++++++++++++ */
+ static unsigned long dma_ops_alloc_addresses(struct device *dev,
+ struct dma_ops_domain *dom,
+ unsigned int pages)
+ {
+ unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+ unsigned long address;
+ unsigned long size = dom->aperture_size >> PAGE_SHIFT;
+ unsigned long boundary_size;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+ limit = limit < size ? limit : size;
+
+ if (dom->next_bit >= limit)
+ dom->next_bit = 0;
+
+ address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
+ 0 , boundary_size, 0);
+ if (address == -1)
+ address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
+ 0, boundary_size, 0);
+
+ if (likely(address != -1)) {
+ dom->next_bit = address + pages;
+ address <<= PAGE_SHIFT;
+ } else
+ address = bad_dma_address;
+
+ WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+
+ return address;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The address free function.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * called with domain->lock held
+ ++++++++++++++++++ */
+ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
+ unsigned long address,
+ unsigned int pages)
+ {
+ address >>= PAGE_SHIFT;
+ iommu_area_free(dom->bitmap, address, pages);
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The next functions belong to the domain allocation. A domain is
+ ++++++++++++++++++ * allocated for every IOMMU as the default domain. If device isolation
+ ++++++++++++++++++ * is enabled, every device get its own domain. The most important thing
+ ++++++++++++++++++ * about domains is the page table mapping the DMA address space they
+ ++++++++++++++++++ * contain.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ static u16 domain_id_alloc(void)
+ {
+ unsigned long flags;
+ int id;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+ BUG_ON(id == 0);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __set_bit(id, amd_iommu_pd_alloc_bitmap);
+ else
+ id = 0;
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return id;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ ++++++++++++++++++ * ranges.
+ ++++++++++++++++++ */
+ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages)
+ {
+ unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+
+ if (start_page + pages > last_page)
+ pages = last_page - start_page;
+
+ set_bit_string(dom->bitmap, start_page, pages);
+ }
+
+ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+ {
+ int i, j;
+ u64 *p1, *p2, *p3;
+
+ p1 = dma_dom->domain.pt_root;
+
+ if (!p1)
+ return;
+
+ for (i = 0; i < 512; ++i) {
+ if (!IOMMU_PTE_PRESENT(p1[i]))
+ continue;
+
+ p2 = IOMMU_PTE_PAGE(p1[i]);
+ for (j = 0; j < 512; ++i) {
+ if (!IOMMU_PTE_PRESENT(p2[j]))
+ continue;
+ p3 = IOMMU_PTE_PAGE(p2[j]);
+ free_page((unsigned long)p3);
+ }
+
+ free_page((unsigned long)p2);
+ }
+
+ free_page((unsigned long)p1);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Free a domain, only used if something went wrong in the
+ ++++++++++++++++++ * allocation path and we need to free an already allocated page table
+ ++++++++++++++++++ */
+ static void dma_ops_domain_free(struct dma_ops_domain *dom)
+ {
+ if (!dom)
+ return;
+
+ dma_ops_free_pagetable(dom);
+
+ kfree(dom->pte_pages);
+
+ kfree(dom->bitmap);
+
+ kfree(dom);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Allocates a new protection domain usable for the dma_ops functions.
+ ++++++++++++++++++ * It also intializes the page table and the address allocator data
+ ++++++++++++++++++ * structures required for the dma_ops interface
+ ++++++++++++++++++ */
+ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
+ unsigned order)
+ {
+ struct dma_ops_domain *dma_dom;
+ unsigned i, num_pte_pages;
+ u64 *l2_pde;
+ u64 address;
+
+ /*
+ * Currently the DMA aperture must be between 32 MB and 1GB in size
+ */
+ if ((order < 25) || (order > 30))
+ return NULL;
+
+ dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
+ if (!dma_dom)
+ return NULL;
+
+ spin_lock_init(&dma_dom->domain.lock);
+
+ dma_dom->domain.id = domain_id_alloc();
+ if (dma_dom->domain.id == 0)
+ goto free_dma_dom;
+ dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+ dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ dma_dom->domain.priv = dma_dom;
+ if (!dma_dom->domain.pt_root)
+ goto free_dma_dom;
+ dma_dom->aperture_size = (1ULL << order);
+ dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
+ GFP_KERNEL);
+ if (!dma_dom->bitmap)
+ goto free_dma_dom;
+ /*
+ * mark the first page as allocated so we never return 0 as
+ * a valid dma-address. So we can use 0 as error value
+ */
+ dma_dom->bitmap[0] = 1;
+ dma_dom->next_bit = 0;
+
+ ++++++++++++++++++ /* Intialize the exclusion range if necessary */
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ int pages = to_pages(iommu->exclusion_start,
+ iommu->exclusion_length);
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
+
+ ++++++++++++++++++ /*
+ ++++++++++++++++++ * At the last step, build the page tables so we don't need to
+ ++++++++++++++++++ * allocate page table pages in the dma_ops mapping/unmapping
+ ++++++++++++++++++ * path.
+ ++++++++++++++++++ */
+ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
+ dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
+ GFP_KERNEL);
+ if (!dma_dom->pte_pages)
+ goto free_dma_dom;
+
+ l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (l2_pde == NULL)
+ goto free_dma_dom;
+
+ dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
+
+ for (i = 0; i < num_pte_pages; ++i) {
+ dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!dma_dom->pte_pages[i])
+ goto free_dma_dom;
+ address = virt_to_phys(dma_dom->pte_pages[i]);
+ l2_pde[i] = IOMMU_L1_PDE(address);
+ }
+
+ return dma_dom;
+
+ free_dma_dom:
+ dma_ops_domain_free(dma_dom);
+
+ return NULL;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Find out the protection domain structure for a given PCI device. This
+ ++++++++++++++++++ * will give us the pointer to the page table root for example.
+ ++++++++++++++++++ */
+ static struct protection_domain *domain_for_device(u16 devid)
+ {
+ struct protection_domain *dom;
+ unsigned long flags;
+
+ read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ dom = amd_iommu_pd_table[devid];
+ read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return dom;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * If a device is not yet associated with a domain, this function does
+ ++++++++++++++++++ * assigns it visible for the hardware
+ ++++++++++++++++++ */
+ static void set_device_domain(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
+ {
+ unsigned long flags;
+
+ u64 pte_root = virt_to_phys(domain->pt_root);
+
+ pte_root |= (domain->mode & 0x07) << 9;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ amd_iommu_dev_table[devid].data[0] = pte_root;
+ amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+ amd_iommu_dev_table[devid].data[2] = domain->id;
+
+ amd_iommu_pd_table[devid] = domain;
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+
+ iommu->need_sync = 1;
+ }
+
+ ++++++++++++++++++/*****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The next functions belong to the dma_ops mapping/unmapping code.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ *****************************************************************************/
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * In the dma_ops path we only have the struct device. This function
+ ++++++++++++++++++ * finds the corresponding IOMMU, the protection domain and the
+ ++++++++++++++++++ * requestor id for a given device.
+ ++++++++++++++++++ * If the device is not yet associated with a domain this is also done
+ ++++++++++++++++++ * in this function.
+ ++++++++++++++++++ */
+ static int get_device_resources(struct device *dev,
+ struct amd_iommu **iommu,
+ struct protection_domain **domain,
+ u16 *bdf)
+ {
+ struct dma_ops_domain *dma_dom;
+ struct pci_dev *pcidev;
+ u16 _bdf;
+
+ BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+
+ pcidev = to_pci_dev(dev);
+ ++++++++++++++++++ _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
+
+ ++++++++++++++++++ /* device not translated by any IOMMU in the system? */
+ if (_bdf >= amd_iommu_last_bdf) {
+ *iommu = NULL;
+ *domain = NULL;
+ *bdf = 0xffff;
+ return 0;
+ }
+
+ *bdf = amd_iommu_alias_table[_bdf];
+
+ *iommu = amd_iommu_rlookup_table[*bdf];
+ if (*iommu == NULL)
+ return 0;
+ dma_dom = (*iommu)->default_dom;
+ *domain = domain_for_device(*bdf);
+ if (*domain == NULL) {
+ *domain = &dma_dom->domain;
+ set_device_domain(*iommu, *domain, *bdf);
+ printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+ "device ", (*domain)->id);
+ print_devid(_bdf, 1);
+ }
+
+ return 1;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This is the generic map function. It maps one 4kb page at paddr to
+ ++++++++++++++++++ * the given address in the DMA address space for the domain.
+ ++++++++++++++++++ */
+ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+ struct dma_ops_domain *dom,
+ unsigned long address,
+ phys_addr_t paddr,
+ int direction)
+ {
+ u64 *pte, __pte;
+
+ WARN_ON(address > dom->aperture_size);
+
+ paddr &= PAGE_MASK;
+
+ pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+
+ if (direction == DMA_TO_DEVICE)
+ __pte |= IOMMU_PTE_IR;
+ else if (direction == DMA_FROM_DEVICE)
+ __pte |= IOMMU_PTE_IW;
+ else if (direction == DMA_BIDIRECTIONAL)
+ __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
+
+ WARN_ON(*pte);
+
+ *pte = __pte;
+
+ return (dma_addr_t)address;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The generic unmapping function for on page in the DMA address space.
+ ++++++++++++++++++ */
+ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
+ struct dma_ops_domain *dom,
+ unsigned long address)
+ {
+ u64 *pte;
+
+ if (address >= dom->aperture_size)
+ return;
+
+ WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+
+ pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ WARN_ON(!*pte);
+
+ *pte = 0ULL;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function contains common code for mapping of a physically
+ ++++++++++++++++++ * contiguous memory region into DMA address space. It is uses by all
+ ++++++++++++++++++ * mapping functions provided by this IOMMU driver.
+ ++++++++++++++++++ * Must be called with the domain lock held.
+ ++++++++++++++++++ */
+ static dma_addr_t __map_single(struct device *dev,
+ struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ phys_addr_t paddr,
+ size_t size,
+ int dir)
+ {
+ dma_addr_t offset = paddr & ~PAGE_MASK;
+ dma_addr_t address, start;
+ unsigned int pages;
+ int i;
+
+ pages = to_pages(paddr, size);
+ paddr &= PAGE_MASK;
+
+ address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+ if (unlikely(address == bad_dma_address))
+ goto out;
+
+ start = address;
+ for (i = 0; i < pages; ++i) {
+ dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ paddr += PAGE_SIZE;
+ start += PAGE_SIZE;
+ }
+ address += offset;
+
+ out:
+ return address;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Does the reverse of the __map_single function. Must be called with
+ ++++++++++++++++++ * the domain lock held too
+ ++++++++++++++++++ */
+ static void __unmap_single(struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ dma_addr_t dma_addr,
+ size_t size,
+ int dir)
+ {
+ dma_addr_t i, start;
+ unsigned int pages;
+
+ if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+ return;
+
+ pages = to_pages(dma_addr, size);
+ dma_addr &= PAGE_MASK;
+ start = dma_addr;
+
+ for (i = 0; i < pages; ++i) {
+ dma_ops_domain_unmap(iommu, dma_dom, start);
+ start += PAGE_SIZE;
+ }
+
+ dma_ops_free_addresses(dma_dom, dma_addr, pages);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The exported map_single function for dma_ops.
+ ++++++++++++++++++ */
+ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
+ size_t size, int dir)
+ {
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ dma_addr_t addr;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (iommu == NULL || domain == NULL)
+ ++++++++++++++++++ /* device not handled by any AMD IOMMU */
+ return (dma_addr_t)paddr;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+ if (addr == bad_dma_address)
+ goto out;
+
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return addr;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The exported unmap_single function for dma_ops.
+ ++++++++++++++++++ */
+ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int dir)
+ {
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ ++++++++++++++++++ /* device not handled by any AMD IOMMU */
+ return;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ __unmap_single(iommu, domain->priv, dma_addr, size, dir);
+
+ iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This is a special map_sg function which is used if we should map a
+ ++++++++++++++++++ * device which is not handled by an AMD IOMMU in the system.
+ ++++++++++++++++++ */
+ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+ {
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sglist, s, nelems, i) {
+ s->dma_address = (dma_addr_t)sg_phys(s);
+ s->dma_length = s->length;
+ }
+
+ return nelems;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The exported map_sg function for dma_ops (handles scatter-gather
+ ++++++++++++++++++ * lists).
+ ++++++++++++++++++ */
+ static int map_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+ {
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ int i;
+ struct scatterlist *s;
+ phys_addr_t paddr;
+ int mapped_elems = 0;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain)
+ return map_sg_no_iommu(dev, sglist, nelems, dir);
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ for_each_sg(sglist, s, nelems, i) {
+ paddr = sg_phys(s);
+
+ s->dma_address = __map_single(dev, iommu, domain->priv,
+ paddr, s->length, dir);
+
+ if (s->dma_address) {
+ s->dma_length = s->length;
+ mapped_elems++;
+ } else
+ goto unmap;
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, s->dma_address,
+ s->dma_length);
+ }
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return mapped_elems;
+ unmap:
+ for_each_sg(sglist, s, mapped_elems, i) {
+ if (s->dma_address)
+ __unmap_single(iommu, domain->priv, s->dma_address,
+ s->dma_length, dir);
+ s->dma_address = s->dma_length = 0;
+ }
+
+ mapped_elems = 0;
+
+ goto out;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The exported map_sg function for dma_ops (handles scatter-gather
+ ++++++++++++++++++ * lists).
+ ++++++++++++++++++ */
+ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+ {
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ struct scatterlist *s;
+ u16 devid;
+ int i;
+
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ return;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ for_each_sg(sglist, s, nelems, i) {
+ __unmap_single(iommu, domain->priv, s->dma_address,
+ s->dma_length, dir);
+ iommu_flush_pages(iommu, domain->id, s->dma_address,
+ s->dma_length);
+ s->dma_address = s->dma_length = 0;
+ }
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The exported alloc_coherent function for dma_ops.
+ ++++++++++++++++++ */
+ static void *alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag)
+ {
+ unsigned long flags;
+ void *virt_addr;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ phys_addr_t paddr;
+
+ virt_addr = (void *)__get_free_pages(flag, get_order(size));
+ if (!virt_addr)
+ return 0;
+
+ memset(virt_addr, 0, size);
+ paddr = virt_to_phys(virt_addr);
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain) {
+ *dma_addr = (dma_addr_t)paddr;
+ return virt_addr;
+ }
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ size, DMA_BIDIRECTIONAL);
+
+ if (*dma_addr == bad_dma_address) {
+ free_pages((unsigned long)virt_addr, get_order(size));
+ virt_addr = NULL;
+ goto out;
+ }
+
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, *dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return virt_addr;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The exported free_coherent function for dma_ops.
+ ++++++++++++++++++ * FIXME: fix the generic x86 DMA layer so that it actually calls that
+ ++++++++++++++++++ * function.
+ ++++++++++++++++++ */
+ static void free_coherent(struct device *dev, size_t size,
+ void *virt_addr, dma_addr_t dma_addr)
+ {
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain)
+ goto free_mem;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+ iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ free_mem:
+ free_pages((unsigned long)virt_addr, get_order(size));
+ }
+
+ /*
+ ++++++++++++++++++ * The function for pre-allocating protection domains.
+ ++++++++++++++++++ *
+ * If the driver core informs the DMA layer if a driver grabs a device
+ * we don't need to preallocate the protection domains anymore.
+ * For now we have to.
+ */
+ void prealloc_protection_domains(void)
+ {
+ struct pci_dev *dev = NULL;
+ struct dma_ops_domain *dma_dom;
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ u16 devid;
+
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ devid = (dev->bus->number << 8) | dev->devfn;
+ if (devid >= amd_iommu_last_bdf)
+ continue;
+ devid = amd_iommu_alias_table[devid];
+ if (domain_for_device(devid))
+ continue;
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ continue;
+ dma_dom = dma_ops_domain_alloc(iommu, order);
+ if (!dma_dom)
+ continue;
+ init_unity_mappings_for_device(dma_dom, devid);
+ set_device_domain(iommu, &dma_dom->domain, devid);
+ printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
+ dma_dom->domain.id);
+ print_devid(devid, 1);
+ }
+ }
+
+ static struct dma_mapping_ops amd_iommu_dma_ops = {
+ .alloc_coherent = alloc_coherent,
+ .free_coherent = free_coherent,
+ .map_single = map_single,
+ .unmap_single = unmap_single,
+ .map_sg = map_sg,
+ .unmap_sg = unmap_sg,
+ };
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The function which clues the AMD IOMMU driver into dma_ops.
+ ++++++++++++++++++ */
+ int __init amd_iommu_init_dma_ops(void)
+ {
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ int ret;
+
+ ++++++++++++++++++ /*
+ ++++++++++++++++++ * first allocate a default protection domain for every IOMMU we
+ ++++++++++++++++++ * found in the system. Devices not assigned to any other
+ ++++++++++++++++++ * protection domain will be assigned to the default one.
+ ++++++++++++++++++ */
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+ if (iommu->default_dom == NULL)
+ return -ENOMEM;
+ ret = iommu_init_unity_mappings(iommu);
+ if (ret)
+ goto free_domains;
+ }
+
+ ++++++++++++++++++ /*
+ ++++++++++++++++++ * If device isolation is enabled, pre-allocate the protection
+ ++++++++++++++++++ * domains for each device.
+ ++++++++++++++++++ */
+ if (amd_iommu_isolate)
+ prealloc_protection_domains();
+
+ iommu_detected = 1;
+ force_iommu = 1;
+ bad_dma_address = 0;
+ #ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+ #endif
+
+ ++++++++++++++++++ /* Make the driver finally visible to the drivers */
+ dma_ops = &amd_iommu_dma_ops;
+
+ return 0;
+
+ free_domains:
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ if (iommu->default_dom)
+ dma_ops_domain_free(iommu->default_dom);
+ }
+
+ return ret;
+ }
--- /dev/null
------- -- ---------#include <asm/gart.h>
+ /*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+ #include <linux/pci.h>
+ #include <linux/acpi.h>
+ #include <linux/gfp.h>
+ #include <linux/list.h>
+ #include <linux/sysdev.h>
+ #include <asm/pci-direct.h>
+ #include <asm/amd_iommu_types.h>
+ #include <asm/amd_iommu.h>
- -------- ---------#define UPDATE_LAST_BDF(x) do {\
- -------- --------- if ((x) > amd_iommu_last_bdf) \
- -------- --------- amd_iommu_last_bdf = (x); \
- -------- --------- } while (0);
- -------- ---------
- -------- ---------#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
+++++++ ++++++++++++#include <asm/iommu.h>
+
+ /*
+ * definitions for the ACPI scanning code
+ */
- -------- ---------#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
+ #define PCI_BUS(x) (((x) >> 8) & 0xff)
+ #define IVRS_HEADER_LENGTH 48
- -------- ---------u16 amd_iommu_last_bdf;
- -------- ---------struct list_head amd_iommu_unity_map;
- -------- ---------unsigned amd_iommu_aperture_order = 26;
- -------- ---------int amd_iommu_isolate;
+
+ #define ACPI_IVHD_TYPE 0x10
+ #define ACPI_IVMD_TYPE_ALL 0x20
+ #define ACPI_IVMD_TYPE 0x21
+ #define ACPI_IVMD_TYPE_RANGE 0x22
+
+ #define IVHD_DEV_ALL 0x01
+ #define IVHD_DEV_SELECT 0x02
+ #define IVHD_DEV_SELECT_RANGE_START 0x03
+ #define IVHD_DEV_RANGE_END 0x04
+ #define IVHD_DEV_ALIAS 0x42
+ #define IVHD_DEV_ALIAS_RANGE 0x43
+ #define IVHD_DEV_EXT_SELECT 0x46
+ #define IVHD_DEV_EXT_SELECT_RANGE 0x47
+
+ #define IVHD_FLAG_HT_TUN_EN 0x00
+ #define IVHD_FLAG_PASSPW_EN 0x01
+ #define IVHD_FLAG_RESPASSPW_EN 0x02
+ #define IVHD_FLAG_ISOC_EN 0x03
+
+ #define IVMD_FLAG_EXCL_RANGE 0x08
+ #define IVMD_FLAG_UNITY_MAP 0x01
+
+ #define ACPI_DEVFLAG_INITPASS 0x01
+ #define ACPI_DEVFLAG_EXTINT 0x02
+ #define ACPI_DEVFLAG_NMI 0x04
+ #define ACPI_DEVFLAG_SYSMGT1 0x10
+ #define ACPI_DEVFLAG_SYSMGT2 0x20
+ #define ACPI_DEVFLAG_LINT0 0x40
+ #define ACPI_DEVFLAG_LINT1 0x80
+ #define ACPI_DEVFLAG_ATSDIS 0x10000000
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * ACPI table definitions
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * These data structures are laid over the table to parse the important values
+ ++++++++++++++++++ * out of it.
+ ++++++++++++++++++ */
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ ++++++++++++++++++ * or more ivhd_entrys.
+ ++++++++++++++++++ */
+ struct ivhd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 cap_ptr;
+ u64 mmio_phys;
+ u16 pci_seg;
+ u16 info;
+ u32 reserved;
+ } __attribute__((packed));
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * A device entry describing which devices a specific IOMMU translates and
+ ++++++++++++++++++ * which requestor ids they use.
+ ++++++++++++++++++ */
+ struct ivhd_entry {
+ u8 type;
+ u16 devid;
+ u8 flags;
+ u32 ext;
+ } __attribute__((packed));
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ ++++++++++++++++++ * ranges for devices and regions that should be unity mapped.
+ ++++++++++++++++++ */
+ struct ivmd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 aux;
+ u64 resv;
+ u64 range_start;
+ u64 range_length;
+ } __attribute__((packed));
+
+ static int __initdata amd_iommu_detected;
+
- -------- ---------struct list_head amd_iommu_list;
+ ++++++++++++++++++u16 amd_iommu_last_bdf; /* largest PCI device id we have
+ ++++++++++++++++++ to handle */
+ ++++++++++++++++++LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
+ ++++++++++++++++++ we find in ACPI */
+ ++++++++++++++++++unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+ ++++++++++++++++++int amd_iommu_isolate; /* if 1, device isolation is enabled */
+ ++++++++++++++++++
+ ++++++++++++++++++LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
+ ++++++++++++++++++ system */
+
- -------- ---------static u32 dev_table_size;
- -------- ---------static u32 alias_table_size;
- -------- ---------static u32 rlookup_table_size;
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Pointer to the device table which is shared by all AMD IOMMUs
+ ++++++++++++++++++ * it is indexed by the PCI device id or the HT unit id and contains
+ ++++++++++++++++++ * information about the domain the device belongs to as well as the
+ ++++++++++++++++++ * page table root pointer.
+ ++++++++++++++++++ */
+ struct dev_table_entry *amd_iommu_dev_table;
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The alias table is a driver specific data structure which contains the
+ ++++++++++++++++++ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ ++++++++++++++++++ * More than one device can share the same requestor id.
+ ++++++++++++++++++ */
+ u16 *amd_iommu_alias_table;
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The rlookup table is used to find the IOMMU which is responsible
+ ++++++++++++++++++ * for a specific device. It is also indexed by the PCI device id.
+ ++++++++++++++++++ */
+ struct amd_iommu **amd_iommu_rlookup_table;
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * The pd table (protection domain table) is used to find the protection domain
+ ++++++++++++++++++ * data structure a device belongs to. Indexed with the PCI device id too.
+ ++++++++++++++++++ */
+ struct protection_domain **amd_iommu_pd_table;
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
+ ++++++++++++++++++ * to know which ones are already in use.
+ ++++++++++++++++++ */
+ unsigned long *amd_iommu_pd_alloc_bitmap;
+
- -------- --------- UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+ ++++++++++++++++++static u32 dev_table_size; /* size of the device table */
+ ++++++++++++++++++static u32 alias_table_size; /* size of the alias table */
+ ++++++++++++++++++static u32 rlookup_table_size; /* size if the rlookup table */
+
+ ++++++++++++++++++static inline void update_last_devid(u16 devid)
+ ++++++++++++++++++{
+ ++++++++++++++++++ if (devid > amd_iommu_last_bdf)
+ ++++++++++++++++++ amd_iommu_last_bdf = devid;
+ ++++++++++++++++++}
+ ++++++++++++++++++
+ ++++++++++++++++++static inline unsigned long tbl_size(int entry_size)
+ ++++++++++++++++++{
+ ++++++++++++++++++ unsigned shift = PAGE_SHIFT +
+ ++++++++++++++++++ get_order(amd_iommu_last_bdf * entry_size);
+ ++++++++++++++++++
+ ++++++++++++++++++ return 1UL << shift;
+ ++++++++++++++++++}
+ ++++++++++++++++++
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * AMD IOMMU MMIO register space handling functions
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * These functions are used to program the IOMMU device registers in
+ ++++++++++++++++++ * MMIO space required for that driver.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ ++++++++++++++++++ * exclusion range are passed through untranslated
+ ++++++++++++++++++ */
+ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+ {
+ u64 start = iommu->exclusion_start & PAGE_MASK;
+ u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
+ u64 entry;
+
+ if (!iommu->exclusion_start)
+ return;
+
+ entry = start | MMIO_EXCL_ENABLE_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+ &entry, sizeof(entry));
+
+ entry = limit;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+ &entry, sizeof(entry));
+ }
+
+ ++++++++++++++++++/* Programs the physical address of the device table into the IOMMU hardware */
+ static void __init iommu_set_device_table(struct amd_iommu *iommu)
+ {
+ u32 entry;
+
+ BUG_ON(iommu->mmio_base == NULL);
+
+ entry = virt_to_phys(amd_iommu_dev_table);
+ entry |= (dev_table_size >> 12) - 1;
+ memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+ &entry, sizeof(entry));
+ }
+
+ ++++++++++++++++++/* Generic functions to enable/disable certain features of the IOMMU. */
+ static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+ {
+ u32 ctrl;
+
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl |= (1 << bit);
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ }
+
+ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+ {
+ u32 ctrl;
+
+ ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= ~(1 << bit);
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ }
+
+ ++++++++++++++++++/* Function to enable the hardware */
+ void __init iommu_enable(struct amd_iommu *iommu)
+ {
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
+ print_devid(iommu->devid, 0);
+ printk(" cap 0x%hx\n", iommu->cap_ptr);
+
+ iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ ++++++++++++++++++ * the system has one.
+ ++++++++++++++++++ */
+ static u8 * __init iommu_map_mmio_space(u64 address)
+ {
+ u8 *ret;
+
+ if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+ return NULL;
+
+ ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
+ if (ret != NULL)
+ return ret;
+
+ release_mem_region(address, MMIO_REGION_LENGTH);
+
+ return NULL;
+ }
+
+ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+ {
+ if (iommu->mmio_base)
+ iounmap(iommu->mmio_base);
+ release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ ++++++++++++++++++ * parsing. In this pass we try to find out the highest device id this
+ ++++++++++++++++++ * code has to handle. Upon this information the size of the shared data
+ ++++++++++++++++++ * structures is determined later.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function reads the last device id the IOMMU has to handle from the PCI
+ ++++++++++++++++++ * capability header for this IOMMU
+ ++++++++++++++++++ */
+ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
+ {
+ u32 cap;
+
+ cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- -------- --------- UPDATE_LAST_BDF(dev->devid);
+ ++++++++++++++++++ update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * After reading the highest device id from the IOMMU PCI capability header
+ ++++++++++++++++++ * this function looks if there is a higher device id defined in the ACPI table
+ ++++++++++++++++++ */
+ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+ {
+ u8 *p = (void *)h, *end = (void *)h;
+ struct ivhd_entry *dev;
+
+ p += sizeof(*h);
+ end += h->length;
+
+ find_last_devid_on_pci(PCI_BUS(h->devid),
+ PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid),
+ h->cap_ptr);
+
+ while (p < end) {
+ dev = (struct ivhd_entry *)p;
+ switch (dev->type) {
+ case IVHD_DEV_SELECT:
+ case IVHD_DEV_RANGE_END:
+ case IVHD_DEV_ALIAS:
+ case IVHD_DEV_EXT_SELECT:
- -------- --------- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+ ++++++++++++++++++ /* all the above subfield types refer to device ids */
+ ++++++++++++++++++ update_last_devid(dev->devid);
+ break;
+ default:
+ break;
+ }
+ p += 0x04 << (*p >> 6);
+ }
+
+ WARN_ON(p != end);
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ ++++++++++++++++++ * id which we need to handle. This is the first of three functions which parse
+ ++++++++++++++++++ * the ACPI table. So we check the checksum here.
+ ++++++++++++++++++ */
+ static int __init find_last_devid_acpi(struct acpi_table_header *table)
+ {
+ int i;
+ u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+
+ /*
+ * Validate checksum here so we don't need to do it when
+ * we actually parse the table
+ */
+ for (i = 0; i < table->length; ++i)
+ checksum += p[i];
+ if (checksum != 0)
+ /* ACPI table corrupt */
+ return -ENODEV;
+
+ p += IVRS_HEADER_LENGTH;
+
+ end += table->length;
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ switch (h->type) {
+ case ACPI_IVHD_TYPE:
+ find_last_devid_from_ivhd(h);
+ break;
+ default:
+ break;
+ }
+ p += h->length;
+ }
+ WARN_ON(p != end);
+
+ return 0;
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The following functions belong the the code path which parses the ACPI table
+ ++++++++++++++++++ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ ++++++++++++++++++ * data structures, initialize the device/alias/rlookup table and also
+ ++++++++++++++++++ * basically initialize the hardware.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ ++++++++++++++++++ * write commands to that buffer later and the IOMMU will execute them
+ ++++++++++++++++++ * asynchronously
+ ++++++++++++++++++ */
+ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
+ {
- -------- --------- u64 entry = 0;
+ ++++++++++++++++++ u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(CMD_BUFFER_SIZE));
- -------- --------- memset(cmd_buf, 0, CMD_BUFFER_SIZE);
- -------- ---------
+ ++++++++++++++++++ u64 entry;
+
+ if (cmd_buf == NULL)
+ return NULL;
+
+ iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+
- -------- --------- if (iommu->cmd_buf)
- -------- --------- free_pages((unsigned long)iommu->cmd_buf,
- -------- --------- get_order(CMD_BUFFER_SIZE));
+ entry = (u64)virt_to_phys(cmd_buf);
+ entry |= MMIO_CMD_SIZE_512;
+ memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+
+ return cmd_buf;
+ }
+
+ static void __init free_command_buffer(struct amd_iommu *iommu)
+ {
- -------- ---------static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+ ++++++++++++++++++ free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+ }
+
+ ++++++++++++++++++/* sets a specific bit in the device table entry. */
+ static void set_dev_entry_bit(u16 devid, u8 bit)
+ {
+ int i = (bit >> 5) & 0x07;
+ int _bit = bit & 0x1f;
+
+ amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+ }
+
- -------- ---------}
+ ++++++++++++++++++/* Writes the specific IOMMU for a device into the rlookup table */
+ ++++++++++++++++++static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+ ++++++++++++++++++{
+ ++++++++++++++++++ amd_iommu_rlookup_table[devid] = iommu;
+ ++++++++++++++++++}
+ ++++++++++++++++++
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function takes the device specific flags read from the ACPI
+ ++++++++++++++++++ * table and sets up the device table entry with that information
+ ++++++++++++++++++ */
+ ++++++++++++++++++static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+ ++++++++++++++++++ u16 devid, u32 flags, u32 ext_flags)
+ {
+ if (flags & ACPI_DEVFLAG_INITPASS)
+ set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+ if (flags & ACPI_DEVFLAG_EXTINT)
+ set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+ if (flags & ACPI_DEVFLAG_NMI)
+ set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+ if (flags & ACPI_DEVFLAG_SYSMGT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+ if (flags & ACPI_DEVFLAG_SYSMGT2)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+ if (flags & ACPI_DEVFLAG_LINT0)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+ if (flags & ACPI_DEVFLAG_LINT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
- -------- ---------static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
- -------- ---------{
- -------- --------- amd_iommu_rlookup_table[devid] = iommu;
+
- -------- --------- iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
- -------- --------- iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+ ++++++++++++++++++ set_iommu_for_device(iommu, devid);
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Reads the device exclusion range from ACPI and initialize IOMMU with
+ ++++++++++++++++++ * it
+ ++++++++++++++++++ */
+ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+ {
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+ return;
+
+ if (iommu) {
+ ++++++++++++++++++ /*
+ ++++++++++++++++++ * We only can configure exclusion ranges per IOMMU, not
+ ++++++++++++++++++ * per device. But we can enable the exclusion range per
+ ++++++++++++++++++ * device. This is done here
+ ++++++++++++++++++ */
+ set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
+ iommu->exclusion_start = m->range_start;
+ iommu->exclusion_length = m->range_length;
+ }
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function reads some important data from the IOMMU PCI space and
+ ++++++++++++++++++ * initializes the driver data structure with it. It reads the hardware
+ ++++++++++++++++++ * capabilities and the first/last device entries
+ ++++++++++++++++++ */
+ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
+ {
+ int bus = PCI_BUS(iommu->devid);
+ int dev = PCI_SLOT(iommu->devid);
+ int fn = PCI_FUNC(iommu->devid);
+ int cap_ptr = iommu->cap_ptr;
+ u32 range;
+
+ iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+
+ range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- -------- --------- bool alias = 0;
+ ++++++++++++++++++ iommu->first_device = calc_devid(MMIO_GET_BUS(range),
+ ++++++++++++++++++ MMIO_GET_FD(range));
+ ++++++++++++++++++ iommu->last_device = calc_devid(MMIO_GET_BUS(range),
+ ++++++++++++++++++ MMIO_GET_LD(range));
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ ++++++++++++++++++ * initializes the hardware and our data structures with it.
+ ++++++++++++++++++ */
+ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
+ struct ivhd_header *h)
+ {
+ u8 *p = (u8 *)h;
+ u8 *end = p, flags = 0;
+ u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
+ u32 ext_flags = 0;
- -------- --------- set_dev_entry_from_acpi(dev_i, e->flags, 0);
+ ++++++++++++++++++ bool alias = false;
+ struct ivhd_entry *e;
+
+ /*
+ * First set the recommended feature enable bits from ACPI
+ * into the IOMMU control registers
+ */
+ h->flags & IVHD_FLAG_HT_TUN_EN ?
+ iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+ iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+ h->flags & IVHD_FLAG_PASSPW_EN ?
+ iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+ h->flags & IVHD_FLAG_RESPASSPW_EN ?
+ iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+ h->flags & IVHD_FLAG_ISOC_EN ?
+ iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+ iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+ /*
+ * make IOMMU memory accesses cache coherent
+ */
+ iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+ /*
+ * Done. Now parse the device entries
+ */
+ p += sizeof(struct ivhd_header);
+ end += h->length;
+
+ while (p < end) {
+ e = (struct ivhd_entry *)p;
+ switch (e->type) {
+ case IVHD_DEV_ALL:
+ for (dev_i = iommu->first_device;
+ dev_i <= iommu->last_device; ++dev_i)
- -------- --------- set_dev_entry_from_acpi(devid, e->flags, 0);
+ ++++++++++++++++++ set_dev_entry_from_acpi(iommu, dev_i,
+ ++++++++++++++++++ e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT:
+ devid = e->devid;
- -------- --------- alias = 0;
+ ++++++++++++++++++ set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT_RANGE_START:
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = 0;
- -------- --------- set_dev_entry_from_acpi(devid, e->flags, 0);
+ ++++++++++++++++++ alias = false;
+ break;
+ case IVHD_DEV_ALIAS:
+ devid = e->devid;
+ devid_to = e->ext >> 8;
- -------- --------- alias = 1;
+ ++++++++++++++++++ set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+ amd_iommu_alias_table[devid] = devid_to;
+ break;
+ case IVHD_DEV_ALIAS_RANGE:
+ devid_start = e->devid;
+ flags = e->flags;
+ devid_to = e->ext >> 8;
+ ext_flags = 0;
- -------- --------- set_dev_entry_from_acpi(devid, e->flags, e->ext);
+ ++++++++++++++++++ alias = true;
+ break;
+ case IVHD_DEV_EXT_SELECT:
+ devid = e->devid;
- -------- --------- alias = 0;
+ ++++++++++++++++++ set_dev_entry_from_acpi(iommu, devid, e->flags,
+ ++++++++++++++++++ e->ext);
+ break;
+ case IVHD_DEV_EXT_SELECT_RANGE:
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = e->ext;
- -------- --------- set_dev_entry_from_acpi(
+ ++++++++++++++++++ alias = false;
+ break;
+ case IVHD_DEV_RANGE_END:
+ devid = e->devid;
+ for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+ if (alias)
+ amd_iommu_alias_table[dev_i] = devid_to;
- -------- --------- INIT_LIST_HEAD(&amd_iommu_list);
- -------- ---------
+ ++++++++++++++++++ set_dev_entry_from_acpi(iommu,
+ amd_iommu_alias_table[dev_i],
+ flags, ext_flags);
+ }
+ break;
+ default:
+ break;
+ }
+
+ p += 0x04 << (e->type >> 6);
+ }
+ }
+
+ ++++++++++++++++++/* Initializes the device->iommu mapping for the driver */
+ static int __init init_iommu_devices(struct amd_iommu *iommu)
+ {
+ u16 i;
+
+ for (i = iommu->first_device; i <= iommu->last_device; ++i)
+ set_iommu_for_device(iommu, i);
+
+ return 0;
+ }
+
+ static void __init free_iommu_one(struct amd_iommu *iommu)
+ {
+ free_command_buffer(iommu);
+ iommu_unmap_mmio_space(iommu);
+ }
+
+ static void __init free_iommu_all(void)
+ {
+ struct amd_iommu *iommu, *next;
+
+ list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+ list_del(&iommu->list);
+ free_iommu_one(iommu);
+ kfree(iommu);
+ }
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function clues the initialization function for one IOMMU
+ ++++++++++++++++++ * together and also allocates the command buffer and programs the
+ ++++++++++++++++++ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ ++++++++++++++++++ */
+ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+ {
+ spin_lock_init(&iommu->lock);
+ list_add_tail(&iommu->list, &amd_iommu_list);
+
+ /*
+ * Copy data from ACPI table entry to the iommu struct
+ */
+ iommu->devid = h->devid;
+ iommu->cap_ptr = h->cap_ptr;
+ iommu->mmio_phys = h->mmio_phys;
+ iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
+ if (!iommu->mmio_base)
+ return -ENOMEM;
+
+ iommu_set_device_table(iommu);
+ iommu->cmd_buf = alloc_command_buffer(iommu);
+ if (!iommu->cmd_buf)
+ return -ENOMEM;
+
+ init_iommu_from_pci(iommu);
+ init_iommu_from_acpi(iommu, h);
+ init_iommu_devices(iommu);
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ ++++++++++++++++++ * IOMMU structure and initializes it with init_iommu_one()
+ ++++++++++++++++++ */
+ static int __init init_iommu_all(struct acpi_table_header *table)
+ {
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+ struct amd_iommu *iommu;
+ int ret;
+
- -------- --------- INIT_LIST_HEAD(&amd_iommu_unity_map);
- -------- ---------
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ switch (*p) {
+ case ACPI_IVHD_TYPE:
+ iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+ if (iommu == NULL)
+ return -ENOMEM;
+ ret = init_iommu_one(iommu, h);
+ if (ret)
+ return ret;
+ break;
+ default:
+ break;
+ }
+ p += h->length;
+
+ }
+ WARN_ON(p != end);
+
+ return 0;
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * The next functions belong to the third pass of parsing the ACPI
+ ++++++++++++++++++ * table. In this last pass the memory mapping requirements are
+ ++++++++++++++++++ * gathered (like exclusion and unity mapping reanges).
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ static void __init free_unity_maps(void)
+ {
+ struct unity_map_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ }
+
+ ++++++++++++++++++/* called when we find an exclusion range definition in ACPI */
+ static int __init init_exclusion_range(struct ivmd_header *m)
+ {
+ int i;
+
+ switch (m->type) {
+ case ACPI_IVMD_TYPE:
+ set_device_exclusion_range(m->devid, m);
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ for (i = 0; i < amd_iommu_last_bdf; ++i)
+ set_device_exclusion_range(i, m);
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ for (i = m->devid; i <= m->aux; ++i)
+ set_device_exclusion_range(i, m);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+ }
+
+ ++++++++++++++++++/* called for unity map ACPI definition */
+ static int __init init_unity_map_range(struct ivmd_header *m)
+ {
+ struct unity_map_entry *e = 0;
+
+ e = kzalloc(sizeof(*e), GFP_KERNEL);
+ if (e == NULL)
+ return -ENOMEM;
+
+ switch (m->type) {
+ default:
+ case ACPI_IVMD_TYPE:
+ e->devid_start = e->devid_end = m->devid;
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ e->devid_start = 0;
+ e->devid_end = amd_iommu_last_bdf;
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ e->devid_start = m->devid;
+ e->devid_end = m->aux;
+ break;
+ }
+ e->address_start = PAGE_ALIGN(m->range_start);
+ e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+ e->prot = m->flags >> 1;
+
+ list_add_tail(&e->list, &amd_iommu_unity_map);
+
+ return 0;
+ }
+
+ ++++++++++++++++++/* iterates over all memory definitions we find in the ACPI table */
+ static int __init init_memory_definitions(struct acpi_table_header *table)
+ {
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivmd_header *m;
+
- -------- --------- dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
- -------- --------- alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
- -------- --------- rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ m = (struct ivmd_header *)p;
+ if (m->flags & IVMD_FLAG_EXCL_RANGE)
+ init_exclusion_range(m);
+ else if (m->flags & IVMD_FLAG_UNITY_MAP)
+ init_unity_map_range(m);
+
+ p += m->length;
+ }
+
+ return 0;
+ }
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This function finally enables all IOMMUs found in the system after
+ ++++++++++++++++++ * they have been initialized
+ ++++++++++++++++++ */
+ static void __init enable_iommus(void)
+ {
+ struct amd_iommu *iommu;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ iommu_set_exclusion_range(iommu);
+ iommu_enable(iommu);
+ }
+ }
+
+ /*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+ static int amd_iommu_resume(struct sys_device *dev)
+ {
+ return 0;
+ }
+
+ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+ {
+ return -EINVAL;
+ }
+
+ static struct sysdev_class amd_iommu_sysdev_class = {
+ .name = "amd_iommu",
+ .suspend = amd_iommu_suspend,
+ .resume = amd_iommu_resume,
+ };
+
+ static struct sys_device device_amd_iommu = {
+ .id = 0,
+ .cls = &amd_iommu_sysdev_class,
+ };
+
+ ++++++++++++++++++/*
+ ++++++++++++++++++ * This is the core init function for AMD IOMMU hardware in the system.
+ ++++++++++++++++++ * This function is called from the generic x86 DMA layer initialization
+ ++++++++++++++++++ * code.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ ++++++++++++++++++ * three times:
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * 1 pass) Find the highest PCI device id the driver has to handle.
+ ++++++++++++++++++ * Upon this information the size of the data structures is
+ ++++++++++++++++++ * determined that needs to be allocated.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * 2 pass) Initialize the data structures just allocated with the
+ ++++++++++++++++++ * information in the ACPI table about available AMD IOMMUs
+ ++++++++++++++++++ * in the system. It also maps the PCI devices in the
+ ++++++++++++++++++ * system to specific IOMMUs
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * 3 pass) After the basic data structures are allocated and
+ ++++++++++++++++++ * initialized we update them with information about memory
+ ++++++++++++++++++ * remapping requirements parsed out of the ACPI table in
+ ++++++++++++++++++ * this last pass.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * After that the hardware is initialized and ready to go. In the last
+ ++++++++++++++++++ * step we do some Linux specific things like registering the driver in
+ ++++++++++++++++++ * the dma_ops interface and initializing the suspend/resume support
+ ++++++++++++++++++ * functions. Finally it prints some information about AMD IOMMUs and
+ ++++++++++++++++++ * the driver state and enables the hardware.
+ ++++++++++++++++++ */
+ int __init amd_iommu_init(void)
+ {
+ int i, ret = 0;
+
+
+ if (no_iommu) {
+ printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+ return 0;
+ }
+
+ if (!amd_iommu_detected)
+ return -ENODEV;
+
+ /*
+ * First parse ACPI tables to find the largest Bus/Dev/Func
+ * we need to handle. Upon this information the shared data
+ * structures for the IOMMUs in the system will be allocated
+ */
+ if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
+ return -ENODEV;
+
- -------- --------- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+ ++++++++++++++++++ dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
+ ++++++++++++++++++ alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+ ++++++++++++++++++ rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
+
+ ret = -ENOMEM;
+
+ /* Device table - directly used by all IOMMUs */
- -------- --------- amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+ ++++++++++++++++++ amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(dev_table_size));
+ if (amd_iommu_dev_table == NULL)
+ goto out;
+
+ /*
+ * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+ * IOMMU see for that device
+ */
+ amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(alias_table_size));
+ if (amd_iommu_alias_table == NULL)
+ goto free;
+
+ /* IOMMU rlookup table - find the IOMMU for a specific device */
+ amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(rlookup_table_size));
+ if (amd_iommu_rlookup_table == NULL)
+ goto free;
+
+ /*
+ * Protection Domain table - maps devices to protection domains
+ * This table has the same size as the rlookup_table
+ */
- -------- --------- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+ ++++++++++++++++++ amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(rlookup_table_size));
+ if (amd_iommu_pd_table == NULL)
+ goto free;
+
- -------- --------- * memory is allocated now; initialize the device table with all zeroes
- -------- --------- * and let all alias entries point to itself
+ ++++++++++++++++++ amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+ ++++++++++++++++++ GFP_KERNEL | __GFP_ZERO,
+ get_order(MAX_DOMAIN_ID/8));
+ if (amd_iommu_pd_alloc_bitmap == NULL)
+ goto free;
+
+ /*
- -------- --------- memset(amd_iommu_dev_table, 0, dev_table_size);
+ ++++++++++++++++++ * let all alias entries point to itself
+ */
- -------- --------- memset(amd_iommu_pd_table, 0, rlookup_table_size);
- -------- --------- memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
- -------- ---------
+ for (i = 0; i < amd_iommu_last_bdf; ++i)
+ amd_iommu_alias_table[i] = i;
+
- -------- --------- if (amd_iommu_pd_alloc_bitmap)
- -------- --------- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+ /*
+ * never allocate domain 0 because its used as the non-allocated and
+ * error value placeholder
+ */
+ amd_iommu_pd_alloc_bitmap[0] = 1;
+
+ /*
+ * now the data structures are allocated and basically initialized
+ * start the real acpi table scan
+ */
+ ret = -ENODEV;
+ if (acpi_table_parse("IVRS", init_iommu_all) != 0)
+ goto free;
+
+ if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
+ goto free;
+
+ ret = amd_iommu_init_dma_ops();
+ if (ret)
+ goto free;
+
+ ret = sysdev_class_register(&amd_iommu_sysdev_class);
+ if (ret)
+ goto free;
+
+ ret = sysdev_register(&device_amd_iommu);
+ if (ret)
+ goto free;
+
+ enable_iommus();
+
+ printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
+ (1 << (amd_iommu_aperture_order-20)));
+
+ printk(KERN_INFO "AMD IOMMU: device isolation ");
+ if (amd_iommu_isolate)
+ printk("enabled\n");
+ else
+ printk("disabled\n");
+
+ out:
+ return ret;
+
+ free:
- -------- --------- if (amd_iommu_pd_table)
- -------- --------- free_pages((unsigned long)amd_iommu_pd_table,
- -------- --------- get_order(rlookup_table_size));
+ ++++++++++++++++++ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+
- -------- --------- if (amd_iommu_rlookup_table)
- -------- --------- free_pages((unsigned long)amd_iommu_rlookup_table,
- -------- --------- get_order(rlookup_table_size));
+ ++++++++++++++++++ free_pages((unsigned long)amd_iommu_pd_table,
+ ++++++++++++++++++ get_order(rlookup_table_size));
+
- -------- --------- if (amd_iommu_alias_table)
- -------- --------- free_pages((unsigned long)amd_iommu_alias_table,
- -------- --------- get_order(alias_table_size));
+ ++++++++++++++++++ free_pages((unsigned long)amd_iommu_rlookup_table,
+ ++++++++++++++++++ get_order(rlookup_table_size));
+
- -------- --------- if (amd_iommu_dev_table)
- -------- --------- free_pages((unsigned long)amd_iommu_dev_table,
- -------- --------- get_order(dev_table_size));
+ ++++++++++++++++++ free_pages((unsigned long)amd_iommu_alias_table,
+ ++++++++++++++++++ get_order(alias_table_size));
+
- -------- --------- if (swiotlb || no_iommu || iommu_detected)
+ ++++++++++++++++++ free_pages((unsigned long)amd_iommu_dev_table,
+ ++++++++++++++++++ get_order(dev_table_size));
+
+ free_iommu_all();
+
+ free_unity_maps();
+
+ goto out;
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * Early detect code. This code runs at IOMMU detection time in the DMA
+ ++++++++++++++++++ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ ++++++++++++++++++ * IOMMUs
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+ {
+ return 0;
+ }
+
+ void __init amd_iommu_detect(void)
+ {
- -------- --------- for (; *str; ++str) {
- -------- --------- if (strcmp(str, "32M") == 0)
- -------- --------- amd_iommu_aperture_order = 25;
- -------- --------- if (strcmp(str, "64M") == 0)
- -------- --------- amd_iommu_aperture_order = 26;
- -------- --------- if (strcmp(str, "128M") == 0)
- -------- --------- amd_iommu_aperture_order = 27;
- -------- --------- if (strcmp(str, "256M") == 0)
- -------- --------- amd_iommu_aperture_order = 28;
- -------- --------- if (strcmp(str, "512M") == 0)
- -------- --------- amd_iommu_aperture_order = 29;
- -------- --------- if (strcmp(str, "1G") == 0)
- -------- --------- amd_iommu_aperture_order = 30;
- -------- --------- }
+ ++++++++++++++++++ if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
+ return;
+
+ if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+ iommu_detected = 1;
+ amd_iommu_detected = 1;
+ #ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+ #endif
+ }
+ }
+
+ ++++++++++++++++++/****************************************************************************
+ ++++++++++++++++++ *
+ ++++++++++++++++++ * Parsing functions for the AMD IOMMU specific kernel command line
+ ++++++++++++++++++ * options.
+ ++++++++++++++++++ *
+ ++++++++++++++++++ ****************************************************************************/
+ ++++++++++++++++++
+ static int __init parse_amd_iommu_options(char *str)
+ {
+ for (; *str; ++str) {
+ if (strcmp(str, "isolate") == 0)
+ amd_iommu_isolate = 1;
+ }
+
+ return 1;
+ }
+
+ static int __init parse_amd_iommu_size_options(char *str)
+ {
+ ++++++++++++++++++ unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
+ ++++++++++++++++++
+ ++++++++++++++++++ if ((order > 24) && (order < 31))
+ ++++++++++++++++++ amd_iommu_aperture_order = order;
+
+ return 1;
+ }
+
+ __setup("amd_iommu=", parse_amd_iommu_options);
+ __setup("amd_iommu_size=", parse_amd_iommu_size_options);
/*
* Debug level, exported for io_apic.c
*/
-- -----------------int apic_verbosity;
++ +++++++++++++++++unsigned int apic_verbosity;
+
+ int pic_mode;
+
+ /* Have we found an MP table */
+ int smp_found_config;
+
+ static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+ };
static unsigned int calibration_result;
if (!local_apic_timer_verify_ok) {
printk(KERN_WARNING
"APIC timer disabled due to verification failure.\n");
++ +++++++++++++++++ return -1;
++ +++++++++++++++++ }
++ +++++++++++++++++
++ +++++++++++++++++ return 0;
++ +++++++++++++++++}
++ +++++++++++++++++
++ +++++++++++++++++/*
++ +++++++++++++++++ * Setup the boot APIC
++ +++++++++++++++++ *
++ +++++++++++++++++ * Calibrate and verify the result.
++ +++++++++++++++++ */
++ +++++++++++++++++void __init setup_boot_APIC_clock(void)
++ +++++++++++++++++{
++ +++++++++++++++++ /*
++ +++++++++++++++++ * The local apic timer can be disabled via the kernel
++ +++++++++++++++++ * commandline or from the CPU detection code. Register the lapic
++ +++++++++++++++++ * timer as a dummy clock event source on SMP systems, so the
++ +++++++++++++++++ * broadcast mechanism is used. On UP systems simply ignore it.
++ +++++++++++++++++ */
++ +++++++++++++++++ if (local_apic_timer_disabled) {
/* No broadcast on UP ! */
-- ----------------- if (num_possible_cpus() == 1)
-- ----------------- return;
-- ----------------- } else {
-- ----------------- /*
-- ----------------- * If nmi_watchdog is set to IO_APIC, we need the
-- ----------------- * PIT/HPET going. Otherwise register lapic as a dummy
-- ----------------- * device.
-- ----------------- */
-- ----------------- if (nmi_watchdog != NMI_IO_APIC)
-- ----------------- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-- ----------------- else
-- ----------------- printk(KERN_WARNING "APIC timer registered as dummy,"
-- ------- --------- " due to nmi_watchdog=%d!\n", nmi_watchdog);
- " due to nmi_watchdog=1!\n");
++ +++++++++++++++++ if (num_possible_cpus() > 1) {
++ +++++++++++++++++ lapic_clockevent.mult = 1;
++ +++++++++++++++++ setup_APIC_timer();
++ +++++++++++++++++ }
++ +++++++++++++++++ return;
++ + + + }
++ + + +
++ +++++++++++++++++ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
++ +++++++++++++++++ "calibrating APIC timer ...\n");
++ +++++++++++++++++
++ +++++++++++++++++ if (calibrate_APIC_clock()) {
++ +++++++++++++++++ /* No broadcast on UP ! */
++ +++++++++++++++++ if (num_possible_cpus() > 1)
++ +++++++++++++++++ setup_APIC_timer();
++ +++++++++++++++++ return;
++ +++ ++ + +++ +++ }
++ +++ ++ + +++ +++
++ +++++++++++++++++ /*
++ +++++++++++++++++ * If nmi_watchdog is set to IO_APIC, we need the
++ +++++++++++++++++ * PIT/HPET going. Otherwise register lapic as a dummy
++ +++++++++++++++++ * device.
++ +++++++++++++++++ */
++ +++++++++++++++++ if (nmi_watchdog != NMI_IO_APIC)
++ +++++++++++++++++ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
++ +++++++++++++++++ else
++ +++++++++++++++++ printk(KERN_WARNING "APIC timer registered as dummy,"
++ +++++++++++++++++ " due to nmi_watchdog=%d!\n", nmi_watchdog);
++ +++++++++++++++++
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
}
int __init APIC_init_uniprocessor(void)
{
---- ----- --------- if (disable_apic)
- if (enable_local_apic < 0)
---- --------------- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
---- ---------------
if (!smp_found_config && !cpu_has_apic)
return -1;
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
*/
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
/* IPI for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
/* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
++ + + +
++ + + + /* IPI for single call function */
++ + + + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
++ + + + call_function_single_interrupt);
}
#endif
static int __init parse_nolapic(char *arg)
{
- enable_local_apic = -1;
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ disable_apic = 1;
---- ----- --------- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
++++ +++++++++++++++ setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
early_param("nolapic", parse_nolapic);
/*
* Debug level, exported for io_apic.c
*/
-- -----------------int apic_verbosity;
++ +++++++++++++++++unsigned int apic_verbosity;
+
+ /* Have we found an MP table */
+ int smp_found_config;
static struct resource lapic_resource = {
.name = "Local APIC",
--- /dev/null
--- ------ ---------#include <linux/string.h>
+ #include <linux/init.h>
+ #include <linux/kernel.h>
+ #include <linux/sched.h>
+ #include <linux/string.h>
+ #include <linux/bootmem.h>
+ #include <linux/bitops.h>
+ #include <linux/module.h>
+ #include <linux/kgdb.h>
+ #include <linux/topology.h>
--- ------ ---------#include <linux/module.h>
+ #include <linux/delay.h>
+ #include <linux/smp.h>
--- ------ ---------#include <asm/processor.h>
+ #include <linux/percpu.h>
- - unsigned int n, dummy, eax, ebx, ecx, edx;
+ #include <asm/i387.h>
+ #include <asm/msr.h>
+ #include <asm/io.h>
+++++++++++++++++++ #include <asm/linkage.h>
+ #include <asm/mmu_context.h>
+ #include <asm/mtrr.h>
+ #include <asm/mce.h>
+ #include <asm/pat.h>
+ #include <asm/numa.h>
+ #ifdef CONFIG_X86_LOCAL_APIC
+ #include <asm/mpspec.h>
+ #include <asm/apic.h>
+ #include <mach_apic.h>
+ #endif
+ #include <asm/pda.h>
+ #include <asm/pgtable.h>
+ #include <asm/processor.h>
+ #include <asm/desc.h>
+ #include <asm/atomic.h>
+ #include <asm/proto.h>
+ #include <asm/sections.h>
+ #include <asm/setup.h>
+ #include <asm/genapic.h>
+
+ #include "cpu.h"
+
+ /* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+ /* The TLS descriptors are currently at a different place compared to i386.
+ Hopefully nobody expects them at a fixed place (Wine?) */
+ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+ } };
+ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+ __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+ /* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+ void switch_to_new_gdt(void)
+ {
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+ }
+
+ struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+ static void __cpuinit default_init(struct cpuinfo_x86 *c)
+ {
+ display_cacheinfo(c);
+ }
+
+ static struct cpu_dev __cpuinitdata default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+ };
+ static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+ {
+ unsigned int *v;
+
+ if (c->extended_cpuid_level < 0x80000004)
+ return 0;
+
+ v = (unsigned int *) c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+ return 1;
+ }
+
+
+ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+ {
- - if (n >= 0x80000008) {
- - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
- - c->x86_virt_bits = (eax >> 8) & 0xff;
- - c->x86_phys_bits = eax & 0xff;
- - }
+ + + unsigned int n, dummy, ebx, ecx, edx;
+
+ n = c->extended_cpuid_level;
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+ "D cache %dK (%d bytes/line)\n",
+ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+ }
+
+ if (n >= 0x80000006) {
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+ ecx = cpuid_ecx(0x80000006);
+ c->x86_cache_size = ecx >> 16;
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+ c->x86_cache_size, ecx & 0xFF);
+ }
---- ----- --------- c->extended_cpuid_level = cpuid_eax(0x80000000);
+ }
+
+ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+ u32 eax, ebx, ecx, edx;
+ int index_msb, core_bits;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return;
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+ if (smp_num_siblings == 1) {
+ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ } else if (smp_num_siblings > 1) {
+
+ if (smp_num_siblings > NR_CPUS) {
+ printk(KERN_WARNING "CPU: Unsupported number of "
+ "siblings %d", smp_num_siblings);
+ smp_num_siblings = 1;
+ return;
+ }
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = phys_pkg_id(index_msb);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
+ }
+ out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ }
+
+ #endif
+ }
+
+ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+ {
+ char *v = c->x86_vendor_id;
+ int i;
+ static int printed;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (cpu_devs[i]) {
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ c->x86_vendor = i;
+ this_cpu = cpu_devs[i];
+ return;
+ }
+ }
+ }
+ if (!printed) {
+ printed++;
+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
+ }
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ }
+
+ static void __init early_cpu_support_print(void)
+ {
+ int i,j;
+ struct cpu_dev *cpu_devx;
+
+ printk("KERNEL supported cpus:\n");
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ cpu_devx = cpu_devs[i];
+ if (!cpu_devx)
+ continue;
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devx->c_ident[j])
+ continue;
+ printk(" %s %s\n", cpu_devx->c_vendor,
+ cpu_devx->c_ident[j]);
+ }
+ }
+ }
+
+ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
+ void __init early_cpu_init(void)
+ {
+ struct cpu_vendor_dev *cvdev;
+
+ for (cvdev = __x86cpuvendor_start ;
+ cvdev < __x86cpuvendor_end ;
+ cvdev++)
+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+ early_cpu_support_print();
+ early_identify_cpu(&boot_cpu_data);
+ }
+
+ /* Do some early cpuid on the boot CPU to get some parameter that are
+ needed before check_bugs. Everything advanced is in identify_cpu
+ below. */
+ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+ {
+ u32 tfms, xlvl;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = -1;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_clflush_size = 64;
+ c->x86_cache_alignment = c->x86_clflush_size;
+ c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+ c->extended_cpuid_level = 0;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ get_cpu_vendor(c);
+
+ /* Initialize the standard set of capabilities */
+ /* Note that the vendor-specific code below might override */
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ __u32 misc;
+ cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+ &c->x86_capability[0]);
+ c->x86 = (tfms >> 8) & 0xf;
+ c->x86_model = (tfms >> 4) & 0xf;
+ c->x86_mask = tfms & 0xf;
+ if (c->x86 == 0xf)
+ c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
+ c->x86_model += ((tfms >> 16) & 0xF) << 4;
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ } else {
+ /* Have CPUID level 0 only - unheard of */
+ c->x86 = 4;
+ }
+
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
+ #ifdef CONFIG_SMP
+ c->phys_proc_id = c->initial_apicid;
+ #endif
+ /* AMD-defined flags: level 0x80000001 */
+ xlvl = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = xlvl;
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ if (xlvl >= 0x80000004)
+ get_model_name(c); /* Default name */
+ }
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ /* Don't set x86_cpuid_level here for now to not confuse. */
+ if (xlvl >= 0x80860001)
+ c->x86_capability[2] = cpuid_edx(0x80860001);
+ }
+
- /* Assume all 64-bit CPUs support 32-bit syscall */
- set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+ if (c->extended_cpuid_level >= 0x80000007)
+ c->x86_power = cpuid_edx(0x80000007);
+
------- -- ----- -- /* Assume all 64-bit CPUs support 32-bit syscall */
------- -- ----- -- set_cpu_cap(c, X86_FEATURE_SYSCALL32);
------- -- ----- --
+ + + if (c->extended_cpuid_level >= 0x80000008) {
+ + + u32 eax = cpuid_eax(0x80000008);
+ + +
+ + + c->x86_virt_bits = (eax >> 8) & 0xff;
+ + + c->x86_phys_bits = eax & 0xff;
+ + + }
+ +
---- ----- ---------
---- ----- --------- /* early_param could clear that, but recall get it set again */
---- ----- --------- if (disable_apic)
---- ----- --------- clear_cpu_cap(c, X86_FEATURE_APIC);
+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+ cpu_devs[c->x86_vendor]->c_early_init)
+ cpu_devs[c->x86_vendor]->c_early_init(c);
+
+ validate_pat_support(c);
---------- -------- DEBUG_STKSZ]
---------- -------- __attribute__((section(".bss.page_aligned")));
+ }
+
+ /*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+ {
+ int i;
+
+ early_identify_cpu(c);
+
+ init_scattered_cpuid_features(c);
+
+ c->apicid = phys_pkg_id(0);
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
+
+ detect_ht(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+ }
+
+ /* Clear all flags overriden by options */
+ for (i = 0; i < NCAPINTS; i++)
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
+
+ #ifdef CONFIG_X86_MCE
+ mcheck_init(c);
+ #endif
+ select_idle_routine(c);
+
+ #ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+ #endif
+
+ }
+
+ void __cpuinit identify_boot_cpu(void)
+ {
+ identify_cpu(&boot_cpu_data);
+ }
+
+ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+ {
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ mtrr_ap_init();
+ }
+
+ static __init int setup_noclflush(char *arg)
+ {
+ setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ return 1;
+ }
+ __setup("noclflush", setup_noclflush);
+
+ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+ {
+ if (c->x86_model_id[0])
+ printk(KERN_CONT "%s", c->x86_model_id);
+
+ if (c->x86_mask || c->cpuid_level >= 0)
+ printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ else
+ printk(KERN_CONT "\n");
+ }
+
+ static __init int setup_disablecpuid(char *arg)
+ {
+ int bit;
+ if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+ setup_clear_cpu_cap(bit);
+ else
+ return 0;
+ return 1;
+ }
+ __setup("clearcpuid=", setup_disablecpuid);
+
+ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+ struct x8664_pda **_cpu_pda __read_mostly;
+ EXPORT_SYMBOL(_cpu_pda);
+
+ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+ char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+ unsigned long __supported_pte_mask __read_mostly = ~0UL;
+ EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+ static int do_not_nx __cpuinitdata;
+
+ /* noexec=on|off
+ Control non executable mappings for 64bit processes.
+
+ on Enable(default)
+ off Disable
+ */
+ static int __init nonx_setup(char *str)
+ {
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ do_not_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ do_not_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+ }
+ early_param("noexec", nonx_setup);
+
+ int force_personality32;
+
+ /* noexec32=on|off
+ Control non executable heap for 32bit processes.
+ To control the stack too use noexec=off
+
+ on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+ off PROT_READ implies PROT_EXEC
+ */
+ static int __init nonx32_setup(char *str)
+ {
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+ }
+ __setup("noexec32=", nonx32_setup);
+
+ void pda_init(int cpu)
+ {
+ struct x8664_pda *pda = cpu_pda(cpu);
+
+ /* Setup up data that may be needed in __get_free_pages early */
+ loadsegment(fs, 0);
+ loadsegment(gs, 0);
+ /* Memory clobbers used to order PDA accessed */
+ mb();
+ wrmsrl(MSR_GS_BASE, pda);
+ mb();
+
+ pda->cpunumber = cpu;
+ pda->irqcount = -1;
+ pda->kernelstack = (unsigned long)stack_thread_info() -
+ PDA_STACKOFFSET + THREAD_SIZE;
+ pda->active_mm = &init_mm;
+ pda->mmu_state = 0;
+
+ if (cpu == 0) {
+ /* others are initialized in smpboot.c */
+ pda->pcurrent = &init_task;
+ pda->irqstackptr = boot_cpu_stack;
+ } else {
+ pda->irqstackptr = (char *)
+ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+ if (!pda->irqstackptr)
+ panic("cannot allocate irqstack for cpu %d", cpu);
+
+ if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+ pda->nodenumber = cpu_to_node(cpu);
+ }
+
+ pda->irqstackptr += IRQSTACKSIZE-64;
+ }
+
+ char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+++++++++++++++++++ DEBUG_STKSZ] __page_aligned_bss;
+
+ extern asmlinkage void ignore_sysret(void);
+
+ /* May not be marked __init: used by software suspend */
+ void syscall_init(void)
+ {
+ /*
+ * LSTAR and STAR live in a bit strange symbiosis.
+ * They both write to the same internal register. STAR allows to
+ * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ */
+ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+
+ #ifdef CONFIG_IA32_EMULATION
+ syscall32_cpu_init();
+ #endif
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+ }
+
+ void __cpuinit check_efer(void)
+ {
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || do_not_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+
+ unsigned long kernel_eflags;
+
+ /*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+ DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+ /*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+ void __cpuinit cpu_init(void)
+ {
+ int cpu = stack_smp_processor_id();
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+ unsigned long v;
+ char *estacks = NULL;
+ struct task_struct *me;
+ int i;
+
+ /* CPU 0 is initialised in head64.c */
+ if (cpu != 0)
+ pda_init(cpu);
+ else
+ estacks = boot_exception_stacks;
+
+ me = current;
+
+ if (cpu_test_and_set(cpu, cpu_initialized))
+ panic("CPU#%d already initialized!\n", cpu);
+
+ printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+
+ switch_to_new_gdt();
+ load_idt((const struct desc_ptr *)&idt_descr);
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ check_efer();
+
+ /*
+ * set up and load the per-CPU TSS
+ */
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
+ if (cpu) {
+ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+ if (!estacks)
+ panic("Cannot allocate exception stack %ld %d\n",
+ v, cpu);
+ }
+ estacks += PAGE_SIZE << order[v];
+ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
+ }
+
+ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+
+ atomic_inc(&init_mm.mm_count);
+ me->active_mm = &init_mm;
+ if (me->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, me);
+
+ load_sp0(t, ¤t->thread);
+ set_tss_desc(cpu, t);
+ load_TR_desc();
+ load_LDT(&init_mm.context);
+
+ #ifdef CONFIG_KGDB
+ /*
+ * If the kgdb is connected no debug regs should be altered. This
+ * is only applicable when KGDB and a KGDB I/O module are built
+ * into the kernel and you are using early debugging with
+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+ */
+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ else {
+ #endif
+ /*
+ * Clear all 6 debug registers:
+ */
+
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+ set_debugreg(0UL, 6);
+ set_debugreg(0UL, 7);
+ #ifdef CONFIG_KGDB
+ /* If the kgdb is connected no debug regs should be altered. */
+ }
+ #endif
+
+ fpu_init();
+
+ raw_local_save_flags(kernel_eflags);
+
+ if (is_uv_system())
+ uv_cpu_init();
+ }
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
- #include "irq_vectors.h"
+ + + #include <asm/ftrace.h>
+ #include <asm/irq_vectors.h>
/*
* We use macros for low-level operations which need to be overridden
ENTRY(xen_sysenter_target)
RING0_INT_FRAME
addl $5*4, %esp /* remove xen-provided frame */
++++++++++++++++++ + CFI_ADJUST_CFA_OFFSET -5*4
jmp sysenter_past_esp
+ + CFI_ENDPROC
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
---------- ---- ---- TI_flags(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
jnz tracesys
cmpq $__NR_syscall_max,%rax
ja badsys
--- /dev/null
-- - - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+ /*
+ * NMI watchdog support on APIC systems
+ *
+ * Started by Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes:
+ * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
+ * Mikael Pettersson : Power Management for local APIC NMI watchdog.
+ * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model. Disable/enable API.
+ */
+
+ #include <asm/apic.h>
+
+ #include <linux/nmi.h>
+ #include <linux/mm.h>
+ #include <linux/delay.h>
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
+ #include <linux/sysdev.h>
+ #include <linux/sysctl.h>
+ #include <linux/percpu.h>
+ #include <linux/kprobes.h>
+ #include <linux/cpumask.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/kdebug.h>
+ #include <linux/smp.h>
+
+ #include <asm/i8259.h>
+ #include <asm/io_apic.h>
+ #include <asm/smp.h>
+ #include <asm/nmi.h>
+ #include <asm/proto.h>
+ #include <asm/timer.h>
+
+ #include <asm/mce.h>
+
+ #include <mach_traps.h>
+
+ int unknown_nmi_panic;
+ int nmi_watchdog_enabled;
+
+ static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
+ /* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ * be enabled
+ * 0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
+ EXPORT_SYMBOL(nmi_active);
+
+ unsigned int nmi_watchdog = NMI_NONE;
+ EXPORT_SYMBOL(nmi_watchdog);
+
+ static int panic_on_timeout;
+
+ static unsigned int nmi_hz = HZ;
+ static DEFINE_PER_CPU(short, wd_enabled);
+ static int endflag __initdata;
+
+ static inline unsigned int get_nmi_count(int cpu)
+ {
+ #ifdef CONFIG_X86_64
+ return cpu_pda(cpu)->__nmi_count;
+ #else
+ return nmi_count(cpu);
+ #endif
+ }
+
+ static inline int mce_in_progress(void)
+ {
+ #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+ return atomic_read(&mce_entry) > 0;
+ #endif
+ return 0;
+ }
+
+ /*
+ * Take the local apic timer and PIT/HPET into account. We don't
+ * know which one is active, when we have highres/dyntick on
+ */
+ static inline unsigned int get_timer_irqs(int cpu)
+ {
+ #ifdef CONFIG_X86_64
+ return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
+ #else
+ return per_cpu(irq_stat, cpu).apic_timer_irqs +
+ per_cpu(irq_stat, cpu).irq0_irqs;
+ #endif
+ }
+
+ #ifdef CONFIG_SMP
+ /*
+ * The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+ static __init void nmi_cpu_busy(void *data)
+ {
+ local_irq_enable_in_hardirq();
+ /*
+ * Intentionally don't use cpu_relax here. This is
+ * to make sure that the performance counter really ticks,
+ * even if there is a simulator or similar that catches the
+ * pause instruction. On a real HT machine this is fine because
+ * all other CPUs are busy with "useless" delay loops and don't
+ * care if they get somewhat less cycles.
+ */
+ while (endflag == 0)
+ mb();
+ }
+ #endif
+
+ int __init check_nmi_watchdog(void)
+ {
+ unsigned int *prev_nmi_count;
+ int cpu;
+
+ if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
+ return 0;
+
+ prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
+ if (!prev_nmi_count)
+ goto error;
+
+ printk(KERN_INFO "Testing NMI watchdog ... ");
+
+ #ifdef CONFIG_SMP
+ if (nmi_watchdog == NMI_LOCAL_APIC)
-- ------- --------- apic_write_around(APIC_LVT0, APIC_DM_NMI);
++ + + + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
+ #endif
+
+ for_each_possible_cpu(cpu)
+ prev_nmi_count[cpu] = get_nmi_count(cpu);
+ local_irq_enable();
+ mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
+
+ for_each_online_cpu(cpu) {
+ if (!per_cpu(wd_enabled, cpu))
+ continue;
+ if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+ printk(KERN_WARNING "WARNING: CPU#%d: NMI "
+ "appears to be stuck (%d->%d)!\n",
+ cpu,
+ prev_nmi_count[cpu],
+ get_nmi_count(cpu));
+ per_cpu(wd_enabled, cpu) = 0;
+ atomic_dec(&nmi_active);
+ }
+ }
+ endflag = 1;
+ if (!atomic_read(&nmi_active)) {
+ kfree(prev_nmi_count);
+ atomic_set(&nmi_active, -1);
+ goto error;
+ }
+ printk("OK.\n");
+
+ /*
+ * now that we know it works we can reduce NMI frequency to
+ * something more reasonable; makes a difference in some configs
+ */
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ nmi_hz = lapic_adjust_nmi_hz(1);
+
+ kfree(prev_nmi_count);
+ return 0;
+ error:
+ if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+ disable_8259A_irq(0);
+ + #ifdef CONFIG_X86_32
+ + timer_ack = 0;
+ + #endif
+ return -1;
+ }
+
+ static int __init setup_nmi_watchdog(char *str)
+ {
+ unsigned int nmi;
+
+ if (!strncmp(str, "panic", 5)) {
+ panic_on_timeout = 1;
+ str = strchr(str, ',');
+ if (!str)
+ return 1;
+ ++str;
+ }
+
+ get_option(&str, &nmi);
+
+ if (nmi >= NMI_INVALID)
+ return 0;
+
+ nmi_watchdog = nmi;
+ return 1;
+ }
+ __setup("nmi_watchdog=", setup_nmi_watchdog);
+
+ /*
+ * Suspend/resume support
+ */
+ #ifdef CONFIG_PM
+
+ static int nmi_pm_active; /* nmi_active before suspend */
+
+ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+ {
+ /* only CPU0 goes here, other CPUs should be offline */
+ nmi_pm_active = atomic_read(&nmi_active);
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
+ return 0;
+ }
+
+ static int lapic_nmi_resume(struct sys_device *dev)
+ {
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
+ return 0;
+ }
+
+ static struct sysdev_class nmi_sysclass = {
+ .name = "lapic_nmi",
+ .resume = lapic_nmi_resume,
+ .suspend = lapic_nmi_suspend,
+ };
+
+ static struct sys_device device_lapic_nmi = {
+ .id = 0,
+ .cls = &nmi_sysclass,
+ };
+
+ static int __init init_lapic_nmi_sysfs(void)
+ {
+ int error;
+
+ /*
+ * should really be a BUG_ON but b/c this is an
+ * init call, it just doesn't work. -dcz
+ */
+ if (nmi_watchdog != NMI_LOCAL_APIC)
+ return 0;
+
+ if (atomic_read(&nmi_active) < 0)
+ return 0;
+
+ error = sysdev_class_register(&nmi_sysclass);
+ if (!error)
+ error = sysdev_register(&device_lapic_nmi);
+ return error;
+ }
+
+ /* must come after the local APIC's device_initcall() */
+ late_initcall(init_lapic_nmi_sysfs);
+
+ #endif /* CONFIG_PM */
+
+ static void __acpi_nmi_enable(void *__unused)
+ {
-- - - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
++ +++++++++++++++++ apic_write(APIC_LVT0, APIC_DM_NMI);
+ }
+
+ /*
+ * Enable timer based NMIs on all CPUs:
+ */
+ void acpi_nmi_enable(void)
+ {
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-- ------- --------- apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
++ + + + on_each_cpu(__acpi_nmi_enable, NULL, 1);
+ }
+
+ static void __acpi_nmi_disable(void *__unused)
+ {
-- - - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
++ +++++++++++++++++ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+ }
+
+ /*
+ * Disable timer based NMIs on all CPUs:
+ */
+ void acpi_nmi_disable(void)
+ {
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
++ + + + on_each_cpu(__acpi_nmi_disable, NULL, 1);
+ }
+
+ void setup_apic_nmi_watchdog(void *unused)
+ {
+ if (__get_cpu_var(wd_enabled))
+ return;
+
+ /* cheap hack to support suspend/resume */
+ /* if cpu0 is not active neither should the other cpus */
+ if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
+ return;
+
+ switch (nmi_watchdog) {
+ case NMI_LOCAL_APIC:
+ /* enable it before to avoid race with handler */
+ __get_cpu_var(wd_enabled) = 1;
+ if (lapic_watchdog_init(nmi_hz) < 0) {
+ __get_cpu_var(wd_enabled) = 0;
+ return;
+ }
+ /* FALL THROUGH */
+ case NMI_IO_APIC:
+ __get_cpu_var(wd_enabled) = 1;
+ atomic_inc(&nmi_active);
+ }
+ }
+
+ void stop_apic_nmi_watchdog(void *unused)
+ {
+ /* only support LOCAL and IO APICs for now */
+ if (!nmi_watchdog_active())
+ return;
+ if (__get_cpu_var(wd_enabled) == 0)
+ return;
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ lapic_watchdog_stop();
+ __get_cpu_var(wd_enabled) = 0;
+ atomic_dec(&nmi_active);
+ }
+
+ /*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up here too!]
+ */
+
+ static DEFINE_PER_CPU(unsigned, last_irq_sum);
+ static DEFINE_PER_CPU(local_t, alert_counter);
+ static DEFINE_PER_CPU(int, nmi_touch);
+
+ void touch_nmi_watchdog(void)
+ {
+ if (nmi_watchdog_active()) {
+ unsigned cpu;
+
+ /*
+ * Tell other CPUs to reset their alert counters. We cannot
+ * do it ourselves because the alert count increase is not
+ * atomic.
+ */
+ for_each_present_cpu(cpu) {
+ if (per_cpu(nmi_touch, cpu) != 1)
+ per_cpu(nmi_touch, cpu) = 1;
+ }
+ }
+
+ /*
+ * Tickle the softlockup detector too:
+ */
+ touch_softlockup_watchdog();
+ }
+ EXPORT_SYMBOL(touch_nmi_watchdog);
+
+ notrace __kprobes int
+ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
+ {
+ /*
+ * Since current_thread_info()-> is always on the stack, and we
+ * always switch the stack NMI-atomically, it's safe to use
+ * smp_processor_id().
+ */
+ unsigned int sum;
+ int touched = 0;
+ int cpu = smp_processor_id();
+ int rc = 0;
+
+ /* check for other users first */
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP) {
+ rc = 1;
+ touched = 1;
+ }
+
+ sum = get_timer_irqs(cpu);
+
+ if (__get_cpu_var(nmi_touch)) {
+ __get_cpu_var(nmi_touch) = 0;
+ touched = 1;
+ }
+
+ if (cpu_isset(cpu, backtrace_mask)) {
+ static DEFINE_SPINLOCK(lock); /* Serialise the printks */
+
+ spin_lock(&lock);
+ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ dump_stack();
+ spin_unlock(&lock);
+ cpu_clear(cpu, backtrace_mask);
+ }
+
+ /* Could check oops_in_progress here too, but it's safer not to */
+ if (mce_in_progress())
+ touched = 1;
+
+ /* if the none of the timers isn't firing, this cpu isn't doing much */
+ if (!touched && __get_cpu_var(last_irq_sum) == sum) {
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ local_inc(&__get_cpu_var(alert_counter));
+ if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
+ /*
+ * die_nmi will return ONLY if NOTIFY_STOP happens..
+ */
+ die_nmi("BUG: NMI Watchdog detected LOCKUP",
+ regs, panic_on_timeout);
+ } else {
+ __get_cpu_var(last_irq_sum) = sum;
+ local_set(&__get_cpu_var(alert_counter), 0);
+ }
+
+ /* see if the nmi watchdog went off */
+ if (!__get_cpu_var(wd_enabled))
+ return rc;
+ switch (nmi_watchdog) {
+ case NMI_LOCAL_APIC:
+ rc |= lapic_wd_event(nmi_hz);
+ break;
+ case NMI_IO_APIC:
+ /*
+ * don't know how to accurately check for this.
+ * just assume it was a watchdog timer interrupt
+ * This matches the old behaviour.
+ */
+ rc = 1;
+ break;
+ }
+ return rc;
+ }
+
+ #ifdef CONFIG_SYSCTL
+
+++++++++++ ++++++++static int __init setup_unknown_nmi_panic(char *str)
+++++++++++ ++++++++{
+++++++++++ ++++++++ unknown_nmi_panic = 1;
+++++++++++ ++++++++ return 1;
+++++++++++ ++++++++}
+++++++++++ ++++++++__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+++++++++++ ++++++++
+ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+ {
+ unsigned char reason = get_nmi_reason();
+ char buf[64];
+
+ sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+ die_nmi(buf, regs, 1); /* Always panic here */
+ return 0;
+ }
+
+ /*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+ void __user *buffer, size_t *length, loff_t *ppos)
+ {
+ int old_state;
+
+ nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+ old_state = nmi_watchdog_enabled;
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (!!old_state == !!nmi_watchdog_enabled)
+ return 0;
+
+ if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
+ printk(KERN_WARNING
+ "NMI watchdog is permanently disabled\n");
+ return -EIO;
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_lapic_nmi_watchdog();
+ else
+ disable_lapic_nmi_watchdog();
+ } else {
+ printk(KERN_WARNING
+ "NMI watchdog doesn't know what hardware to touch\n");
+ return -EIO;
+ }
+ return 0;
+ }
+
+ #endif /* CONFIG_SYSCTL */
+
+ int do_nmi_callback(struct pt_regs *regs, int cpu)
+ {
+ #ifdef CONFIG_SYSCTL
+ if (unknown_nmi_panic)
+ return unknown_nmi_panic_callback(regs, cpu);
+ #endif
+ return 0;
+ }
+
+ void __trigger_all_cpu_backtrace(void)
+ {
+ int i;
+
+ backtrace_mask = cpu_online_map;
+ /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpus_empty(backtrace_mask))
+ break;
+ mdelay(1);
+ }
+ }
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
+++++++++++++++++++ #include <asm/pgtable.h>
#include <asm/time.h>
+ #include <asm/pgalloc.h>
#include <asm/irq.h>
#include <asm/delay.h>
#include <asm/fixmap.h>
#include <asm/proto.h>
#include <asm/dma.h>
------- ------------#include <asm/gart.h>
+++++++ ++++++++++++#include <asm/iommu.h>
#include <asm/calgary.h>
+ #include <asm/amd_iommu.h>
--- ----------------int forbid_dac __read_mostly;
--- ----------------EXPORT_SYMBOL(forbid_dac);
+++ ++++++++++++++++static int forbid_dac __read_mostly;
const struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
detect_intel_iommu();
- #ifdef CONFIG_SWIOTLB
+ amd_iommu_detect();
+
------- -- ---------#ifdef CONFIG_SWIOTLB
pci_swiotlb_init();
------- ------------#endif
}
#endif
intel_iommu_init();
- #ifdef CONFIG_GART_IOMMU
+ amd_iommu_init();
+
------- -- ---------#ifdef CONFIG_GART_IOMMU
gart_iommu_init();
------- ------------#endif
no_iommu_init();
return 0;
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
+ #include <linux/clockchips.h>
+++ + ++ + #include <asm/system.h>
+++ + ++ +
+++ + ++ + unsigned long idle_halt;
+++ + ++ + EXPORT_SYMBOL(idle_halt);
+++ + ++ + unsigned long idle_nomwait;
+++ + ++ + EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
+++++ ++++++++++++++static int force_mwait __cpuinitdata;
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
+++ ++++++++++++++++static int __cpuinitdata force_mwait;
+
+ #define MWAIT_INFO 0x05
+ #define MWAIT_ECX_EXTENDED_INFO 0x01
+ #define MWAIT_EDX_C1 0xf0
+
static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
{
+ u32 eax, ebx, ecx, edx;
+
if (force_mwait)
return 1;
- #include <linux/kernel.h>
+ /*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ * Memory region support
+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ * Added E820 sanitization routine (removes overlapping memory regions);
+ * Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ * Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+ /*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+ #include <linux/mmzone.h>
+ #include <linux/screen_info.h>
+ #include <linux/ioport.h>
+ #include <linux/acpi.h>
+ #include <linux/apm_bios.h>
+ #include <linux/initrd.h>
+ #include <linux/bootmem.h>
+ #include <linux/seq_file.h>
+ #include <linux/console.h>
+ #include <linux/mca.h>
+ #include <linux/root_dev.h>
+ #include <linux/highmem.h>
#include <linux/module.h>
+ #include <linux/efi.h>
#include <linux/init.h>
- #include <linux/bootmem.h>
+ #include <linux/edd.h>
+ #include <linux/iscsi_ibft.h>
+ #include <linux/nodemask.h>
+ #include <linux/kexec.h>
+ #include <linux/dmi.h>
+ #include <linux/pfn.h>
+ #include <linux/pci.h>
+ #include <asm/pci-direct.h>
+ #include <linux/init_ohci1394_dma.h>
+ #include <linux/kvm_para.h>
+
+ #include <linux/errno.h>
+ #include <linux/kernel.h>
+ #include <linux/stddef.h>
+ #include <linux/unistd.h>
+ #include <linux/ptrace.h>
+ #include <linux/slab.h>
+ #include <linux/user.h>
+ #include <linux/delay.h>
--- ------ ---------#include <linux/highmem.h>
+
+ #include <linux/kallsyms.h>
--- ------ ---------#include <linux/edd.h>
--- ------ ---------#include <linux/iscsi_ibft.h>
--- ------ ---------#include <linux/kexec.h>
+ #include <linux/cpufreq.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/ctype.h>
+ #include <linux/uaccess.h>
+
#include <linux/percpu.h>
- #include <asm/smp.h>
- #include <asm/percpu.h>
+ #include <linux/crash_dump.h>
+
+ #include <video/edid.h>
+
+ #include <asm/mtrr.h>
+ #include <asm/apic.h>
+ #include <asm/e820.h>
+ #include <asm/mpspec.h>
+ #include <asm/setup.h>
+ #include <asm/arch_hooks.h>
+ #include <asm/efi.h>
#include <asm/sections.h>
+ #include <asm/dmi.h>
+ #include <asm/io_apic.h>
+ #include <asm/ist.h>
+ #include <asm/vmi.h>
+ #include <setup_arch.h>
+ #include <asm/bios_ebda.h>
+ #include <asm/cacheflush.h>
#include <asm/processor.h>
- #include <asm/setup.h>
+ #include <asm/bugs.h>
+
+ #include <asm/system.h>
+ #include <asm/vsyscall.h>
+ #include <asm/smp.h>
+ #include <asm/desc.h>
+ #include <asm/dma.h>
------- -- ---------#include <asm/gart.h>
+++++++ ++++++++++++#include <asm/iommu.h>
+ #include <asm/mmu_context.h>
+ #include <asm/proto.h>
+
+ #include <mach_apic.h>
+ #include <asm/paravirt.h>
+
+ #include <asm/percpu.h>
--- ------ ---------#include <asm/sections.h>
#include <asm/topology.h>
- #include <asm/mpspec.h>
#include <asm/apicdef.h>
+ #ifdef CONFIG_X86_64
+ #include <asm/numa_64.h>
+ #endif
- #ifdef CONFIG_X86_LOCAL_APIC
- unsigned int num_processors;
- unsigned disabled_cpus __cpuinitdata;
- /* Processor that is doing the boot up */
- unsigned int boot_cpu_physical_apicid = -1U;
- EXPORT_SYMBOL(boot_cpu_physical_apicid);
+ #ifndef ARCH_SETUP
+ #define ARCH_SETUP
+ #endif
- DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
- EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+ #ifndef CONFIG_DEBUG_BOOT_PARAMS
+ struct boot_params __initdata boot_params;
+ #else
+ struct boot_params boot_params;
+ #endif
- /* Bitmask of physically existing CPUs */
- physid_mask_t phys_cpu_present_map;
+ /*
+ * Machine setup..
+ */
+ static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+
+ static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+
+ static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+
+
+ #ifdef CONFIG_X86_32
+ /* This value is set up by the early boot code to point to the value
+ immediately after the boot time page tables. It contains a *physical*
+ address, and must not be in the .bss segment! */
+ unsigned long init_pg_tables_start __initdata = ~0UL;
+ unsigned long init_pg_tables_end __initdata = ~0UL;
+
+ static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ };
+
+ /* cpu data as detected by the assembly code in head.S */
+ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+ /* common cpu data for all cpus */
+ struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+ EXPORT_SYMBOL(boot_cpu_data);
+ static void set_mca_bus(int x)
+ {
+ #ifdef CONFIG_MCA
+ MCA_bus = x;
+ #endif
+ }
+
+ unsigned int def_to_bigsmp;
+
+ /* for MCA, but anyone else can use it if they want */
+ unsigned int machine_id;
+ unsigned int machine_submodel_id;
+ unsigned int BIOS_revision;
+
+ struct apm_info apm_info;
+ EXPORT_SYMBOL(apm_info);
+
+ #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+ struct ist_info ist_info;
+ EXPORT_SYMBOL(ist_info);
+ #else
+ struct ist_info ist_info;
#endif
- #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+ #else
+ struct cpuinfo_x86 boot_cpu_data __read_mostly;
+ EXPORT_SYMBOL(boot_cpu_data);
+ #endif
+
+
+ #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+ unsigned long mmu_cr4_features;
+ #else
+ unsigned long mmu_cr4_features = X86_CR4_PAE;
+ #endif
+
+ /* Boot loader ID as an integer, for the benefit of proc_dointvec */
+ int bootloader_type;
+
/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas. These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ * Early DMI memory
*/
- static void __init setup_per_cpu_maps(void)
+ int dmi_alloc_index;
+ char dmi_alloc_data[DMI_MAX_DATA];
+
+ /*
+ * Setup options
+ */
+ struct screen_info screen_info;
+ EXPORT_SYMBOL(screen_info);
+ struct edid_info edid_info;
+ EXPORT_SYMBOL_GPL(edid_info);
+
+ extern int root_mountflags;
+
+ unsigned long saved_video_mode;
+
+ #define RAMDISK_IMAGE_START_MASK 0x07FF
+ #define RAMDISK_PROMPT_FLAG 0x8000
+ #define RAMDISK_LOAD_FLAG 0x4000
+
+ static char __initdata command_line[COMMAND_LINE_SIZE];
+
+ #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+ struct edd edd;
+ #ifdef CONFIG_EDD_MODULE
+ EXPORT_SYMBOL(edd);
+ #endif
+ /**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
+ */
+ static inline void copy_edd(void)
+ {
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+ }
+ #else
+ static inline void copy_edd(void)
+ {
+ }
+ #endif
+
+ #ifdef CONFIG_BLK_DEV_INITRD
+
+ #ifdef CONFIG_X86_32
+
+ #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+ static void __init relocate_initrd(void)
{
- int cpu;
- for_each_possible_cpu(cpu) {
- per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
- per_cpu(x86_bios_cpu_apicid, cpu) =
- x86_bios_cpu_apicid_init[cpu];
- #ifdef CONFIG_NUMA
- per_cpu(x86_cpu_to_node_map, cpu) =
- x86_cpu_to_node_map_init[cpu];
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 ramdisk_here;
+ unsigned long slop, clen, mapaddr;
+ char *p, *q;
+
+ /* We need to move the initrd down into lowmem */
+ ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+ PAGE_SIZE);
+
+ if (ramdisk_here == -1ULL)
+ panic("Cannot find place for new RAMDISK of size %lld\n",
+ ramdisk_size);
+
+ /* Note: this includes all the lowmem currently occupied by
+ the initrd, we rely on that fact to keep the data intact. */
+ reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+ "NEW RAMDISK");
+ initrd_start = ramdisk_here + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+ ramdisk_here, ramdisk_here + ramdisk_size);
+
+ q = (char *)initrd_start;
+
+ /* Copy any lowmem portion of the initrd */
+ if (ramdisk_image < end_of_lowmem) {
+ clen = end_of_lowmem - ramdisk_image;
+ p = (char *)__va(ramdisk_image);
+ memcpy(q, p, clen);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+
+ /* Copy the highmem portion of the initrd */
+ while (ramdisk_size) {
+ slop = ramdisk_image & ~PAGE_MASK;
+ clen = ramdisk_size;
+ if (clen > MAX_MAP_CHUNK-slop)
+ clen = MAX_MAP_CHUNK-slop;
+ mapaddr = ramdisk_image & PAGE_MASK;
+ p = early_ioremap(mapaddr, clen+slop);
+ memcpy(q, p+slop, clen);
+ early_iounmap(p, clen+slop);
+ q += clen;
+ ramdisk_image += clen;
+ ramdisk_size -= clen;
+ }
+ /* high pages is not converted by early_res_to_bootmem */
+ ramdisk_image = boot_params.hdr.ramdisk_image;
+ ramdisk_size = boot_params.hdr.ramdisk_size;
+ printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+ " %08llx - %08llx\n",
+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
+ }
#endif
+
+ static void __init reserve_initrd(void)
+ {
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ initrd_start = 0;
+
+ if (ramdisk_size >= (end_of_lowmem>>1)) {
+ free_early(ramdisk_image, ramdisk_end);
+ printk(KERN_ERR "initrd too large to handle, "
+ "disabling initrd\n");
+ return;
+ }
+
+ printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
+ ramdisk_end);
+
+
+ if (ramdisk_end <= end_of_lowmem) {
+ /* All in lowmem, easy case */
+ /*
+ * don't need to reserve again, already reserved early
+ * in i386_start_kernel
+ */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ return;
}
- /* indicate the early static arrays will soon be gone */
- x86_cpu_to_apicid_early_ptr = NULL;
- x86_bios_cpu_apicid_early_ptr = NULL;
- #ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = NULL;
+ #ifdef CONFIG_X86_32
+ relocate_initrd();
+ #else
+ printk(KERN_ERR "initrd extends beyond end of memory "
+ "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
+ ramdisk_end, end_of_lowmem);
+ initrd_start = 0;
#endif
+ free_early(ramdisk_image, ramdisk_end);
}
+ #else
+ static void __init reserve_initrd(void)
+ {
+ }
+ #endif /* CONFIG_BLK_DEV_INITRD */
+
+ static void __init parse_setup_data(void)
+ {
+ struct setup_data *data;
+ u64 pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ case SETUP_E820_EXT:
+ parse_e820_ext(data, pa_data);
+ break;
+ default:
+ break;
+ }
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+ }
+
+ static void __init e820_reserve_setup_data(void)
+ {
+ struct setup_data *data;
+ u64 pa_data;
+ int found = 0;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ e820_update_range(pa_data, sizeof(*data)+data->len,
+ E820_RAM, E820_RESERVED_KERN);
+ found = 1;
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+ if (!found)
+ return;
- #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
- cpumask_t *cpumask_of_cpu_map __read_mostly;
- EXPORT_SYMBOL(cpumask_of_cpu_map);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ printk(KERN_INFO "extended physical RAM map:\n");
+ e820_print_map("reserve setup_data");
+ }
- /* requires nr_cpu_ids to be initialized */
- static void __init setup_cpumask_of_cpu(void)
+ static void __init reserve_early_setup_data(void)
{
- int i;
+ struct setup_data *data;
+ u64 pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+ }
+
+ /*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+ #ifdef CONFIG_KEXEC
+
+ /**
+ * Reserve @size bytes of crashkernel memory at any suitable offset.
+ *
+ * @size: Size of the crashkernel memory to reserve.
+ * Returns the base address on success, and -1ULL on failure.
+ */
+ unsigned long long find_and_reserve_crashkernel(unsigned long long size)
+ {
+ const unsigned long long alignment = 16<<20; /* 16M */
+ unsigned long long start = 0LL;
+
+ while (1) {
+ int ret;
+
+ start = find_e820_area(start, ULONG_MAX, size, alignment);
+ if (start == -1ULL)
+ return start;
+
+ /* try to reserve it */
+ ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
+ if (ret >= 0)
+ return start;
- /* alloc_bootmem zeroes memory */
- cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
- for (i = 0; i < nr_cpu_ids; i++)
- cpu_set(i, cpumask_of_cpu_map[i]);
+ start += alignment;
+ }
+ }
+
+ static inline unsigned long long get_total_mem(void)
+ {
+ unsigned long long total;
+
+ total = max_low_pfn - min_low_pfn;
+ #ifdef CONFIG_HIGHMEM
+ total += highend_pfn - highstart_pfn;
+ #endif
+
+ return total << PAGE_SHIFT;
+ }
+
+ static void __init reserve_crashkernel(void)
+ {
+ unsigned long long total_mem;
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ total_mem = get_total_mem();
+
+ ret = parse_crashkernel(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0)
+ return;
+
+ /* 0 means: find the address automatically */
+ if (crash_base <= 0) {
+ crash_base = find_and_reserve_crashkernel(crash_size);
+ if (crash_base == -1ULL) {
+ pr_info("crashkernel reservation failed. "
+ "No suitable area found.\n");
+ return;
+ }
+ } else {
+ ret = reserve_bootmem_generic(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE);
+ if (ret < 0) {
+ pr_info("crashkernel reservation failed - "
+ "memory is in use\n");
+ return;
+ }
+ }
+
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
}
#else
- static inline void setup_cpumask_of_cpu(void) { }
+ static void __init reserve_crashkernel(void)
+ {
+ }
#endif
- #ifdef CONFIG_X86_32
- /*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
+ static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer0", .start = 0x40, .end = 0x43,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x60, .end = 0x60,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "fpu", .start = 0xf0, .end = 0xff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+ };
+
+ static void __init reserve_standard_io_resources(void)
+ {
+ int i;
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+ }
+
+ #ifdef CONFIG_PROC_VMCORE
+ /* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
*/
- unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
- EXPORT_SYMBOL(__per_cpu_offset);
+ static int __init setup_elfcorehdr(char *arg)
+ {
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+ }
+ early_param("elfcorehdr", setup_elfcorehdr);
#endif
+++ ++++++++++++++++static struct x86_quirks default_x86_quirks __initdata;
+++ ++++++++++++++++
+++ ++++++++++++++++struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+++ ++++++++++++++++
+ /*
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization. Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
*/
- void __init setup_per_cpu_areas(void)
+
+ void __init setup_arch(char **cmdline_p)
{
- int i, highest_cpu = 0;
- unsigned long size;
+ #ifdef CONFIG_X86_32
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+ + visws_early_detect();
+ pre_setup_arch_hook();
+ early_cpu_init();
+ #else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ #endif
- #ifdef CONFIG_HOTPLUG_CPU
- prefill_possible_map();
+ early_ioremap_init();
+
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+ #ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+ if (boot_params.sys_desc_table.length != 0) {
+ set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+ machine_id = boot_params.sys_desc_table.table[0];
+ machine_submodel_id = boot_params.sys_desc_table.table[1];
+ BIOS_revision = boot_params.sys_desc_table.table[2];
+ }
+ #endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+
+ #ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+ #endif
+ #ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ #ifdef CONFIG_X86_32
+ "EL32",
+ #else
+ "EL64",
#endif
+ 4)) {
+ efi_enabled = 1;
+ efi_reserve_early();
+ }
+ #endif
+
+ ARCH_SETUP
+
+ setup_memory_map();
+ parse_setup_data();
+ /* update the e820_saved too */
+ e820_reserve_setup_data();
- /* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
- size);
+ copy_edd();
- for_each_possible_cpu(i) {
- char *ptr;
- #ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+ init_mm.start_code = (unsigned long) _text;
+ init_mm.end_code = (unsigned long) _etext;
+ init_mm.end_data = (unsigned long) _edata;
+ #ifdef CONFIG_X86_32
+ init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
#else
- int node = early_cpu_to_node(i);
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
- printk(KERN_INFO
- "cpu %d has no node or node-local memory\n", i);
- }
- else
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ init_mm.brk = (unsigned long) &_end;
#endif
- if (!ptr)
- panic("Cannot allocate cpu data for CPU %d\n", i);
+
+ code_resource.start = virt_to_phys(_text);
+ code_resource.end = virt_to_phys(_etext)-1;
+ data_resource.start = virt_to_phys(_etext);
+ data_resource.end = virt_to_phys(_edata)-1;
+ bss_resource.start = virt_to_phys(&__bss_start);
+ bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
#ifdef CONFIG_X86_64
- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+ early_cpu_init();
+ #endif
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ parse_early_param();
+
+ /* after early param, so could get panic from serial */
+ reserve_early_setup_data();
+
+ if (acpi_mps_check()) {
+ #ifdef CONFIG_X86_LOCAL_APIC
+ disable_apic = 1;
+ #endif
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ }
+
+++ + ++ + #ifdef CONFIG_PCI
+++ + ++ + if (pci_early_dump_regs)
+++ + ++ + early_dump_pci_devices();
+++ + ++ + #endif
+++ + ++ +
+ finish_e820_parsing();
+
+ #ifdef CONFIG_X86_32
+ probe_roms();
+ #endif
+
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
+ if (efi_enabled)
+ efi_init();
+
+ #ifdef CONFIG_X86_32
+ if (ppro_with_ram_bug()) {
+ e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+ E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820_print_map("bad_ppro");
+ }
+ #else
+ early_gart_iommu_check();
+ #endif
+
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ max_pfn = e820_end_of_ram_pfn();
+
+ /* preallocate 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
+ /* update e820 for memory not covered by WB MTRRs */
+ mtrr_bp_init();
+ if (mtrr_trim_uncached_memory(max_pfn))
+ max_pfn = e820_end_of_ram_pfn();
+
+ #ifdef CONFIG_X86_32
+ /* max_low_pfn get updated here */
+ find_low_pfn_range();
#else
- __per_cpu_offset[i] = ptr - __per_cpu_start;
+ num_physpages = max_pfn;
+
+ check_efer();
+
+ /* How many end-of-memory variables you have, grandma! */
+ /* need this before calling reserve_initrd */
+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+ max_low_pfn = e820_end_of_low_ram_pfn();
+ else
+ max_low_pfn = max_pfn;
+
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
- highest_cpu = i;
+ /* max_pfn_mapped is updated here */
+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+ max_pfn_mapped = max_low_pfn_mapped;
+
+ #ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ max_pfn_mapped = init_memory_mapping(1UL<<32,
+ max_pfn<<PAGE_SHIFT);
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
}
+ #endif
- nr_cpu_ids = highest_cpu + 1;
- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+ /*
+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+ */
- /* Setup percpu data maps */
- setup_per_cpu_maps();
+ #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+ #endif
- /* Setup cpumask_of_cpu map */
- setup_cpumask_of_cpu();
- }
+ reserve_initrd();
+
+ #ifdef CONFIG_X86_64
+ vsmp_init();
+ #endif
+
+ dmi_scan_machine();
+
+ io_delay_init();
+
+ /*
+ * Parse the ACPI tables for possible boot-time SMP configuration.
+ */
+ acpi_boot_table_init();
+
+ #ifdef CONFIG_ACPI_NUMA
+ /*
+ * Parse SRAT to discover nodes.
+ */
+ acpi_numa_init();
+ #endif
+
+ initmem_init(0, max_pfn);
+
+ #ifdef CONFIG_X86_64
+ dma32_reserve_bootmem();
+ #endif
+ #ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ */
+ acpi_reserve_bootmem();
#endif
--- --- -- ----- ---#ifdef CONFIG_X86_NUMAQ
--- --- -- ----- --- /*
--- --- -- ----- --- * need to check online nodes num, call it
--- --- -- ----- --- * here before time_init/tsc_init
--- --- -- ----- --- */
--- --- -- ----- --- numaq_tsc_disable();
--- --- -- ----- ---#endif
--- --- -- ----- ---
+ #ifdef CONFIG_X86_FIND_SMP_CONFIG
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+ #endif
+ reserve_crashkernel();
+
+ reserve_ibft_region();
+
+ #ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+ #endif
+
+ #if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+ /*
+ * Must be after max_low_pfn is determined, and before kernel
+ * pagetables are setup.
+ */
+ vmi_init();
+ #endif
+
+++++++++++++++++++ paravirt_pagetable_setup_start(swapper_pg_dir);
+ paging_init();
+++++++++++++++++++ paravirt_pagetable_setup_done(swapper_pg_dir);
+++++++++++++++++++ paravirt_post_allocator_init();
+
+ #ifdef CONFIG_X86_64
+ map_vsyscall();
+ #endif
+
+ #ifdef CONFIG_X86_GENERICARCH
+ generic_apic_probe();
+ #endif
+
+ early_quirks();
+
+ /*
+ * Read APIC and some other early information from ACPI tables.
+ */
+ acpi_boot_init();
+
+ #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ get_smp_config();
+ #endif
+
+ prefill_possible_map();
+ #ifdef CONFIG_X86_64
+ init_cpu_to_node();
+ #endif
+
+ init_apic_mappings();
+ ioapic_init_mappings();
+
+ #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+ if (def_to_bigsmp)
+ printk(KERN_WARNING "More than 8 CPUs detected and "
+ "CONFIG_X86_PC cannot handle it.\nUse "
+ "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+ #endif
+ kvm_guest_init();
+
+ e820_reserve_resources();
+ e820_mark_nosave_regions(max_low_pfn);
+
+ #ifdef CONFIG_X86_32
+ request_resource(&iomem_resource, &video_ram_resource);
+ #endif
+ reserve_standard_io_resources();
+
+ e820_setup_gap();
+
+ #ifdef CONFIG_VT
+ #if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+ #elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+ #endif
+ #endif
+ }
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
*/
-- - - - lock_ipi_call_lock();
- #ifdef CONFIG_X86_64
- spin_lock(&vector_lock);
-
- /* Setup the per cpu irq handling data structures */
- __setup_vector_irq(smp_processor_id());
- /*
- * Allow the master to continue.
- */
- spin_unlock(&vector_lock);
++ + + + ipi_call_lock_irq();
+ #ifdef CONFIG_X86_IO_APIC
+ setup_vector_irq(smp_processor_id());
#endif
cpu_set(smp_processor_id(), cpu_online_map);
-- - - - unlock_ipi_call_lock();
++ + + + ipi_call_unlock_irq();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
setup_secondary_clock();
complete(&c_idle->done);
}
---------- -------- static int __cpuinit get_local_pda(int cpu)
+ #ifdef CONFIG_X86_64
+ /*
+ * Allocate node local memory for the AP pda.
+ *
+ * Must be called after the _cpu_pda pointer table is initialized.
+ */
+++++++++++++++++++ int __cpuinit get_local_pda(int cpu)
+ {
+ struct x8664_pda *oldpda, *newpda;
+ unsigned long size = sizeof(struct x8664_pda);
+ int node = cpu_to_node(cpu);
+
+ if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+ return 0;
+
+ oldpda = cpu_pda(cpu);
+ newpda = kmalloc_node(size, GFP_ATOMIC, node);
+ if (!newpda) {
+ printk(KERN_ERR "Could not allocate node local PDA "
+ "for CPU %d on node %d\n", cpu, node);
+
+ if (oldpda)
+ return 0; /* have a usable pda */
+ else
+ return -1;
+ }
+
+ if (oldpda) {
+ memcpy(newpda, oldpda, size);
+ if (!after_bootmem)
+ free_bootmem((unsigned long)oldpda, size);
+ }
+
+ newpda->in_bootmem = 0;
+ cpu_pda(cpu) = newpda;
+ return 0;
+ }
+ #endif /* CONFIG_X86_64 */
+
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
/* was set by cpu_init() */
------------------- clear_bit(cpu, (unsigned long *)&cpu_initialized);
- clear_node_cpumask(cpu);
- #endif
+++++++++++++++++++ cpu_clear(cpu, cpu_initialized);
+ numa_remove_cpu(cpu);
}
int __cpu_disable(void)
__flush_tlb_all();
+ if (!after_init_bootmem)
+ reserve_early(table_start << PAGE_SHIFT,
+ table_end << PAGE_SHIFT, "PGTABLE");
+
+++++++++ ++++++++++ if (!after_init_bootmem)
+++++++++ ++++++++++ early_memtest(start, end);
+++++++++ ++++++++++
+ return end >> PAGE_SHIFT;
+ }
+
+
+ /*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+ void __init paging_init(void)
+ {
+ pagetable_init();
+
+ __flush_tlb_all();
+
kmap_init();
---------- --------
---------- -------- paravirt_post_allocator_init();
+
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+ sparse_init();
+ zone_sizes_init();
}
/*
if (retval < 0)
return 0;
- - if (((pfn <= max_low_pfn_mapped) ||
- - (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) &&
- if (pfn <= max_pfn_mapped &&
- ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+ + + if (((pfn < max_low_pfn_mapped) ||
+ + + (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
+ ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
free_memtype(offset, offset + size);
printk(KERN_INFO
"%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
free_memtype(addr, addr + size);
}
++++++++++ + +++++++
++++++++++++ +++++++#if defined(CONFIG_DEBUG_FS)
++++++++++++ +++++++
++++++++++++ +++++++/* get Nth element of the linked list */
++++++++++++ +++++++static struct memtype *memtype_get_idx(loff_t pos)
++++++++++++ +++++++{
++++++++++++ +++++++ struct memtype *list_node, *print_entry;
++++++++++++ +++++++ int i = 1;
++++++++++++ +++++++
++++++++++++ +++++++ print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
++++++++++++ +++++++ if (!print_entry)
++++++++++++ +++++++ return NULL;
++++++++++++ +++++++
++++++++++++ +++++++ spin_lock(&memtype_lock);
++++++++++++ +++++++ list_for_each_entry(list_node, &memtype_list, nd) {
++++++++++++ +++++++ if (pos == i) {
++++++++++++ +++++++ *print_entry = *list_node;
++++++++++++ +++++++ spin_unlock(&memtype_lock);
++++++++++++ +++++++ return print_entry;
++++++++++++ +++++++ }
++++++++++++ +++++++ ++i;
++++++++++++ +++++++ }
++++++++++++ +++++++ spin_unlock(&memtype_lock);
++++++++++++ +++++++ kfree(print_entry);
++++++++++++ +++++++ return NULL;
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
++++++++++++ +++++++{
++++++++++++ +++++++ if (*pos == 0) {
++++++++++++ +++++++ ++*pos;
++++++++++++ +++++++ seq_printf(seq, "PAT memtype list:\n");
++++++++++++ +++++++ }
++++++++++++ +++++++
++++++++++++ +++++++ return memtype_get_idx(*pos);
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++++++++++++ +++++++{
++++++++++++ +++++++ ++*pos;
++++++++++++ +++++++ return memtype_get_idx(*pos);
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++static void memtype_seq_stop(struct seq_file *seq, void *v)
++++++++++++ +++++++{
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++static int memtype_seq_show(struct seq_file *seq, void *v)
++++++++++++ +++++++{
++++++++++++ +++++++ struct memtype *print_entry = (struct memtype *)v;
++++++++++++ +++++++
++++++++++++ +++++++ seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
++++++++++++ +++++++ print_entry->start, print_entry->end);
++++++++++++ +++++++ kfree(print_entry);
++++++++++++ +++++++ return 0;
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++static struct seq_operations memtype_seq_ops = {
++++++++++++ +++++++ .start = memtype_seq_start,
++++++++++++ +++++++ .next = memtype_seq_next,
++++++++++++ +++++++ .stop = memtype_seq_stop,
++++++++++++ +++++++ .show = memtype_seq_show,
++++++++++++ +++++++};
++++++++++++ +++++++
++++++++++++ +++++++static int memtype_seq_open(struct inode *inode, struct file *file)
++++++++++++ +++++++{
++++++++++++ +++++++ return seq_open(file, &memtype_seq_ops);
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++static const struct file_operations memtype_fops = {
++++++++++++ +++++++ .open = memtype_seq_open,
++++++++++++ +++++++ .read = seq_read,
++++++++++++ +++++++ .llseek = seq_lseek,
++++++++++++ +++++++ .release = seq_release,
++++++++++++ +++++++};
++++++++++++ +++++++
++++++++++++ +++++++static int __init pat_memtype_list_init(void)
++++++++++++ +++++++{
++++++++++++ +++++++ debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
++++++++++++ +++++++ NULL, &memtype_fops);
++++++++++++ +++++++ return 0;
++++++++++++ +++++++}
++++++++++++ +++++++
++++++++++++ +++++++late_initcall(pat_memtype_list_init);
++++++++++++ +++++++
++++++++++++ +++++++#endif /* CONFIG_DEBUG_FS */
#define PCI_CAN_SKIP_ISA_ALIGN 0x8000
#define PCI_USE__CRS 0x10000
#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
+ #define PCI_HAS_IO_ECS 0x40000
+++ + ++ + #define PCI_NOASSIGN_ROMS 0x80000
extern unsigned int pci_probe;
extern unsigned long pirq_table_addr;
extern void pci_direct_init(int type);
extern void pci_pcbios_init(void);
extern int pci_olpc_init(void);
------- -- ----- ---extern int __init pci_numa_init(void);
+ extern void __init dmi_check_pciprobe(void);
+ extern void __init dmi_check_skip_isa_align(void);
+
+ /* some common used subsys_initcalls */
+ extern int __init pci_acpi_init(void);
+ extern int __init pcibios_irq_init(void);
++++++++++++++++ +++extern int __init pci_visws_init(void);
++++++++++++++++ +++extern int __init pci_numaq_init(void);
+ extern int __init pcibios_init(void);
/* pci-mmconfig.c */
}
}
+ /*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+ void xen_vcpu_restore(void)
+ {
+ if (have_vcpu_info_placement) {
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
+
+ if (other_cpu &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
+ BUG();
+
+ xen_vcpu_setup(cpu);
+
+ if (other_cpu &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+ BUG();
+ }
+
+ BUG_ON(!have_vcpu_info_placement);
+ }
+ }
+
static void __init xen_banner(void)
{
++++++++++++++++++ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
++++++++++++++++++ + struct xen_extraversion extra;
++++++++++++++++++ + HYPERVISOR_xen_version(XENVER_extraversion, &extra);
++++++++++++++++++ +
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
---------- ------- - printk(KERN_INFO "Hypervisor signature: %s%s\n",
---------- ------- - xen_start_info->magic,
- printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
++++++++++++++++++ + printk(KERN_INFO "Xen version: %d.%d%s%s\n",
++++++++++++++++++ + version >> 16, version & 0xffff, extra.extraversion,
+ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
static __init void xen_pagetable_setup_start(pgd_t *base)
{
------------------- pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
------------------- int i;
-------------------
------------------- /* special set_pte for pagetable initialization */
------------------- pv_mmu_ops.set_pte = xen_set_pte_init;
-------------------
------------------- init_mm.pgd = base;
------------------- /*
------------------- * copy top-level of Xen-supplied pagetable into place. This
------------------- * is a stand-in while we copy the pmd pages.
------------------- */
------------------- memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-------------------
------------------- /*
------------------- * For PAE, need to allocate new pmds, rather than
------------------- * share Xen's, since Xen doesn't like pmd's being
------------------- * shared between address spaces.
------------------- */
------------------- for (i = 0; i < PTRS_PER_PGD; i++) {
------------------- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
------------------- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-------------------
------------------- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
------------------- PAGE_SIZE);
-------------------
------------------- make_lowmem_page_readonly(pmd);
-------------------
------------------- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
------------------- } else
------------------- pgd_clear(&base[i]);
------------------- }
-------------------
------------------- /* make sure zero_page is mapped RO so we can use it in pagetables */
------------------- make_lowmem_page_readonly(empty_zero_page);
------------------- make_lowmem_page_readonly(base);
------------------- /*
------------------- * Switch to new pagetable. This is done before
------------------- * pagetable_init has done anything so that the new pages
------------------- * added to the table can be prepared properly for Xen.
------------------- */
------------------- xen_write_cr3(__pa(base));
-------------------
------------------- /* Unpin initial Xen pagetable */
------------------- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
------------------- PFN_DOWN(__pa(xen_start_info->pt_base)));
}
- static __init void setup_shared_info(void)
+ void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
------------------- unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-------------------
------------------- /*
------------------- * Create a mapping for the shared info page.
------------------- * Should be set_fixmap(), but shared_info is a machine
------------------- * address with no corresponding pseudo-phys address.
------------------- */
------------------- set_pte_mfn(addr,
------------------- PFN_DOWN(xen_start_info->shared_info),
------------------- PAGE_KERNEL);
-------------------
------------------- HYPERVISOR_shared_info = (struct shared_info *)addr;
+++++++++++++++++++ set_fixmap(FIX_PARAVIRT_BOOTMAP,
+++++++++++++++++++ xen_start_info->shared_info);
+++++++++++++++++++
+++++++++++++++++++ HYPERVISOR_shared_info =
+++++++++++++++++++ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
} else
HYPERVISOR_shared_info =
(struct shared_info *)__va(xen_start_info->shared_info);
static __init void xen_pagetable_setup_done(pgd_t *base)
{
---------- -------- /* This will work as long as patching hasn't happened yet
---------- -------- (which it hasn't) */
---------- -------- pv_mmu_ops.alloc_pte = xen_alloc_pte;
---------- -------- pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
---------- -------- pv_mmu_ops.release_pte = xen_release_pte;
---------- -------- pv_mmu_ops.release_pmd = xen_release_pmd;
---------- -------- pv_mmu_ops.set_pte = xen_set_pte;
---------- --------
+ xen_setup_shared_info();
---------- --------
---------- -------- /* Actually pin the pagetable down, but we can't set PG_pinned
---------- -------- yet because the page structures don't exist yet. */
---------- -------- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
+ }
+
+ static __init void xen_post_allocator_init(void)
+ {
+++++++++++++++++++ pv_mmu_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.set_pmd = xen_set_pmd;
+ pv_mmu_ops.set_pud = xen_set_pud;
+++++++++++++++++++ #if PAGETABLE_LEVELS == 4
+++++++++++++++++++ pv_mmu_ops.set_pgd = xen_set_pgd;
+++++++++++++++++++ #endif
+++++++++++++++++++
++++++++++ ++++++++ /* This will work as long as patching hasn't happened yet
++++++++++ ++++++++ (which it hasn't) */
++++++++++ ++++++++ pv_mmu_ops.alloc_pte = xen_alloc_pte;
++++++++++ ++++++++ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
++++++++++ ++++++++ pv_mmu_ops.release_pte = xen_release_pte;
++++++++++ ++++++++ pv_mmu_ops.release_pmd = xen_release_pmd;
- pv_mmu_ops.set_pte = xen_set_pte;
-
- setup_shared_info();
+++++++++++++++++++ #if PAGETABLE_LEVELS == 4
+++++++++++++++++++ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+++++++++++++++++++ pv_mmu_ops.release_pud = xen_release_pud;
+++++++++++++++++++ #endif
- /* Actually pin the pagetable down, but we can't set PG_pinned
- yet because the page structures don't exist yet. */
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ SetPagePinned(virt_to_page(level3_user_vsyscall));
+++++++++++++++++++ #endif
+ xen_mark_init_mm_pinned();
}
/* This is called once we have the cpu_possible_map */
return ret;
}
+ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+ {
+ pte_t pte;
+
+ phys >>= PAGE_SHIFT;
+
+ switch (idx) {
+ case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+ #ifdef CONFIG_X86_F00F_BUG
+ case FIX_F00F_IDT:
+ #endif
+++++++++++++++++++ #ifdef CONFIG_X86_32
+ case FIX_WP_TEST:
+ case FIX_VDSO:
+++++++++++++++++++ # ifdef CONFIG_HIGHMEM
+++++++++++++++++++ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+++++++++++++++++++ # endif
+++++++++++++++++++ #else
+++++++++++++++++++ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+++++++++++++++++++ #endif
+ #ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+ #endif
+ pte = pfn_pte(phys, prot);
+ break;
+
+ default:
+ pte = mfn_pte(phys, prot);
+ break;
+ }
+
+ __native_set_fixmap(idx, pte);
+++++++++++++++++++
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ /* Replicate changes to map the vsyscall page into the user
+++++++++++++++++++ pagetable vsyscall mapping. */
+++++++++++++++++++ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+++++++++++++++++++ unsigned long vaddr = __fix_to_virt(idx);
+++++++++++++++++++ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+++++++++++++++++++ }
+++++++++++++++++++ #endif
+ }
+
static const struct pv_info xen_info __initdata = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
.read_pmc = native_read_pmc,
.iret = xen_iret,
- .irq_enable_syscall_ret = xen_sysexit,
+ .irq_enable_sysexit = xen_sysexit,
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ .usergs_sysret32 = xen_sysret32,
+++++++++++++++++++ .usergs_sysret64 = xen_sysret64,
+++++++++++++++++++ #endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.irq_enable = xen_irq_enable,
.safe_halt = xen_safe_halt,
.halt = xen_halt,
---------- -------- .adjust_exception_frame = paravirt_nop,
+ #ifdef CONFIG_X86_64
+++++++++++++++++++ .adjust_exception_frame = xen_adjust_exception_frame,
+ #endif
};
static const struct pv_apic_ops xen_apic_ops __initdata = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
---------- -------- .pgd_alloc = __paravirt_pgd_alloc,
---------- -------- .pgd_free = paravirt_nop,
+++++++++++++++++++ .pgd_alloc = xen_pgd_alloc,
+++++++++++++++++++ .pgd_free = xen_pgd_free,
+
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pte_init,
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
------------------- .set_pte = NULL, /* see xen_pagetable_setup_* */
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ .set_pte = xen_set_pte,
+++++++++++++++++++ #else
+++++++++++++++++++ .set_pte = xen_set_pte_init,
+++++++++++++++++++ #endif
.set_pte_at = xen_set_pte_at,
- .set_pmd = xen_set_pmd,
+ .set_pmd = xen_set_pmd_hyper,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
.pte_val = xen_pte_val,
+ .pte_flags = native_pte_val,
.pgd_val = xen_pgd_val,
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
+++++++++++++++++++ #ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
---------- -------- .set_pud = xen_set_pud_hyper,
- .set_pud = xen_set_pud,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
+++++++++++++++++++ #endif /* CONFIG_X86_PAE */
+++++++++++++++++++ .set_pud = xen_set_pud_hyper,
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
.enter = paravirt_enter_lazy_mmu,
.leave = xen_leave_lazy,
},
- };
- #ifdef CONFIG_SMP
- static const struct smp_ops xen_smp_ops __initdata = {
- .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
- .smp_prepare_cpus = xen_smp_prepare_cpus,
- .cpu_up = xen_cpu_up,
- .smp_cpus_done = xen_smp_cpus_done,
-
- .smp_send_stop = xen_smp_send_stop,
- .smp_send_reschedule = xen_smp_send_reschedule,
- .smp_call_function_mask = xen_smp_call_function_mask,
+ .set_fixmap = xen_set_fixmap,
};
- #endif /* CONFIG_SMP */
---------- -------- #ifdef CONFIG_SMP
---------- -------- static const struct smp_ops xen_smp_ops __initdata = {
---------- -------- .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
---------- -------- .smp_prepare_cpus = xen_smp_prepare_cpus,
---------- -------- .cpu_up = xen_cpu_up,
---------- -------- .smp_cpus_done = xen_smp_cpus_done,
---------- --------
---------- -------- .smp_send_stop = xen_smp_send_stop,
---------- -------- .smp_send_reschedule = xen_smp_send_reschedule,
------ -- - --- --
------ -- - --- -- .send_call_func_ipi = xen_smp_send_call_function_ipi,
------ -- - --- -- .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-- - - .smp_call_function_mask = xen_smp_call_function_mask,
---------- -------- };
---------- -------- #endif /* CONFIG_SMP */
---------- --------
static void xen_reboot(int reason)
{
+ struct sched_shutdown r = { .reason = reason };
+
#ifdef CONFIG_SMP
smp_send_stop();
#endif
top = pp.virt_start;
reserve_top_address(-top + 2 * PAGE_SIZE);
+++++++++++++++++++ #endif /* CONFIG_X86_32 */
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ /*
+++++++++++++++++++ * Like __va(), but returns address in the kernel mapping (which is
+++++++++++++++++++ * all we have until the physical memory mapping has been set up.
+++++++++++++++++++ */
+++++++++++++++++++ static void *__ka(phys_addr_t paddr)
+++++++++++++++++++ {
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ return (void *)(paddr + __START_KERNEL_map);
+++++++++++++++++++ #else
+++++++++++++++++++ return __va(paddr);
+++++++++++++++++++ #endif
++ + + + + }
++ + + + +
+++++++++++++++++++ /* Convert a machine address to physical address */
+++++++++++++++++++ static unsigned long m2p(phys_addr_t maddr)
+++++++++++++++++++ {
+++++++++++++++++++ phys_addr_t paddr;
+++++++++++++++++++
+++++++++++++++++++ maddr &= PTE_MASK;
+++++++++++++++++++ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+++++++++++++++++++
+++++++++++++++++++ return paddr;
++++++++++ ++++++++ }
++++++++++ ++++++++
+++++++++++++++++++ /* Convert a machine address to kernel virtual */
+++++++++++++++++++ static void *m2v(phys_addr_t maddr)
+++++++++++++++++++ {
+++++++++++++++++++ return __ka(m2p(maddr));
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ static void walk(pgd_t *pgd, unsigned long addr)
+++++++++++++++++++ {
+++++++++++++++++++ unsigned l4idx = pgd_index(addr);
+++++++++++++++++++ unsigned l3idx = pud_index(addr);
+++++++++++++++++++ unsigned l2idx = pmd_index(addr);
+++++++++++++++++++ unsigned l1idx = pte_index(addr);
+++++++++++++++++++ pgd_t l4;
+++++++++++++++++++ pud_t l3;
+++++++++++++++++++ pmd_t l2;
+++++++++++++++++++ pte_t l1;
+++++++++++++++++++
+++++++++++++++++++ xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
+++++++++++++++++++ pgd, addr, l4idx, l3idx, l2idx, l1idx);
+++++++++++++++++++
+++++++++++++++++++ l4 = pgd[l4idx];
+++++++++++++++++++ xen_raw_printk(" l4: %016lx\n", l4.pgd);
+++++++++++++++++++ xen_raw_printk(" %016lx\n", pgd_val(l4));
+++++++++++++++++++
+++++++++++++++++++ l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
+++++++++++++++++++ xen_raw_printk(" l3: %016lx\n", l3.pud);
+++++++++++++++++++ xen_raw_printk(" %016lx\n", pud_val(l3));
+++++++++++++++++++
+++++++++++++++++++ l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
+++++++++++++++++++ xen_raw_printk(" l2: %016lx\n", l2.pmd);
+++++++++++++++++++ xen_raw_printk(" %016lx\n", pmd_val(l2));
+++++++++++++++++++
+++++++++++++++++++ l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
+++++++++++++++++++ xen_raw_printk(" l1: %016lx\n", l1.pte);
+++++++++++++++++++ xen_raw_printk(" %016lx\n", pte_val(l1));
+++++++++++++++++++ }
+++++++++++++++++++ #endif
+++++++++++++++++++
+++++++++++++++++++ static void set_page_prot(void *addr, pgprot_t prot)
+++++++++++++++++++ {
+++++++++++++++++++ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+++++++++++++++++++ pte_t pte = pfn_pte(pfn, prot);
+++++++++++++++++++
+++++++++++++++++++ xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
+++++++++++++++++++ addr, pfn, get_phys_to_machine(pfn),
+++++++++++++++++++ pgprot_val(prot), pte.pte);
+++++++++++++++++++
+++++++++++++++++++ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+++++++++++++++++++ BUG();
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+++++++++++++++++++ {
+++++++++++++++++++ unsigned pmdidx, pteidx;
+++++++++++++++++++ unsigned ident_pte;
+++++++++++++++++++ unsigned long pfn;
+++++++++++++++++++
+++++++++++++++++++ ident_pte = 0;
+++++++++++++++++++ pfn = 0;
+++++++++++++++++++ for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+++++++++++++++++++ pte_t *pte_page;
+++++++++++++++++++
+++++++++++++++++++ /* Reuse or allocate a page of ptes */
+++++++++++++++++++ if (pmd_present(pmd[pmdidx]))
+++++++++++++++++++ pte_page = m2v(pmd[pmdidx].pmd);
+++++++++++++++++++ else {
+++++++++++++++++++ /* Check for free pte pages */
+++++++++++++++++++ if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+++++++++++++++++++ break;
+++++++++++++++++++
+++++++++++++++++++ pte_page = &level1_ident_pgt[ident_pte];
+++++++++++++++++++ ident_pte += PTRS_PER_PTE;
+++++++++++++++++++
+++++++++++++++++++ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ /* Install mappings */
+++++++++++++++++++ for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+++++++++++++++++++ pte_t pte;
+++++++++++++++++++
+++++++++++++++++++ if (pfn > max_pfn_mapped)
+++++++++++++++++++ max_pfn_mapped = pfn;
+++++++++++++++++++
+++++++++++++++++++ if (!pte_none(pte_page[pteidx]))
+++++++++++++++++++ continue;
+++++++++++++++++++
+++++++++++++++++++ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+++++++++++++++++++ pte_page[pteidx] = pte;
+++++++++++++++++++ }
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+++++++++++++++++++ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+++++++++++++++++++
+++++++++++++++++++ set_page_prot(pmd, PAGE_KERNEL_RO);
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ static void convert_pfn_mfn(void *v)
+++++++++++++++++++ {
+++++++++++++++++++ pte_t *pte = v;
+++++++++++++++++++ int i;
+++++++++++++++++++
+++++++++++++++++++ /* All levels are converted the same way, so just treat them
+++++++++++++++++++ as ptes. */
+++++++++++++++++++ for(i = 0; i < PTRS_PER_PTE; i++)
+++++++++++++++++++ pte[i] = xen_make_pte(pte[i].pte);
+++++++++++++++++++ }
+++++++++++++++++++
+++++++++++++++++++ /*
+++++++++++++++++++ * Set up the inital kernel pagetable.
+++++++++++++++++++ *
+++++++++++++++++++ * We can construct this by grafting the Xen provided pagetable into
+++++++++++++++++++ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+++++++++++++++++++ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
+++++++++++++++++++ * means that only the kernel has a physical mapping to start with -
+++++++++++++++++++ * but that's enough to get __va working. We need to fill in the rest
+++++++++++++++++++ * of the physical mapping once some sort of allocator has been set
+++++++++++++++++++ * up.
+++++++++++++++++++ */
+++++++++++++++++++ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+++++++++++++++++++ {
+++++++++++++++++++ pud_t *l3;
+++++++++++++++++++ pmd_t *l2;
+++++++++++++++++++
+++++++++++++++++++ /* Zap identity mapping */
+++++++++++++++++++ init_level4_pgt[0] = __pgd(0);
+++++++++++++++++++
+++++++++++++++++++ /* Pre-constructed entries are in pfn, so convert to mfn */
+++++++++++++++++++ convert_pfn_mfn(init_level4_pgt);
+++++++++++++++++++ convert_pfn_mfn(level3_ident_pgt);
+++++++++++++++++++ convert_pfn_mfn(level3_kernel_pgt);
+++++++++++++++++++
+++++++++++++++++++ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+++++++++++++++++++ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+++++++++++++++++++
+++++++++++++++++++ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+++++++++++++++++++ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+++++++++++++++++++
+++++++++++++++++++ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+++++++++++++++++++ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+++++++++++++++++++ memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+++++++++++++++++++
+++++++++++++++++++ /* Set up identity map */
+++++++++++++++++++ xen_map_identity_early(level2_ident_pgt, max_pfn);
+++++++++++++++++++
+++++++++++++++++++ /* Make pagetable pieces RO */
+++++++++++++++++++ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+++++++++++++++++++
+++++++++++++++++++ /* Pin down new L4 */
+++++++++++++++++++ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+++++++++++++++++++ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+++++++++++++++++++
+++++++++++++++++++ /* Unpin Xen-provided one */
+++++++++++++++++++ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+++++++++++++++++++
+++++++++++++++++++ /* Switch over */
+++++++++++++++++++ pgd = init_level4_pgt;
+++++++++++++++++++
+++++++++++++++++++ /*
+++++++++++++++++++ * At this stage there can be no user pgd, and no page
+++++++++++++++++++ * structure to attach it to, so make sure we just set kernel
+++++++++++++++++++ * pgd.
+++++++++++++++++++ */
+++++++++++++++++++ xen_mc_batch();
+++++++++++++++++++ __xen_write_cr3(true, __pa(pgd));
+++++++++++++++++++ xen_mc_issue(PARAVIRT_LAZY_CPU);
+++++++++++++++++++
+++++++++++++++++++ reserve_early(__pa(xen_start_info->pt_base),
+++++++++++++++++++ __pa(xen_start_info->pt_base +
+++++++++++++++++++ xen_start_info->nr_pt_frames * PAGE_SIZE),
+++++++++++++++++++ "XEN PAGETABLES");
+++++++++++++++++++
+++++++++++++++++++ return pgd;
+++++++++++++++++++ }
+++++++++++++++++++ #else /* !CONFIG_X86_64 */
+++++++++++++++++++ static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+++++++++++++++++++
+++++++++++++++++++ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+++++++++++++++++++ {
+++++++++++++++++++ pmd_t *kernel_pmd;
+++++++++++++++++++
+++++++++++++++++++ init_pg_tables_start = __pa(pgd);
+++++++++++++++++++ init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+++++++++++++++++++ max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+++++++++++++++++++
+++++++++++++++++++ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+++++++++++++++++++ memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+++++++++++++++++++
+++++++++++++++++++ xen_map_identity_early(level2_kernel_pgt, max_pfn);
+++++++++++++++++++
+++++++++++++++++++ memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+++++++++++++++++++ set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+++++++++++++++++++ __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+++++++++++++++++++
+++++++++++++++++++ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+++++++++++++++++++ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+++++++++++++++++++
+++++++++++++++++++ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+++++++++++++++++++
+++++++++++++++++++ xen_write_cr3(__pa(swapper_pg_dir));
+++++++++++++++++++
+++++++++++++++++++ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+++++++++++++++++++
+++++++++++++++++++ return swapper_pg_dir;
++++++ ++++ +++ + }
+++++++++++++++++++ #endif /* CONFIG_X86_64 */
++++++ ++++ +++ +
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
pv_apic_ops = xen_apic_ops;
pv_mmu_ops = xen_mmu_ops;
+ if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
+ pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
+ pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
+ }
+
machine_ops = xen_machine_ops;
------------------- #ifdef CONFIG_SMP
------------------- smp_ops = xen_smp_ops;
+++++++++++++++++++ #ifdef CONFIG_X86_64
+++++++++++++++++++ /* Disable until direct per-cpu data access. */
+++++++++++++++++++ have_vcpu_info_placement = 0;
+++++++++++++++++++ x86_64_init_pda();
#endif
- xen_setup_features();
+++++++++++++++++++ xen_smp_init();
++++++++++ ++++++++
/* Get mfn list */
if (!xen_feature(XENFEAT_auto_translated_physmap))
- phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+ xen_build_dynamic_phys_to_machine();
pgd = (pgd_t *)xen_start_info->pt_base;
---------- -------- init_pg_tables_start = __pa(pgd);
------------------- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
---------- -------- max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT;
-------------------
------------------- init_mm.pgd = pgd; /* use the Xen pagetables to start */
-------------------
------------------- /* keep using Xen gdt for now; no urgent need to change it */
-------------------
------------------- x86_write_percpu(xen_cr3, __pa(pgd));
------------------- x86_write_percpu(xen_current_cr3, __pa(pgd));
+++++++++++++++++++ /* Prevent unwanted bits from being set in PTEs. */
+++++++++++++++++++ __supported_pte_mask &= ~_PAGE_GLOBAL;
+++++++++++++++++++ if (!is_initial_xendomain())
+++++++++++++++++++ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
boot_params.hdr.ramdisk_image = xen_start_info->mod_start
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+++++++++++++++++++ boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
- if (!is_initial_xendomain())
+ if (!is_initial_xendomain()) {
+ add_preferred_console("xenboot", 0, NULL);
+ add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
+ }
+
+++++++++++++++++++ xen_raw_console_write("about to get started...\n");
+++++++++++++++++++
+++++++++++++++++++ #if 0
+++++++++++++++++++ xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
+++++++++++++++++++ &boot_params, __pa_symbol(&boot_params),
+++++++++++++++++++ __va(__pa_symbol(&boot_params)));
+++++++++++++++++++
+++++++++++++++++++ walk(pgd, &boot_params);
+++++++++++++++++++ walk(pgd, __va(__pa(&boot_params)));
+++++++++++++++++++ #endif
++++++++++ ++++++++
/* Start the world */
- start_kernel();
+++++++++++++++++++ #ifdef CONFIG_X86_32
+ i386_start_kernel();
+++++++++++++++++++ #else
+++++++++++++++++++ x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+++++++++++++++++++ #endif
}
#ifdef CONFIG_X86_64
------------------- #define PV_SAVE_REGS pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
------------------- #define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
+++++++++++++++++++ #define PV_SAVE_REGS \
+++++++++++++++++++ push %rax; \
+++++++++++++++++++ push %rcx; \
+++++++++++++++++++ push %rdx; \
+++++++++++++++++++ push %rsi; \
+++++++++++++++++++ push %rdi; \
+++++++++++++++++++ push %r8; \
+++++++++++++++++++ push %r9; \
+++++++++++++++++++ push %r10; \
+++++++++++++++++++ push %r11
+++++++++++++++++++ #define PV_RESTORE_REGS \
+++++++++++++++++++ pop %r11; \
+++++++++++++++++++ pop %r10; \
+++++++++++++++++++ pop %r9; \
+++++++++++++++++++ pop %r8; \
+++++++++++++++++++ pop %rdi; \
+++++++++++++++++++ pop %rsi; \
+++++++++++++++++++ pop %rdx; \
+++++++++++++++++++ pop %rcx; \
+++++++++++++++++++ pop %rax
#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
+ #define PARA_INDIRECT(addr) *addr(%rip)
#else
#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
/* Interrupt control for vSMPowered x86_64 systems */
void vsmp_init(void);
- char *machine_specific_memory_setup(void);
+ + #ifdef CONFIG_X86_VISWS
+ + extern void visws_early_detect(void);
+ + extern int is_visws_box(void);
+ + #else
+ + static inline void visws_early_detect(void) { }
+ + static inline int is_visws_box(void) { return 0; }
+ + #endif
+ +
+ + /*
+ + * Any setup quirks to be performed?
+ + */
--- --- -- ---------extern int (*arch_time_init_quirk)(void);
--- --- -- ---------extern int (*arch_pre_intr_init_quirk)(void);
--- --- -- ---------extern int (*arch_intr_init_quirk)(void);
--- --- -- ---------extern int (*arch_trap_init_quirk)(void);
--- --- -- ---------extern char * (*arch_memory_setup_quirk)(void);
--- --- -- ---------extern int (*mach_get_smp_config_quirk)(unsigned int early);
--- --- -- ---------extern int (*mach_find_smp_config_quirk)(unsigned int reserve);
+++ ++++++++++++++++struct mpc_config_processor;
+++ ++++++++++++++++struct mpc_config_bus;
+++ ++++++++++++++++struct mp_config_oemtable;
+++ ++++++++++++++++struct x86_quirks {
+++ ++++++++++++++++ int (*arch_pre_time_init)(void);
+++ ++++++++++++++++ int (*arch_time_init)(void);
+++ ++++++++++++++++ int (*arch_pre_intr_init)(void);
+++ ++++++++++++++++ int (*arch_intr_init)(void);
+++ ++++++++++++++++ int (*arch_trap_init)(void);
+++ ++++++++++++++++ char * (*arch_memory_setup)(void);
+++ ++++++++++++++++ int (*mach_get_smp_config)(unsigned int early);
+++ ++++++++++++++++ int (*mach_find_smp_config)(unsigned int reserve);
+++ ++++++++++++++++
+++ ++++++++++++++++ int *mpc_record;
+++ ++++++++++++++++ int (*mpc_apic_id)(struct mpc_config_processor *m);
+++ ++++++++++++++++ void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
+++ ++++++++++++++++ void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
+++ ++++++++++++++++ void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
+++ ++++++++++++++++ unsigned short oemsize);
+++ ++++++++++++++++};
+++ ++++++++++++++++
+++ ++++++++++++++++extern struct x86_quirks *x86_quirks;
+ +
#ifndef CONFIG_PARAVIRT
#define paravirt_post_allocator_init() do {} while (0)
#endif
*/
#define LOWMEMSIZE() (0x9f000)
- struct e820entry;
-
- char * __init machine_specific_memory_setup(void);
- char *memory_setup(void);
+ #ifdef __i386__
- int __init copy_e820_map(struct e820entry *biosmap, int nr_map);
- int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map);
- void __init add_memory_region(unsigned long long start,
- unsigned long long size, int type);
+ void __init i386_start_kernel(void);
+ extern void probe_roms(void);
+ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;
-
+ #else
+++++++++++++++++++ void __init x86_64_init_pda(void);
+ void __init x86_64_start_kernel(char *real_mode);
+ void __init x86_64_start_reservations(char *real_mode_data);
#endif /* __i386__ */
#endif /* _SETUP */