However, when run without a hypervisor the kernel is
theoretically slower. If in doubt, say N.
-source "arch/i386/xen/Kconfig"
+source "arch/x86/xen/Kconfig"
config VMI
bool "VMI Paravirt-ops support"
core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/i386/xen/
+core-$(CONFIG_XEN) += arch/x86/xen/
# default subarch .h files
mflags-y += -Iinclude/asm-i386/mach-default
.ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
.asciz "Stack: %p %p %p %p %p %p %p %p\n"
-#include "../xen/xen-head.S"
+#include "../../x86/xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
*/
-#include "../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
+#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
.globl VDSO_NOTE_MASK
ELFNOTE_START(GNU, 2, "a")
+++ /dev/null
-#
-# This Kconfig describes xen options
-#
-
-config XEN
- bool "Enable support for Xen hypervisor"
- depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
- help
- This is the Linux Xen port. Enabling this will allow the
- kernel to boot in a paravirtualized environment under the
- Xen hypervisor.
+++ /dev/null
-obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
- events.o time.o manage.o xen-asm.o
-
-obj-$(CONFIG_SMP) += smp.o
+++ /dev/null
-/*
- * Core of Xen paravirt_ops implementation.
- *
- * This file contains the xen_paravirt_ops structure itself, and the
- * implementations for:
- * - privileged instructions
- * - interrupt flags
- * - segment operations
- * - booting and setup
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/preempt.h>
-#include <linux/hardirq.h>
-#include <linux/percpu.h>
-#include <linux/delay.h>
-#include <linux/start_kernel.h>
-#include <linux/sched.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/highmem.h>
-#include <linux/smp.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/physdev.h>
-#include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
-#include <xen/features.h>
-#include <xen/page.h>
-
-#include <asm/paravirt.h>
-#include <asm/page.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/setup.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/reboot.h>
-
-#include "xen-ops.h"
-#include "mmu.h"
-#include "multicalls.h"
-
-EXPORT_SYMBOL_GPL(hypercall_page);
-
-DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
-DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-DEFINE_PER_CPU(unsigned long, xen_cr3);
-
-struct start_info *xen_start_info;
-EXPORT_SYMBOL_GPL(xen_start_info);
-
-static /* __initdata */ struct shared_info dummy_shared_info;
-
-/*
- * Point at some empty memory to start with. We map the real shared_info
- * page as soon as fixmap is up and running.
- */
-struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
-
-/*
- * Flag to determine whether vcpu info placement is available on all
- * VCPUs. We assume it is to start with, and then set it to zero on
- * the first failure. This is because it can succeed on some VCPUs
- * and not others, since it can involve hypervisor memory allocation,
- * or because the guest failed to guarantee all the appropriate
- * constraints on all VCPUs (ie buffer can't cross a page boundary).
- *
- * Note that any particular CPU may be using a placed vcpu structure,
- * but we can only optimise if the all are.
- *
- * 0: not available, 1: available
- */
-static int have_vcpu_info_placement = 1;
-
-static void __init xen_vcpu_setup(int cpu)
-{
- struct vcpu_register_vcpu_info info;
- int err;
- struct vcpu_info *vcpup;
-
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
-
- if (!have_vcpu_info_placement)
- return; /* already tested, not available */
-
- vcpup = &per_cpu(xen_vcpu_info, cpu);
-
- info.mfn = virt_to_mfn(vcpup);
- info.offset = offset_in_page(vcpup);
-
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
- cpu, vcpup, info.mfn, info.offset);
-
- /* Check to see if the hypervisor will put the vcpu_info
- structure where we want it, which allows direct access via
- a percpu-variable. */
- err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
-
- if (err) {
- printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
- have_vcpu_info_placement = 0;
- } else {
- /* This cpu is using the registered vcpu info, even if
- later ones fail to. */
- per_cpu(xen_vcpu, cpu) = vcpup;
-
- printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
- cpu, vcpup);
- }
-}
-
-static void __init xen_banner(void)
-{
- printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- paravirt_ops.name);
- printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
-}
-
-static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- unsigned maskedx = ~0;
-
- /*
- * Mask out inconvenient features, to try and disable as many
- * unsupported kernel subsystems as possible.
- */
- if (*eax == 1)
- maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
- (1 << X86_FEATURE_ACPI) | /* disable ACPI */
- (1 << X86_FEATURE_ACC)); /* thermal monitoring */
-
- asm(XEN_EMULATE_PREFIX "cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx));
- *edx &= maskedx;
-}
-
-static void xen_set_debugreg(int reg, unsigned long val)
-{
- HYPERVISOR_set_debugreg(reg, val);
-}
-
-static unsigned long xen_get_debugreg(int reg)
-{
- return HYPERVISOR_get_debugreg(reg);
-}
-
-static unsigned long xen_save_fl(void)
-{
- struct vcpu_info *vcpu;
- unsigned long flags;
-
- vcpu = x86_read_percpu(xen_vcpu);
-
- /* flag has opposite sense of mask */
- flags = !vcpu->evtchn_upcall_mask;
-
- /* convert to IF type flag
- -0 -> 0x00000000
- -1 -> 0xffffffff
- */
- return (-flags) & X86_EFLAGS_IF;
-}
-
-static void xen_restore_fl(unsigned long flags)
-{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- if (flags == 0) {
- preempt_check_resched();
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
- }
-}
-
-static void xen_irq_disable(void)
-{
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
- preempt_enable_no_resched();
-}
-
-static void xen_irq_enable(void)
-{
- struct vcpu_info *vcpu;
-
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = 0;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
-}
-
-static void xen_safe_halt(void)
-{
- /* Blocking includes an implicit local_irq_enable(). */
- if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
- BUG();
-}
-
-static void xen_halt(void)
-{
- if (irqs_disabled())
- HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
- else
- xen_safe_halt();
-}
-
-static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
-{
- BUG_ON(preemptible());
-
- switch (mode) {
- case PARAVIRT_LAZY_NONE:
- BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
- break;
-
- case PARAVIRT_LAZY_MMU:
- case PARAVIRT_LAZY_CPU:
- BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
- break;
-
- case PARAVIRT_LAZY_FLUSH:
- /* flush if necessary, but don't change state */
- if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
- xen_mc_flush();
- return;
- }
-
- xen_mc_flush();
- x86_write_percpu(xen_lazy_mode, mode);
-}
-
-static unsigned long xen_store_tr(void)
-{
- return 0;
-}
-
-static void xen_set_ldt(const void *addr, unsigned entries)
-{
- unsigned long linear_addr = (unsigned long)addr;
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_SET_LDT;
- if (linear_addr) {
- /* ldt my be vmalloced, use arbitrary_virt_to_machine */
- xmaddr_t maddr;
- maddr = arbitrary_virt_to_machine((unsigned long)addr);
- linear_addr = (unsigned long)maddr.maddr;
- }
- op->arg1.linear_addr = linear_addr;
- op->arg2.nr_ents = entries;
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
-static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
-{
- unsigned long *frames;
- unsigned long va = dtr->address;
- unsigned int size = dtr->size + 1;
- unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
- int f;
- struct multicall_space mcs;
-
- /* A GDT can be up to 64k in size, which corresponds to 8192
- 8-byte entries, or 16 4k pages.. */
-
- BUG_ON(size > 65536);
- BUG_ON(va & ~PAGE_MASK);
-
- mcs = xen_mc_entry(sizeof(*frames) * pages);
- frames = mcs.args;
-
- for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
- frames[f] = virt_to_mfn(va);
- make_lowmem_page_readonly((void *)va);
- }
-
- MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
-static void load_TLS_descriptor(struct thread_struct *t,
- unsigned int cpu, unsigned int i)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
- struct multicall_space mc = __xen_mc_entry(0);
-
- MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
-}
-
-static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- xen_mc_batch();
-
- load_TLS_descriptor(t, cpu, 0);
- load_TLS_descriptor(t, cpu, 1);
- load_TLS_descriptor(t, cpu, 2);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-
- /*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone,
- * it means we're in a context switch, and %gs has just been
- * saved. This means we can zero it out to prevent faults on
- * exit from the hypervisor if the next process has no %gs.
- * Either way, it has been saved, and the new value will get
- * loaded properly. This will go away as soon as Xen has been
- * modified to not save/restore %gs for normal hypercalls.
- */
- if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
- loadsegment(gs, 0);
-}
-
-static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
- u32 low, u32 high)
-{
- unsigned long lp = (unsigned long)&dt[entrynum];
- xmaddr_t mach_lp = virt_to_machine(lp);
- u64 entry = (u64)high << 32 | low;
-
- preempt_disable();
-
- xen_mc_flush();
- if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
- BUG();
-
- preempt_enable();
-}
-
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
- struct trap_info *info)
-{
- u8 type, dpl;
-
- type = (high >> 8) & 0x1f;
- dpl = (high >> 13) & 3;
-
- if (type != 0xf && type != 0xe)
- return 0;
-
- info->vector = vector;
- info->address = (high & 0xffff0000) | (low & 0x0000ffff);
- info->cs = low >> 16;
- info->flags = dpl;
- /* interrupt gates clear IF */
- if (type == 0xe)
- info->flags |= 4;
-
- return 1;
-}
-
-/* Locations of each CPU's IDT */
-static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
-
-/* Set an IDT entry. If the entry is part of the current IDT, then
- also update Xen. */
-static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
- u32 low, u32 high)
-{
- unsigned long p = (unsigned long)&dt[entrynum];
- unsigned long start, end;
-
- preempt_disable();
-
- start = __get_cpu_var(idt_desc).address;
- end = start + __get_cpu_var(idt_desc).size + 1;
-
- xen_mc_flush();
-
- write_dt_entry(dt, entrynum, low, high);
-
- if (p >= start && (p + 8) <= end) {
- struct trap_info info[2];
-
- info[1].address = 0;
-
- if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
- if (HYPERVISOR_set_trap_table(info))
- BUG();
- }
-
- preempt_enable();
-}
-
-static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
- struct trap_info *traps)
-{
- unsigned in, out, count;
-
- count = (desc->size+1) / 8;
- BUG_ON(count > 256);
-
- for (in = out = 0; in < count; in++) {
- const u32 *entry = (u32 *)(desc->address + in * 8);
-
- if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
- out++;
- }
- traps[out].address = 0;
-}
-
-void xen_copy_trap_info(struct trap_info *traps)
-{
- const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
-
- xen_convert_trap_info(desc, traps);
-}
-
-/* Load a new IDT into Xen. In principle this can be per-CPU, so we
- hold a spinlock to protect the static traps[] array (static because
- it avoids allocation, and saves stack space). */
-static void xen_load_idt(const struct Xgt_desc_struct *desc)
-{
- static DEFINE_SPINLOCK(lock);
- static struct trap_info traps[257];
-
- spin_lock(&lock);
-
- __get_cpu_var(idt_desc) = *desc;
-
- xen_convert_trap_info(desc, traps);
-
- xen_mc_flush();
- if (HYPERVISOR_set_trap_table(traps))
- BUG();
-
- spin_unlock(&lock);
-}
-
-/* Write a GDT descriptor entry. Ignore LDT descriptors, since
- they're handled differently. */
-static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
- u32 low, u32 high)
-{
- preempt_disable();
-
- switch ((high >> 8) & 0xff) {
- case DESCTYPE_LDT:
- case DESCTYPE_TSS:
- /* ignore */
- break;
-
- default: {
- xmaddr_t maddr = virt_to_machine(&dt[entry]);
- u64 desc = (u64)high << 32 | low;
-
- xen_mc_flush();
- if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
- BUG();
- }
-
- }
-
- preempt_enable();
-}
-
-static void xen_load_esp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- struct multicall_space mcs = xen_mc_entry(0);
- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
-static void xen_set_iopl_mask(unsigned mask)
-{
- struct physdev_set_iopl set_iopl;
-
- /* Force the change at ring 0. */
- set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
- HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
-}
-
-static void xen_io_delay(void)
-{
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_apic_read(unsigned long reg)
-{
- return 0;
-}
-
-static void xen_apic_write(unsigned long reg, unsigned long val)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-#endif
-
-static void xen_flush_tlb(void)
-{
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
-{
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_INVLPG_LOCAL;
- op->arg1.linear_addr = addr & PAGE_MASK;
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
- unsigned long va)
-{
- struct {
- struct mmuext_op op;
- cpumask_t mask;
- } *args;
- cpumask_t cpumask = *cpus;
- struct multicall_space mcs;
-
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (cpus_empty(cpumask))
- return;
-
- mcs = xen_mc_entry(sizeof(*args));
- args = mcs.args;
- args->mask = cpumask;
- args->op.arg2.vcpumask = &args->mask;
-
- if (va == TLB_FLUSH_ALL) {
- args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
- } else {
- args->op.cmd = MMUEXT_INVLPG_MULTI;
- args->op.arg1.linear_addr = va;
- }
-
- MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-
-static void xen_write_cr2(unsigned long cr2)
-{
- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
- return x86_read_percpu(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
- return x86_read_percpu(xen_vcpu_info.arch.cr2);
-}
-
-static void xen_write_cr4(unsigned long cr4)
-{
- /* Just ignore cr4 changes; Xen doesn't allow us to do
- anything anyway. */
-}
-
-static unsigned long xen_read_cr3(void)
-{
- return x86_read_percpu(xen_cr3);
-}
-
-static void xen_write_cr3(unsigned long cr3)
-{
- BUG_ON(preemptible());
-
- if (cr3 == x86_read_percpu(xen_cr3)) {
- /* just a simple tlb flush */
- xen_flush_tlb();
- return;
- }
-
- x86_write_percpu(xen_cr3, cr3);
-
-
- {
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-
- op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = mfn;
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
- }
-}
-
-/* Early in boot, while setting up the initial pagetable, assume
- everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
-{
- BUG_ON(mem_map); /* should only be used early */
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
- attached to a pinned pagetable. */
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
-{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(virt_to_page(mm->pgd))) {
- SetPagePinned(page);
-
- if (!PageHighMem(page))
- make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
- else
- /* make sure there are no stray mappings of
- this page */
- kmap_flush_unused();
- }
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_pt(u32 pfn)
-{
- struct page *page = pfn_to_page(pfn);
-
- if (PagePinned(page)) {
- if (!PageHighMem(page))
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
- }
-}
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- pgprot_t prot = PAGE_KERNEL;
-
- if (PagePinned(page))
- prot = PAGE_KERNEL_RO;
-
- if (0 && PageHighMem(page))
- printk("mapping highpte %lx type %d prot %s\n",
- page_to_pfn(page), type,
- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
- return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
- if (pte_val_ma(*ptep) & _PAGE_PRESENT)
- pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
- pte_val_ma(pte));
-
- return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
- doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
- pte = mask_rw_pte(ptep, pte);
-
- xen_set_pte(ptep, pte);
-}
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
- pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
-
- /* special set_pte for pagetable initialization */
- paravirt_ops.set_pte = xen_set_pte_init;
-
- init_mm.pgd = base;
- /*
- * copy top-level of Xen-supplied pagetable into place. For
- * !PAE we can use this as-is, but for PAE it is a stand-in
- * while we copy the pmd pages.
- */
- memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
- if (PTRS_PER_PMD > 1) {
- int i;
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
-
- make_lowmem_page_readonly(pmd);
-
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
- }
- }
-
- /* make sure zero_page is mapped RO so we can use it in pagetables */
- make_lowmem_page_readonly(empty_zero_page);
- make_lowmem_page_readonly(base);
- /*
- * Switch to new pagetable. This is done before
- * pagetable_init has done anything so that the new pages
- * added to the table can be prepared properly for Xen.
- */
- xen_write_cr3(__pa(base));
-}
-
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
- /* This will work as long as patching hasn't happened yet
- (which it hasn't) */
- paravirt_ops.alloc_pt = xen_alloc_pt;
- paravirt_ops.set_pte = xen_set_pte;
-
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /*
- * Create a mapping for the shared info page.
- * Should be set_fixmap(), but shared_info is a machine
- * address with no corresponding pseudo-phys address.
- */
- set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
- PFN_DOWN(xen_start_info->shared_info),
- PAGE_KERNEL);
-
- HYPERVISOR_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
- } else
- HYPERVISOR_shared_info =
- (struct shared_info *)__va(xen_start_info->shared_info);
-
- /* Actually pin the pagetable down, but we can't set PG_pinned
- yet because the page structures don't exist yet. */
- {
- struct mmuext_op op;
-#ifdef CONFIG_X86_PAE
- op.cmd = MMUEXT_PIN_L3_TABLE;
-#else
- op.cmd = MMUEXT_PIN_L3_TABLE;
-#endif
- op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
- }
-}
-
-/* This is called once we have the cpu_possible_map */
-void __init xen_setup_vcpu_info_placement(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- xen_vcpu_setup(cpu);
-
- /* xen_vcpu_setup managed to place the vcpu_info within the
- percpu area for all cpus, so make use of it */
- if (have_vcpu_info_placement) {
- printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
- paravirt_ops.save_fl = xen_save_fl_direct;
- paravirt_ops.restore_fl = xen_restore_fl_direct;
- paravirt_ops.irq_disable = xen_irq_disable_direct;
- paravirt_ops.irq_enable = xen_irq_enable_direct;
- paravirt_ops.read_cr2 = xen_read_cr2_direct;
- paravirt_ops.iret = xen_iret_direct;
- }
-}
-
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len)
-{
- char *start, *end, *reloc;
- unsigned ret;
-
- start = end = reloc = NULL;
-
-#define SITE(x) \
- case PARAVIRT_PATCH(x): \
- if (have_vcpu_info_placement) { \
- start = (char *)xen_##x##_direct; \
- end = xen_##x##_direct_end; \
- reloc = xen_##x##_direct_reloc; \
- } \
- goto patch_site
-
- switch (type) {
- SITE(irq_enable);
- SITE(irq_disable);
- SITE(save_fl);
- SITE(restore_fl);
-#undef SITE
-
- patch_site:
- if (start == NULL || (end-start) > len)
- goto default_patch;
-
- ret = paravirt_patch_insns(insnbuf, len, start, end);
-
- /* Note: because reloc is assigned from something that
- appears to be an array, gcc assumes it's non-null,
- but doesn't know its relationship with start and
- end. */
- if (reloc > start && reloc < end) {
- int reloc_off = reloc - start;
- long *relocp = (long *)(insnbuf + reloc_off);
- long delta = start - (char *)addr;
-
- *relocp += delta;
- }
- break;
-
- default_patch:
- default:
- ret = paravirt_patch_default(type, clobbers, insnbuf,
- addr, len);
- break;
- }
-
- return ret;
-}
-
-static const struct paravirt_ops xen_paravirt_ops __initdata = {
- .paravirt_enabled = 1,
- .shared_kernel_pmd = 0,
-
- .name = "Xen",
- .banner = xen_banner,
-
- .patch = xen_patch,
-
- .memory_setup = xen_memory_setup,
- .arch_setup = xen_arch_setup,
- .init_IRQ = xen_init_IRQ,
- .post_allocator_init = xen_mark_init_mm_pinned,
-
- .time_init = xen_time_init,
- .set_wallclock = xen_set_wallclock,
- .get_wallclock = xen_get_wallclock,
- .get_cpu_khz = xen_cpu_khz,
- .sched_clock = xen_sched_clock,
-
- .cpuid = xen_cpuid,
-
- .set_debugreg = xen_set_debugreg,
- .get_debugreg = xen_get_debugreg,
-
- .clts = native_clts,
-
- .read_cr0 = native_read_cr0,
- .write_cr0 = native_write_cr0,
-
- .read_cr2 = xen_read_cr2,
- .write_cr2 = xen_write_cr2,
-
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3,
-
- .read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
- .write_cr4 = xen_write_cr4,
-
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
- .wbinvd = native_wbinvd,
-
- .read_msr = native_read_msr_safe,
- .write_msr = native_write_msr_safe,
- .read_tsc = native_read_tsc,
- .read_pmc = native_read_pmc,
-
- .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
- .irq_enable_sysexit = NULL, /* never called */
-
- .load_tr_desc = paravirt_nop,
- .set_ldt = xen_set_ldt,
- .load_gdt = xen_load_gdt,
- .load_idt = xen_load_idt,
- .load_tls = xen_load_tls,
-
- .store_gdt = native_store_gdt,
- .store_idt = native_store_idt,
- .store_tr = xen_store_tr,
-
- .write_ldt_entry = xen_write_ldt_entry,
- .write_gdt_entry = xen_write_gdt_entry,
- .write_idt_entry = xen_write_idt_entry,
- .load_esp0 = xen_load_esp0,
-
- .set_iopl_mask = xen_set_iopl_mask,
- .io_delay = xen_io_delay,
-
-#ifdef CONFIG_X86_LOCAL_APIC
- .apic_write = xen_apic_write,
- .apic_write_atomic = xen_apic_write,
- .apic_read = xen_apic_read,
- .setup_boot_clock = paravirt_nop,
- .setup_secondary_clock = paravirt_nop,
- .startup_ipi_hook = paravirt_nop,
-#endif
-
- .flush_tlb_user = xen_flush_tlb,
- .flush_tlb_kernel = xen_flush_tlb,
- .flush_tlb_single = xen_flush_tlb_single,
- .flush_tlb_others = xen_flush_tlb_others,
-
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
-
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
- .alloc_pt = xen_alloc_pt_init,
- .release_pt = xen_release_pt,
- .alloc_pd = paravirt_nop,
- .alloc_pd_clone = paravirt_nop,
- .release_pd = paravirt_nop,
-
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
- .set_pte = NULL, /* see xen_pagetable_setup_* */
- .set_pte_at = xen_set_pte_at,
- .set_pmd = xen_set_pmd,
-
- .pte_val = xen_pte_val,
- .pgd_val = xen_pgd_val,
-
- .make_pte = xen_make_pte,
- .make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = xen_set_pte_atomic,
- .set_pte_present = xen_set_pte_at,
- .set_pud = xen_set_pud,
- .pte_clear = xen_pte_clear,
- .pmd_clear = xen_pmd_clear,
-
- .make_pmd = xen_make_pmd,
- .pmd_val = xen_pmd_val,
-#endif /* PAE */
-
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
- .exit_mmap = xen_exit_mmap,
-
- .set_lazy_mode = xen_set_lazy_mode,
-};
-
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
- .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
- .smp_prepare_cpus = xen_smp_prepare_cpus,
- .cpu_up = xen_cpu_up,
- .smp_cpus_done = xen_smp_cpus_done,
-
- .smp_send_stop = xen_smp_send_stop,
- .smp_send_reschedule = xen_smp_send_reschedule,
- .smp_call_function_mask = xen_smp_call_function_mask,
-};
-#endif /* CONFIG_SMP */
-
-static void xen_reboot(int reason)
-{
-#ifdef CONFIG_SMP
- smp_send_stop();
-#endif
-
- if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
- BUG();
-}
-
-static void xen_restart(char *msg)
-{
- xen_reboot(SHUTDOWN_reboot);
-}
-
-static void xen_emergency_restart(void)
-{
- xen_reboot(SHUTDOWN_reboot);
-}
-
-static void xen_machine_halt(void)
-{
- xen_reboot(SHUTDOWN_poweroff);
-}
-
-static void xen_crash_shutdown(struct pt_regs *regs)
-{
- xen_reboot(SHUTDOWN_crash);
-}
-
-static const struct machine_ops __initdata xen_machine_ops = {
- .restart = xen_restart,
- .halt = xen_machine_halt,
- .power_off = xen_machine_halt,
- .shutdown = xen_machine_halt,
- .crash_shutdown = xen_crash_shutdown,
- .emergency_restart = xen_emergency_restart,
-};
-
-
-/* First C function to be called on Xen boot */
-asmlinkage void __init xen_start_kernel(void)
-{
- pgd_t *pgd;
-
- if (!xen_start_info)
- return;
-
- BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
-
- /* Install Xen paravirt ops */
- paravirt_ops = xen_paravirt_ops;
- machine_ops = xen_machine_ops;
-
-#ifdef CONFIG_SMP
- smp_ops = xen_smp_ops;
-#endif
-
- xen_setup_features();
-
- /* Get mfn list */
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
-
- pgd = (pgd_t *)xen_start_info->pt_base;
-
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-
- init_mm.pgd = pgd; /* use the Xen pagetables to start */
-
- /* keep using Xen gdt for now; no urgent need to change it */
-
- x86_write_percpu(xen_cr3, __pa(pgd));
-
-#ifdef CONFIG_SMP
- /* Don't do the full vcpu_info placement stuff until we have a
- possible map. */
- per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-#else
- /* May as well do it now, since there's no good time to call
- it later on UP. */
- xen_setup_vcpu_info_placement();
-#endif
-
- paravirt_ops.kernel_rpl = 1;
- if (xen_feature(XENFEAT_supervisor_mode_kernel))
- paravirt_ops.kernel_rpl = 0;
-
- /* set the limit of our address space */
- reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
-
- /* set up basic CPUID stuff */
- cpu_detect(&new_cpu_data);
- new_cpu_data.hard_math = 1;
- new_cpu_data.x86_capability[0] = cpuid_edx(1);
-
- /* Poke various useful things into boot_params */
- LOADER_TYPE = (9 << 4) | 0;
- INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
- INITRD_SIZE = xen_start_info->mod_len;
-
- /* Start the world */
- start_kernel();
-}
+++ /dev/null
-/*
- * Xen event channels
- *
- * Xen models interrupts with abstract event channels. Because each
- * domain gets 1024 event channels, but NR_IRQ is not that large, we
- * must dynamically map irqs<->event channels. The event channels
- * interface with the rest of the kernel by defining a xen interrupt
- * chip. When an event is recieved, it is mapped to an irq and sent
- * through the normal interrupt processing path.
- *
- * There are four kinds of events which can be mapped to an event
- * channel:
- *
- * 1. Inter-domain notifications. This includes all the virtual
- * device events, since they're driven by front-ends in another domain
- * (typically dom0).
- * 2. VIRQs, typically used for timers. These are per-cpu events.
- * 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/linkage.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/ptrace.h>
-#include <asm/irq.h>
-#include <asm/sync_bitops.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/event_channel.h>
-
-#include "xen-ops.h"
-
-/*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
- */
-static DEFINE_SPINLOCK(irq_mapping_update_lock);
-
-/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
-
-/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
-
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-struct packed_irq
-{
- unsigned short evtchn;
- unsigned char index;
- unsigned char type;
-};
-
-static struct packed_irq irq_info[NR_IRQS];
-
-/* Binding types. */
-enum {
- IRQT_UNBOUND,
- IRQT_PIRQ,
- IRQT_VIRQ,
- IRQT_IPI,
- IRQT_EVTCHN
-};
-
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
-
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
- [0 ... NR_EVENT_CHANNELS-1] = -1
-};
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
-static u8 cpu_evtchn[NR_EVENT_CHANNELS];
-
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-
-/* Xen will never allocate port zero for any purpose. */
-#define VALID_EVTCHN(chn) ((chn) != 0)
-
-/*
- * Force a proper event-channel callback from Xen after clearing the
- * callback mask. We do this in a very simple manner, by making a call
- * down into Xen. The pending flag will be checked by Xen on return.
- */
-void force_evtchn_callback(void)
-{
- (void)HYPERVISOR_xen_version(0, NULL);
-}
-EXPORT_SYMBOL_GPL(force_evtchn_callback);
-
-static struct irq_chip xen_dynamic_chip;
-
-/* Constructor for packed IRQ information. */
-static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
-{
- return (struct packed_irq) { evtchn, index, type };
-}
-
-/*
- * Accessors for packed IRQ information.
- */
-static inline unsigned int evtchn_from_irq(int irq)
-{
- return irq_info[irq].evtchn;
-}
-
-static inline unsigned int index_from_irq(int irq)
-{
- return irq_info[irq].index;
-}
-
-static inline unsigned int type_from_irq(int irq)
-{
- return irq_info[irq].type;
-}
-
-static inline unsigned long active_evtchns(unsigned int cpu,
- struct shared_info *sh,
- unsigned int idx)
-{
- return (sh->evtchn_pending[idx] &
- cpu_evtchn_mask[cpu][idx] &
- ~sh->evtchn_mask[idx]);
-}
-
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
-{
- int irq = evtchn_to_irq[chn];
-
- BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
- irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-#endif
-
- __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
- __set_bit(chn, cpu_evtchn_mask[cpu]);
-
- cpu_evtchn[chn] = cpu;
-}
-
-static void init_evtchn_cpu_bindings(void)
-{
-#ifdef CONFIG_SMP
- int i;
- /* By default all event channels notify CPU#0. */
- for (i = 0; i < NR_IRQS; i++)
- irq_desc[i].affinity = cpumask_of_cpu(0);
-#endif
-
- memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
- memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
-}
-
-static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
-{
- return cpu_evtchn[evtchn];
-}
-
-static inline void clear_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_clear_bit(port, &s->evtchn_pending[0]);
-}
-
-static inline void set_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_pending[0]);
-}
-
-
-/**
- * notify_remote_via_irq - send event to remote end of event channel via irq
- * @irq: irq of event channel to send event to
- *
- * Unlike notify_remote_via_evtchn(), this is safe to use across
- * save/restore. Notifications on a broken connection are silently
- * dropped.
- */
-void notify_remote_via_irq(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- notify_remote_via_evtchn(evtchn);
-}
-EXPORT_SYMBOL_GPL(notify_remote_via_irq);
-
-static void mask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_mask[0]);
-}
-
-static void unmask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- unsigned int cpu = get_cpu();
-
- BUG_ON(!irqs_disabled());
-
- /* Slow path (hypercall) if this is a non-local port. */
- if (unlikely(cpu != cpu_from_evtchn(port))) {
- struct evtchn_unmask unmask = { .port = port };
- (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
- } else {
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-
- sync_clear_bit(port, &s->evtchn_mask[0]);
-
- /*
- * The following is basically the equivalent of
- * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
- * the interrupt edge' if the channel is masked.
- */
- if (sync_test_bit(port, &s->evtchn_pending[0]) &&
- !sync_test_and_set_bit(port / BITS_PER_LONG,
- &vcpu_info->evtchn_pending_sel))
- vcpu_info->evtchn_upcall_pending = 1;
- }
-
- put_cpu();
-}
-
-static int find_unbound_irq(void)
-{
- int irq;
-
- /* Only allocate from dynirq range */
- for (irq = 0; irq < NR_IRQS; irq++)
- if (irq_bindcount[irq] == 0)
- break;
-
- if (irq == NR_IRQS)
- panic("No available IRQ to bind to: increase NR_IRQS!\n");
-
- return irq;
-}
-
-int bind_evtchn_to_irq(unsigned int evtchn)
-{
- int irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = evtchn_to_irq[evtchn];
-
- if (irq == -1) {
- irq = find_unbound_irq();
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "event");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
- }
-
- irq_bindcount[irq]++;
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-
-static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
-{
- struct evtchn_bind_ipi bind_ipi;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(ipi_to_irq, cpu)[ipi];
- if (irq == -1) {
- irq = find_unbound_irq();
- if (irq < 0)
- goto out;
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "ipi");
-
- bind_ipi.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
- &bind_ipi) != 0)
- BUG();
- evtchn = bind_ipi.port;
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
-
- per_cpu(ipi_to_irq, cpu)[ipi] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- irq_bindcount[irq]++;
-
- out:
- spin_unlock(&irq_mapping_update_lock);
- return irq;
-}
-
-
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-{
- struct evtchn_bind_virq bind_virq;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(virq_to_irq, cpu)[virq];
-
- if (irq == -1) {
- bind_virq.virq = virq;
- bind_virq.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
- &bind_virq) != 0)
- BUG();
- evtchn = bind_virq.port;
-
- irq = find_unbound_irq();
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "virq");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
-
- per_cpu(virq_to_irq, cpu)[virq] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- irq_bindcount[irq]++;
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-
-static void unbind_from_irq(unsigned int irq)
-{
- struct evtchn_close close;
- int evtchn = evtchn_from_irq(irq);
-
- spin_lock(&irq_mapping_update_lock);
-
- if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
- close.port = evtchn;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
- BUG();
-
- switch (type_from_irq(irq)) {
- case IRQT_VIRQ:
- per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
- [index_from_irq(irq)] = -1;
- break;
- default:
- break;
- }
-
- /* Closed ports are implicitly re-bound to VCPU0. */
- bind_evtchn_to_cpu(evtchn, 0);
-
- evtchn_to_irq[evtchn] = -1;
- irq_info[irq] = IRQ_UNBOUND;
-
- dynamic_irq_init(irq);
- }
-
- spin_unlock(&irq_mapping_update_lock);
-}
-
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
- irqreturn_t (*handler)(int, void *),
- unsigned long irqflags,
- const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_evtchn_to_irq(evtchn);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-
-int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
- irqreturn_t (*handler)(int, void *),
- unsigned long irqflags, const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_virq_to_irq(virq, cpu);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
-
-int bind_ipi_to_irqhandler(enum ipi_vector ipi,
- unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname,
- void *dev_id)
-{
- int irq, retval;
-
- irq = bind_ipi_to_irq(ipi, cpu);
- if (irq < 0)
- return irq;
-
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-
-void unbind_from_irqhandler(unsigned int irq, void *dev_id)
-{
- free_irq(irq, dev_id);
- unbind_from_irq(irq);
-}
-EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
-
-void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
-{
- int irq = per_cpu(ipi_to_irq, cpu)[vector];
- BUG_ON(irq < 0);
- notify_remote_via_irq(irq);
-}
-
-
-/*
- * Search the CPUs pending events bitmasks. For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching. The first level is
- * a bitset of words which contain pending event bits. The second
- * level is a bitset of pending events themselves.
- */
-fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
- int cpu = get_cpu();
- struct shared_info *s = HYPERVISOR_shared_info;
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- unsigned long pending_words;
-
- vcpu_info->evtchn_upcall_pending = 0;
-
- /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
- pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
- unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
-
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
-
- if (irq != -1) {
- regs->orig_eax = ~irq;
- do_IRQ(regs);
- }
- }
- }
-
- put_cpu();
-}
-
-/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
-{
- struct evtchn_bind_vcpu bind_vcpu;
- int evtchn = evtchn_from_irq(irq);
-
- if (!VALID_EVTCHN(evtchn))
- return;
-
- /* Send future instances of this interrupt to other vcpu. */
- bind_vcpu.port = evtchn;
- bind_vcpu.vcpu = tcpu;
-
- /*
- * If this fails, it usually just indicates that we're dealing with a
- * virq or IPI channel, which don't actually need to be rebound. Ignore
- * it, but don't do the xenlinux-level rebind in that case.
- */
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
- bind_evtchn_to_cpu(evtchn, tcpu);
-}
-
-
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
-{
- unsigned tcpu = first_cpu(dest);
- rebind_irq_to_cpu(irq, tcpu);
-}
-
-static void enable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
-}
-
-static void disable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- mask_evtchn(evtchn);
-}
-
-static void ack_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- move_native_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
-}
-
-static int retrigger_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- int ret = 0;
-
- if (VALID_EVTCHN(evtchn)) {
- set_evtchn(evtchn);
- ret = 1;
- }
-
- return ret;
-}
-
-static struct irq_chip xen_dynamic_chip __read_mostly = {
- .name = "xen-dyn",
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
- .ack = ack_dynirq,
- .set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
-};
-
-void __init xen_init_IRQ(void)
-{
- int i;
-
- init_evtchn_cpu_bindings();
-
- /* No event channels are 'live' right now. */
- for (i = 0; i < NR_EVENT_CHANNELS; i++)
- mask_evtchn(i);
-
- /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
- for (i = 0; i < NR_IRQS; i++)
- irq_bindcount[i] = 0;
-
- irq_ctx_init(smp_processor_id());
-}
+++ /dev/null
-/******************************************************************************
- * features.c
- *
- * Xen feature flags.
- *
- * Copyright (c) 2006, Ian Campbell, XenSource Inc.
- */
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/features.h>
-
-u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
-
-void xen_setup_features(void)
-{
- struct xen_feature_info fi;
- int i, j;
-
- for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
- fi.submap_idx = i;
- if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
- break;
- for (j = 0; j < 32; j++)
- xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
- }
-}
+++ /dev/null
-/*
- * Handle extern requests for shutdown, reboot and sysrq
- */
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/reboot.h>
-#include <linux/sysrq.h>
-
-#include <xen/xenbus.h>
-
-#define SHUTDOWN_INVALID -1
-#define SHUTDOWN_POWEROFF 0
-#define SHUTDOWN_SUSPEND 2
-/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
- * report a crash, not be instructed to crash!
- * HALT is the same as POWEROFF, as far as we're concerned. The tools use
- * the distinction when we return the reason code to them.
- */
-#define SHUTDOWN_HALT 4
-
-/* Ignore multiple shutdown requests. */
-static int shutting_down = SHUTDOWN_INVALID;
-
-static void shutdown_handler(struct xenbus_watch *watch,
- const char **vec, unsigned int len)
-{
- char *str;
- struct xenbus_transaction xbt;
- int err;
-
- if (shutting_down != SHUTDOWN_INVALID)
- return;
-
- again:
- err = xenbus_transaction_start(&xbt);
- if (err)
- return;
-
- str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
- /* Ignore read errors and empty reads. */
- if (XENBUS_IS_ERR_READ(str)) {
- xenbus_transaction_end(xbt, 1);
- return;
- }
-
- xenbus_write(xbt, "control", "shutdown", "");
-
- err = xenbus_transaction_end(xbt, 0);
- if (err == -EAGAIN) {
- kfree(str);
- goto again;
- }
-
- if (strcmp(str, "poweroff") == 0 ||
- strcmp(str, "halt") == 0)
- orderly_poweroff(false);
- else if (strcmp(str, "reboot") == 0)
- ctrl_alt_del();
- else {
- printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
- shutting_down = SHUTDOWN_INVALID;
- }
-
- kfree(str);
-}
-
-static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
- unsigned int len)
-{
- char sysrq_key = '\0';
- struct xenbus_transaction xbt;
- int err;
-
- again:
- err = xenbus_transaction_start(&xbt);
- if (err)
- return;
- if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
- printk(KERN_ERR "Unable to read sysrq code in "
- "control/sysrq\n");
- xenbus_transaction_end(xbt, 1);
- return;
- }
-
- if (sysrq_key != '\0')
- xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
-
- err = xenbus_transaction_end(xbt, 0);
- if (err == -EAGAIN)
- goto again;
-
- if (sysrq_key != '\0')
- handle_sysrq(sysrq_key, NULL);
-}
-
-static struct xenbus_watch shutdown_watch = {
- .node = "control/shutdown",
- .callback = shutdown_handler
-};
-
-static struct xenbus_watch sysrq_watch = {
- .node = "control/sysrq",
- .callback = sysrq_handler
-};
-
-static int setup_shutdown_watcher(void)
-{
- int err;
-
- err = register_xenbus_watch(&shutdown_watch);
- if (err) {
- printk(KERN_ERR "Failed to set shutdown watcher\n");
- return err;
- }
-
- err = register_xenbus_watch(&sysrq_watch);
- if (err) {
- printk(KERN_ERR "Failed to set sysrq watcher\n");
- return err;
- }
-
- return 0;
-}
-
-static int shutdown_event(struct notifier_block *notifier,
- unsigned long event,
- void *data)
-{
- setup_shutdown_watcher();
- return NOTIFY_DONE;
-}
-
-static int __init setup_shutdown_event(void)
-{
- static struct notifier_block xenstore_notifier = {
- .notifier_call = shutdown_event
- };
- register_xenstore_notifier(&xenstore_notifier);
-
- return 0;
-}
-
-subsys_initcall(setup_shutdown_event);
+++ /dev/null
-/*
- * Xen mmu operations
- *
- * This file contains the various mmu fetch and update operations.
- * The most important job they must perform is the mapping between the
- * domain's pfn and the overall machine mfns.
- *
- * Xen allows guests to directly update the pagetable, in a controlled
- * fashion. In other words, the guest modifies the same pagetable
- * that the CPU actually uses, which eliminates the overhead of having
- * a separate shadow pagetable.
- *
- * In order to allow this, it falls on the guest domain to map its
- * notion of a "physical" pfn - which is just a domain-local linear
- * address - into a real "machine address" which the CPU's MMU can
- * use.
- *
- * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
- * inserted directly into the pagetable. When creating a new
- * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
- * when reading the content back with __(pgd|pmd|pte)_val, it converts
- * the mfn back into a pfn.
- *
- * The other constraint is that all pages which make up a pagetable
- * must be mapped read-only in the guest. This prevents uncontrolled
- * guest updates to the pagetable. Xen strictly enforces this, and
- * will disallow any pagetable update which will end up mapping a
- * pagetable page RW, and will disallow using any writable page as a
- * pagetable.
- *
- * Naively, when loading %cr3 with the base of a new pagetable, Xen
- * would need to validate the whole pagetable before going on.
- * Naturally, this is quite slow. The solution is to "pin" a
- * pagetable, which enforces all the constraints on the pagetable even
- * when it is not actively in use. This menas that Xen can be assured
- * that it is still valid when you do load it into %cr3, and doesn't
- * need to revalidate it.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/bug.h>
-#include <linux/sched.h>
-
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/paravirt.h>
-
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/page.h>
-#include <xen/interface/xen.h>
-
-#include "multicalls.h"
-#include "mmu.h"
-
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
-{
- pte_t *pte = lookup_address(address);
- unsigned offset = address & PAGE_MASK;
-
- BUG_ON(pte == NULL);
-
- return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
-}
-
-void make_lowmem_page_readonly(void *vaddr)
-{
- pte_t *pte, ptev;
- unsigned long address = (unsigned long)vaddr;
-
- pte = lookup_address(address);
- BUG_ON(pte == NULL);
-
- ptev = pte_wrprotect(*pte);
-
- if (HYPERVISOR_update_va_mapping(address, ptev, 0))
- BUG();
-}
-
-void make_lowmem_page_readwrite(void *vaddr)
-{
- pte_t *pte, ptev;
- unsigned long address = (unsigned long)vaddr;
-
- pte = lookup_address(address);
- BUG_ON(pte == NULL);
-
- ptev = pte_mkwrite(*pte);
-
- if (HYPERVISOR_update_va_mapping(address, ptev, 0))
- BUG();
-}
-
-
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
-{
- struct multicall_space mcs;
- struct mmu_update *u;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*u));
- u = mcs.args;
- u->ptr = virt_to_machine(ptr).maddr;
- u->val = pmd_val_ma(val);
- MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
-}
-
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
- */
-void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- BUG();
- return;
- }
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud)) {
- BUG();
- return;
- }
- pmd = pmd_offset(pud, vaddr);
- if (pmd_none(*pmd)) {
- BUG();
- return;
- }
- pte = pte_offset_kernel(pmd, vaddr);
- /* <mfn,flags> stored as-is, to permit clearing entries */
- xen_set_pte(pte, mfn_pte(mfn, flags));
-
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
-}
-
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- if (mm == current->mm || mm == &init_mm) {
- if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- struct multicall_space mcs;
- mcs = xen_mc_entry(0);
-
- MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- return;
- } else
- if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
- return;
- }
- xen_set_pte(ptep, pteval);
-}
-
-#ifdef CONFIG_X86_PAE
-void xen_set_pud(pud_t *ptr, pud_t val)
-{
- struct multicall_space mcs;
- struct mmu_update *u;
-
- preempt_disable();
-
- mcs = xen_mc_entry(sizeof(*u));
- u = mcs.args;
- u->ptr = virt_to_machine(ptr).maddr;
- u->val = pud_val_ma(val);
- MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- preempt_enable();
-}
-
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- set_64bit((u64 *)ptep, pte_val_ma(pte));
-}
-
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- ptep->pte_low = 0;
- smp_wmb(); /* make sure low gets written first */
- ptep->pte_high = 0;
-}
-
-void xen_pmd_clear(pmd_t *pmdp)
-{
- xen_set_pmd(pmdp, __pmd(0));
-}
-
-unsigned long long xen_pte_val(pte_t pte)
-{
- unsigned long long ret = 0;
-
- if (pte.pte_low) {
- ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- }
-
- return ret;
-}
-
-unsigned long long xen_pmd_val(pmd_t pmd)
-{
- unsigned long long ret = pmd.pmd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
-}
-
-unsigned long long xen_pgd_val(pgd_t pgd)
-{
- unsigned long long ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
-}
-
-pte_t xen_make_pte(unsigned long long pte)
-{
- if (pte & 1)
- pte = phys_to_machine(XPADDR(pte)).maddr;
-
- return (pte_t){ pte, pte >> 32 };
-}
-
-pmd_t xen_make_pmd(unsigned long long pmd)
-{
- if (pmd & 1)
- pmd = phys_to_machine(XPADDR(pmd)).maddr;
-
- return (pmd_t){ pmd };
-}
-
-pgd_t xen_make_pgd(unsigned long long pgd)
-{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
-}
-#else /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- *ptep = pte;
-}
-
-unsigned long xen_pte_val(pte_t pte)
-{
- unsigned long ret = pte.pte_low;
-
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr;
-
- return ret;
-}
-
-unsigned long xen_pgd_val(pgd_t pgd)
-{
- unsigned long ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
-}
-
-pte_t xen_make_pte(unsigned long pte)
-{
- if (pte & _PAGE_PRESENT)
- pte = phys_to_machine(XPADDR(pte)).maddr;
-
- return (pte_t){ pte };
-}
-
-pgd_t xen_make_pgd(unsigned long pgd)
-{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
-}
-#endif /* CONFIG_X86_PAE */
-
-
-
-/*
- (Yet another) pagetable walker. This one is intended for pinning a
- pagetable. This means that it walks a pagetable and calls the
- callback function on each page it finds making up the page table,
- at every level. It walks the entire pagetable, but it only bothers
- pinning pte pages which are below pte_limit. In the normal case
- this will be TASK_SIZE, but at boot we need to pin up to
- FIXADDR_TOP. But the important bit is that we don't pin beyond
- there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
- unsigned long limit)
-{
- pgd_t *pgd = pgd_base;
- int flush = 0;
- unsigned long addr = 0;
- unsigned long pgd_next;
-
- BUG_ON(limit > FIXADDR_TOP);
-
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
- for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
- pud_t *pud;
- unsigned long pud_limit, pud_next;
-
- pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
-
- if (!pgd_val(*pgd))
- continue;
-
- pud = pud_offset(pgd, 0);
-
- if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pud), 0);
-
- for (; addr != pud_limit; pud++, addr = pud_next) {
- pmd_t *pmd;
- unsigned long pmd_limit;
-
- pud_next = pud_addr_end(addr, pud_limit);
-
- if (pud_next < limit)
- pmd_limit = pud_next;
- else
- pmd_limit = limit;
-
- if (pud_none(*pud))
- continue;
-
- pmd = pmd_offset(pud, 0);
-
- if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pmd), 0);
-
- for (; addr != pmd_limit; pmd++) {
- addr += (PAGE_SIZE * PTRS_PER_PTE);
- if ((pmd_limit-1) < (addr-1)) {
- addr = pmd_limit;
- break;
- }
-
- if (pmd_none(*pmd))
- continue;
-
- flush |= (*func)(pmd_page(*pmd), 0);
- }
- }
- }
-
- flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
-
- return flush;
-}
-
-static int pin_page(struct page *page, unsigned flags)
-{
- unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
- int flush;
-
- if (pgfl)
- flush = 0; /* already pinned */
- else if (PageHighMem(page))
- /* kmaps need flushing if we found an unpinned
- highpage */
- flush = 1;
- else {
- void *pt = lowmem_page_address(page);
- unsigned long pfn = page_to_pfn(page);
- struct multicall_space mcs = __xen_mc_entry(0);
-
- flush = 0;
-
- MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
- pfn_pte(pfn, PAGE_KERNEL_RO),
- flags);
- }
-
- return flush;
-}
-
-/* This is called just after a mm has been created, but it has not
- been used yet. We need to make sure that its pagetable is all
- read-only, and can be pinned. */
-void xen_pgd_pin(pgd_t *pgd)
-{
- struct multicall_space mcs;
- struct mmuext_op *op;
-
- xen_mc_batch();
-
- if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
- /* re-enable interrupts for kmap_flush_unused */
- xen_mc_issue(0);
- kmap_flush_unused();
- xen_mc_batch();
- }
-
- mcs = __xen_mc_entry(sizeof(*op));
- op = mcs.args;
-
-#ifdef CONFIG_X86_PAE
- op->cmd = MMUEXT_PIN_L3_TABLE;
-#else
- op->cmd = MMUEXT_PIN_L2_TABLE;
-#endif
- op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- xen_mc_issue(0);
-}
-
-/* The init_mm pagetable is really pinned as soon as its created, but
- that's before we have page structures to store the bits. So do all
- the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
-{
- SetPagePinned(page);
- return 0;
-}
-
-void __init xen_mark_init_mm_pinned(void)
-{
- pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
-}
-
-static int unpin_page(struct page *page, unsigned flags)
-{
- unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
-
- if (pgfl && !PageHighMem(page)) {
- void *pt = lowmem_page_address(page);
- unsigned long pfn = page_to_pfn(page);
- struct multicall_space mcs = __xen_mc_entry(0);
-
- MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
- pfn_pte(pfn, PAGE_KERNEL),
- flags);
- }
-
- return 0; /* never need to flush on unpin */
-}
-
-/* Release a pagetables pages back as normal RW */
-static void xen_pgd_unpin(pgd_t *pgd)
-{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
- xen_mc_batch();
-
- mcs = __xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_UNPIN_TABLE;
- op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- pgd_walk(pgd, unpin_page, TASK_SIZE);
-
- xen_mc_issue(0);
-}
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
- spin_lock(&next->page_table_lock);
- xen_pgd_pin(next->pgd);
- spin_unlock(&next->page_table_lock);
-}
-
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
-{
- spin_lock(&mm->page_table_lock);
- xen_pgd_pin(mm->pgd);
- spin_unlock(&mm->page_table_lock);
-}
-
-
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
- we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
-{
- struct mm_struct *mm = info;
-
- if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
- leave_mm(smp_processor_id());
-}
-
-static void drop_mm_ref(struct mm_struct *mm)
-{
- if (current->active_mm == mm) {
- if (current->mm == mm)
- load_cr3(swapper_pg_dir);
- else
- leave_mm(smp_processor_id());
- }
-
- if (!cpus_empty(mm->cpu_vm_mask))
- xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
- mm, 1);
-}
-#else
-static void drop_mm_ref(struct mm_struct *mm)
-{
- if (current->active_mm == mm)
- load_cr3(swapper_pg_dir);
-}
-#endif
-
-/*
- * While a process runs, Xen pins its pagetables, which means that the
- * hypervisor forces it to be read-only, and it controls all updates
- * to it. This means that all pagetable updates have to go via the
- * hypervisor, which is moderately expensive.
- *
- * Since we're pulling the pagetable down, we switch to use init_mm,
- * unpin old process pagetable and mark it all read-write, which
- * allows further operations on it to be simple memory accesses.
- *
- * The only subtle point is that another CPU may be still using the
- * pagetable because of lazy tlb flushing. This means we need need to
- * switch all CPUs off this pagetable before we can unpin it.
- */
-void xen_exit_mmap(struct mm_struct *mm)
-{
- get_cpu(); /* make sure we don't move around */
- drop_mm_ref(mm);
- put_cpu();
-
- spin_lock(&mm->page_table_lock);
-
- /* pgd may not be pinned in the error exit path of execve */
- if (PagePinned(virt_to_page(mm->pgd)))
- xen_pgd_unpin(mm->pgd);
- spin_unlock(&mm->page_table_lock);
-}
+++ /dev/null
-#ifndef _XEN_MMU_H
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
-
-void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-void xen_exit_mmap(struct mm_struct *mm);
-
-void xen_pgd_pin(pgd_t *pgd);
-//void xen_pgd_unpin(pgd_t *pgd);
-
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
-
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
-void xen_set_pud(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
-
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
-
-#endif /* _XEN_MMU_H */
+++ /dev/null
-/*
- * Xen hypercall batching.
- *
- * Xen allows multiple hypercalls to be issued at once, using the
- * multicall interface. This allows the cost of trapping into the
- * hypervisor to be amortized over several calls.
- *
- * This file implements a simple interface for multicalls. There's a
- * per-cpu buffer of outstanding multicalls. When you want to queue a
- * multicall for issuing, you can allocate a multicall slot for the
- * call and its arguments, along with storage for space which is
- * pointed to by the arguments (for passing pointers to structures,
- * etc). When the multicall is actually issued, all the space for the
- * commands and allocated memory is freed for reuse.
- *
- * Multicalls are flushed whenever any of the buffers get full, or
- * when explicitly requested. There's no way to get per-multicall
- * return results back. It will BUG if any of the multicalls fail.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-
-#include <asm/xen/hypercall.h>
-
-#include "multicalls.h"
-
-#define MC_BATCH 32
-#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
-
-struct mc_buffer {
- struct multicall_entry entries[MC_BATCH];
- u64 args[MC_ARGS];
- unsigned mcidx, argidx;
-};
-
-static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
-DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
-
-void xen_mc_flush(void)
-{
- struct mc_buffer *b = &__get_cpu_var(mc_buffer);
- int ret = 0;
- unsigned long flags;
-
- BUG_ON(preemptible());
-
- /* Disable interrupts in case someone comes in and queues
- something in the middle */
- local_irq_save(flags);
-
- if (b->mcidx) {
- int i;
-
- if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
- BUG();
- for (i = 0; i < b->mcidx; i++)
- if (b->entries[i].result < 0)
- ret++;
- b->mcidx = 0;
- b->argidx = 0;
- } else
- BUG_ON(b->argidx != 0);
-
- local_irq_restore(flags);
-
- BUG_ON(ret);
-}
-
-struct multicall_space __xen_mc_entry(size_t args)
-{
- struct mc_buffer *b = &__get_cpu_var(mc_buffer);
- struct multicall_space ret;
- unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
-
- BUG_ON(preemptible());
- BUG_ON(argspace > MC_ARGS);
-
- if (b->mcidx == MC_BATCH ||
- (b->argidx + argspace) > MC_ARGS)
- xen_mc_flush();
-
- ret.mc = &b->entries[b->mcidx];
- b->mcidx++;
- ret.args = &b->args[b->argidx];
- b->argidx += argspace;
-
- return ret;
-}
+++ /dev/null
-#ifndef _XEN_MULTICALLS_H
-#define _XEN_MULTICALLS_H
-
-#include "xen-ops.h"
-
-/* Multicalls */
-struct multicall_space
-{
- struct multicall_entry *mc;
- void *args;
-};
-
-/* Allocate room for a multicall and its args */
-struct multicall_space __xen_mc_entry(size_t args);
-
-DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
-
-/* Call to start a batch of multiple __xen_mc_entry()s. Must be
- paired with xen_mc_issue() */
-static inline void xen_mc_batch(void)
-{
- /* need to disable interrupts until this entry is complete */
- local_irq_save(__get_cpu_var(xen_mc_irq_flags));
-}
-
-static inline struct multicall_space xen_mc_entry(size_t args)
-{
- xen_mc_batch();
- return __xen_mc_entry(args);
-}
-
-/* Flush all pending multicalls */
-void xen_mc_flush(void);
-
-/* Issue a multicall if we're not in a lazy mode */
-static inline void xen_mc_issue(unsigned mode)
-{
- if ((xen_get_lazy_mode() & mode) == 0)
- xen_mc_flush();
-
- /* restore flags saved in xen_mc_batch */
- local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
-}
-
-#endif /* _XEN_MULTICALLS_H */
+++ /dev/null
-/*
- * Machine specific setup for xen
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-
-#include <asm/elf.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/interface/physdev.h>
-#include <xen/features.h>
-
-#include "xen-ops.h"
-#include "vdso.h"
-
-/* These are code, but not functions. Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-
-unsigned long *phys_to_machine_mapping;
-EXPORT_SYMBOL(phys_to_machine_mapping);
-
-/**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
- **/
-
-char * __init xen_memory_setup(void)
-{
- unsigned long max_pfn = xen_start_info->nr_pages;
-
- e820.nr_map = 0;
- add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
-
- return "Xen";
-}
-
-static void xen_idle(void)
-{
- local_irq_disable();
-
- if (need_resched())
- local_irq_enable();
- else {
- current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
- safe_halt();
- current_thread_info()->status |= TS_POLLING;
- }
-}
-
-/*
- * Set the bit indicating "nosegneg" library variants should be used.
- */
-static void fiddle_vdso(void)
-{
- extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */
- extern char vsyscall_int80_start;
- u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
- &vsyscall_int80_start);
- *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
-}
-
-void __init xen_arch_setup(void)
-{
- struct physdev_set_iopl set_iopl;
- int rc;
-
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
-
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
-
- HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
- __KERNEL_CS, (unsigned long)xen_failsafe_callback);
-
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- printk(KERN_INFO "physdev_op failed %d\n", rc);
-
-#ifdef CONFIG_ACPI
- if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
- printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
- disable_acpi();
- }
-#endif
-
- memcpy(boot_command_line, xen_start_info->cmd_line,
- MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
- COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
-
- pm_idle = xen_idle;
-
-#ifdef CONFIG_SMP
- /* fill cpus_possible with all available cpus */
- xen_fill_possible_map();
-#endif
-
- paravirt_disable_iospace();
-
- fiddle_vdso();
-}
+++ /dev/null
-/*
- * Xen SMP support
- *
- * This file implements the Xen versions of smp_ops. SMP under Xen is
- * very straightforward. Bringing a CPU up is simply a matter of
- * loading its initial context and setting it running.
- *
- * IPIs are handled through the Xen event mechanism.
- *
- * Because virtual CPUs can be scheduled onto any real CPU, there's no
- * useful topology information for the kernel to make use of. As a
- * result, all CPUs are treated as if they're single-core and
- * single-threaded.
- *
- * This does not handle HOTPLUG_CPU yet.
- */
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/smp.h>
-
-#include <asm/paravirt.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/cpu.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/vcpu.h>
-
-#include <asm/xen/interface.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/page.h>
-#include <xen/events.h>
-
-#include "xen-ops.h"
-#include "mmu.h"
-
-static cpumask_t cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
-};
-
-static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
-
-static struct call_data_struct *call_data;
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
-{
- return IRQ_HANDLED;
-}
-
-static __cpuinit void cpu_bringup_and_idle(void)
-{
- int cpu = smp_processor_id();
-
- cpu_init();
-
- preempt_disable();
- per_cpu(cpu_state, cpu) = CPU_ONLINE;
-
- xen_setup_cpu_clockevents();
-
- /* We can take interrupts now: we're officially "up". */
- local_irq_enable();
-
- wmb(); /* make sure everything is out */
- cpu_idle();
-}
-
-static int xen_smp_intr_init(unsigned int cpu)
-{
- int rc;
- const char *resched_name, *callfunc_name;
-
- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
-
- resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
- rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
- cpu,
- xen_reschedule_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
- resched_name,
- NULL);
- if (rc < 0)
- goto fail;
- per_cpu(resched_irq, cpu) = rc;
-
- callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
- rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
- cpu,
- xen_call_function_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
- callfunc_name,
- NULL);
- if (rc < 0)
- goto fail;
- per_cpu(callfunc_irq, cpu) = rc;
-
- return 0;
-
- fail:
- if (per_cpu(resched_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
- if (per_cpu(callfunc_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
- return rc;
-}
-
-void __init xen_fill_possible_map(void)
-{
- int i, rc;
-
- for (i = 0; i < NR_CPUS; i++) {
- rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0)
- cpu_set(i, cpu_possible_map);
- }
-}
-
-void __init xen_smp_prepare_boot_cpu(void)
-{
- int cpu;
-
- BUG_ON(smp_processor_id() != 0);
- native_smp_prepare_boot_cpu();
-
- /* We've switched to the "real" per-cpu gdt, so make sure the
- old memory can be recycled */
- make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpus_clear(cpu_sibling_map[cpu]);
- cpus_clear(cpu_core_map[cpu]);
- }
-
- xen_setup_vcpu_info_placement();
-}
-
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
-{
- unsigned cpu;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpus_clear(cpu_sibling_map[cpu]);
- cpus_clear(cpu_core_map[cpu]);
- }
-
- smp_store_cpu_info(0);
- set_cpu_sibling_map(0);
-
- if (xen_smp_intr_init(0))
- BUG();
-
- cpu_initialized_map = cpumask_of_cpu(0);
-
- /* Restrict the possible_map according to max_cpus. */
- while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
- for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
- continue;
- cpu_clear(cpu, cpu_possible_map);
- }
-
- for_each_possible_cpu (cpu) {
- struct task_struct *idle;
-
- if (cpu == 0)
- continue;
-
- idle = fork_idle(cpu);
- if (IS_ERR(idle))
- panic("failed fork for CPU %d", cpu);
-
- cpu_set(cpu, cpu_present_map);
- }
-
- //init_xenbus_allowed_cpumask();
-}
-
-static __cpuinit int
-cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
-{
- struct vcpu_guest_context *ctxt;
- struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
-
- if (cpu_test_and_set(cpu, cpu_initialized_map))
- return 0;
-
- ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
- if (ctxt == NULL)
- return -ENOMEM;
-
- ctxt->flags = VGCF_IN_KERNEL;
- ctxt->user_regs.ds = __USER_DS;
- ctxt->user_regs.es = __USER_DS;
- ctxt->user_regs.fs = __KERNEL_PERCPU;
- ctxt->user_regs.gs = 0;
- ctxt->user_regs.ss = __KERNEL_DS;
- ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
- ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-
- memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
-
- xen_copy_trap_info(ctxt->trap_ctxt);
-
- ctxt->ldt_ents = 0;
-
- BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
- make_lowmem_page_readonly(gdt->gdt);
-
- ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
- ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
-
- ctxt->user_regs.cs = __KERNEL_CS;
- ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
-
- ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.esp0;
-
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
-
- per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
- ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
-
- if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
- BUG();
-
- kfree(ctxt);
- return 0;
-}
-
-int __cpuinit xen_cpu_up(unsigned int cpu)
-{
- struct task_struct *idle = idle_task(cpu);
- int rc;
-
-#if 0
- rc = cpu_up_check(cpu);
- if (rc)
- return rc;
-#endif
-
- init_gdt(cpu);
- per_cpu(current_task, cpu) = idle;
- irq_ctx_init(cpu);
- xen_setup_timer(cpu);
-
- /* make sure interrupts start blocked */
- per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
-
- rc = cpu_initialize_context(cpu, idle);
- if (rc)
- return rc;
-
- if (num_online_cpus() == 1)
- alternatives_smp_switch(1);
-
- rc = xen_smp_intr_init(cpu);
- if (rc)
- return rc;
-
- smp_store_cpu_info(cpu);
- set_cpu_sibling_map(cpu);
- /* This must be done before setting cpu_online_map */
- wmb();
-
- cpu_set(cpu, cpu_online_map);
-
- rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
- BUG_ON(rc);
-
- return 0;
-}
-
-void xen_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
-static void stop_self(void *v)
-{
- int cpu = smp_processor_id();
-
- /* make sure we're not pinning something down */
- load_cr3(swapper_pg_dir);
- /* should set up a minimal gdt */
-
- HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
- BUG();
-}
-
-void xen_smp_send_stop(void)
-{
- smp_call_function(stop_self, NULL, 0, 0);
-}
-
-void xen_smp_send_reschedule(int cpu)
-{
- xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
-}
-
-
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
-{
- unsigned cpu;
-
- cpus_and(mask, mask, cpu_online_map);
-
- for_each_cpu_mask(cpu, mask)
- xen_send_IPI_one(cpu, vector);
-}
-
-static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
-{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
- irq_enter();
- (*func)(info);
- irq_exit();
-
- if (wait) {
- mb(); /* commit everything before setting finished */
- atomic_inc(&call_data->finished);
- }
-
- return IRQ_HANDLED;
-}
-
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
- void *info, int wait)
-{
- struct call_data_struct data;
- int cpus;
-
- /* Holding any lock stops cpus from going down. */
- spin_lock(&call_lock);
-
- cpu_clear(smp_processor_id(), mask);
-
- cpus = cpus_weight(mask);
- if (!cpus) {
- spin_unlock(&call_lock);
- return 0;
- }
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- mb(); /* write everything before IPI */
-
- /* Send a message to other CPUs and wait for them to respond */
- xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
-
- /* Make sure other vcpus get a chance to run.
- XXX too severe? Maybe we should check the other CPU's states? */
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus ||
- (wait && atomic_read(&data.finished) != cpus))
- cpu_relax();
-
- spin_unlock(&call_lock);
-
- return 0;
-}
+++ /dev/null
-/*
- * Xen time implementation.
- *
- * This is implemented in terms of a clocksource driver which uses
- * the hypervisor clock as a nanosecond timebase, and a clockevent
- * driver which uses the hypervisor's timer mechanism.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/kernel_stat.h>
-
-#include <asm/xen/hypervisor.h>
-#include <asm/xen/hypercall.h>
-
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/vcpu.h>
-
-#include "xen-ops.h"
-
-#define XEN_SHIFT 22
-
-/* Xen may fire a timer up to this many ns early */
-#define TIMER_SLOP 100000
-#define NS_PER_TICK (1000000000LL / HZ)
-
-static cycle_t xen_clocksource_read(void);
-
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
-/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
-
-/* snapshots of runstate info */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
-
-/* unused ns of stolen and blocked time */
-static DEFINE_PER_CPU(u64, residual_stolen);
-static DEFINE_PER_CPU(u64, residual_blocked);
-
-/* return an consistent snapshot of 64-bit time/counter value */
-static u64 get64(const u64 *p)
-{
- u64 ret;
-
- if (BITS_PER_LONG < 64) {
- u32 *p32 = (u32 *)p;
- u32 h, l;
-
- /*
- * Read high then low, and then make sure high is
- * still the same; this will only loop if low wraps
- * and carries into high.
- * XXX some clean way to make this endian-proof?
- */
- do {
- h = p32[1];
- barrier();
- l = p32[0];
- barrier();
- } while (p32[1] != h);
-
- ret = (((u64)h) << 32) | l;
- } else
- ret = *p;
-
- return ret;
-}
-
-/*
- * Runstate accounting
- */
-static void get_runstate_snapshot(struct vcpu_runstate_info *res)
-{
- u64 state_time;
- struct vcpu_runstate_info *state;
-
- BUG_ON(preemptible());
-
- state = &__get_cpu_var(runstate);
-
- /*
- * The runstate info is always updated by the hypervisor on
- * the current CPU, so there's no need to use anything
- * stronger than a compiler barrier when fetching it.
- */
- do {
- state_time = get64(&state->state_entry_time);
- barrier();
- *res = *state;
- barrier();
- } while (get64(&state->state_entry_time) != state_time);
-}
-
-static void setup_runstate_info(int cpu)
-{
- struct vcpu_register_runstate_memory_area area;
-
- area.addr.v = &per_cpu(runstate, cpu);
-
- if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
- cpu, &area))
- BUG();
-}
-
-static void do_stolen_accounting(void)
-{
- struct vcpu_runstate_info state;
- struct vcpu_runstate_info *snap;
- s64 blocked, runnable, offline, stolen;
- cputime_t ticks;
-
- get_runstate_snapshot(&state);
-
- WARN_ON(state.state != RUNSTATE_running);
-
- snap = &__get_cpu_var(runstate_snapshot);
-
- /* work out how much time the VCPU has not been runn*ing* */
- blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
- runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
- offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
-
- *snap = state;
-
- /* Add the appropriate number of ticks of stolen time,
- including any left-overs from last time. Passing NULL to
- account_steal_time accounts the time as stolen. */
- stolen = runnable + offline + __get_cpu_var(residual_stolen);
-
- if (stolen < 0)
- stolen = 0;
-
- ticks = 0;
- while (stolen >= NS_PER_TICK) {
- ticks++;
- stolen -= NS_PER_TICK;
- }
- __get_cpu_var(residual_stolen) = stolen;
- account_steal_time(NULL, ticks);
-
- /* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. Passing idle to
- account_steal_time accounts the time as idle/wait. */
- blocked += __get_cpu_var(residual_blocked);
-
- if (blocked < 0)
- blocked = 0;
-
- ticks = 0;
- while (blocked >= NS_PER_TICK) {
- ticks++;
- blocked -= NS_PER_TICK;
- }
- __get_cpu_var(residual_blocked) = blocked;
- account_steal_time(idle_task(smp_processor_id()), ticks);
-}
-
-/*
- * Xen sched_clock implementation. Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
-{
- struct vcpu_runstate_info state;
- cycle_t now;
- u64 ret;
- s64 offset;
-
- /*
- * Ideally sched_clock should be called on a per-cpu basis
- * anyway, so preempt should already be disabled, but that's
- * not current practice at the moment.
- */
- preempt_disable();
-
- now = xen_clocksource_read();
-
- get_runstate_snapshot(&state);
-
- WARN_ON(state.state != RUNSTATE_running);
-
- offset = now - state.state_entry_time;
- if (offset < 0)
- offset = 0;
-
- ret = state.time[RUNSTATE_blocked] +
- state.time[RUNSTATE_running] +
- offset;
-
- preempt_enable();
-
- return ret;
-}
-
-
-/* Get the CPU speed from Xen */
-unsigned long xen_cpu_khz(void)
-{
- u64 cpu_khz = 1000000ULL << 32;
- const struct vcpu_time_info *info =
- &HYPERVISOR_shared_info->vcpu_info[0].time;
-
- do_div(cpu_khz, info->tsc_to_system_mul);
- if (info->tsc_shift < 0)
- cpu_khz <<= -info->tsc_shift;
- else
- cpu_khz >>= info->tsc_shift;
-
- return cpu_khz;
-}
-
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
- struct vcpu_time_info *src;
- struct shadow_time_info *dst;
-
- /* src is shared memory with the hypervisor, so we need to
- make sure we get a consistent snapshot, even in the face of
- being preempted. */
- src = &__get_cpu_var(xen_vcpu)->time;
- dst = &__get_cpu_var(shadow_time);
-
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) | (dst->version ^ src->version));
-
- return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
- u64 now, delta;
- now = native_read_tsc();
- delta = now - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
-static cycle_t xen_clocksource_read(void)
-{
- struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
- cycle_t ret;
- unsigned version;
-
- do {
- version = get_time_values_from_xen();
- barrier();
- ret = shadow->system_timestamp + get_nsec_offset(shadow);
- barrier();
- } while (version != __get_cpu_var(xen_vcpu)->time.version);
-
- put_cpu_var(shadow_time);
-
- return ret;
-}
-
-static void xen_read_wallclock(struct timespec *ts)
-{
- const struct shared_info *s = HYPERVISOR_shared_info;
- u32 version;
- u64 delta;
- struct timespec now;
-
- /* get wallclock at system boot */
- do {
- version = s->wc_version;
- rmb(); /* fetch version before time */
- now.tv_sec = s->wc_sec;
- now.tv_nsec = s->wc_nsec;
- rmb(); /* fetch time before checking version */
- } while ((s->wc_version & 1) | (version ^ s->wc_version));
-
- delta = xen_clocksource_read(); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
- now.tv_nsec = do_div(delta, NSEC_PER_SEC);
- now.tv_sec = delta;
-
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
-}
-
-unsigned long xen_get_wallclock(void)
-{
- struct timespec ts;
-
- xen_read_wallclock(&ts);
-
- return ts.tv_sec;
-}
-
-int xen_set_wallclock(unsigned long now)
-{
- /* do nothing for domU */
- return -1;
-}
-
-static struct clocksource xen_clocksource __read_mostly = {
- .name = "xen",
- .rating = 400,
- .read = xen_clocksource_read,
- .mask = ~0,
- .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
- .shift = XEN_SHIFT,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- Xen clockevent implementation
-
- Xen has two clockevent implementations:
-
- The old timer_op one works with all released versions of Xen prior
- to version 3.0.4. This version of the hypervisor provides a
- single-shot timer with nanosecond resolution. However, sharing the
- same event channel is a 100Hz tick which is delivered while the
- vcpu is running. We don't care about or use this tick, but it will
- cause the core time code to think the timer fired too soon, and
- will end up resetting it each time. It could be filtered, but
- doing so has complications when the ktime clocksource is not yet
- the xen clocksource (ie, at boot time).
-
- The new vcpu_op-based timer interface allows the tick timer period
- to be changed or turned off. The tick timer is not useful as a
- periodic timer because events are only delivered to running vcpus.
- The one-shot timer can report when a timeout is in the past, so
- set_next_event is capable of returning -ETIME when appropriate.
- This interface is used when available.
-*/
-
-
-/*
- Get a hypervisor absolute time. In theory we could maintain an
- offset between the kernel's time and the hypervisor's time, and
- apply that to a kernel's absolute timeout. Unfortunately the
- hypervisor and kernel times can drift even if the kernel is using
- the Xen clocksource, because ntp can warp the kernel's clocksource.
-*/
-static s64 get_abs_timeout(unsigned long delta)
-{
- return xen_clocksource_read() + delta;
-}
-
-static void xen_timerop_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* unsupported */
- WARN_ON(1);
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- HYPERVISOR_set_timer_op(0); /* cancel timeout */
- break;
- }
-}
-
-static int xen_timerop_set_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-
- if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
- BUG();
-
- /* We may have missed the deadline, but there's no real way of
- knowing for sure. If the event was in the past, then we'll
- get an immediate interrupt. */
-
- return 0;
-}
-
-static const struct clock_event_device xen_timerop_clockevent = {
- .name = "xen",
- .features = CLOCK_EVT_FEAT_ONESHOT,
-
- .max_delta_ns = 0xffffffff,
- .min_delta_ns = TIMER_SLOP,
-
- .mult = 1,
- .shift = 0,
- .rating = 500,
-
- .set_mode = xen_timerop_set_mode,
- .set_next_event = xen_timerop_set_next_event,
-};
-
-
-
-static void xen_vcpuop_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- int cpu = smp_processor_id();
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- WARN_ON(1); /* unsupported */
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
- BUG();
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
- HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
- BUG();
- break;
- case CLOCK_EVT_MODE_RESUME:
- break;
- }
-}
-
-static int xen_vcpuop_set_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- int cpu = smp_processor_id();
- struct vcpu_set_singleshot_timer single;
- int ret;
-
- WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-
- single.timeout_abs_ns = get_abs_timeout(delta);
- single.flags = VCPU_SSHOTTMR_future;
-
- ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
-
- BUG_ON(ret != 0 && ret != -ETIME);
-
- return ret;
-}
-
-static const struct clock_event_device xen_vcpuop_clockevent = {
- .name = "xen",
- .features = CLOCK_EVT_FEAT_ONESHOT,
-
- .max_delta_ns = 0xffffffff,
- .min_delta_ns = TIMER_SLOP,
-
- .mult = 1,
- .shift = 0,
- .rating = 500,
-
- .set_mode = xen_vcpuop_set_mode,
- .set_next_event = xen_vcpuop_set_next_event,
-};
-
-static const struct clock_event_device *xen_clockevent =
- &xen_timerop_clockevent;
-static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
-
-static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
- irqreturn_t ret;
-
- ret = IRQ_NONE;
- if (evt->event_handler) {
- evt->event_handler(evt);
- ret = IRQ_HANDLED;
- }
-
- do_stolen_accounting();
-
- return ret;
-}
-
-void xen_setup_timer(int cpu)
-{
- const char *name;
- struct clock_event_device *evt;
- int irq;
-
- printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
-
- name = kasprintf(GFP_KERNEL, "timer%d", cpu);
- if (!name)
- name = "<timer kasprintf failed>";
-
- irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
- name, NULL);
-
- evt = &per_cpu(xen_clock_events, cpu);
- memcpy(evt, xen_clockevent, sizeof(*evt));
-
- evt->cpumask = cpumask_of_cpu(cpu);
- evt->irq = irq;
-
- setup_runstate_info(cpu);
-}
-
-void xen_setup_cpu_clockevents(void)
-{
- BUG_ON(preemptible());
-
- clockevents_register_device(&__get_cpu_var(xen_clock_events));
-}
-
-__init void xen_time_init(void)
-{
- int cpu = smp_processor_id();
-
- get_time_values_from_xen();
-
- clocksource_register(&xen_clocksource);
-
- if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
- /* Successfully turned off 100Hz tick, so we have the
- vcpuop-based timer interface */
- printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
- xen_clockevent = &xen_vcpuop_clockevent;
- }
-
- /* Set initial system time with full resolution */
- xen_read_wallclock(&xtime);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
-
- tsc_disable = 0;
-
- xen_setup_timer(cpu);
- xen_setup_cpu_clockevents();
-}
+++ /dev/null
-/* Bit used for the pseudo-hwcap for non-negative segments. We use
- bit 1 to avoid bugs in some versions of glibc when bit 0 is
- used; the choice is otherwise arbitrary. */
-#define VDSO_NOTE_NONEGSEG_BIT 1
+++ /dev/null
-/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
- */
-
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/percpu.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
-#include <xen/interface/xen.h>
-
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
- /* Clear mask and test pending */
- andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
- ret
- ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
-
-
-/*
- Disabling events is simply a matter of making the event mask
- non-zero.
- */
-ENTRY(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
- ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
-
-/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- setz %ah
- addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
- ret
- ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
-
-
-/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
- */
-ENTRY(xen_restore_fl_direct)
- testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
- ret
- ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
-
-/*
- This is run where a normal iret would be run, with the same stack setup:
- 8: eflags
- 4: cs
- esp-> 0: eip
-
- This attempts to make sure that any pending events are dealt
- with on return to usermode, but there is a small window in
- which an event can happen just before entering usermode. If
- the nested interrupt ends up setting one of the TIF_WORK_MASK
- pending work flags, they will not be tested again before
- returning to usermode. This means that a process can end up
- with pending work, which will be unprocessed until the process
- enters and leaves the kernel again, which could be an
- unbounded amount of time. This means that a pending signal or
- reschedule event could be indefinitely delayed.
-
- The fix is to notice a nested interrupt in the critical
- window, and if one occurs, then fold the nested interrupt into
- the current interrupt stack frame, and re-process it
- iteratively rather than recursively. This means that it will
- exit via the normal path, and all pending work will be dealt
- with appropriately.
-
- Because the nested interrupt handler needs to deal with the
- current stack state in whatever form its in, we keep things
- simple by only using a single register which is pushed/popped
- on the stack.
-
- Non-direct iret could be done in the same way, but it would
- require an annoying amount of code duplication. We'll assume
- that direct mode will be the common case once the hypervisor
- support becomes commonplace.
- */
-ENTRY(xen_iret_direct)
- /* test eflags for special cases */
- testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
- jnz hyper_iret
-
- push %eax
- ESP_OFFSET=4 # bytes pushed onto stack
-
- /* Store vcpu_info pointer for easy access. Do it this
- way to avoid having to reload %fs */
-#ifdef CONFIG_SMP
- GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax),%eax
- movl __per_cpu_offset(,%eax,4),%eax
- lea per_cpu__xen_vcpu_info(%eax),%eax
-#else
- movl $per_cpu__xen_vcpu_info, %eax
-#endif
-
- /* check IF state we're restoring */
- testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-
- /* Maybe enable events. Once this happens we could get a
- recursive event, so the critical region starts immediately
- afterwards. However, if that happens we don't end up
- resuming the code, so we don't have to be worried about
- being preempted to another CPU. */
- setz XEN_vcpu_info_mask(%eax)
-xen_iret_start_crit:
-
- /* check for unmasked and pending */
- cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-
- /* If there's something pending, mask events again so we
- can jump back into xen_hypervisor_callback */
- sete XEN_vcpu_info_mask(%eax)
-
- popl %eax
-
- /* From this point on the registers are restored and the stack
- updated, so we don't need to worry about it if we're preempted */
-iret_restore_end:
-
- /* Jump to hypervisor_callback after fixing up the stack.
- Events are masked, so jumping out of the critical
- region is OK. */
- je xen_hypervisor_callback
-
- iret
-xen_iret_end_crit:
-
-hyper_iret:
- /* put this out of line since its very rarely used */
- jmp hypercall_page + __HYPERVISOR_iret * 32
-
- .globl xen_iret_start_crit, xen_iret_end_crit
-
-/*
- This is called by xen_hypervisor_callback in entry.S when it sees
- that the EIP at the time of interrupt was between xen_iret_start_crit
- and xen_iret_end_crit. We're passed the EIP in %eax so we can do
- a more refined determination of what to do.
-
- The stack format at this point is:
- ----------------
- ss : (ss/esp may be present if we came from usermode)
- esp :
- eflags } outer exception info
- cs }
- eip }
- ---------------- <- edi (copy dest)
- eax : outer eax if it hasn't been restored
- ----------------
- eflags } nested exception info
- cs } (no ss/esp because we're nested
- eip } from the same ring)
- orig_eax }<- esi (copy src)
- - - - - - - - -
- fs }
- es }
- ds } SAVE_ALL state
- eax }
- : :
- ebx }
- ----------------
- return addr <- esp
- ----------------
-
- In order to deliver the nested exception properly, we need to shift
- everything from the return addr up to the error code so it
- sits just under the outer exception info. This means that when we
- handle the exception, we do it in the context of the outer exception
- rather than starting a new one.
-
- The only caveat is that if the outer eax hasn't been
- restored yet (ie, it's still on stack), we need to insert
- its value into the SAVE_ALL state before going on, since
- it's usermode state which we eventually need to restore.
- */
-ENTRY(xen_iret_crit_fixup)
- /* offsets +4 for return address */
-
- /*
- Paranoia: Make sure we're really coming from userspace.
- One could imagine a case where userspace jumps into the
- critical range address, but just before the CPU delivers a GP,
- it decides to deliver an interrupt instead. Unlikely?
- Definitely. Easy to avoid? Yes. The Intel documents
- explicitly say that the reported EIP for a bad jump is the
- jump instruction itself, not the destination, but some virtual
- environments get this wrong.
- */
- movl PT_CS+4(%esp), %ecx
- andl $SEGMENT_RPL_MASK, %ecx
- cmpl $USER_RPL, %ecx
- je 2f
-
- lea PT_ORIG_EAX+4(%esp), %esi
- lea PT_EFLAGS+4(%esp), %edi
-
- /* If eip is before iret_restore_end then stack
- hasn't been restored yet. */
- cmp $iret_restore_end, %eax
- jae 1f
-
- movl 0+4(%edi),%eax /* copy EAX */
- movl %eax, PT_EAX+4(%esp)
-
- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
-
- /* set up the copy */
-1: std
- mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
- rep movsl
- cld
-
- lea 4(%edi),%esp /* point esp to new frame */
-2: ret
-
-
-/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
+++ /dev/null
-/* Xen-specific pieces of head.S, intended to be included in the right
- place in head.S */
-
-#ifdef CONFIG_XEN
-
-#include <linux/elfnote.h>
-#include <asm/boot.h>
-#include <xen/interface/elfnote.h>
-
-.pushsection .init.text
-ENTRY(startup_xen)
- movl %esi,xen_start_info
- cld
- movl $(init_thread_union+THREAD_SIZE),%esp
- jmp xen_start_kernel
-.popsection
-
-.pushsection .bss.page_aligned
- .align PAGE_SIZE_asm
-ENTRY(hypercall_page)
- .skip 0x1000
-.popsection
-
- ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
- ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
- ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
- ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
- ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
-#else
- ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
-#endif
- ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
-
-#endif /*CONFIG_XEN */
+++ /dev/null
-#ifndef XEN_OPS_H
-#define XEN_OPS_H
-
-#include <linux/init.h>
-
-/* These are code, but not functions. Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-
-void xen_copy_trap_info(struct trap_info *traps);
-
-DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DECLARE_PER_CPU(unsigned long, xen_cr3);
-
-extern struct start_info *xen_start_info;
-extern struct shared_info *HYPERVISOR_shared_info;
-
-char * __init xen_memory_setup(void);
-void __init xen_arch_setup(void);
-void __init xen_init_IRQ(void);
-
-void xen_setup_timer(int cpu);
-void xen_setup_cpu_clockevents(void);
-unsigned long xen_cpu_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
-
-void xen_mark_init_mm_pinned(void);
-
-DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
-static inline unsigned xen_get_lazy_mode(void)
-{
- return x86_read_percpu(xen_lazy_mode);
-}
-
-void __init xen_fill_possible_map(void);
-
-void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
-
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
- int wait);
-int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait);
-
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
- void *info, int wait);
-
-
-/* Declare an asm function, along with symbols needed to make it
- inlineable */
-#define DECL_ASM(ret, name, ...) \
- ret name(__VA_ARGS__); \
- extern char name##_end[]; \
- extern char name##_reloc[] \
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
-
-void xen_iret_direct(void);
-#endif /* XEN_OPS_H */
--- /dev/null
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+ bool "Enable support for Xen hypervisor"
+ depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+ help
+ This is the Linux Xen port. Enabling this will allow the
+ kernel to boot in a paravirtualized environment under the
+ Xen hypervisor.
--- /dev/null
+obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
+ events.o time.o manage.o xen-asm.o
+
+obj-$(CONFIG_SMP) += smp.o
--- /dev/null
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+static /* __initdata */ struct shared_info dummy_shared_info;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs. We assume it is to start with, and then set it to zero on
+ * the first failure. This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+
+static void __init xen_vcpu_setup(int cpu)
+{
+ struct vcpu_register_vcpu_info info;
+ int err;
+ struct vcpu_info *vcpup;
+
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+ if (!have_vcpu_info_placement)
+ return; /* already tested, not available */
+
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+
+ info.mfn = virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
+
+ printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+ cpu, vcpup, info.mfn, info.offset);
+
+ /* Check to see if the hypervisor will put the vcpu_info
+ structure where we want it, which allows direct access via
+ a percpu-variable. */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+
+ if (err) {
+ printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+ have_vcpu_info_placement = 0;
+ } else {
+ /* This cpu is using the registered vcpu info, even if
+ later ones fail to. */
+ per_cpu(xen_vcpu, cpu) = vcpup;
+
+ printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+ cpu, vcpup);
+ }
+}
+
+static void __init xen_banner(void)
+{
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ paravirt_ops.name);
+ printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ unsigned maskedx = ~0;
+
+ /*
+ * Mask out inconvenient features, to try and disable as many
+ * unsupported kernel subsystems as possible.
+ */
+ if (*eax == 1)
+ maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
+ (1 << X86_FEATURE_ACPI) | /* disable ACPI */
+ (1 << X86_FEATURE_ACC)); /* thermal monitoring */
+
+ asm(XEN_EMULATE_PREFIX "cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+ *edx &= maskedx;
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+ HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+ return HYPERVISOR_get_debugreg(reg);
+}
+
+static unsigned long xen_save_fl(void)
+{
+ struct vcpu_info *vcpu;
+ unsigned long flags;
+
+ vcpu = x86_read_percpu(xen_vcpu);
+
+ /* flag has opposite sense of mask */
+ flags = !vcpu->evtchn_upcall_mask;
+
+ /* convert to IF type flag
+ -0 -> 0x00000000
+ -1 -> 0xffffffff
+ */
+ return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+ struct vcpu_info *vcpu;
+
+ /* convert from IF type flag */
+ flags = !(flags & X86_EFLAGS_IF);
+
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ vcpu = x86_read_percpu(xen_vcpu);
+ vcpu->evtchn_upcall_mask = flags;
+ preempt_enable_no_resched();
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ if (flags == 0) {
+ preempt_check_resched();
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ force_evtchn_callback();
+ }
+}
+
+static void xen_irq_disable(void)
+{
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+ preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+ struct vcpu_info *vcpu;
+
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ vcpu = x86_read_percpu(xen_vcpu);
+ vcpu->evtchn_upcall_mask = 0;
+ preempt_enable_no_resched();
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+ /* Blocking includes an implicit local_irq_enable(). */
+ if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+ BUG();
+}
+
+static void xen_halt(void)
+{
+ if (irqs_disabled())
+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+ else
+ xen_safe_halt();
+}
+
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(preemptible());
+
+ switch (mode) {
+ case PARAVIRT_LAZY_NONE:
+ BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+ break;
+
+ case PARAVIRT_LAZY_MMU:
+ case PARAVIRT_LAZY_CPU:
+ BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+ break;
+
+ case PARAVIRT_LAZY_FLUSH:
+ /* flush if necessary, but don't change state */
+ if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+ xen_mc_flush();
+ return;
+ }
+
+ xen_mc_flush();
+ x86_write_percpu(xen_lazy_mode, mode);
+}
+
+static unsigned long xen_store_tr(void)
+{
+ return 0;
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+ unsigned long linear_addr = (unsigned long)addr;
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_SET_LDT;
+ if (linear_addr) {
+ /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+ xmaddr_t maddr;
+ maddr = arbitrary_virt_to_machine((unsigned long)addr);
+ linear_addr = (unsigned long)maddr.maddr;
+ }
+ op->arg1.linear_addr = linear_addr;
+ op->arg2.nr_ents = entries;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ unsigned long *frames;
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ int f;
+ struct multicall_space mcs;
+
+ /* A GDT can be up to 64k in size, which corresponds to 8192
+ 8-byte entries, or 16 4k pages.. */
+
+ BUG_ON(size > 65536);
+ BUG_ON(va & ~PAGE_MASK);
+
+ mcs = xen_mc_entry(sizeof(*frames) * pages);
+ frames = mcs.args;
+
+ for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+ frames[f] = virt_to_mfn(va);
+ make_lowmem_page_readonly((void *)va);
+ }
+
+ MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+ unsigned int cpu, unsigned int i)
+{
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ struct multicall_space mc = __xen_mc_entry(0);
+
+ MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ xen_mc_batch();
+
+ load_TLS_descriptor(t, cpu, 0);
+ load_TLS_descriptor(t, cpu, 1);
+ load_TLS_descriptor(t, cpu, 2);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+ /*
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+ * it means we're in a context switch, and %gs has just been
+ * saved. This means we can zero it out to prevent faults on
+ * exit from the hypervisor if the next process has no %gs.
+ * Either way, it has been saved, and the new value will get
+ * loaded properly. This will go away as soon as Xen has been
+ * modified to not save/restore %gs for normal hypercalls.
+ */
+ if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ loadsegment(gs, 0);
+}
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+ u32 low, u32 high)
+{
+ unsigned long lp = (unsigned long)&dt[entrynum];
+ xmaddr_t mach_lp = virt_to_machine(lp);
+ u64 entry = (u64)high << 32 | low;
+
+ preempt_disable();
+
+ xen_mc_flush();
+ if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+ BUG();
+
+ preempt_enable();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+ struct trap_info *info)
+{
+ u8 type, dpl;
+
+ type = (high >> 8) & 0x1f;
+ dpl = (high >> 13) & 3;
+
+ if (type != 0xf && type != 0xe)
+ return 0;
+
+ info->vector = vector;
+ info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+ info->cs = low >> 16;
+ info->flags = dpl;
+ /* interrupt gates clear IF */
+ if (type == 0xe)
+ info->flags |= 4;
+
+ return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry. If the entry is part of the current IDT, then
+ also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+ u32 low, u32 high)
+{
+ unsigned long p = (unsigned long)&dt[entrynum];
+ unsigned long start, end;
+
+ preempt_disable();
+
+ start = __get_cpu_var(idt_desc).address;
+ end = start + __get_cpu_var(idt_desc).size + 1;
+
+ xen_mc_flush();
+
+ write_dt_entry(dt, entrynum, low, high);
+
+ if (p >= start && (p + 8) <= end) {
+ struct trap_info info[2];
+
+ info[1].address = 0;
+
+ if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+ if (HYPERVISOR_set_trap_table(info))
+ BUG();
+ }
+
+ preempt_enable();
+}
+
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+ struct trap_info *traps)
+{
+ unsigned in, out, count;
+
+ count = (desc->size+1) / 8;
+ BUG_ON(count > 256);
+
+ for (in = out = 0; in < count; in++) {
+ const u32 *entry = (u32 *)(desc->address + in * 8);
+
+ if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+ out++;
+ }
+ traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+ const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+
+ xen_convert_trap_info(desc, traps);
+}
+
+/* Load a new IDT into Xen. In principle this can be per-CPU, so we
+ hold a spinlock to protect the static traps[] array (static because
+ it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+ static DEFINE_SPINLOCK(lock);
+ static struct trap_info traps[257];
+
+ spin_lock(&lock);
+
+ __get_cpu_var(idt_desc) = *desc;
+
+ xen_convert_trap_info(desc, traps);
+
+ xen_mc_flush();
+ if (HYPERVISOR_set_trap_table(traps))
+ BUG();
+
+ spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry. Ignore LDT descriptors, since
+ they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+ u32 low, u32 high)
+{
+ preempt_disable();
+
+ switch ((high >> 8) & 0xff) {
+ case DESCTYPE_LDT:
+ case DESCTYPE_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+ u64 desc = (u64)high << 32 | low;
+
+ xen_mc_flush();
+ if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+ BUG();
+ }
+
+ }
+
+ preempt_enable();
+}
+
+static void xen_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ struct multicall_space mcs = xen_mc_entry(0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_set_iopl_mask(unsigned mask)
+{
+ struct physdev_set_iopl set_iopl;
+
+ /* Force the change at ring 0. */
+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+ return 0;
+}
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
+}
+#endif
+
+static void xen_flush_tlb(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_INVLPG_LOCAL;
+ op->arg1.linear_addr = addr & PAGE_MASK;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+ unsigned long va)
+{
+ struct {
+ struct mmuext_op op;
+ cpumask_t mask;
+ } *args;
+ cpumask_t cpumask = *cpus;
+ struct multicall_space mcs;
+
+ /*
+ * A couple of (to be removed) sanity checks:
+ *
+ * - current CPU must not be in mask
+ * - mask must exist :)
+ */
+ BUG_ON(cpus_empty(cpumask));
+ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+ BUG_ON(!mm);
+
+ /* If a CPU which we ran on has gone down, OK. */
+ cpus_and(cpumask, cpumask, cpu_online_map);
+ if (cpus_empty(cpumask))
+ return;
+
+ mcs = xen_mc_entry(sizeof(*args));
+ args = mcs.args;
+ args->mask = cpumask;
+ args->op.arg2.vcpumask = &args->mask;
+
+ if (va == TLB_FLUSH_ALL) {
+ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ } else {
+ args->op.cmd = MMUEXT_INVLPG_MULTI;
+ args->op.arg1.linear_addr = va;
+ }
+
+ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+ x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+ return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+
+static unsigned long xen_read_cr2_direct(void)
+{
+ return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+ /* Just ignore cr4 changes; Xen doesn't allow us to do
+ anything anyway. */
+}
+
+static unsigned long xen_read_cr3(void)
+{
+ return x86_read_percpu(xen_cr3);
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ if (cr3 == x86_read_percpu(xen_cr3)) {
+ /* just a simple tlb flush */
+ xen_flush_tlb();
+ return;
+ }
+
+ x86_write_percpu(xen_cr3, cr3);
+
+
+ {
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = mfn;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+ }
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+ everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+ BUG_ON(mem_map); /* should only be used early */
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+ attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ if (PagePinned(virt_to_page(mm->pgd))) {
+ SetPagePinned(page);
+
+ if (!PageHighMem(page))
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ else
+ /* make sure there are no stray mappings of
+ this page */
+ kmap_flush_unused();
+ }
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ if (PagePinned(page)) {
+ if (!PageHighMem(page))
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ }
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+ pgprot_t prot = PAGE_KERNEL;
+
+ if (PagePinned(page))
+ prot = PAGE_KERNEL_RO;
+
+ if (0 && PageHighMem(page))
+ printk("mapping highpte %lx type %d prot %s\n",
+ page_to_pfn(page), type,
+ (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+ return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+ if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
+
+ return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+ doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+ pte = mask_rw_pte(ptep, pte);
+
+ xen_set_pte(ptep, pte);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+ pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+ /* special set_pte for pagetable initialization */
+ paravirt_ops.set_pte = xen_set_pte_init;
+
+ init_mm.pgd = base;
+ /*
+ * copy top-level of Xen-supplied pagetable into place. For
+ * !PAE we can use this as-is, but for PAE it is a stand-in
+ * while we copy the pmd pages.
+ */
+ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+ if (PTRS_PER_PMD > 1) {
+ int i;
+ /*
+ * For PAE, need to allocate new pmds, rather than
+ * share Xen's, since Xen doesn't like pmd's being
+ * shared between address spaces.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+ pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+ memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+ PAGE_SIZE);
+
+ make_lowmem_page_readonly(pmd);
+
+ set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+ } else
+ pgd_clear(&base[i]);
+ }
+ }
+
+ /* make sure zero_page is mapped RO so we can use it in pagetables */
+ make_lowmem_page_readonly(empty_zero_page);
+ make_lowmem_page_readonly(base);
+ /*
+ * Switch to new pagetable. This is done before
+ * pagetable_init has done anything so that the new pages
+ * added to the table can be prepared properly for Xen.
+ */
+ xen_write_cr3(__pa(base));
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ paravirt_ops.alloc_pt = xen_alloc_pt;
+ paravirt_ops.set_pte = xen_set_pte;
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ /*
+ * Create a mapping for the shared info page.
+ * Should be set_fixmap(), but shared_info is a machine
+ * address with no corresponding pseudo-phys address.
+ */
+ set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+ PFN_DOWN(xen_start_info->shared_info),
+ PAGE_KERNEL);
+
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+ } else
+ HYPERVISOR_shared_info =
+ (struct shared_info *)__va(xen_start_info->shared_info);
+
+ /* Actually pin the pagetable down, but we can't set PG_pinned
+ yet because the page structures don't exist yet. */
+ {
+ struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+ op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+ op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+ }
+}
+
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ xen_vcpu_setup(cpu);
+
+ /* xen_vcpu_setup managed to place the vcpu_info within the
+ percpu area for all cpus, so make use of it */
+ if (have_vcpu_info_placement) {
+ printk(KERN_INFO "Xen: using vcpu_info placement\n");
+
+ paravirt_ops.save_fl = xen_save_fl_direct;
+ paravirt_ops.restore_fl = xen_restore_fl_direct;
+ paravirt_ops.irq_disable = xen_irq_disable_direct;
+ paravirt_ops.irq_enable = xen_irq_enable_direct;
+ paravirt_ops.read_cr2 = xen_read_cr2_direct;
+ paravirt_ops.iret = xen_iret_direct;
+ }
+}
+
+static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len)
+{
+ char *start, *end, *reloc;
+ unsigned ret;
+
+ start = end = reloc = NULL;
+
+#define SITE(x) \
+ case PARAVIRT_PATCH(x): \
+ if (have_vcpu_info_placement) { \
+ start = (char *)xen_##x##_direct; \
+ end = xen_##x##_direct_end; \
+ reloc = xen_##x##_direct_reloc; \
+ } \
+ goto patch_site
+
+ switch (type) {
+ SITE(irq_enable);
+ SITE(irq_disable);
+ SITE(save_fl);
+ SITE(restore_fl);
+#undef SITE
+
+ patch_site:
+ if (start == NULL || (end-start) > len)
+ goto default_patch;
+
+ ret = paravirt_patch_insns(insnbuf, len, start, end);
+
+ /* Note: because reloc is assigned from something that
+ appears to be an array, gcc assumes it's non-null,
+ but doesn't know its relationship with start and
+ end. */
+ if (reloc > start && reloc < end) {
+ int reloc_off = reloc - start;
+ long *relocp = (long *)(insnbuf + reloc_off);
+ long delta = start - (char *)addr;
+
+ *relocp += delta;
+ }
+ break;
+
+ default_patch:
+ default:
+ ret = paravirt_patch_default(type, clobbers, insnbuf,
+ addr, len);
+ break;
+ }
+
+ return ret;
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+ .paravirt_enabled = 1,
+ .shared_kernel_pmd = 0,
+
+ .name = "Xen",
+ .banner = xen_banner,
+
+ .patch = xen_patch,
+
+ .memory_setup = xen_memory_setup,
+ .arch_setup = xen_arch_setup,
+ .init_IRQ = xen_init_IRQ,
+ .post_allocator_init = xen_mark_init_mm_pinned,
+
+ .time_init = xen_time_init,
+ .set_wallclock = xen_set_wallclock,
+ .get_wallclock = xen_get_wallclock,
+ .get_cpu_khz = xen_cpu_khz,
+ .sched_clock = xen_sched_clock,
+
+ .cpuid = xen_cpuid,
+
+ .set_debugreg = xen_set_debugreg,
+ .get_debugreg = xen_get_debugreg,
+
+ .clts = native_clts,
+
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3,
+
+ .read_cr4 = native_read_cr4,
+ .read_cr4_safe = native_read_cr4_safe,
+ .write_cr4 = xen_write_cr4,
+
+ .save_fl = xen_save_fl,
+ .restore_fl = xen_restore_fl,
+ .irq_disable = xen_irq_disable,
+ .irq_enable = xen_irq_enable,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+ .wbinvd = native_wbinvd,
+
+ .read_msr = native_read_msr_safe,
+ .write_msr = native_write_msr_safe,
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+
+ .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+ .irq_enable_sysexit = NULL, /* never called */
+
+ .load_tr_desc = paravirt_nop,
+ .set_ldt = xen_set_ldt,
+ .load_gdt = xen_load_gdt,
+ .load_idt = xen_load_idt,
+ .load_tls = xen_load_tls,
+
+ .store_gdt = native_store_gdt,
+ .store_idt = native_store_idt,
+ .store_tr = xen_store_tr,
+
+ .write_ldt_entry = xen_write_ldt_entry,
+ .write_gdt_entry = xen_write_gdt_entry,
+ .write_idt_entry = xen_write_idt_entry,
+ .load_esp0 = xen_load_esp0,
+
+ .set_iopl_mask = xen_set_iopl_mask,
+ .io_delay = xen_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = xen_apic_write,
+ .apic_write_atomic = xen_apic_write,
+ .apic_read = xen_apic_read,
+ .setup_boot_clock = paravirt_nop,
+ .setup_secondary_clock = paravirt_nop,
+ .startup_ipi_hook = paravirt_nop,
+#endif
+
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_single = xen_flush_tlb_single,
+ .flush_tlb_others = xen_flush_tlb_others,
+
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,
+
+ .pagetable_setup_start = xen_pagetable_setup_start,
+ .pagetable_setup_done = xen_pagetable_setup_done,
+
+ .alloc_pt = xen_alloc_pt_init,
+ .release_pt = xen_release_pt,
+ .alloc_pd = paravirt_nop,
+ .alloc_pd_clone = paravirt_nop,
+ .release_pd = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+ .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+ .set_pte = NULL, /* see xen_pagetable_setup_* */
+ .set_pte_at = xen_set_pte_at,
+ .set_pmd = xen_set_pmd,
+
+ .pte_val = xen_pte_val,
+ .pgd_val = xen_pgd_val,
+
+ .make_pte = xen_make_pte,
+ .make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = xen_set_pte_atomic,
+ .set_pte_present = xen_set_pte_at,
+ .set_pud = xen_set_pud,
+ .pte_clear = xen_pte_clear,
+ .pmd_clear = xen_pmd_clear,
+
+ .make_pmd = xen_make_pmd,
+ .pmd_val = xen_pmd_val,
+#endif /* PAE */
+
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
+
+ .set_lazy_mode = xen_set_lazy_mode,
+};
+
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+ .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = xen_smp_prepare_cpus,
+ .cpu_up = xen_cpu_up,
+ .smp_cpus_done = xen_smp_cpus_done,
+
+ .smp_send_stop = xen_smp_send_stop,
+ .smp_send_reschedule = xen_smp_send_reschedule,
+ .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif /* CONFIG_SMP */
+
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+ smp_send_stop();
+#endif
+
+ if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+ BUG();
+}
+
+static void xen_restart(char *msg)
+{
+ xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_emergency_restart(void)
+{
+ xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+ xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops __initdata xen_machine_ops = {
+ .restart = xen_restart,
+ .halt = xen_machine_halt,
+ .power_off = xen_machine_halt,
+ .shutdown = xen_machine_halt,
+ .crash_shutdown = xen_crash_shutdown,
+ .emergency_restart = xen_emergency_restart,
+};
+
+
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+ pgd_t *pgd;
+
+ if (!xen_start_info)
+ return;
+
+ BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+ /* Install Xen paravirt ops */
+ paravirt_ops = xen_paravirt_ops;
+ machine_ops = xen_machine_ops;
+
+#ifdef CONFIG_SMP
+ smp_ops = xen_smp_ops;
+#endif
+
+ xen_setup_features();
+
+ /* Get mfn list */
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+ pgd = (pgd_t *)xen_start_info->pt_base;
+
+ init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+ init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+ /* keep using Xen gdt for now; no urgent need to change it */
+
+ x86_write_percpu(xen_cr3, __pa(pgd));
+
+#ifdef CONFIG_SMP
+ /* Don't do the full vcpu_info placement stuff until we have a
+ possible map. */
+ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+ /* May as well do it now, since there's no good time to call
+ it later on UP. */
+ xen_setup_vcpu_info_placement();
+#endif
+
+ paravirt_ops.kernel_rpl = 1;
+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ paravirt_ops.kernel_rpl = 0;
+
+ /* set the limit of our address space */
+ reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+ /* set up basic CPUID stuff */
+ cpu_detect(&new_cpu_data);
+ new_cpu_data.hard_math = 1;
+ new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+ /* Poke various useful things into boot_params */
+ LOADER_TYPE = (9 << 4) | 0;
+ INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+ INITRD_SIZE = xen_start_info->mod_len;
+
+ /* Start the world */
+ start_kernel();
+}
--- /dev/null
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels. Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels. The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip. When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications. This includes all the virtual
+ * device events, since they're driven by front-ends in another domain
+ * (typically dom0).
+ * 2. VIRQs, typically used for timers. These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+ unsigned short evtchn;
+ unsigned char index;
+ unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+ IRQT_UNBOUND,
+ IRQT_PIRQ,
+ IRQT_VIRQ,
+ IRQT_IPI,
+ IRQT_EVTCHN
+};
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+ [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn) ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+ (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+ return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+ return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+ return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+ return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+ struct shared_info *sh,
+ unsigned int idx)
+{
+ return (sh->evtchn_pending[idx] &
+ cpu_evtchn_mask[cpu][idx] &
+ ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+ int irq = evtchn_to_irq[chn];
+
+ BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+ irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+ __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+ __set_bit(chn, cpu_evtchn_mask[cpu]);
+
+ cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ /* By default all event channels notify CPU#0. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+ memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+ memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+ return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ unsigned int cpu = get_cpu();
+
+ BUG_ON(!irqs_disabled());
+
+ /* Slow path (hypercall) if this is a non-local port. */
+ if (unlikely(cpu != cpu_from_evtchn(port))) {
+ struct evtchn_unmask unmask = { .port = port };
+ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+ } else {
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+ sync_clear_bit(port, &s->evtchn_mask[0]);
+
+ /*
+ * The following is basically the equivalent of
+ * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+ * the interrupt edge' if the channel is masked.
+ */
+ if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+ !sync_test_and_set_bit(port / BITS_PER_LONG,
+ &vcpu_info->evtchn_pending_sel))
+ vcpu_info->evtchn_upcall_pending = 1;
+ }
+
+ put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+ int irq;
+
+ /* Only allocate from dynirq range */
+ for (irq = 0; irq < NR_IRQS; irq++)
+ if (irq_bindcount[irq] == 0)
+ break;
+
+ if (irq == NR_IRQS)
+ panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+ return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+ int irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = evtchn_to_irq[evtchn];
+
+ if (irq == -1) {
+ irq = find_unbound_irq();
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "event");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+ }
+
+ irq_bindcount[irq]++;
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+ struct evtchn_bind_ipi bind_ipi;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(ipi_to_irq, cpu)[ipi];
+ if (irq == -1) {
+ irq = find_unbound_irq();
+ if (irq < 0)
+ goto out;
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "ipi");
+
+ bind_ipi.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+ &bind_ipi) != 0)
+ BUG();
+ evtchn = bind_ipi.port;
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+ per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ out:
+ spin_unlock(&irq_mapping_update_lock);
+ return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+ struct evtchn_bind_virq bind_virq;
+ int evtchn, irq;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = per_cpu(virq_to_irq, cpu)[virq];
+
+ if (irq == -1) {
+ bind_virq.virq = virq;
+ bind_virq.vcpu = cpu;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+ &bind_virq) != 0)
+ BUG();
+ evtchn = bind_virq.port;
+
+ irq = find_unbound_irq();
+
+ dynamic_irq_init(irq);
+ set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+ handle_level_irq, "virq");
+
+ evtchn_to_irq[evtchn] = irq;
+ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+ per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+ bind_evtchn_to_cpu(evtchn, cpu);
+ }
+
+ irq_bindcount[irq]++;
+
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+ struct evtchn_close close;
+ int evtchn = evtchn_from_irq(irq);
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ switch (type_from_irq(irq)) {
+ case IRQT_VIRQ:
+ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+ [index_from_irq(irq)] = -1;
+ break;
+ default:
+ break;
+ }
+
+ /* Closed ports are implicitly re-bound to VCPU0. */
+ bind_evtchn_to_cpu(evtchn, 0);
+
+ evtchn_to_irq[evtchn] = -1;
+ irq_info[irq] = IRQ_UNBOUND;
+
+ dynamic_irq_init(irq);
+ }
+
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ irqreturn_t (*handler)(int, void *),
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_evtchn_to_irq(evtchn);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ irqreturn_t (*handler)(int, void *),
+ unsigned long irqflags, const char *devname, void *dev_id)
+{
+ unsigned int irq;
+ int retval;
+
+ irq = bind_virq_to_irq(virq, cpu);
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ unsigned int cpu,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ int irq, retval;
+
+ irq = bind_ipi_to_irq(ipi, cpu);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+ free_irq(irq, dev_id);
+ unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+ int irq = per_cpu(ipi_to_irq, cpu)[vector];
+ BUG_ON(irq < 0);
+ notify_remote_via_irq(irq);
+}
+
+
+/*
+ * Search the CPUs pending events bitmasks. For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching. The first level is
+ * a bitset of words which contain pending event bits. The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ int cpu = get_cpu();
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+ unsigned long pending_words;
+
+ vcpu_info->evtchn_upcall_pending = 0;
+
+ /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+ pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+ while (pending_words != 0) {
+ unsigned long pending_bits;
+ int word_idx = __ffs(pending_words);
+ pending_words &= ~(1UL << word_idx);
+
+ while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
+
+ if (irq != -1) {
+ regs->orig_eax = ~irq;
+ do_IRQ(regs);
+ }
+ }
+ }
+
+ put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+ struct evtchn_bind_vcpu bind_vcpu;
+ int evtchn = evtchn_from_irq(irq);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ /* Send future instances of this interrupt to other vcpu. */
+ bind_vcpu.port = evtchn;
+ bind_vcpu.vcpu = tcpu;
+
+ /*
+ * If this fails, it usually just indicates that we're dealing with a
+ * virq or IPI channel, which don't actually need to be rebound. Ignore
+ * it, but don't do the xenlinux-level rebind in that case.
+ */
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+ bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+ unsigned tcpu = first_cpu(dest);
+ rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ int ret = 0;
+
+ if (VALID_EVTCHN(evtchn)) {
+ set_evtchn(evtchn);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+ .name = "xen-dyn",
+ .mask = disable_dynirq,
+ .unmask = enable_dynirq,
+ .ack = ack_dynirq,
+ .set_affinity = set_affinity_irq,
+ .retrigger = retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+ int i;
+
+ init_evtchn_cpu_bindings();
+
+ /* No event channels are 'live' right now. */
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ mask_evtchn(i);
+
+ /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+ for (i = 0; i < NR_IRQS; i++)
+ irq_bindcount[i] = 0;
+
+ irq_ctx_init(smp_processor_id());
+}
--- /dev/null
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+ struct xen_feature_info fi;
+ int i, j;
+
+ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+ fi.submap_idx = i;
+ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+ break;
+ for (j = 0; j < 32; j++)
+ xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+ }
+}
--- /dev/null
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+
+#include <xen/xenbus.h>
+
+#define SHUTDOWN_INVALID -1
+#define SHUTDOWN_POWEROFF 0
+#define SHUTDOWN_SUSPEND 2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned. The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT 4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+static void shutdown_handler(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ char *str;
+ struct xenbus_transaction xbt;
+ int err;
+
+ if (shutting_down != SHUTDOWN_INVALID)
+ return;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+
+ str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+ /* Ignore read errors and empty reads. */
+ if (XENBUS_IS_ERR_READ(str)) {
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ xenbus_write(xbt, "control", "shutdown", "");
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN) {
+ kfree(str);
+ goto again;
+ }
+
+ if (strcmp(str, "poweroff") == 0 ||
+ strcmp(str, "halt") == 0)
+ orderly_poweroff(false);
+ else if (strcmp(str, "reboot") == 0)
+ ctrl_alt_del();
+ else {
+ printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+ shutting_down = SHUTDOWN_INVALID;
+ }
+
+ kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+ unsigned int len)
+{
+ char sysrq_key = '\0';
+ struct xenbus_transaction xbt;
+ int err;
+
+ again:
+ err = xenbus_transaction_start(&xbt);
+ if (err)
+ return;
+ if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+ printk(KERN_ERR "Unable to read sysrq code in "
+ "control/sysrq\n");
+ xenbus_transaction_end(xbt, 1);
+ return;
+ }
+
+ if (sysrq_key != '\0')
+ xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+
+ if (sysrq_key != '\0')
+ handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+ .node = "control/sysrq",
+ .callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+ int err;
+
+ err = register_xenbus_watch(&shutdown_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set shutdown watcher\n");
+ return err;
+ }
+
+ err = register_xenbus_watch(&sysrq_watch);
+ if (err) {
+ printk(KERN_ERR "Failed to set sysrq watcher\n");
+ return err;
+ }
+
+ return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+ unsigned long event,
+ void *data)
+{
+ setup_shutdown_watcher();
+ return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+ static struct notifier_block xenstore_notifier = {
+ .notifier_call = shutdown_event
+ };
+ register_xenstore_notifier(&xenstore_notifier);
+
+ return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
--- /dev/null
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion. In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable. When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest. This prevents uncontrolled
+ * guest updates to the pagetable. Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow. The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use. This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+ pte_t *pte = lookup_address(address);
+ unsigned offset = address & PAGE_MASK;
+
+ BUG_ON(pte == NULL);
+
+ return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+ pte_t *pte, ptev;
+ unsigned long address = (unsigned long)vaddr;
+
+ pte = lookup_address(address);
+ BUG_ON(pte == NULL);
+
+ ptev = pte_wrprotect(*pte);
+
+ if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+ BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+ pte_t *pte, ptev;
+ unsigned long address = (unsigned long)vaddr;
+
+ pte = lookup_address(address);
+ BUG_ON(pte == NULL);
+
+ ptev = pte_mkwrite(*pte);
+
+ if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+ BUG();
+}
+
+
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+ u->ptr = virt_to_machine(ptr).maddr;
+ u->val = pmd_val_ma(val);
+ MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ /* <mfn,flags> stored as-is, to permit clearing entries */
+ xen_set_pte(pte, mfn_pte(mfn, flags));
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ if (mm == current->mm || mm == &init_mm) {
+ if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ struct multicall_space mcs;
+ mcs = xen_mc_entry(0);
+
+ MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ return;
+ } else
+ if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+ return;
+ }
+ xen_set_pte(ptep, pteval);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+ u->ptr = virt_to_machine(ptr).maddr;
+ u->val = pud_val_ma(val);
+ MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb(); /* make sure low gets written first */
+ ptep->pte_high = 0;
+}
+
+void xen_pmd_clear(pmd_t *pmdp)
+{
+ xen_set_pmd(pmdp, __pmd(0));
+}
+
+unsigned long long xen_pte_val(pte_t pte)
+{
+ unsigned long long ret = 0;
+
+ if (pte.pte_low) {
+ ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ }
+
+ return ret;
+}
+
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+ unsigned long long ret = pmd.pmd;
+ if (ret)
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ return ret;
+}
+
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+ unsigned long long ret = pgd.pgd;
+ if (ret)
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ return ret;
+}
+
+pte_t xen_make_pte(unsigned long long pte)
+{
+ if (pte & 1)
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+
+ return (pte_t){ pte, pte >> 32 };
+}
+
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+ if (pmd & 1)
+ pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+ return (pmd_t){ pmd };
+}
+
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+#else /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+
+unsigned long xen_pte_val(pte_t pte)
+{
+ unsigned long ret = pte.pte_low;
+
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr;
+
+ return ret;
+}
+
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+ unsigned long ret = pgd.pgd;
+ if (ret)
+ ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ return ret;
+}
+
+pte_t xen_make_pte(unsigned long pte)
+{
+ if (pte & _PAGE_PRESENT)
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+
+ return (pte_t){ pte };
+}
+
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+#endif /* CONFIG_X86_PAE */
+
+
+
+/*
+ (Yet another) pagetable walker. This one is intended for pinning a
+ pagetable. This means that it walks a pagetable and calls the
+ callback function on each page it finds making up the page table,
+ at every level. It walks the entire pagetable, but it only bothers
+ pinning pte pages which are below pte_limit. In the normal case
+ this will be TASK_SIZE, but at boot we need to pin up to
+ FIXADDR_TOP. But the important bit is that we don't pin beyond
+ there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+ unsigned long limit)
+{
+ pgd_t *pgd = pgd_base;
+ int flush = 0;
+ unsigned long addr = 0;
+ unsigned long pgd_next;
+
+ BUG_ON(limit > FIXADDR_TOP);
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+ pud_t *pud;
+ unsigned long pud_limit, pud_next;
+
+ pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+
+ if (!pgd_val(*pgd))
+ continue;
+
+ pud = pud_offset(pgd, 0);
+
+ if (PTRS_PER_PUD > 1) /* not folded */
+ flush |= (*func)(virt_to_page(pud), 0);
+
+ for (; addr != pud_limit; pud++, addr = pud_next) {
+ pmd_t *pmd;
+ unsigned long pmd_limit;
+
+ pud_next = pud_addr_end(addr, pud_limit);
+
+ if (pud_next < limit)
+ pmd_limit = pud_next;
+ else
+ pmd_limit = limit;
+
+ if (pud_none(*pud))
+ continue;
+
+ pmd = pmd_offset(pud, 0);
+
+ if (PTRS_PER_PMD > 1) /* not folded */
+ flush |= (*func)(virt_to_page(pmd), 0);
+
+ for (; addr != pmd_limit; pmd++) {
+ addr += (PAGE_SIZE * PTRS_PER_PTE);
+ if ((pmd_limit-1) < (addr-1)) {
+ addr = pmd_limit;
+ break;
+ }
+
+ if (pmd_none(*pmd))
+ continue;
+
+ flush |= (*func)(pmd_page(*pmd), 0);
+ }
+ }
+ }
+
+ flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+
+ return flush;
+}
+
+static int pin_page(struct page *page, unsigned flags)
+{
+ unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+ int flush;
+
+ if (pgfl)
+ flush = 0; /* already pinned */
+ else if (PageHighMem(page))
+ /* kmaps need flushing if we found an unpinned
+ highpage */
+ flush = 1;
+ else {
+ void *pt = lowmem_page_address(page);
+ unsigned long pfn = page_to_pfn(page);
+ struct multicall_space mcs = __xen_mc_entry(0);
+
+ flush = 0;
+
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+ pfn_pte(pfn, PAGE_KERNEL_RO),
+ flags);
+ }
+
+ return flush;
+}
+
+/* This is called just after a mm has been created, but it has not
+ been used yet. We need to make sure that its pagetable is all
+ read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *op;
+
+ xen_mc_batch();
+
+ if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+ /* re-enable interrupts for kmap_flush_unused */
+ xen_mc_issue(0);
+ kmap_flush_unused();
+ xen_mc_batch();
+ }
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+
+#ifdef CONFIG_X86_PAE
+ op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+ op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+ op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(0);
+}
+
+/* The init_mm pagetable is really pinned as soon as its created, but
+ that's before we have page structures to store the bits. So do all
+ the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+ SetPagePinned(page);
+ return 0;
+}
+
+void __init xen_mark_init_mm_pinned(void)
+{
+ pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+
+static int unpin_page(struct page *page, unsigned flags)
+{
+ unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+
+ if (pgfl && !PageHighMem(page)) {
+ void *pt = lowmem_page_address(page);
+ unsigned long pfn = page_to_pfn(page);
+ struct multicall_space mcs = __xen_mc_entry(0);
+
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+ pfn_pte(pfn, PAGE_KERNEL),
+ flags);
+ }
+
+ return 0; /* never need to flush on unpin */
+}
+
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+
+ mcs = __xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_UNPIN_TABLE;
+ op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+ xen_mc_issue(0);
+}
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+ spin_lock(&next->page_table_lock);
+ xen_pgd_pin(next->pgd);
+ spin_unlock(&next->page_table_lock);
+}
+
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+ xen_pgd_pin(mm->pgd);
+ spin_unlock(&mm->page_table_lock);
+}
+
+
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+ we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+ struct mm_struct *mm = info;
+
+ if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+ leave_mm(smp_processor_id());
+}
+
+static void drop_mm_ref(struct mm_struct *mm)
+{
+ if (current->active_mm == mm) {
+ if (current->mm == mm)
+ load_cr3(swapper_pg_dir);
+ else
+ leave_mm(smp_processor_id());
+ }
+
+ if (!cpus_empty(mm->cpu_vm_mask))
+ xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+ mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+ if (current->active_mm == mm)
+ load_cr3(swapper_pg_dir);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it. This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing. This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+ get_cpu(); /* make sure we don't move around */
+ drop_mm_ref(mm);
+ put_cpu();
+
+ spin_lock(&mm->page_table_lock);
+
+ /* pgd may not be pinned in the error exit path of execve */
+ if (PagePinned(virt_to_page(mm->pgd)))
+ xen_pgd_unpin(mm->pgd);
+ spin_unlock(&mm->page_table_lock);
+}
--- /dev/null
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif /* _XEN_MMU_H */
--- /dev/null
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface. This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls. There's a
+ * per-cpu buffer of outstanding multicalls. When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc). When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested. There's no way to get per-multicall
+ * return results back. It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH 32
+#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
+
+struct mc_buffer {
+ struct multicall_entry entries[MC_BATCH];
+ u64 args[MC_ARGS];
+ unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ int ret = 0;
+ unsigned long flags;
+
+ BUG_ON(preemptible());
+
+ /* Disable interrupts in case someone comes in and queues
+ something in the middle */
+ local_irq_save(flags);
+
+ if (b->mcidx) {
+ int i;
+
+ if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+ BUG();
+ for (i = 0; i < b->mcidx; i++)
+ if (b->entries[i].result < 0)
+ ret++;
+ b->mcidx = 0;
+ b->argidx = 0;
+ } else
+ BUG_ON(b->argidx != 0);
+
+ local_irq_restore(flags);
+
+ BUG_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct multicall_space ret;
+ unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+ BUG_ON(preemptible());
+ BUG_ON(argspace > MC_ARGS);
+
+ if (b->mcidx == MC_BATCH ||
+ (b->argidx + argspace) > MC_ARGS)
+ xen_mc_flush();
+
+ ret.mc = &b->entries[b->mcidx];
+ b->mcidx++;
+ ret.args = &b->args[b->argidx];
+ b->argidx += argspace;
+
+ return ret;
+}
--- /dev/null
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+ struct multicall_entry *mc;
+ void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s. Must be
+ paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+ /* need to disable interrupts until this entry is complete */
+ local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+ xen_mc_batch();
+ return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+ if ((xen_get_lazy_mode() & mode) == 0)
+ xen_mc_flush();
+
+ /* restore flags saved in xen_mc_batch */
+ local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+
+#endif /* _XEN_MULTICALLS_H */
--- /dev/null
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+#include "xen-ops.h"
+#include "vdso.h"
+
+/* These are code, but not functions. Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+ unsigned long max_pfn = xen_start_info->nr_pages;
+
+ e820.nr_map = 0;
+ add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+ return "Xen";
+}
+
+static void xen_idle(void)
+{
+ local_irq_disable();
+
+ if (need_resched())
+ local_irq_enable();
+ else {
+ current_thread_info()->status &= ~TS_POLLING;
+ smp_mb__after_clear_bit();
+ safe_halt();
+ current_thread_info()->status |= TS_POLLING;
+ }
+}
+
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ */
+static void fiddle_vdso(void)
+{
+ extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */
+ extern char vsyscall_int80_start;
+ u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
+ &vsyscall_int80_start);
+ *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+}
+
+void __init xen_arch_setup(void)
+{
+ struct physdev_set_iopl set_iopl;
+ int rc;
+
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+ HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+ __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+ if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+ disable_acpi();
+ }
+#endif
+
+ memcpy(boot_command_line, xen_start_info->cmd_line,
+ MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+ COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+ pm_idle = xen_idle;
+
+#ifdef CONFIG_SMP
+ /* fill cpus_possible with all available cpus */
+ xen_fill_possible_map();
+#endif
+
+ paravirt_disable_iospace();
+
+ fiddle_vdso();
+}
--- /dev/null
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops. SMP under Xen is
+ * very straightforward. Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of. As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+ void (*func) (void *info);
+ void *info;
+ atomic_t started;
+ atomic_t finished;
+ int wait;
+};
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+
+static struct call_data_struct *call_data;
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+ return IRQ_HANDLED;
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+ int cpu = smp_processor_id();
+
+ cpu_init();
+
+ preempt_disable();
+ per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+ xen_setup_cpu_clockevents();
+
+ /* We can take interrupts now: we're officially "up". */
+ local_irq_enable();
+
+ wmb(); /* make sure everything is out */
+ cpu_idle();
+}
+
+static int xen_smp_intr_init(unsigned int cpu)
+{
+ int rc;
+ const char *resched_name, *callfunc_name;
+
+ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+ resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+ cpu,
+ xen_reschedule_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ resched_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(resched_irq, cpu) = rc;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+ cpu,
+ xen_call_function_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(callfunc_irq, cpu) = rc;
+
+ return 0;
+
+ fail:
+ if (per_cpu(resched_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+ if (per_cpu(callfunc_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ return rc;
+}
+
+void __init xen_fill_possible_map(void)
+{
+ int i, rc;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0)
+ cpu_set(i, cpu_possible_map);
+ }
+}
+
+void __init xen_smp_prepare_boot_cpu(void)
+{
+ int cpu;
+
+ BUG_ON(smp_processor_id() != 0);
+ native_smp_prepare_boot_cpu();
+
+ /* We've switched to the "real" per-cpu gdt, so make sure the
+ old memory can be recycled */
+ make_lowmem_page_readwrite(&per_cpu__gdt_page);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ }
+
+ xen_setup_vcpu_info_placement();
+}
+
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+ unsigned cpu;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ }
+
+ smp_store_cpu_info(0);
+ set_cpu_sibling_map(0);
+
+ if (xen_smp_intr_init(0))
+ BUG();
+
+ cpu_initialized_map = cpumask_of_cpu(0);
+
+ /* Restrict the possible_map according to max_cpus. */
+ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+ for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+ continue;
+ cpu_clear(cpu, cpu_possible_map);
+ }
+
+ for_each_possible_cpu (cpu) {
+ struct task_struct *idle;
+
+ if (cpu == 0)
+ continue;
+
+ idle = fork_idle(cpu);
+ if (IS_ERR(idle))
+ panic("failed fork for CPU %d", cpu);
+
+ cpu_set(cpu, cpu_present_map);
+ }
+
+ //init_xenbus_allowed_cpumask();
+}
+
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+ struct vcpu_guest_context *ctxt;
+ struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+
+ if (cpu_test_and_set(cpu, cpu_initialized_map))
+ return 0;
+
+ ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+ if (ctxt == NULL)
+ return -ENOMEM;
+
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.fs = __KERNEL_PERCPU;
+ ctxt->user_regs.gs = 0;
+ ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+
+ memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+ xen_copy_trap_info(ctxt->trap_ctxt);
+
+ ctxt->ldt_ents = 0;
+
+ BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+ make_lowmem_page_readonly(gdt->gdt);
+
+ ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+ ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
+
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.esp0;
+
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+ BUG();
+
+ kfree(ctxt);
+ return 0;
+}
+
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+ struct task_struct *idle = idle_task(cpu);
+ int rc;
+
+#if 0
+ rc = cpu_up_check(cpu);
+ if (rc)
+ return rc;
+#endif
+
+ init_gdt(cpu);
+ per_cpu(current_task, cpu) = idle;
+ irq_ctx_init(cpu);
+ xen_setup_timer(cpu);
+
+ /* make sure interrupts start blocked */
+ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+ rc = cpu_initialize_context(cpu, idle);
+ if (rc)
+ return rc;
+
+ if (num_online_cpus() == 1)
+ alternatives_smp_switch(1);
+
+ rc = xen_smp_intr_init(cpu);
+ if (rc)
+ return rc;
+
+ smp_store_cpu_info(cpu);
+ set_cpu_sibling_map(cpu);
+ /* This must be done before setting cpu_online_map */
+ wmb();
+
+ cpu_set(cpu, cpu_online_map);
+
+ rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+ BUG_ON(rc);
+
+ return 0;
+}
+
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+static void stop_self(void *v)
+{
+ int cpu = smp_processor_id();
+
+ /* make sure we're not pinning something down */
+ load_cr3(swapper_pg_dir);
+ /* should set up a minimal gdt */
+
+ HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+ BUG();
+}
+
+void xen_smp_send_stop(void)
+{
+ smp_call_function(stop_self, NULL, 0, 0);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+ xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+ unsigned cpu;
+
+ cpus_and(mask, mask, cpu_online_map);
+
+ for_each_cpu_mask(cpu, mask)
+ xen_send_IPI_one(cpu, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+ void (*func) (void *info) = call_data->func;
+ void *info = call_data->info;
+ int wait = call_data->wait;
+
+ /*
+ * Notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
+ */
+ mb();
+ atomic_inc(&call_data->started);
+ /*
+ * At this point the info structure may be out of scope unless wait==1
+ */
+ irq_enter();
+ (*func)(info);
+ irq_exit();
+
+ if (wait) {
+ mb(); /* commit everything before setting finished */
+ atomic_inc(&call_data->finished);
+ }
+
+ return IRQ_HANDLED;
+}
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+ void *info, int wait)
+{
+ struct call_data_struct data;
+ int cpus;
+
+ /* Holding any lock stops cpus from going down. */
+ spin_lock(&call_lock);
+
+ cpu_clear(smp_processor_id(), mask);
+
+ cpus = cpus_weight(mask);
+ if (!cpus) {
+ spin_unlock(&call_lock);
+ return 0;
+ }
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ mb(); /* write everything before IPI */
+
+ /* Send a message to other CPUs and wait for them to respond */
+ xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+ /* Make sure other vcpus get a chance to run.
+ XXX too severe? Maybe we should check the other CPU's states? */
+ HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus ||
+ (wait && atomic_read(&data.finished) != cpus))
+ cpu_relax();
+
+ spin_unlock(&call_lock);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP 100000
+#define NS_PER_TICK (1000000000LL / HZ)
+
+static cycle_t xen_clocksource_read(void);
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+ u64 ret;
+
+ if (BITS_PER_LONG < 64) {
+ u32 *p32 = (u32 *)p;
+ u32 h, l;
+
+ /*
+ * Read high then low, and then make sure high is
+ * still the same; this will only loop if low wraps
+ * and carries into high.
+ * XXX some clean way to make this endian-proof?
+ */
+ do {
+ h = p32[1];
+ barrier();
+ l = p32[0];
+ barrier();
+ } while (p32[1] != h);
+
+ ret = (((u64)h) << 32) | l;
+ } else
+ ret = *p;
+
+ return ret;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+ u64 state_time;
+ struct vcpu_runstate_info *state;
+
+ BUG_ON(preemptible());
+
+ state = &__get_cpu_var(runstate);
+
+ /*
+ * The runstate info is always updated by the hypervisor on
+ * the current CPU, so there's no need to use anything
+ * stronger than a compiler barrier when fetching it.
+ */
+ do {
+ state_time = get64(&state->state_entry_time);
+ barrier();
+ *res = *state;
+ barrier();
+ } while (get64(&state->state_entry_time) != state_time);
+}
+
+static void setup_runstate_info(int cpu)
+{
+ struct vcpu_register_runstate_memory_area area;
+
+ area.addr.v = &per_cpu(runstate, cpu);
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+ cpu, &area))
+ BUG();
+}
+
+static void do_stolen_accounting(void)
+{
+ struct vcpu_runstate_info state;
+ struct vcpu_runstate_info *snap;
+ s64 blocked, runnable, offline, stolen;
+ cputime_t ticks;
+
+ get_runstate_snapshot(&state);
+
+ WARN_ON(state.state != RUNSTATE_running);
+
+ snap = &__get_cpu_var(runstate_snapshot);
+
+ /* work out how much time the VCPU has not been runn*ing* */
+ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+ runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+ offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+ *snap = state;
+
+ /* Add the appropriate number of ticks of stolen time,
+ including any left-overs from last time. Passing NULL to
+ account_steal_time accounts the time as stolen. */
+ stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+ if (stolen < 0)
+ stolen = 0;
+
+ ticks = 0;
+ while (stolen >= NS_PER_TICK) {
+ ticks++;
+ stolen -= NS_PER_TICK;
+ }
+ __get_cpu_var(residual_stolen) = stolen;
+ account_steal_time(NULL, ticks);
+
+ /* Add the appropriate number of ticks of blocked time,
+ including any left-overs from last time. Passing idle to
+ account_steal_time accounts the time as idle/wait. */
+ blocked += __get_cpu_var(residual_blocked);
+
+ if (blocked < 0)
+ blocked = 0;
+
+ ticks = 0;
+ while (blocked >= NS_PER_TICK) {
+ ticks++;
+ blocked -= NS_PER_TICK;
+ }
+ __get_cpu_var(residual_blocked) = blocked;
+ account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+/*
+ * Xen sched_clock implementation. Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+ struct vcpu_runstate_info state;
+ cycle_t now;
+ u64 ret;
+ s64 offset;
+
+ /*
+ * Ideally sched_clock should be called on a per-cpu basis
+ * anyway, so preempt should already be disabled, but that's
+ * not current practice at the moment.
+ */
+ preempt_disable();
+
+ now = xen_clocksource_read();
+
+ get_runstate_snapshot(&state);
+
+ WARN_ON(state.state != RUNSTATE_running);
+
+ offset = now - state.state_entry_time;
+ if (offset < 0)
+ offset = 0;
+
+ ret = state.time[RUNSTATE_blocked] +
+ state.time[RUNSTATE_running] +
+ offset;
+
+ preempt_enable();
+
+ return ret;
+}
+
+
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+ u64 cpu_khz = 1000000ULL << 32;
+ const struct vcpu_time_info *info =
+ &HYPERVISOR_shared_info->vcpu_info[0].time;
+
+ do_div(cpu_khz, info->tsc_to_system_mul);
+ if (info->tsc_shift < 0)
+ cpu_khz <<= -info->tsc_shift;
+ else
+ cpu_khz >>= info->tsc_shift;
+
+ return cpu_khz;
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+ struct vcpu_time_info *src;
+ struct shadow_time_info *dst;
+
+ /* src is shared memory with the hypervisor, so we need to
+ make sure we get a consistent snapshot, even in the face of
+ being preempted. */
+ src = &__get_cpu_var(xen_vcpu)->time;
+ dst = &__get_cpu_var(shadow_time);
+
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) | (dst->version ^ src->version));
+
+ return dst->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+ u64 now, delta;
+ now = native_read_tsc();
+ delta = now - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+ struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+ cycle_t ret;
+ unsigned version;
+
+ do {
+ version = get_time_values_from_xen();
+ barrier();
+ ret = shadow->system_timestamp + get_nsec_offset(shadow);
+ barrier();
+ } while (version != __get_cpu_var(xen_vcpu)->time.version);
+
+ put_cpu_var(shadow_time);
+
+ return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+ const struct shared_info *s = HYPERVISOR_shared_info;
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = s->wc_version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = s->wc_sec;
+ now.tv_nsec = s->wc_nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+ delta = xen_clocksource_read(); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+ struct timespec ts;
+
+ xen_read_wallclock(&ts);
+
+ return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+ /* do nothing for domU */
+ return -1;
+}
+
+static struct clocksource xen_clocksource __read_mostly = {
+ .name = "xen",
+ .rating = 400,
+ .read = xen_clocksource_read,
+ .mask = ~0,
+ .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
+ .shift = XEN_SHIFT,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+ Xen clockevent implementation
+
+ Xen has two clockevent implementations:
+
+ The old timer_op one works with all released versions of Xen prior
+ to version 3.0.4. This version of the hypervisor provides a
+ single-shot timer with nanosecond resolution. However, sharing the
+ same event channel is a 100Hz tick which is delivered while the
+ vcpu is running. We don't care about or use this tick, but it will
+ cause the core time code to think the timer fired too soon, and
+ will end up resetting it each time. It could be filtered, but
+ doing so has complications when the ktime clocksource is not yet
+ the xen clocksource (ie, at boot time).
+
+ The new vcpu_op-based timer interface allows the tick timer period
+ to be changed or turned off. The tick timer is not useful as a
+ periodic timer because events are only delivered to running vcpus.
+ The one-shot timer can report when a timeout is in the past, so
+ set_next_event is capable of returning -ETIME when appropriate.
+ This interface is used when available.
+*/
+
+
+/*
+ Get a hypervisor absolute time. In theory we could maintain an
+ offset between the kernel's time and the hypervisor's time, and
+ apply that to a kernel's absolute timeout. Unfortunately the
+ hypervisor and kernel times can drift even if the kernel is using
+ the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+ return xen_clocksource_read() + delta;
+}
+
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ /* unsupported */
+ WARN_ON(1);
+ break;
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ case CLOCK_EVT_MODE_RESUME:
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ HYPERVISOR_set_timer_op(0); /* cancel timeout */
+ break;
+ }
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+ if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+ BUG();
+
+ /* We may have missed the deadline, but there's no real way of
+ knowing for sure. If the event was in the past, then we'll
+ get an immediate interrupt. */
+
+ return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+
+ .max_delta_ns = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
+
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
+
+ .set_mode = xen_timerop_set_mode,
+ .set_next_event = xen_timerop_set_next_event,
+};
+
+
+
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ WARN_ON(1); /* unsupported */
+ break;
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+ HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
+ break;
+ case CLOCK_EVT_MODE_RESUME:
+ break;
+ }
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+ struct vcpu_set_singleshot_timer single;
+ int ret;
+
+ WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+ single.timeout_abs_ns = get_abs_timeout(delta);
+ single.flags = VCPU_SSHOTTMR_future;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+
+ BUG_ON(ret != 0 && ret != -ETIME);
+
+ return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+
+ .max_delta_ns = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
+
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
+
+ .set_mode = xen_vcpuop_set_mode,
+ .set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+ &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+ struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+ irqreturn_t ret;
+
+ ret = IRQ_NONE;
+ if (evt->event_handler) {
+ evt->event_handler(evt);
+ ret = IRQ_HANDLED;
+ }
+
+ do_stolen_accounting();
+
+ return ret;
+}
+
+void xen_setup_timer(int cpu)
+{
+ const char *name;
+ struct clock_event_device *evt;
+ int irq;
+
+ printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+ name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+ if (!name)
+ name = "<timer kasprintf failed>";
+
+ irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ name, NULL);
+
+ evt = &per_cpu(xen_clock_events, cpu);
+ memcpy(evt, xen_clockevent, sizeof(*evt));
+
+ evt->cpumask = cpumask_of_cpu(cpu);
+ evt->irq = irq;
+
+ setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+ BUG_ON(preemptible());
+
+ clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+
+__init void xen_time_init(void)
+{
+ int cpu = smp_processor_id();
+
+ get_time_values_from_xen();
+
+ clocksource_register(&xen_clocksource);
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+ /* Successfully turned off 100Hz tick, so we have the
+ vcpuop-based timer interface */
+ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+ xen_clockevent = &xen_vcpuop_clockevent;
+ }
+
+ /* Set initial system time with full resolution */
+ xen_read_wallclock(&xtime);
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
+ tsc_disable = 0;
+
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+}
--- /dev/null
+/* Bit used for the pseudo-hwcap for non-negative segments. We use
+ bit 1 to avoid bugs in some versions of glibc when bit 0 is
+ used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT 1
--- /dev/null
+/*
+ Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ The inline versions are the same as the direct-use versions, with the
+ pre- and post-amble chopped off.
+
+ This code is encoded for size rather than absolute efficiency,
+ with a view to being able to inline as much as possible.
+
+ We only bother with direct forms (ie, vcpu in pda) of the operations
+ here; the indirect forms are better handled in C, since they're
+ generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+/*
+ Enable events. This clears the event mask and tests the pending
+ event status with one and operation. If there are pending
+ events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Clear mask and test pending */
+ andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+ Disabling events is simply a matter of making the event mask
+ non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ (xen_)save_fl is used to get the current interrupt enable status.
+ Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ may be set in the return value. We take advantage of this by
+ making sure that X86_EFLAGS_IF has the right value (and other bits
+ in that byte are 0), but other bits in the return value are
+ undefined. We need to toggle the state of the bit, because
+ Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ setz %ah
+ addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+
+/*
+ In principle the caller should be passing us a value return
+ from xen_save_fl_direct, but for robustness sake we test only
+ the X86_EFLAGS_IF flag rather than the whole byte. After
+ setting the interrupt mask state, it checks for unmasked
+ pending events and enters the hypervisor to get them delivered
+ if so.
+ */
+ENTRY(xen_restore_fl_direct)
+ testb $X86_EFLAGS_IF>>8, %ah
+ setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+ This is run where a normal iret would be run, with the same stack setup:
+ 8: eflags
+ 4: cs
+ esp-> 0: eip
+
+ This attempts to make sure that any pending events are dealt
+ with on return to usermode, but there is a small window in
+ which an event can happen just before entering usermode. If
+ the nested interrupt ends up setting one of the TIF_WORK_MASK
+ pending work flags, they will not be tested again before
+ returning to usermode. This means that a process can end up
+ with pending work, which will be unprocessed until the process
+ enters and leaves the kernel again, which could be an
+ unbounded amount of time. This means that a pending signal or
+ reschedule event could be indefinitely delayed.
+
+ The fix is to notice a nested interrupt in the critical
+ window, and if one occurs, then fold the nested interrupt into
+ the current interrupt stack frame, and re-process it
+ iteratively rather than recursively. This means that it will
+ exit via the normal path, and all pending work will be dealt
+ with appropriately.
+
+ Because the nested interrupt handler needs to deal with the
+ current stack state in whatever form its in, we keep things
+ simple by only using a single register which is pushed/popped
+ on the stack.
+
+ Non-direct iret could be done in the same way, but it would
+ require an annoying amount of code duplication. We'll assume
+ that direct mode will be the common case once the hypervisor
+ support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+ /* test eflags for special cases */
+ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+ jnz hyper_iret
+
+ push %eax
+ ESP_OFFSET=4 # bytes pushed onto stack
+
+ /* Store vcpu_info pointer for easy access. Do it this
+ way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+ GET_THREAD_INFO(%eax)
+ movl TI_cpu(%eax),%eax
+ movl __per_cpu_offset(,%eax,4),%eax
+ lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+ movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+ /* check IF state we're restoring */
+ testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+ /* Maybe enable events. Once this happens we could get a
+ recursive event, so the critical region starts immediately
+ afterwards. However, if that happens we don't end up
+ resuming the code, so we don't have to be worried about
+ being preempted to another CPU. */
+ setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+ /* If there's something pending, mask events again so we
+ can jump back into xen_hypervisor_callback */
+ sete XEN_vcpu_info_mask(%eax)
+
+ popl %eax
+
+ /* From this point on the registers are restored and the stack
+ updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+ /* Jump to hypervisor_callback after fixing up the stack.
+ Events are masked, so jumping out of the critical
+ region is OK. */
+ je xen_hypervisor_callback
+
+ iret
+xen_iret_end_crit:
+
+hyper_iret:
+ /* put this out of line since its very rarely used */
+ jmp hypercall_page + __HYPERVISOR_iret * 32
+
+ .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+ This is called by xen_hypervisor_callback in entry.S when it sees
+ that the EIP at the time of interrupt was between xen_iret_start_crit
+ and xen_iret_end_crit. We're passed the EIP in %eax so we can do
+ a more refined determination of what to do.
+
+ The stack format at this point is:
+ ----------------
+ ss : (ss/esp may be present if we came from usermode)
+ esp :
+ eflags } outer exception info
+ cs }
+ eip }
+ ---------------- <- edi (copy dest)
+ eax : outer eax if it hasn't been restored
+ ----------------
+ eflags } nested exception info
+ cs } (no ss/esp because we're nested
+ eip } from the same ring)
+ orig_eax }<- esi (copy src)
+ - - - - - - - -
+ fs }
+ es }
+ ds } SAVE_ALL state
+ eax }
+ : :
+ ebx }
+ ----------------
+ return addr <- esp
+ ----------------
+
+ In order to deliver the nested exception properly, we need to shift
+ everything from the return addr up to the error code so it
+ sits just under the outer exception info. This means that when we
+ handle the exception, we do it in the context of the outer exception
+ rather than starting a new one.
+
+ The only caveat is that if the outer eax hasn't been
+ restored yet (ie, it's still on stack), we need to insert
+ its value into the SAVE_ALL state before going on, since
+ it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+ /* offsets +4 for return address */
+
+ /*
+ Paranoia: Make sure we're really coming from userspace.
+ One could imagine a case where userspace jumps into the
+ critical range address, but just before the CPU delivers a GP,
+ it decides to deliver an interrupt instead. Unlikely?
+ Definitely. Easy to avoid? Yes. The Intel documents
+ explicitly say that the reported EIP for a bad jump is the
+ jump instruction itself, not the destination, but some virtual
+ environments get this wrong.
+ */
+ movl PT_CS+4(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+ cmpl $USER_RPL, %ecx
+ je 2f
+
+ lea PT_ORIG_EAX+4(%esp), %esi
+ lea PT_EFLAGS+4(%esp), %edi
+
+ /* If eip is before iret_restore_end then stack
+ hasn't been restored yet. */
+ cmp $iret_restore_end, %eax
+ jae 1f
+
+ movl 0+4(%edi),%eax /* copy EAX */
+ movl %eax, PT_EAX+4(%esp)
+
+ lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
+
+ /* set up the copy */
+1: std
+ mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
+ rep movsl
+ cld
+
+ lea 4(%edi),%esp /* point esp to new frame */
+2: ret
+
+
+/*
+ Force an event check by making a hypercall,
+ but preserve regs before making the call.
+ */
+check_events:
+ push %eax
+ push %ecx
+ push %edx
+ call force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+ ret
--- /dev/null
+/* Xen-specific pieces of head.S, intended to be included in the right
+ place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+.pushsection .init.text
+ENTRY(startup_xen)
+ movl %esi,xen_start_info
+ cld
+ movl $(init_thread_union+THREAD_SIZE),%esp
+ jmp xen_start_kernel
+.popsection
+
+.pushsection .bss.page_aligned
+ .align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+ .skip 0x1000
+.popsection
+
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+
+#endif /*CONFIG_XEN */
--- /dev/null
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+/* These are code, but not functions. Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_copy_trap_info(struct trap_info *traps);
+
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+
+void xen_mark_init_mm_pinned(void);
+
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+static inline unsigned xen_get_lazy_mode(void)
+{
+ return x86_read_percpu(xen_lazy_mode);
+}
+
+void __init xen_fill_possible_map(void);
+
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+ int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait);
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+ void *info, int wait);
+
+
+/* Declare an asm function, along with symbols needed to make it
+ inlineable */
+#define DECL_ASM(ret, name, ...) \
+ ret name(__VA_ARGS__); \
+ extern char name##_end[]; \
+ extern char name##_reloc[] \
+
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */