S: Maintained
F: drivers/media/dvb-frontends/lgdt3305.*
-LGUEST
-M: Rusty Russell <rusty@rustcorp.com.au>
-L: lguest@lists.ozlabs.org
-W: http://lguest.ozlabs.org/
-S: Odd Fixes
-F: arch/x86/include/asm/lguest*.h
-F: arch/x86/lguest/
-F: drivers/lguest/
-F: include/linux/lguest*.h
-F: tools/lguest/
-
LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
M: Viresh Kumar <vireshk@kernel.org>
L: linux-ide@vger.kernel.org
# Hyper-V paravirtualization support
obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
obj-y += realmode/
obj-y += kernel/
obj-y += mm/
Statistics are displayed in debugfs filesystem. Enabling this option
may incur significant overhead.
-source "arch/x86/lguest/Kconfig"
-
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
+++ /dev/null
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS 10
-#define GDT_ENTRY_LGUEST_DS 11
-#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
- /* Manually saved part. */
- unsigned long eax, ebx, ecx, edx;
- unsigned long esi, edi, ebp;
- unsigned long gs;
- unsigned long fs, ds, es;
- unsigned long trapnum, errcode;
- /* Trap pushed part */
- unsigned long eip;
- unsigned long cs;
- unsigned long eflags;
- unsigned long esp;
- unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
- /* Host information we need to restore when we switch back. */
- u32 host_cr3;
- struct desc_ptr host_idt_desc;
- struct desc_ptr host_gdt_desc;
- u32 host_sp;
-
- /* Fields which are used when guest is running. */
- struct desc_ptr guest_idt_desc;
- struct desc_ptr guest_gdt_desc;
- struct x86_hw_tss guest_tss;
- struct desc_struct guest_idt[IDT_ENTRIES];
- struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
- /* The GDT entries copied into lguest_ro_state when running. */
- struct desc_struct gdt[GDT_ENTRIES];
-
- /* The IDT entries: some copied into lguest_ro_state when running. */
- struct desc_struct idt[IDT_ENTRIES];
-
- /* The address of the last guest-visible pagefault (ie. cr2). */
- unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
- u32 cr0;
-
- cr0 = read_cr0();
- if (!(cr0 & 8))
- write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
- ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
+++ /dev/null
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC 0
-#define LHCALL_LGUEST_INIT 1
-#define LHCALL_SHUTDOWN 2
-#define LHCALL_NEW_PGTABLE 4
-#define LHCALL_FLUSH_TLB 5
-#define LHCALL_LOAD_IDT_ENTRY 6
-#define LHCALL_SET_STACK 7
-#define LHCALL_SET_CLOCKEVENT 9
-#define LHCALL_HALT 10
-#define LHCALL_SET_PMD 13
-#define LHCALL_SET_PTE 14
-#define LHCALL_SET_PGD 15
-#define LHCALL_LOAD_TLS 16
-#define LHCALL_LOAD_GDT_ENTRY 18
-#define LHCALL_SEND_INTERRUPTS 19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF 1
-#define LGUEST_SHUTDOWN_RESTART 2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations? There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts). Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure. This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
- unsigned long arg1, unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* "int" is the Intel instruction to trigger a trap. */
- asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
- /* The call in %eax (aka "a") might be overwritten */
- : "=a"(call)
- /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
- : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
- /* "memory" means this might write somewhere in memory.
- * This isn't true for all calls, but it's safe to tell
- * gcc that it might happen so it doesn't get clever. */
- : "memory");
- return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
- /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
- unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
* In case NMI unmasking or performance ever becomes a problem,
* the next best option appears to be MOV-to-CR2 and an
* unconditional jump. That sequence also works on all CPUs,
- * but it will fault at CPL3 (i.e. Xen PV and lguest).
+ * but it will fault at CPL3 (i.e. Xen PV).
*
* CPUID is the conventional way, but it's nasty: it doesn't
* exist on some 486-like CPUs, and it usually exits to a
*
* @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
* PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
#include <asm/ucontext.h>
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
OFFSET(stack_canary_offset, stack_canary, canary);
#endif
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
- BLANK();
- OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
- OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
- BLANK();
- OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
- OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
- OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
- OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
- OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
- OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
- OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
- OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
- OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
- OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
DEFINE(NR_syscalls, sizeof(syscalls));
jmp *%eax
.Lbad_subarch:
-WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
nothing we can do at this point. */
subarch_entries:
.long .Ldefault_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
.long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
- case X86_SUBARCH_LGUEST:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
break;
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
endif # VIRTUALIZATION
+++ /dev/null
-config LGUEST_GUEST
- bool "Lguest guest support"
- depends on X86_32 && PARAVIRT && PCI
- select TTY
- select VIRTUALIZATION
- select VIRTIO
- select VIRTIO_CONSOLE
- help
- Lguest is a tiny in-kernel hypervisor. Selecting this will
- allow your kernel to boot under lguest. This option will increase
- your kernel size by about 10k. If in doubt, say N.
-
- If you say Y here, make sure you say Y (or M) to the virtio block
- and net drivers which lguest needs.
+++ /dev/null
-obj-y := head_32.o boot.o
-CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
+++ /dev/null
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways. First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes. We call the first kernel the Host,
- * and the others the Guests. The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time. This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest? We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h> /* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways. In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
- .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
- .noirq_iret = (u32)lguest_noirq_iret,
- .kernel_address = PAGE_OFFSET,
- .blocked_interrupts = { 1 }, /* Block timer interrupts */
- .syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really. We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly. This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3,
- unsigned long arg4)
-{
- /* Note: This code assumes we're uniprocessor. */
- static unsigned int next_call;
- unsigned long flags;
-
- /*
- * Disable interrupts if not already disabled: we don't want an
- * interrupt handler making a hypercall while we're already doing
- * one!
- */
- local_irq_save(flags);
- if (lguest_data.hcall_status[next_call] != 0xFF) {
- /* Table full, so do normal hcall which will flush table. */
- hcall(call, arg1, arg2, arg3, arg4);
- } else {
- lguest_data.hcalls[next_call].arg0 = call;
- lguest_data.hcalls[next_call].arg1 = arg1;
- lguest_data.hcalls[next_call].arg2 = arg2;
- lguest_data.hcalls[next_call].arg3 = arg3;
- lguest_data.hcalls[next_call].arg4 = arg4;
- /* Arguments must all be written before we mark it to go */
- wmb();
- lguest_data.hcall_status[next_call] = 0;
- if (++next_call == LHCALL_RING_SIZE)
- next_call = 0;
- }
- local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall(). This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off. Because hypercalls
- * are reasonably expensive, batching them up makes sense. For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, 0, 0, 0);
- else
- async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
- unsigned long arg1,
- unsigned long arg2)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, 0, 0);
- else
- async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3, 0);
- else
- async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4)
-{
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- hcall(call, arg1, arg2, arg3, arg4);
- else
- async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
- paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
- hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
- paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction. The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags"). The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag. Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
- return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
- lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment. Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules. In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register. To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch. One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled. There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag. If an interrupt does come
- * in, we then disable them for real. This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in. Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares? The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
- int entrynum, const gate_desc *g)
-{
- /*
- * The gate_desc structure is 8 bytes long: we hand it to the Host in
- * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
- * around like this; typesafety wasn't a big concern in Linux's early
- * years.
- */
- u32 *desc = (u32 *)g;
- /* Keep the local copy up to date. */
- native_write_idt_entry(dt, entrynum, g);
- /* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *idt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table. There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
- unsigned int i;
- struct desc_struct *gdt = (void *)desc->address;
-
- for (i = 0; i < (desc->size+1)/8; i++)
- hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
- const void *desc, int type)
-{
- native_write_gdt_entry(dt, entrynum, desc, type);
- /* Tell Host about this new entry. */
- hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
- dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables). As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- /*
- * There's one problem which normal hardware doesn't have: the Host
- * can't handle us removing entries we're currently using. So we clear
- * the GS register here: if it's needed it'll be reloaded anyway.
- */
- lazy_load_gs(0);
- lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy. Linux only
- * uses this for some strange applications like Wine. We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment. Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops. So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features. It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 6 languages. I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction. Then I pretty much turned
- * off feature bits until the Guest booted. (Don't say that: you'll damage
- * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
- * hardly future proof.) No one's listening! They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged. So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int function = *ax;
-
- native_cpuid(ax, bx, cx, dx);
- switch (function) {
- /*
- * CPUID 0 gives the highest legal CPUID number (and the ID string).
- * We futureproof our code a little by sticking to known CPUID values.
- */
- case 0:
- if (*ax > 5)
- *ax = 5;
- break;
-
- /*
- * CPUID 1 is a basic feature request.
- *
- * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
- * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
- */
- case 1:
- *cx &= 0x00002201;
- *dx &= 0x07808151;
- /*
- * The Host can do a nice optimization if it knows that the
- * kernel mappings (addresses above 0xC0000000 or whatever
- * PAGE_OFFSET is set to) haven't changed. But Linux calls
- * flush_tlb_user() for both user and kernel mappings unless
- * the Page Global Enable (PGE) feature bit is set.
- */
- *dx |= 0x00002000;
- /*
- * We also lie, and say we're family id 5. 6 or greater
- * leads to a rdmsr in early_init_intel which we can't handle.
- * Family ID is returned as bits 8-12 in ax.
- */
- *ax &= 0xFFFFF0FF;
- *ax |= 0x00000500;
- break;
-
- /*
- * This is used to detect if we're running under KVM. We might be,
- * but that's a Host matter, not us. So say we're not.
- */
- case KVM_CPUID_SIGNATURE:
- *bx = *cx = *dx = 0;
- break;
-
- /*
- * 0x80000000 returns the highest Extended Function, so we futureproof
- * like we do above by limiting it to known fields.
- */
- case 0x80000000:
- if (*ax > 0x80000008)
- *ax = 0x80000008;
- break;
-
- /*
- * PAE systems can mark pages as non-executable. Linux calls this the
- * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch it off here, since we don't
- * support it.
- */
- case 0x80000001:
- *dx &= ~(1 << 20);
- break;
- }
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it. The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0. cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used. Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
- return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads. The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
- return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
- lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- current_cr3 = cr3;
-
- /* These two page tables are simple, linear, and used during boot */
- if (cr3 != __pa_symbol(swapper_pg_dir) &&
- cr3 != __pa_symbol(initial_page_table))
- cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
- return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
- return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant. The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU
- * maps virtual addresses to physical addresses using "page tables". We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables. But since most virtual addresses
- * are unused, we use a two level index which saves space. The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages. Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page. If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- * Index into top Index into second Offset within page
- * page directory page pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper ]
- * [ Level ]
- * [ Entries ]
- * [(PUD Page)]---> +---------+
- * | --------->+---------+
- * | | | PADDR1 |
- * Mid-level | | PADDR2 |
- * (PMD) page | | |
- * | | Lower-level |
- * | | (PTE) page |
- * | | | |
- * .... ....
- *
- *
- * And the virtual address is decoded as:
- *
- * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into Index into mid Index into lower Offset within page
- * top entries directory page pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set. Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages. The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. We tell the Host the toplevel and
- * address this corresponds to. The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
- /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
- lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
- ptep->pte_low, ptep->pte_high);
-#else
- lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
- native_set_pud(pudp, pudval);
-
- /* 32 bytes aligned pdpt address and the index. */
- lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
- (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- native_set_pmd(pmdp, pmdval);
- lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more. This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them. Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds! So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
- native_set_pte(ptep, pteval);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry. These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- native_set_pte_atomic(ptep, pte);
- if (cr3_changed)
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- native_pte_clear(mm, addr, ptep);
- lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
- lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations. On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present). Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
- /* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed. That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
- lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit. We don't actually "ack" interrupts as such, we
- * just mask and unmask them. I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
- set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
- clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
- .name = "lguest",
- .irq_mask = disable_lguest_irq,
- .irq_mask_ack = disable_lguest_irq,
- .irq_unmask = enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
- struct irq_desc *desc;
- int err;
-
- /* Returns -ve error or vector number. */
- err = irq_alloc_desc_at(irq, 0);
- if (err < 0 && err != -EEXIST)
- return err;
-
- /*
- * Tell the Linux infrastructure that the interrupt is
- * controlled by our level-based lguest interrupt controller.
- */
- irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
- handle_level_irq, "level");
-
- /* Some systems map "vectors" to interrupts weirdly. Not us! */
- desc = irq_to_desc(irq);
- __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
- return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
- int err;
- u8 line = 0;
-
- /* We literally use the PCI interrupt line as the irq number. */
- pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
- err = lguest_setup_irq(line);
- if (!err)
- dev->irq = line;
- return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
- WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
- unsigned int i;
-
- for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
- if (i != IA32_SYSCALL_VECTOR)
- set_intr_gate(i, irq_entries_start +
- 8 * (i - FIRST_EXTERNAL_VECTOR));
- }
-
- /*
- * This call is required to set up for 4k stacks, where we have
- * separate stacks for hard and soft interrupts.
- */
- irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
- *now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
- return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
- unsigned long sec, nsec;
-
- /*
- * Since the time is in two parts (seconds and nanoseconds), we risk
- * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
- * and getting 99 and 0. As Linux tends to come apart under the stress
- * of time travel, we must be careful:
- */
- do {
- /* First we read the seconds part. */
- sec = lguest_data.time.tv_sec;
- /*
- * This read memory barrier tells the compiler and the CPU that
- * this can't be reordered: we have to complete the above
- * before going on.
- */
- rmb();
- /* Now we read the nanoseconds part. */
- nsec = lguest_data.time.tv_nsec;
- /* Make sure we've done that. */
- rmb();
- /* Now if the seconds part has changed, try again. */
- } while (unlikely(lguest_data.time.tv_sec != sec));
-
- /* Our lguest clock is in real nanoseconds. */
- return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
- .name = "lguest",
- .rating = 200,
- .read = lguest_clock_read,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future. Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* FIXME: I don't think this can ever happen, but James tells me he had
- * to put this code in. Maybe we should remove it now. Anyone? */
- if (delta < LG_CLOCK_MIN_DELTA) {
- if (printk_ratelimit())
- printk(KERN_DEBUG "%s: small delta %lu ns\n",
- __func__, delta);
- return -ETIME;
- }
-
- /* Please wake us this far in the future. */
- hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
- return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
- /* A 0 argument shuts the clock down. */
- hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
- return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
- .name = "lguest",
- .features = CLOCK_EVT_FEAT_ONESHOT,
- .set_next_event = lguest_clockevent_set_next_event,
- .set_state_shutdown = lguest_clockevent_shutdown,
- .rating = INT_MAX,
- .mult = 1,
- .shift = 0,
- .min_delta_ns = LG_CLOCK_MIN_DELTA,
- .min_delta_ticks = LG_CLOCK_MIN_DELTA,
- .max_delta_ns = LG_CLOCK_MAX_DELTA,
- .max_delta_ticks = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0). We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
- unsigned long flags;
-
- /* Don't interrupt us while this is running. */
- local_irq_save(flags);
- lguest_clockevent.event_handler(&lguest_clockevent);
- local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure. The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
- /* Set up the timer interrupt (0) to go to our simple timer routine */
- if (lguest_setup_irq(0) != 0)
- panic("Could not set up timer irq");
- irq_set_handler(0, lguest_time_irq);
-
- clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
- /* We can't set cpumask in the initializer: damn C limitations! Set it
- * here and register our timer device. */
- lguest_clockevent.cpumask = cpumask_of(0);
- clockevents_register_device(&lguest_clockevent);
-
- /* Finally, we unblock the timer interrupt. */
- clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work. They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use. For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
- THREAD_SIZE / PAGE_SIZE);
- tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
- /* FIXME: Implement */
- return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
- /* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices). This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that. Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0. So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads. So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced. It will also never interrupt anything. It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
- return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
- return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
- return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
- return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
- apic->read = lguest_apic_read;
- apic->write = lguest_apic_write;
- apic->icr_read = lguest_apic_icr_read;
- apic->icr_write = lguest_apic_icr_write;
- apic->wait_icr_idle = lguest_apic_wait_icr_idle;
- apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP! Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
- hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
- hcall(LHCALL_SHUTDOWN, __pa("Power down"),
- LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't. But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
- hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
- /* The hcall won't return, but to keep gcc happy, we're "done". */
- return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
- .notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
- /*
- * The Linux bootloader header contains an "e820" memory map: the
- * Launcher populated the first entry with our memory limit.
- */
- e820__range_add(boot_params.e820_table[0].addr,
- boot_params.e820_table[0].size,
- boot_params.e820_table[0].type);
-
- /* This string is for the boot messages. */
- return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
- write_pci_config_byte(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, bar),
- 0);
- write_pci_config(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, length),
- 4);
- write_pci_config(0, 1, 0,
- cfg_offset + offsetof(struct virtio_pci_cap, offset),
- off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
- /*
- * We could set this up once, then leave it; nothing else in the *
- * kernel should touch these registers. But if it went wrong, that
- * would be a horrible bug to find.
- */
- set_cfg_window(cfg_offset, off);
- write_pci_config(0, 1, 0,
- cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
- u8 cap, common_cap = 0, device_cap = 0;
- u32 device_len;
-
- /* Avoid recursive printk into here. */
- console_cfg_offset = -1;
-
- if (!early_pci_allowed()) {
- printk(KERN_ERR "lguest: early PCI access not allowed!\n");
- return;
- }
-
- /* We expect a console PCI device at BUS0, slot 1. */
- if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
- printk(KERN_ERR "lguest: PCI device is %#x!\n",
- read_pci_config(0, 1, 0, 0));
- return;
- }
-
- /* Find the capabilities we need (must be in bar0) */
- cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
- while (cap) {
- u8 vndr = read_pci_config_byte(0, 1, 0, cap);
- if (vndr == PCI_CAP_ID_VNDR) {
- u8 type, bar;
-
- type = read_pci_config_byte(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, cfg_type));
- bar = read_pci_config_byte(0, 1, 0,
- cap + offsetof(struct virtio_pci_cap, bar));
-
- switch (type) {
- case VIRTIO_PCI_CAP_DEVICE_CFG:
- if (bar == 0)
- device_cap = cap;
- break;
- case VIRTIO_PCI_CAP_PCI_CFG:
- console_access_cap = cap;
- break;
- }
- }
- cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
- }
- if (!device_cap || !console_access_cap) {
- printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
- common_cap, device_cap, console_access_cap);
- return;
- }
-
- /*
- * Note that we can't check features, until we've set the DRIVER
- * status bit. We don't want to do that until we have a real driver,
- * so we just check that the device-specific config has room for
- * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
- * it should ignore the access.
- */
- device_len = read_pci_config(0, 1, 0,
- device_cap + offsetof(struct virtio_pci_cap, length));
- if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
- + sizeof(u32))) {
- printk(KERN_ERR "lguest: console missing emerg_wr field\n");
- return;
- }
-
- console_cfg_offset = read_pci_config(0, 1, 0,
- device_cap + offsetof(struct virtio_pci_cap, offset));
- printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
- /* If we couldn't find PCI console, forget it. */
- if (console_cfg_offset < 0)
- return count;
-
- if (unlikely(!console_cfg_offset)) {
- probe_pci_console();
- if (console_cfg_offset < 0)
- return count;
- }
-
- write_bar_via_cfg(console_access_cap,
- console_cfg_offset
- + offsetof(struct virtio_console_config, emerg_wr),
- buf[0]);
- return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
- hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel. This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"? The rest of that quote is
- * "... But that usually will create another problem." This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient. We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
- const char *start, *end;
-} lguest_insns[] = {
- [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c). If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
- unsigned long addr, unsigned len)
-{
- unsigned int insn_len;
-
- /* Don't do anything special if we don't have a replacement */
- if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
- /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
- if (len < insn_len)
- return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
- /* Copy in our instructions. */
- memcpy(ibuf, lguest_insns[type].start, insn_len);
- return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest. The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
- /* We're under lguest. */
- pv_info.name = "lguest";
- /* We're running at privilege level 1, not 0 as normal. */
- pv_info.kernel_rpl = 1;
- /* Everyone except Xen runs with this set. */
- pv_info.shared_kernel_pmd = 1;
-
- /*
- * We set up all the lguest overrides for sensitive operations. These
- * are detailed with the operations themselves.
- */
-
- /* Interrupt-related operations */
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
- pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
- pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
- pv_irq_ops.safe_halt = lguest_safe_halt;
-
- /* Setup operations */
- pv_init_ops.patch = lguest_patch;
-
- /* Intercepts of various CPU instructions */
- pv_cpu_ops.load_gdt = lguest_load_gdt;
- pv_cpu_ops.cpuid = lguest_cpuid;
- pv_cpu_ops.load_idt = lguest_load_idt;
- pv_cpu_ops.iret = lguest_iret;
- pv_cpu_ops.load_sp0 = lguest_load_sp0;
- pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
- pv_cpu_ops.set_ldt = lguest_set_ldt;
- pv_cpu_ops.load_tls = lguest_load_tls;
- pv_cpu_ops.get_debugreg = lguest_get_debugreg;
- pv_cpu_ops.set_debugreg = lguest_set_debugreg;
- pv_cpu_ops.read_cr0 = lguest_read_cr0;
- pv_cpu_ops.write_cr0 = lguest_write_cr0;
- pv_cpu_ops.read_cr4 = lguest_read_cr4;
- pv_cpu_ops.write_cr4 = lguest_write_cr4;
- pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
- pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
- pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
- pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
- /* Pagetable management */
- pv_mmu_ops.write_cr3 = lguest_write_cr3;
- pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
- pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
- pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
- pv_mmu_ops.set_pte = lguest_set_pte;
- pv_mmu_ops.set_pte_at = lguest_set_pte_at;
- pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
- pv_mmu_ops.pte_clear = lguest_pte_clear;
- pv_mmu_ops.pmd_clear = lguest_pmd_clear;
- pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
- pv_mmu_ops.read_cr2 = lguest_read_cr2;
- pv_mmu_ops.read_cr3 = lguest_read_cr3;
- pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
- pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
- pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /* APIC read/write intercepts */
- set_lguest_basic_apic_ops();
-#endif
-
- x86_init.resources.memory_setup = lguest_memory_setup;
- x86_init.irqs.intr_init = lguest_init_IRQ;
- x86_init.timers.timer_init = lguest_time_init;
- x86_platform.calibrate_tsc = lguest_tsc_khz;
- x86_platform.get_wallclock = lguest_get_wallclock;
-
- /*
- * Now is a good time to look at the implementations of these functions
- * before returning to the rest of lguest_init().
- */
-
- /*G:070
- * Now we've seen all the paravirt_ops, we return to
- * lguest_init() where the rest of the fairly chaotic boot setup
- * occurs.
- */
-
- /*
- * The stack protector is a weird thing where gcc places a canary
- * value on the stack and then checks it on return. This file is
- * compiled with -fno-stack-protector it, so we got this far without
- * problems. The value of the canary is kept at offset 20 from the
- * %gs register, so we need to set that up before calling C functions
- * in other files.
- */
- setup_stack_canary_segment(0);
-
- /*
- * We could just call load_stack_canary_segment(), but we might as well
- * call switch_to_new_gdt() which loads the whole table and sets up the
- * per-cpu segment descriptor register %fs as well.
- */
- switch_to_new_gdt(0);
-
- /*
- * The Host<->Guest Switcher lives at the top of our address space, and
- * the Host told us how big it is when we made LGUEST_INIT hypercall:
- * it put the answer in lguest_data.reserve_mem
- */
- reserve_top_address(lguest_data.reserve_mem);
-
- /* Hook in our special panic hypercall code. */
- atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
- /*
- * This is messy CPU setup stuff which the native boot code does before
- * start_kernel, so we have to do, too:
- */
- cpu_detect(&new_cpu_data);
- /* head.S usually sets up the first capability word, so do it here. */
- new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
- /* Math is always hard! */
- set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
- /* We don't have features. We have puppies! Puppies! */
-#ifdef CONFIG_X86_MCE
- mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
- acpi_disabled = 1;
-#endif
-
- /*
- * We set the preferred console to "hvc". This is the "hypervisor
- * virtual console" driver written by the PowerPC people, which we also
- * adapted for lguest's use.
- */
- add_preferred_console("hvc", 0, NULL);
-
- /* Register our very early console. */
- virtio_cons_early_init(early_put_chars);
-
- /* Don't let ACPI try to control our PCI interrupts. */
- disable_acpi();
-
- /* We control them ourselves, by overriding these two hooks. */
- pcibios_enable_irq = lguest_enable_irq;
- pcibios_disable_irq = lguest_disable_irq;
-
- /*
- * Last of all, we set the power management poweroff hook to point to
- * the Guest routine to power off, and the reboot hook to our restart
- * routine.
- */
- pm_power_off = lguest_power_off;
- machine_ops.restart = lguest_restart;
-
- /*
- * Now we're set up, call i386_start_kernel() in head32.c and we proceed
- * to boot as normal. It never returns.
- */
- i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
+++ /dev/null
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables. Finally it checks the 'hardware_subarch' field. This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here! We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
- /*
- * We make the "initialization" hypercall now to tell the Host where
- * our lguest_data struct is.
- */
- movl $LHCALL_LGUEST_INIT, %eax
- movl $lguest_data - __PAGE_OFFSET, %ebx
- int $LGUEST_TRAP_ENTRY
-
- /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
- movl $LHCALL_NEW_PGTABLE, %eax
- movl $(initial_page_table - __PAGE_OFFSET), %ebx
- int $LGUEST_TRAP_ENTRY
-
- /* Set up the initial stack so we can run C code. */
- movl $(init_thread_union+THREAD_SIZE),%esp
-
- /* Jumps are relative: we're running __PAGE_OFFSET too low. */
- jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers. These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...) \
- lgstart_##name: insns; lgend_##name:; \
- .globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later). If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages. First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
- /*
- * The reverse of irq_disable, this sets lguest_data.irq_enabled to
- * X86_EFLAGS_IF (ie. "Interrupts enabled").
- */
- movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * But now we need to check if the Host wants to know: there might have
- * been interrupts waiting to be delivered, in which case it will have
- * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
- * jump to send_interrupts, otherwise we're done.
- */
- cmpl $0, lguest_data+LGUEST_DATA_irq_pending
- jnz send_interrupts
- /*
- * One cool thing about x86 is that you can do many things without using
- * a register. In this case, the normal path hasn't needed to save or
- * restore any registers at all!
- */
- ret
-send_interrupts:
- /*
- * OK, now we need a register: eax is used for the hypercall number,
- * which is LHCALL_SEND_INTERRUPTS.
- *
- * We used not to bother with this pending detection at all, which was
- * much simpler. Sooner or later the Host would realize it had to
- * send us an interrupt. But that turns out to make performance 7
- * times worse on a simple tcp benchmark. So now we do this the hard
- * way.
- */
- pushl %eax
- movl $LHCALL_SEND_INTERRUPTS, %eax
- /* This is the actual hypercall trap. */
- int $LGUEST_TRAP_ENTRY
- /* Put eax back the way we found it. */
- popl %eax
- ret
-
-/*
- * Finally, the "popf" or "restore flags" routine. The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
- /* This is just "lguest_data.irq_enabled = flags;" */
- movl %eax, lguest_data+LGUEST_DATA_irq_enabled
- /*
- * Now, if the %eax value has enabled interrupts and
- * lguest_data.irq_pending is set, we want to tell the Host so it can
- * deliver any outstanding interrupts. Fortunately, both values will
- * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
- * instruction will AND them together for us. If both are set, we
- * jump to send_interrupts.
- */
- testl lguest_data+LGUEST_DATA_irq_pending, %eax
- jnz send_interrupts
- /* Again, the normal path has used no extra registers. Clever, huh? */
- ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it. However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway. If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last. It's *cool*! It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap. The
- * stack looks like this:
- * old address
- * old code segment & privilege level
- * old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once. The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic. The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever. Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
- pushl 2*4(%esp)
- /*
- * Note the %ss: segment prefix here. Normal data accesses use the
- * "ds" segment, but that will have already been restored for whatever
- * we're returning to (such as userspace): we can't trust it. The %ss:
- * prefix makes sure we use the stack segment, which is still valid.
- */
- popl %ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
- iret
obj-$(CONFIG_ISDN) += isdn/
obj-$(CONFIG_EDAC) += edac/
obj-$(CONFIG_EISA) += eisa/
-obj-y += lguest/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_CPU_IDLE) += cpuidle/
obj-y += mmc/
depends on VIRTIO
---help---
This is the virtual block driver for virtio. It can be used with
- lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
+ QEMU based VMMs (like KVM or Xen). Say Y or M.
config VIRTIO_BLK_SCSI
bool "SCSI passthrough request for the Virtio block driver"
depends on VIRTIO && TTY
select HVC_DRIVER
help
- Virtio console for use with lguest and other hypervisors.
+ Virtio console for use with hypervisors.
Also serves as a general-purpose serial device for data
transfer between the guest and host. Character devices at
* We turn the characters into a scatter-gather list, add it to the
* output queue and then kick the Host. Then we sit here waiting for
* it to finish: inefficient in theory, but in practice
- * implementations will do it immediately (lguest's Launcher does).
+ * implementations will do it immediately.
*/
static int put_chars(u32 vtermno, const char *buf, int count)
{
+++ /dev/null
-config LGUEST
- tristate "Linux hypervisor example code"
- depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
- select HVC_DRIVER
- ---help---
- This is a very simple module which allows you to run
- multiple instances of the same Linux kernel, using the
- "lguest" command found in the tools/lguest directory.
-
- Note that "lguest" is pronounced to rhyme with "fell quest",
- not "rustyvisor". See tools/lguest/lguest.txt.
-
- If unsure, say N. If curious, say M. If masochistic, say Y.
+++ /dev/null
-# Host requires the other files, which can be a module.
-obj-$(CONFIG_LGUEST) += lg.o
-lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
- segments.o lguest_user.o
-
-lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
-
-Preparation Preparation!: PREFIX=P
-Guest: PREFIX=G
-Drivers: PREFIX=D
-Launcher: PREFIX=L
-Host: PREFIX=H
-Switcher: PREFIX=S
-Mastery: PREFIX=M
-Beer:
- @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
-Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
- @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
-Puppy:
- @clear
- @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n"
- @sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear
- @printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n"
- @sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear
- @printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n"
- @sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear
+++ /dev/null
-Welcome, friend reader, to lguest.
-
-Lguest is an adventure, with you, the reader, as Hero. I can't think of many
-5000-line projects which offer both such capability and glimpses of future
-potential; it is an exciting time to be delving into the source!
-
-But be warned; this is an arduous journey of several hours or more! And as we
-know, all true Heroes are driven by a Noble Goal. Thus I offer a Beer (or
-equivalent) to anyone I meet who has completed this documentation.
-
-So get comfortable and keep your wits about you (both quick and humorous).
-Along your way to the Noble Goal, you will also gain masterly insight into
-lguest, and hypervisors and x86 virtualization in general.
-
-Our Quest is in seven parts: (best read with C highlighting turned on)
-
-I) Preparation
- - In which our potential hero is flown quickly over the landscape for a
- taste of its scope. Suitable for the armchair coders and other such
- persons of faint constitution.
-
-II) Guest
- - Where we encounter the first tantalising wisps of code, and come to
- understand the details of the life of a Guest kernel.
-
-III) Drivers
- - Whereby the Guest finds its voice and become useful, and our
- understanding of the Guest is completed.
-
-IV) Launcher
- - Where we trace back to the creation of the Guest, and thus begin our
- understanding of the Host.
-
-V) Host
- - Where we master the Host code, through a long and tortuous journey.
- Indeed, it is here that our hero is tested in the Bit of Despair.
-
-VI) Switcher
- - Where our understanding of the intertwined nature of Guests and Hosts
- is completed.
-
-VII) Mastery
- - Where our fully fledged hero grapples with the Great Question:
- "What next?"
-
-make Preparation!
-Rusty Russell.
+++ /dev/null
-/*P:400
- * This contains run_guest() which actually calls into the Host<->Guest
- * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something. This file also contains useful helper routines.
-:*/
-#include <linux/module.h>
-#include <linux/stringify.h>
-#include <linux/stddef.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/vmalloc.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/paravirt.h>
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-#include <asm/poll.h>
-#include <asm/asm-offsets.h>
-#include "lg.h"
-
-unsigned long switcher_addr;
-struct page **lg_switcher_pages;
-static struct vm_struct *switcher_text_vma;
-static struct vm_struct *switcher_stacks_vma;
-
-/* This One Big lock protects all inter-guest data structures. */
-DEFINE_MUTEX(lguest_lock);
-
-/*H:010
- * We need to set up the Switcher at a high virtual address. Remember the
- * Switcher is a few hundred bytes of assembler code which actually changes the
- * CPU to run the Guest, and then changes back to the Host when a trap or
- * interrupt happens.
- *
- * The Switcher code must be at the same virtual address in the Guest as the
- * Host since it will be running as the switchover occurs.
- *
- * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.
- */
-static __init int map_switcher(void)
-{
- int i, err;
-
- /*
- * Map the Switcher in to high memory.
- *
- * It turns out that if we choose the address 0xFFC00000 (4MB under the
- * top virtual address), it makes setting up the page tables really
- * easy.
- */
-
- /* We assume Switcher text fits into a single page. */
- if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
- printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
- end_switcher_text - start_switcher_text);
- return -EINVAL;
- }
-
- /*
- * We allocate an array of struct page pointers. map_vm_area() wants
- * this, rather than just an array of pages.
- */
- lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
- * TOTAL_SWITCHER_PAGES,
- GFP_KERNEL);
- if (!lg_switcher_pages) {
- err = -ENOMEM;
- goto out;
- }
-
- /*
- * Now we actually allocate the pages. The Guest will see these pages,
- * so we make sure they're zeroed.
- */
- for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
- lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!lg_switcher_pages[i]) {
- err = -ENOMEM;
- goto free_some_pages;
- }
- }
-
- /*
- * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
- * It goes in the first page, which we map in momentarily.
- */
- memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
- end_switcher_text - start_switcher_text);
- kunmap(lg_switcher_pages[0]);
-
- /*
- * We place the Switcher underneath the fixmap area, which is the
- * highest virtual address we can get. This is important, since we
- * tell the Guest it can't access this memory, so we want its ceiling
- * as high as possible.
- */
- switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
-
- /*
- * Now we reserve the "virtual memory area"s we want. We might
- * not get them in theory, but in practice it's worked so far.
- *
- * We want the switcher text to be read-only and executable, and
- * the stacks to be read-write and non-executable.
- */
- switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
- switcher_addr,
- switcher_addr + PAGE_SIZE);
-
- if (!switcher_text_vma) {
- err = -ENOMEM;
- printk("lguest: could not map switcher pages high\n");
- goto free_pages;
- }
-
- switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
- VM_ALLOC|VM_NO_GUARD,
- switcher_addr + PAGE_SIZE,
- switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
- if (!switcher_stacks_vma) {
- err = -ENOMEM;
- printk("lguest: could not map switcher pages high\n");
- goto free_text_vma;
- }
-
- /*
- * This code actually sets up the pages we've allocated to appear at
- * switcher_addr. map_vm_area() takes the vma we allocated above, the
- * kind of pages we're mapping (kernel text pages and kernel writable
- * pages respectively), and a pointer to our array of struct pages.
- */
- err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
- if (err) {
- printk("lguest: text map_vm_area failed: %i\n", err);
- goto free_vmas;
- }
-
- err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
- lg_switcher_pages + SWITCHER_TEXT_PAGES);
- if (err) {
- printk("lguest: stacks map_vm_area failed: %i\n", err);
- goto free_vmas;
- }
-
- /*
- * Now the Switcher is mapped at the right address, we can't fail!
- */
- printk(KERN_INFO "lguest: mapped switcher at %p\n",
- switcher_text_vma->addr);
- /* And we succeeded... */
- return 0;
-
-free_vmas:
- /* Undoes map_vm_area and __get_vm_area */
- vunmap(switcher_stacks_vma->addr);
-free_text_vma:
- vunmap(switcher_text_vma->addr);
-free_pages:
- i = TOTAL_SWITCHER_PAGES;
-free_some_pages:
- for (--i; i >= 0; i--)
- __free_pages(lg_switcher_pages[i], 0);
- kfree(lg_switcher_pages);
-out:
- return err;
-}
-/*:*/
-
-/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
-static void unmap_switcher(void)
-{
- unsigned int i;
-
- /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
- vunmap(switcher_text_vma->addr);
- vunmap(switcher_stacks_vma->addr);
- /* Now we just need to free the pages we copied the switcher into */
- for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
- __free_pages(lg_switcher_pages[i], 0);
- kfree(lg_switcher_pages);
-}
-
-/*H:032
- * Dealing With Guest Memory.
- *
- * Before we go too much further into the Host, we need to grok the routines
- * we use to deal with Guest memory.
- *
- * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on the corresponding place in
- * the memory region allocated by the Launcher.
- *
- * But we can't trust the Guest: it might be trying to access the Launcher
- * code. We have to check that the range is below the pfn_limit the Launcher
- * gave us. We have to make sure that addr + len doesn't give us a false
- * positive by overflowing, too.
- */
-bool lguest_address_ok(const struct lguest *lg,
- unsigned long addr, unsigned long len)
-{
- return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
-}
-
-/*
- * This routine copies memory from the Guest. Here we can see how useful the
- * kill_lguest() routine we met in the Launcher can be: we return a random
- * value (all zeroes) instead of needing to return an error.
- */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
-{
- if (!lguest_address_ok(cpu->lg, addr, bytes)
- || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
- /* copy_from_user should do this, but as we rely on it... */
- memset(b, 0, bytes);
- kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
- }
-}
-
-/* This is the write (copy into Guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
- unsigned bytes)
-{
- if (!lguest_address_ok(cpu->lg, addr, bytes)
- || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
- kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
-}
-/*:*/
-
-/*H:030
- * Let's jump straight to the the main loop which runs the Guest.
- * Remember, this is called by the Launcher reading /dev/lguest, and we keep
- * going around and around until something interesting happens.
- */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
-{
- /* If the launcher asked for a register with LHREQ_GETREG */
- if (cpu->reg_read) {
- if (put_user(*cpu->reg_read, user))
- return -EFAULT;
- cpu->reg_read = NULL;
- return sizeof(*cpu->reg_read);
- }
-
- /* We stop running once the Guest is dead. */
- while (!cpu->lg->dead) {
- unsigned int irq;
- bool more;
-
- /* First we run any hypercalls the Guest wants done. */
- if (cpu->hcall)
- do_hypercalls(cpu);
-
- /* Do we have to tell the Launcher about a trap? */
- if (cpu->pending.trap) {
- if (copy_to_user(user, &cpu->pending,
- sizeof(cpu->pending)))
- return -EFAULT;
- return sizeof(cpu->pending);
- }
-
- /*
- * All long-lived kernel loops need to check with this horrible
- * thing called the freezer. If the Host is trying to suspend,
- * it stops us.
- */
- try_to_freeze();
-
- /* Check for signals */
- if (signal_pending(current))
- return -ERESTARTSYS;
-
- /*
- * Check if there are any interrupts which can be delivered now:
- * if so, this sets up the hander to be executed when we next
- * run the Guest.
- */
- irq = interrupt_pending(cpu, &more);
- if (irq < LGUEST_IRQS)
- try_deliver_interrupt(cpu, irq, more);
-
- /*
- * Just make absolutely sure the Guest is still alive. One of
- * those hypercalls could have been fatal, for example.
- */
- if (cpu->lg->dead)
- break;
-
- /*
- * If the Guest asked to be stopped, we sleep. The Guest's
- * clock timer will wake us.
- */
- if (cpu->halted) {
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Just before we sleep, make sure no interrupt snuck in
- * which we should be doing.
- */
- if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
- set_current_state(TASK_RUNNING);
- else
- schedule();
- continue;
- }
-
- /*
- * OK, now we're ready to jump into the Guest. First we put up
- * the "Do Not Disturb" sign:
- */
- local_irq_disable();
-
- /* Actually run the Guest until something happens. */
- lguest_arch_run_guest(cpu);
-
- /* Now we're ready to be interrupted or moved to other CPUs */
- local_irq_enable();
-
- /* Now we deal with whatever happened to the Guest. */
- lguest_arch_handle_trap(cpu);
- }
-
- /* Special case: Guest is 'dead' but wants a reboot. */
- if (cpu->lg->dead == ERR_PTR(-ERESTART))
- return -ERESTART;
-
- /* The Guest is dead => "No such file or directory" */
- return -ENOENT;
-}
-
-/*H:000
- * Welcome to the Host!
- *
- * By this point your brain has been tickled by the Guest code and numbed by
- * the Launcher code; prepare for it to be stretched by the Host code. This is
- * the heart. Let's begin at the initialization routine for the Host's lg
- * module.
- */
-static int __init init(void)
-{
- int err;
-
- /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
- if (get_kernel_rpl() != 0) {
- printk("lguest is afraid of being a guest\n");
- return -EPERM;
- }
-
- /* First we put the Switcher up in very high virtual memory. */
- err = map_switcher();
- if (err)
- goto out;
-
- /* We might need to reserve an interrupt vector. */
- err = init_interrupts();
- if (err)
- goto unmap;
-
- /* /dev/lguest needs to be registered. */
- err = lguest_device_init();
- if (err)
- goto free_interrupts;
-
- /* Finally we do some architecture-specific setup. */
- lguest_arch_host_init();
-
- /* All good! */
- return 0;
-
-free_interrupts:
- free_interrupts();
-unmap:
- unmap_switcher();
-out:
- return err;
-}
-
-/* Cleaning up is just the same code, backwards. With a little French. */
-static void __exit fini(void)
-{
- lguest_device_remove();
- free_interrupts();
- unmap_switcher();
-
- lguest_arch_host_fini();
-}
-/*:*/
-
-/*
- * The Host side of lguest can be a module. This is a nice way for people to
- * play with it.
- */
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+++ /dev/null
-/*P:500
- * Just as userspace programs request kernel operations through a system
- * call, the Guest requests Host operations through a "hypercall". You might
- * notice this nomenclature doesn't really follow any logic, but the name has
- * been around for long enough that we're stuck with it. As you'd expect, this
- * code is basically a one big switch statement.
-:*/
-
-/* Copyright (C) 2006 Rusty Russell IBM Corporation
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/ktime.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include "lg.h"
-
-/*H:120
- * This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both.
- */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
- switch (args->arg0) {
- case LHCALL_FLUSH_ASYNC:
- /*
- * This call does nothing, except by breaking out of the Guest
- * it makes us process all the asynchronous hypercalls.
- */
- break;
- case LHCALL_SEND_INTERRUPTS:
- /*
- * This call does nothing too, but by breaking out of the Guest
- * it makes us process any pending interrupts.
- */
- break;
- case LHCALL_LGUEST_INIT:
- /*
- * You can't get here unless you're already initialized. Don't
- * do that.
- */
- kill_guest(cpu, "already have lguest_data");
- break;
- case LHCALL_SHUTDOWN: {
- char msg[128];
- /*
- * Shutdown is such a trivial hypercall that we do it in five
- * lines right here.
- *
- * If the lgread fails, it will call kill_guest() itself; the
- * kill_guest() with the message will be ignored.
- */
- __lgread(cpu, msg, args->arg1, sizeof(msg));
- msg[sizeof(msg)-1] = '\0';
- kill_guest(cpu, "CRASH: %s", msg);
- if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
- cpu->lg->dead = ERR_PTR(-ERESTART);
- break;
- }
- case LHCALL_FLUSH_TLB:
- /* FLUSH_TLB comes in two flavors, depending on the argument: */
- if (args->arg1)
- guest_pagetable_clear_all(cpu);
- else
- guest_pagetable_flush_user(cpu);
- break;
-
- /*
- * All these calls simply pass the arguments through to the right
- * routines.
- */
- case LHCALL_NEW_PGTABLE:
- guest_new_pagetable(cpu, args->arg1);
- break;
- case LHCALL_SET_STACK:
- guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
- break;
- case LHCALL_SET_PTE:
-#ifdef CONFIG_X86_PAE
- guest_set_pte(cpu, args->arg1, args->arg2,
- __pte(args->arg3 | (u64)args->arg4 << 32));
-#else
- guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
-#endif
- break;
- case LHCALL_SET_PGD:
- guest_set_pgd(cpu->lg, args->arg1, args->arg2);
- break;
-#ifdef CONFIG_X86_PAE
- case LHCALL_SET_PMD:
- guest_set_pmd(cpu->lg, args->arg1, args->arg2);
- break;
-#endif
- case LHCALL_SET_CLOCKEVENT:
- guest_set_clockevent(cpu, args->arg1);
- break;
- case LHCALL_HALT:
- /* Similarly, this sets the halted flag for run_guest(). */
- cpu->halted = 1;
- break;
- default:
- /* It should be an architecture-specific hypercall. */
- if (lguest_arch_do_hcall(cpu, args))
- kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
- }
-}
-
-/*H:124
- * Asynchronous hypercalls are easy: we just look in the array in the
- * Guest's "struct lguest_data" to see if any new ones are marked "ready".
- *
- * We are careful to do these in order: obviously we respect the order the
- * Guest put them in the ring, but we also promise the Guest that they will
- * happen before any normal hypercall (which is why we check this before
- * checking for a normal hcall).
- */
-static void do_async_hcalls(struct lg_cpu *cpu)
-{
- unsigned int i;
- u8 st[LHCALL_RING_SIZE];
-
- /* For simplicity, we copy the entire call status array in at once. */
- if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
- return;
-
- /* We process "struct lguest_data"s hcalls[] ring once. */
- for (i = 0; i < ARRAY_SIZE(st); i++) {
- struct hcall_args args;
- /*
- * We remember where we were up to from last time. This makes
- * sure that the hypercalls are done in the order the Guest
- * places them in the ring.
- */
- unsigned int n = cpu->next_hcall;
-
- /* 0xFF means there's no call here (yet). */
- if (st[n] == 0xFF)
- break;
-
- /*
- * OK, we have hypercall. Increment the "next_hcall" cursor,
- * and wrap back to 0 if we reach the end.
- */
- if (++cpu->next_hcall == LHCALL_RING_SIZE)
- cpu->next_hcall = 0;
-
- /*
- * Copy the hypercall arguments into a local copy of the
- * hcall_args struct.
- */
- if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
- sizeof(struct hcall_args))) {
- kill_guest(cpu, "Fetching async hypercalls");
- break;
- }
-
- /* Do the hypercall, same as a normal one. */
- do_hcall(cpu, &args);
-
- /* Mark the hypercall done. */
- if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
- kill_guest(cpu, "Writing result for async hypercall");
- break;
- }
-
- /*
- * Stop doing hypercalls if they want to notify the Launcher:
- * it needs to service this first.
- */
- if (cpu->pending.trap)
- break;
- }
-}
-
-/*
- * Last of all, we look at what happens first of all. The very first time the
- * Guest makes a hypercall, we end up here to set things up:
- */
-static void initialize(struct lg_cpu *cpu)
-{
- /*
- * You can't do anything until you're initialized. The Guest knows the
- * rules, so we're unforgiving here.
- */
- if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
- kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
- return;
- }
-
- if (lguest_arch_init_hypercalls(cpu))
- kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
- /*
- * The Guest tells us where we're not to deliver interrupts by putting
- * the instruction address into "struct lguest_data".
- */
- if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
- kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
- /*
- * We write the current time into the Guest's data page once so it can
- * set its clock.
- */
- write_timestamp(cpu);
-
- /* page_tables.c will also do some setup. */
- page_table_guest_data_init(cpu);
-
- /*
- * This is the one case where the above accesses might have been the
- * first write to a Guest page. This may have caused a copy-on-write
- * fault, but the old page might be (read-only) in the Guest
- * pagetable.
- */
- guest_pagetable_clear_all(cpu);
-}
-/*:*/
-
-/*M:013
- * If a Guest reads from a page (so creates a mapping) that it has never
- * written to, and then the Launcher writes to it (ie. the output of a virtual
- * device), the Guest will still see the old page. In practice, this never
- * happens: why would the Guest read a page which it has never written to? But
- * a similar scenario might one day bite us, so it's worth mentioning.
- *
- * Note that if we used a shared anonymous mapping in the Launcher instead of
- * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
- * need that to switch the Launcher to processes (away from threads) anyway.
-:*/
-
-/*H:100
- * Hypercalls
- *
- * Remember from the Guest, hypercalls come in two flavors: normal and
- * asynchronous. This file handles both of types.
- */
-void do_hypercalls(struct lg_cpu *cpu)
-{
- /* Not initialized yet? This hypercall must do it. */
- if (unlikely(!cpu->lg->lguest_data)) {
- /* Set up the "struct lguest_data" */
- initialize(cpu);
- /* Hcall is done. */
- cpu->hcall = NULL;
- return;
- }
-
- /*
- * The Guest has initialized.
- *
- * Look in the hypercall ring for the async hypercalls:
- */
- do_async_hcalls(cpu);
-
- /*
- * If we stopped reading the hypercall ring because the Guest did a
- * NOTIFY to the Launcher, we want to return now. Otherwise we do
- * the hypercall.
- */
- if (!cpu->pending.trap) {
- do_hcall(cpu, cpu->hcall);
- /*
- * Tricky point: we reset the hcall pointer to mark the
- * hypercall as "done". We use the hcall pointer rather than
- * the trap number to indicate a hypercall is pending.
- * Normally it doesn't matter: the Guest will run again and
- * update the trap number before we come back here.
- *
- * However, if we are signalled or the Guest sends I/O to the
- * Launcher, the run_guest() loop will exit without running the
- * Guest. When it comes back it would try to re-run the
- * hypercall. Finding that bug sucked.
- */
- cpu->hcall = NULL;
- }
-}
-
-/*
- * This routine supplies the Guest with time: it's used for wallclock time at
- * initial boot and as a rough time source if the TSC isn't available.
- */
-void write_timestamp(struct lg_cpu *cpu)
-{
- struct timespec now;
- ktime_get_real_ts(&now);
- if (copy_to_user(&cpu->lg->lguest_data->time,
- &now, sizeof(struct timespec)))
- kill_guest(cpu, "Writing timestamp");
-}
+++ /dev/null
-/*P:800
- * Interrupts (traps) are complicated enough to earn their own file.
- * There are three classes of interrupts:
- *
- * 1) Real hardware interrupts which occur while we're running the Guest,
- * 2) Interrupts for virtual devices attached to the Guest, and
- * 3) Traps and faults from the Guest.
- *
- * Real hardware interrupts must be delivered to the Host, not the Guest.
- * Virtual interrupts must be delivered to the Guest, but we make them look
- * just like real hardware would deliver them. Traps from the Guest can be set
- * up to go directly back into the Guest, but sometimes the Host wants to see
- * them first, so we also have a way of "reflecting" them into the Guest as if
- * they had been delivered to it directly.
-:*/
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include "lg.h"
-
-/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
-module_param(syscall_vector, uint, 0444);
-
-/* The address of the interrupt handler is split into two bits: */
-static unsigned long idt_address(u32 lo, u32 hi)
-{
- return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
-}
-
-/*
- * The "type" of the interrupt handler is a 4 bit field: we only support a
- * couple of types.
- */
-static int idt_type(u32 lo, u32 hi)
-{
- return (hi >> 8) & 0xF;
-}
-
-/* An IDT entry can't be used unless the "present" bit is set. */
-static bool idt_present(u32 lo, u32 hi)
-{
- return (hi & 0x8000);
-}
-
-/*
- * We need a helper to "push" a value onto the Guest's stack, since that's a
- * big part of what delivering an interrupt does.
- */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
-{
- /* Stack grows upwards: move stack then write value. */
- *gstack -= 4;
- lgwrite(cpu, *gstack, u32, val);
-}
-
-/*H:210
- * The push_guest_interrupt_stack() routine saves Guest state on the stack for
- * an interrupt or trap. The mechanics of delivering traps and interrupts to
- * the Guest are the same, except some traps have an "error code" which gets
- * pushed onto the stack as well: the caller tells us if this is one.
- *
- * We set up the stack just like the CPU does for a real interrupt, so it's
- * identical for the Guest (and the standard "iret" instruction will undo
- * it).
- */
-static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
-{
- unsigned long gstack, origstack;
- u32 eflags, ss, irq_enable;
- unsigned long virtstack;
-
- /*
- * There are two cases for interrupts: one where the Guest is already
- * in the kernel, and a more complex one where the Guest is in
- * userspace. We check the privilege level to find out.
- */
- if ((cpu->regs->ss&0x3) != GUEST_PL) {
- /*
- * The Guest told us their kernel stack with the SET_STACK
- * hypercall: both the virtual address and the segment.
- */
- virtstack = cpu->esp1;
- ss = cpu->ss1;
-
- origstack = gstack = guest_pa(cpu, virtstack);
- /*
- * We push the old stack segment and pointer onto the new
- * stack: when the Guest does an "iret" back from the interrupt
- * handler the CPU will notice they're dropping privilege
- * levels and expect these here.
- */
- push_guest_stack(cpu, &gstack, cpu->regs->ss);
- push_guest_stack(cpu, &gstack, cpu->regs->esp);
- } else {
- /* We're staying on the same Guest (kernel) stack. */
- virtstack = cpu->regs->esp;
- ss = cpu->regs->ss;
-
- origstack = gstack = guest_pa(cpu, virtstack);
- }
-
- /*
- * Remember that we never let the Guest actually disable interrupts, so
- * the "Interrupt Flag" bit is always set. We copy that bit from the
- * Guest's "irq_enabled" field into the eflags word: we saw the Guest
- * copy it back in "lguest_iret".
- */
- eflags = cpu->regs->eflags;
- if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
- && !(irq_enable & X86_EFLAGS_IF))
- eflags &= ~X86_EFLAGS_IF;
-
- /*
- * An interrupt is expected to push three things on the stack: the old
- * "eflags" word, the old code segment, and the old instruction
- * pointer.
- */
- push_guest_stack(cpu, &gstack, eflags);
- push_guest_stack(cpu, &gstack, cpu->regs->cs);
- push_guest_stack(cpu, &gstack, cpu->regs->eip);
-
- /* For the six traps which supply an error code, we push that, too. */
- if (has_err)
- push_guest_stack(cpu, &gstack, cpu->regs->errcode);
-
- /* Adjust the stack pointer and stack segment. */
- cpu->regs->ss = ss;
- cpu->regs->esp = virtstack + (gstack - origstack);
-}
-
-/*
- * This actually makes the Guest start executing the given interrupt/trap
- * handler.
- *
- * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
- * interrupt or trap. It's split into two parts for traditional reasons: gcc
- * on i386 used to be frightened by 64 bit numbers.
- */
-static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
-{
- /* If we're already in the kernel, we don't change stacks. */
- if ((cpu->regs->ss&0x3) != GUEST_PL)
- cpu->regs->ss = cpu->esp1;
-
- /*
- * Set the code segment and the address to execute.
- */
- cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
- cpu->regs->eip = idt_address(lo, hi);
-
- /*
- * Trapping always clears these flags:
- * TF: Trap flag
- * VM: Virtual 8086 mode
- * RF: Resume
- * NT: Nested task.
- */
- cpu->regs->eflags &=
- ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-
- /*
- * There are two kinds of interrupt handlers: 0xE is an "interrupt
- * gate" which expects interrupts to be disabled on entry.
- */
- if (idt_type(lo, hi) == 0xE)
- if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
- kill_guest(cpu, "Disabling interrupts");
-}
-
-/* This restores the eflags word which was pushed on the stack by a trap */
-static void restore_eflags(struct lg_cpu *cpu)
-{
- /* This is the physical address of the stack. */
- unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
-
- /*
- * Stack looks like this:
- * Address Contents
- * esp EIP
- * esp + 4 CS
- * esp + 8 EFLAGS
- */
- cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
- cpu->regs->eflags &=
- ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-}
-
-/*H:205
- * Virtual Interrupts.
- *
- * interrupt_pending() returns the first pending interrupt which isn't blocked
- * by the Guest. It is called before every entry to the Guest, and just before
- * we go to sleep when the Guest has halted itself.
- */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
-{
- unsigned int irq;
- DECLARE_BITMAP(blk, LGUEST_IRQS);
-
- /* If the Guest hasn't even initialized yet, we can do nothing. */
- if (!cpu->lg->lguest_data)
- return LGUEST_IRQS;
-
- /*
- * Take our "irqs_pending" array and remove any interrupts the Guest
- * wants blocked: the result ends up in "blk".
- */
- if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
- sizeof(blk)))
- return LGUEST_IRQS;
- bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
-
- /* Find the first interrupt. */
- irq = find_first_bit(blk, LGUEST_IRQS);
- *more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-
- return irq;
-}
-
-/*
- * This actually diverts the Guest to running an interrupt handler, once an
- * interrupt has been identified by interrupt_pending().
- */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
-{
- struct desc_struct *idt;
-
- BUG_ON(irq >= LGUEST_IRQS);
-
- /* If they're halted, interrupts restart them. */
- if (cpu->halted) {
- /* Re-enable interrupts. */
- if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
- kill_guest(cpu, "Re-enabling interrupts");
- cpu->halted = 0;
- } else {
- /* Otherwise we check if they have interrupts disabled. */
- u32 irq_enabled;
- if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
- irq_enabled = 0;
- if (!irq_enabled) {
- /* Make sure they know an IRQ is pending. */
- put_user(X86_EFLAGS_IF,
- &cpu->lg->lguest_data->irq_pending);
- return;
- }
- }
-
- /*
- * Look at the IDT entry the Guest gave us for this interrupt. The
- * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
- * over them.
- */
- idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
- /* If they don't have a handler (yet?), we just ignore it */
- if (idt_present(idt->a, idt->b)) {
- /* OK, mark it no longer pending and deliver it. */
- clear_bit(irq, cpu->irqs_pending);
-
- /*
- * They may be about to iret, where they asked us never to
- * deliver interrupts. In this case, we can emulate that iret
- * then immediately deliver the interrupt. This is basically
- * a noop: the iret would pop the interrupt frame and restore
- * eflags, and then we'd set it up again. So just restore the
- * eflags word and jump straight to the handler in this case.
- *
- * Denys Vlasenko points out that this isn't quite right: if
- * the iret was returning to userspace, then that interrupt
- * would reset the stack pointer (which the Guest told us
- * about via LHCALL_SET_STACK). But unless the Guest is being
- * *really* weird, that will be the same as the current stack
- * anyway.
- */
- if (cpu->regs->eip == cpu->lg->noirq_iret) {
- restore_eflags(cpu);
- } else {
- /*
- * set_guest_interrupt() takes a flag to say whether
- * this interrupt pushes an error code onto the stack
- * as well: virtual interrupts never do.
- */
- push_guest_interrupt_stack(cpu, false);
- }
- /* Actually make Guest cpu jump to handler. */
- guest_run_interrupt(cpu, idt->a, idt->b);
- }
-
- /*
- * Every time we deliver an interrupt, we update the timestamp in the
- * Guest's lguest_data struct. It would be better for the Guest if we
- * did this more often, but it can actually be quite slow: doing it
- * here is a compromise which means at least it gets updated every
- * timer interrupt.
- */
- write_timestamp(cpu);
-
- /*
- * If there are no other interrupts we want to deliver, clear
- * the pending flag.
- */
- if (!more)
- put_user(0, &cpu->lg->lguest_data->irq_pending);
-}
-
-/* And this is the routine when we want to set an interrupt for the Guest. */
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
-{
- /*
- * Next time the Guest runs, the core code will see if it can deliver
- * this interrupt.
- */
- set_bit(irq, cpu->irqs_pending);
-
- /*
- * Make sure it sees it; it might be asleep (eg. halted), or running
- * the Guest right now, in which case kick_process() will knock it out.
- */
- if (!wake_up_process(cpu->tsk))
- kick_process(cpu->tsk);
-}
-/*:*/
-
-/*
- * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent
- * me a patch, so we support that too. It'd be a big step for lguest if half
- * the Plan 9 user base were to start using it.
- *
- * Actually now I think of it, it's possible that Ron *is* half the Plan 9
- * userbase. Oh well.
- */
-bool could_be_syscall(unsigned int num)
-{
- /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
- return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
-}
-
-/* The syscall vector it wants must be unused by Host. */
-bool check_syscall_vector(struct lguest *lg)
-{
- u32 vector;
-
- if (get_user(vector, &lg->lguest_data->syscall_vec))
- return false;
-
- return could_be_syscall(vector);
-}
-
-int init_interrupts(void)
-{
- /* If they want some strange system call vector, reserve it now */
- if (syscall_vector != IA32_SYSCALL_VECTOR) {
- if (test_bit(syscall_vector, used_vectors) ||
- vector_used_by_percpu_irq(syscall_vector)) {
- printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
- syscall_vector);
- return -EBUSY;
- }
- set_bit(syscall_vector, used_vectors);
- }
-
- return 0;
-}
-
-void free_interrupts(void)
-{
- if (syscall_vector != IA32_SYSCALL_VECTOR)
- clear_bit(syscall_vector, used_vectors);
-}
-
-/*H:220
- * Now we've got the routines to deliver interrupts, delivering traps like
- * page fault is easy. The only trick is that Intel decided that some traps
- * should have error codes:
- */
-static bool has_err(unsigned int trap)
-{
- return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
-}
-
-/* deliver_trap() returns true if it could deliver the trap. */
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
-{
- /*
- * Trap numbers are always 8 bit, but we set an impossible trap number
- * for traps inside the Switcher, so check that here.
- */
- if (num >= ARRAY_SIZE(cpu->arch.idt))
- return false;
-
- /*
- * Early on the Guest hasn't set the IDT entries (or maybe it put a
- * bogus one in): if we fail here, the Guest will be killed.
- */
- if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
- return false;
- push_guest_interrupt_stack(cpu, has_err(num));
- guest_run_interrupt(cpu, cpu->arch.idt[num].a,
- cpu->arch.idt[num].b);
- return true;
-}
-
-/*H:250
- * Here's the hard part: returning to the Host every time a trap happens
- * and then calling deliver_trap() and re-entering the Guest is slow.
- * Particularly because Guest userspace system calls are traps (usually trap
- * 128).
- *
- * So we'd like to set up the IDT to tell the CPU to deliver traps directly
- * into the Guest. This is possible, but the complexities cause the size of
- * this file to double! However, 150 lines of code is worth writing for taking
- * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
- * the other hypervisors would beat it up at lunchtime.
- *
- * This routine indicates if a particular trap number could be delivered
- * directly.
- *
- * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
- * trap gate for syscalls, so this trick is ineffective. See Mastery for
- * how we could do this anyway...
- */
-static bool direct_trap(unsigned int num)
-{
- /*
- * Hardware interrupts don't go to the Guest at all (except system
- * call).
- */
- if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
- return false;
-
- /*
- * The Host needs to see page faults (for shadow paging and to save the
- * fault address), general protection faults (in/out emulation) and
- * device not available (TS handling) and of course, the hypercall trap.
- */
- return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
-}
-/*:*/
-
-/*M:005
- * The Guest has the ability to turn its interrupt gates into trap gates,
- * if it is careful. The Host will let trap gates can go directly to the
- * Guest, but the Guest needs the interrupts atomically disabled for an
- * interrupt gate. The Host could provide a mechanism to register more
- * "no-interrupt" regions, and the Guest could point the trap gate at
- * instructions within that region, where it can safely disable interrupts.
- */
-
-/*M:006
- * The Guests do not use the sysenter (fast system call) instruction,
- * because it's hardcoded to enter privilege level 0 and so can't go direct.
- * It's about twice as fast as the older "int 0x80" system call, so it might
- * still be worthwhile to handle it in the Switcher and lcall down to the
- * Guest. The sysenter semantics are hairy tho: search for that keyword in
- * entry.S
-:*/
-
-/*H:260
- * When we make traps go directly into the Guest, we need to make sure
- * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the
- * CPU trying to deliver the trap will fault while trying to push the interrupt
- * words on the stack: this is called a double fault, and it forces us to kill
- * the Guest.
- *
- * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
- */
-void pin_stack_pages(struct lg_cpu *cpu)
-{
- unsigned int i;
-
- /*
- * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
- * two pages of stack space.
- */
- for (i = 0; i < cpu->lg->stack_pages; i++)
- /*
- * The stack grows *upwards*, so the address we're given is the
- * start of the page after the kernel stack. Subtract one to
- * get back onto the first stack page, and keep subtracting to
- * get to the rest of the stack pages.
- */
- pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
-}
-
-/*
- * Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the guest TSS to use that
- * stack. The TSS entries expect a virtual address, so unlike most addresses
- * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
- * physical.
- *
- * In Linux each process has its own kernel stack, so this happens a lot: we
- * change stacks on each context switch.
- */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
-{
- /*
- * You're not allowed a stack segment with privilege level 0: bad Guest!
- */
- if ((seg & 0x3) != GUEST_PL)
- kill_guest(cpu, "bad stack segment %i", seg);
- /* We only expect one or two stack pages. */
- if (pages > 2)
- kill_guest(cpu, "bad stack pages %u", pages);
- /* Save where the stack is, and how many pages */
- cpu->ss1 = seg;
- cpu->esp1 = esp;
- cpu->lg->stack_pages = pages;
- /* Make sure the new stack pages are mapped */
- pin_stack_pages(cpu);
-}
-
-/*
- * All this reference to mapping stacks leads us neatly into the other complex
- * part of the Host: page table handling.
- */
-
-/*H:235
- * This is the routine which actually checks the Guest's IDT entry and
- * transfers it into the entry in "struct lguest":
- */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
- unsigned int num, u32 lo, u32 hi)
-{
- u8 type = idt_type(lo, hi);
-
- /* We zero-out a not-present entry */
- if (!idt_present(lo, hi)) {
- trap->a = trap->b = 0;
- return;
- }
-
- /* We only support interrupt and trap gates. */
- if (type != 0xE && type != 0xF)
- kill_guest(cpu, "bad IDT type %i", type);
-
- /*
- * We only copy the handler address, present bit, privilege level and
- * type. The privilege level controls where the trap can be triggered
- * manually with an "int" instruction. This is usually GUEST_PL,
- * except for system calls which userspace can use.
- */
- trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
- trap->b = (hi&0xFFFFEF00);
-}
-
-/*H:230
- * While we're here, dealing with delivering traps and interrupts to the
- * Guest, we might as well complete the picture: how the Guest tells us where
- * it wants them to go. This would be simple, except making traps fast
- * requires some tricks.
- *
- * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
- * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
- */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
-{
- /*
- * Guest never handles: NMI, doublefault, spurious interrupt or
- * hypercall. We ignore when it tries to set them.
- */
- if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
- return;
-
- /*
- * Mark the IDT as changed: next time the Guest runs we'll know we have
- * to copy this again.
- */
- cpu->changed |= CHANGED_IDT;
-
- /* Check that the Guest doesn't try to step outside the bounds. */
- if (num >= ARRAY_SIZE(cpu->arch.idt))
- kill_guest(cpu, "Setting idt entry %u", num);
- else
- set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
-}
-
-/*
- * The default entry for each interrupt points into the Switcher routines which
- * simply return to the Host. The run_guest() loop will then call
- * deliver_trap() to bounce it back into the Guest.
- */
-static void default_idt_entry(struct desc_struct *idt,
- int trap,
- const unsigned long handler,
- const struct desc_struct *base)
-{
- /* A present interrupt gate. */
- u32 flags = 0x8e00;
-
- /*
- * Set the privilege level on the entry for the hypercall: this allows
- * the Guest to use the "int" instruction to trigger it.
- */
- if (trap == LGUEST_TRAP_ENTRY)
- flags |= (GUEST_PL << 13);
- else if (base)
- /*
- * Copy privilege level from what Guest asked for. This allows
- * debug (int 3) traps from Guest userspace, for example.
- */
- flags |= (base->b & 0x6000);
-
- /* Now pack it into the IDT entry in its weird format. */
- idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
- idt->b = (handler&0xFFFF0000) | flags;
-}
-
-/* When the Guest first starts, we put default entries into the IDT. */
-void setup_default_idt_entries(struct lguest_ro_state *state,
- const unsigned long *def)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
- default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
-}
-
-/*H:240
- * We don't use the IDT entries in the "struct lguest" directly, instead
- * we copy them into the IDT which we've set up for Guests on this CPU, just
- * before we run the Guest. This routine does that copy.
- */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
- const unsigned long *def)
-{
- unsigned int i;
-
- /*
- * We can simply copy the direct traps, otherwise we use the default
- * ones in the Switcher: they will return to the Host.
- */
- for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
- const struct desc_struct *gidt = &cpu->arch.idt[i];
-
- /* If no Guest can ever override this trap, leave it alone. */
- if (!direct_trap(i))
- continue;
-
- /*
- * Only trap gates (type 15) can go direct to the Guest.
- * Interrupt gates (type 14) disable interrupts as they are
- * entered, which we never let the Guest do. Not present
- * entries (type 0x0) also can't go direct, of course.
- *
- * If it can't go direct, we still need to copy the priv. level:
- * they might want to give userspace access to a software
- * interrupt.
- */
- if (idt_type(gidt->a, gidt->b) == 0xF)
- idt[i] = *gidt;
- else
- default_idt_entry(&idt[i], i, def[i], gidt);
- }
-}
-
-/*H:200
- * The Guest Clock.
- *
- * There are two sources of virtual interrupts. We saw one in lguest_user.c:
- * the Launcher sending interrupts for virtual devices. The other is the Guest
- * timer interrupt.
- *
- * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
- * the next timer interrupt (in nanoseconds). We use the high-resolution timer
- * infrastructure to set a callback at that time.
- *
- * 0 means "turn off the clock".
- */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
-{
- ktime_t expires;
-
- if (unlikely(delta == 0)) {
- /* Clock event device is shutting down. */
- hrtimer_cancel(&cpu->hrt);
- return;
- }
-
- /*
- * We use wallclock time here, so the Guest might not be running for
- * all the time between now and the timer interrupt it asked for. This
- * is almost always the right thing to do.
- */
- expires = ktime_add_ns(ktime_get_real(), delta);
- hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
-}
-
-/* This is the function called when the Guest's timer expires. */
-static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
-{
- struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
-
- /* Remember the first interrupt is the timer interrupt. */
- set_interrupt(cpu, 0);
- return HRTIMER_NORESTART;
-}
-
-/* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
-{
- hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- cpu->hrt.function = clockdev_fn;
-}
+++ /dev/null
-#ifndef _LGUEST_H
-#define _LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/wait.h>
-#include <linux/hrtimer.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include <asm/lguest.h>
-
-struct pgdir {
- unsigned long gpgdir;
- bool switcher_mapped;
- int last_host_cpu;
- pgd_t *pgdir;
-};
-
-/* We have two pages shared with guests, per cpu. */
-struct lguest_pages {
- /* This is the stack page mapped rw in guest */
- char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
- struct lguest_regs regs;
-
- /* This is the host state & guest descriptor page, ro in guest */
- struct lguest_ro_state state;
-} __attribute__((aligned(PAGE_SIZE)));
-
-#define CHANGED_IDT 1
-#define CHANGED_GDT 2
-#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
-#define CHANGED_ALL 3
-
-struct lg_cpu {
- unsigned int id;
- struct lguest *lg;
- struct task_struct *tsk;
- struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
-
- u32 cr2;
- u32 esp1;
- u16 ss1;
-
- /* Bitmap of what has changed: see CHANGED_* above. */
- int changed;
-
- /* Pending operation. */
- struct lguest_pending pending;
-
- unsigned long *reg_read; /* register from LHREQ_GETREG */
-
- /* At end of a page shared mapped over lguest_pages in guest. */
- unsigned long regs_page;
- struct lguest_regs *regs;
-
- struct lguest_pages *last_pages;
-
- /* Initialization mode: linear map everything. */
- bool linear_pages;
- int cpu_pgd; /* Which pgd this cpu is currently using */
-
- /* If a hypercall was asked for, this points to the arguments. */
- struct hcall_args *hcall;
- u32 next_hcall;
-
- /* Virtual clock device */
- struct hrtimer hrt;
-
- /* Did the Guest tell us to halt? */
- int halted;
-
- /* Pending virtual interrupts */
- DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
- struct lg_cpu_arch arch;
-};
-
-/* The private info the thread maintains about the guest. */
-struct lguest {
- struct lguest_data __user *lguest_data;
- struct lg_cpu cpus[NR_CPUS];
- unsigned int nr_cpus;
-
- /* Valid guest memory pages must be < this. */
- u32 pfn_limit;
-
- /* Device memory is >= pfn_limit and < device_limit. */
- u32 device_limit;
-
- /*
- * This provides the offset to the base of guest-physical memory in the
- * Launcher.
- */
- void __user *mem_base;
- unsigned long kernel_address;
-
- struct pgdir pgdirs[4];
-
- unsigned long noirq_iret;
-
- unsigned int stack_pages;
- u32 tsc_khz;
-
- /* Dead? */
- const char *dead;
-};
-
-extern struct mutex lguest_lock;
-
-/* core.c: */
-bool lguest_address_ok(const struct lguest *lg,
- unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
-extern struct page **lg_switcher_pages;
-
-/*H:035
- * Using memory-copy operations like that is usually inconvient, so we
- * have the following helper macros which read and write a specific type (often
- * an unsigned long).
- *
- * This reads into a variable of the given type then returns that.
- */
-#define lgread(cpu, addr, type) \
- ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
-
-/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val) \
- do { \
- typecheck(type, val); \
- __lgwrite((cpu), (addr), &(val), sizeof(val)); \
- } while(0)
-/* (end of memory access helper routines) :*/
-
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
-
-/*
- * Helper macros to obtain the first 12 or the last 20 bits, this is only the
- * first step in the migration to the kernel types. pte_pfn is already defined
- * in the kernel.
- */
-#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
-#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
-#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
-#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
-
-/* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
- u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
-void setup_default_idt_entries(struct lguest_ro_state *state,
- const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
- const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-bool send_notify_to_eventfd(struct lg_cpu *cpu);
-void init_clockdev(struct lg_cpu *cpu);
-bool check_syscall_vector(struct lguest *lg);
-bool could_be_syscall(unsigned int num);
-int init_interrupts(void);
-void free_interrupts(void);
-
-/* segments.c: */
-void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
- u32 low, u32 hi);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
-
-/* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg);
-void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#ifdef CONFIG_X86_PAE
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#endif
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
- unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
- unsigned long *iomem);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
-
-/* <arch>/core.c: */
-void lguest_arch_host_init(void);
-void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
-
-/* <arch>/switcher.S: */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-
-/* lguest_user.c: */
-int lguest_device_init(void);
-void lguest_device_remove(void);
-
-/* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
-
-/*L:035
- * Let's step aside for the moment, to study one important routine that's used
- * widely in the Host code.
- *
- * There are many cases where the Guest can do something invalid, like pass crap
- * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
- * acceptable to simply terminate the Guest and give the Launcher a nicely
- * formatted reason. It's also simpler for the Guest itself, which doesn't
- * need to check most hypercalls for "success"; if you're still running, it
- * succeeded.
- *
- * Once this is called, the Guest will never run again, so most Host code can
- * call this then continue as if nothing had happened. This means many
- * functions don't have to explicitly return an error code, which keeps the
- * code simple.
- *
- * It also means that this can be called more than once: only the first one is
- * remembered. The only trick is that we still need to kill the Guest even if
- * we can't allocate memory to store the reason. Linux has a neat way of
- * packing error codes into invalid pointers, so we use that here.
- *
- * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
- * } while(0)".
- */
-#define kill_guest(cpu, fmt...) \
-do { \
- if (!(cpu)->lg->dead) { \
- (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \
- if (!(cpu)->lg->dead) \
- (cpu)->lg->dead = ERR_PTR(-ENOMEM); \
- } \
-} while(0)
-/* (End of aside) :*/
-
-#endif /* __ASSEMBLY__ */
-#endif /* _LGUEST_H */
+++ /dev/null
-/*P:200 This contains all the /dev/lguest code, whereby the userspace
- * launcher controls and communicates with the Guest. For example,
- * the first write will tell us the Guest's memory layout and entry
- * point. A read will run the Guest until something happens, such as
- * a signal or the Guest accessing a device.
-:*/
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "lg.h"
-
-/*L:052
- The Launcher can get the registers, and also set some of them.
-*/
-static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
-{
- unsigned long which;
-
- /* We re-use the ptrace structure to specify which register to read. */
- if (get_user(which, input) != 0)
- return -EFAULT;
-
- /*
- * We set up the cpu register pointer, and their next read will
- * actually get the value (instead of running the guest).
- *
- * The last argument 'true' says we can access any register.
- */
- cpu->reg_read = lguest_arch_regptr(cpu, which, true);
- if (!cpu->reg_read)
- return -ENOENT;
-
- /* And because this is a write() call, we return the length used. */
- return sizeof(unsigned long) * 2;
-}
-
-static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
-{
- unsigned long which, value, *reg;
-
- /* We re-use the ptrace structure to specify which register to read. */
- if (get_user(which, input) != 0)
- return -EFAULT;
- input++;
- if (get_user(value, input) != 0)
- return -EFAULT;
-
- /* The last argument 'false' means we can't access all registers. */
- reg = lguest_arch_regptr(cpu, which, false);
- if (!reg)
- return -ENOENT;
-
- *reg = value;
-
- /* And because this is a write() call, we return the length used. */
- return sizeof(unsigned long) * 3;
-}
-
-/*L:050
- * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
- * number to /dev/lguest.
- */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
-{
- unsigned long irq;
-
- if (get_user(irq, input) != 0)
- return -EFAULT;
- if (irq >= LGUEST_IRQS)
- return -EINVAL;
-
- /*
- * Next time the Guest runs, the core code will see if it can deliver
- * this interrupt.
- */
- set_interrupt(cpu, irq);
- return 0;
-}
-
-/*L:053
- * Deliver a trap: this is used by the Launcher if it can't emulate
- * an instruction.
- */
-static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
-{
- unsigned long trapnum;
-
- if (get_user(trapnum, input) != 0)
- return -EFAULT;
-
- if (!deliver_trap(cpu, trapnum))
- return -EINVAL;
-
- return 0;
-}
-
-/*L:040
- * Once our Guest is initialized, the Launcher makes it run by reading
- * from /dev/lguest.
- */
-static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
-{
- struct lguest *lg = file->private_data;
- struct lg_cpu *cpu;
- unsigned int cpu_id = *o;
-
- /* You must write LHREQ_INITIALIZE first! */
- if (!lg)
- return -EINVAL;
-
- /* Watch out for arbitrary vcpu indexes! */
- if (cpu_id >= lg->nr_cpus)
- return -EINVAL;
-
- cpu = &lg->cpus[cpu_id];
-
- /* If you're not the task which owns the Guest, go away. */
- if (current != cpu->tsk)
- return -EPERM;
-
- /* If the Guest is already dead, we indicate why */
- if (lg->dead) {
- size_t len;
-
- /* lg->dead either contains an error code, or a string. */
- if (IS_ERR(lg->dead))
- return PTR_ERR(lg->dead);
-
- /* We can only return as much as the buffer they read with. */
- len = min(size, strlen(lg->dead)+1);
- if (copy_to_user(user, lg->dead, len) != 0)
- return -EFAULT;
- return len;
- }
-
- /*
- * If we returned from read() last time because the Guest sent I/O,
- * clear the flag.
- */
- if (cpu->pending.trap)
- cpu->pending.trap = 0;
-
- /* Run the Guest until something interesting happens. */
- return run_guest(cpu, (unsigned long __user *)user);
-}
-
-/*L:025
- * This actually initializes a CPU. For the moment, a Guest is only
- * uniprocessor, so "id" is always 0.
- */
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
- /* We have a limited number of CPUs in the lguest struct. */
- if (id >= ARRAY_SIZE(cpu->lg->cpus))
- return -EINVAL;
-
- /* Set up this CPU's id, and pointer back to the lguest struct. */
- cpu->id = id;
- cpu->lg = container_of(cpu, struct lguest, cpus[id]);
- cpu->lg->nr_cpus++;
-
- /* Each CPU has a timer it can set. */
- init_clockdev(cpu);
-
- /*
- * We need a complete page for the Guest registers: they are accessible
- * to the Guest and we can only grant it access to whole pages.
- */
- cpu->regs_page = get_zeroed_page(GFP_KERNEL);
- if (!cpu->regs_page)
- return -ENOMEM;
-
- /* We actually put the registers at the end of the page. */
- cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
- /*
- * Now we initialize the Guest's registers, handing it the start
- * address.
- */
- lguest_arch_setup_regs(cpu, start_ip);
-
- /*
- * We keep a pointer to the Launcher task (ie. current task) for when
- * other Guests want to wake this one (eg. console input).
- */
- cpu->tsk = current;
-
- /*
- * We need to keep a pointer to the Launcher's memory map, because if
- * the Launcher dies we need to clean it up. If we don't keep a
- * reference, it is destroyed before close() is called.
- */
- cpu->mm = get_task_mm(cpu->tsk);
-
- /*
- * We remember which CPU's pages this Guest used last, for optimization
- * when the same Guest runs on the same CPU twice.
- */
- cpu->last_pages = NULL;
-
- /* No error == success. */
- return 0;
-}
-
-/*L:020
- * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
- * addition to the LHREQ_INITIALIZE value). These are:
- *
- * base: The start of the Guest-physical memory inside the Launcher memory.
- *
- * pfnlimit: The highest (Guest-physical) page number the Guest should be
- * allowed to access. The Guest memory lives inside the Launcher, so it sets
- * this to ensure the Guest can only reach its own memory.
- *
- * start: The first instruction to execute ("eip" in x86-speak).
- */
-static int initialize(struct file *file, const unsigned long __user *input)
-{
- /* "struct lguest" contains all we (the Host) know about a Guest. */
- struct lguest *lg;
- int err;
- unsigned long args[4];
-
- /*
- * We grab the Big Lguest lock, which protects against multiple
- * simultaneous initializations.
- */
- mutex_lock(&lguest_lock);
- /* You can't initialize twice! Close the device and start again... */
- if (file->private_data) {
- err = -EBUSY;
- goto unlock;
- }
-
- if (copy_from_user(args, input, sizeof(args)) != 0) {
- err = -EFAULT;
- goto unlock;
- }
-
- lg = kzalloc(sizeof(*lg), GFP_KERNEL);
- if (!lg) {
- err = -ENOMEM;
- goto unlock;
- }
-
- /* Populate the easy fields of our "struct lguest" */
- lg->mem_base = (void __user *)args[0];
- lg->pfn_limit = args[1];
- lg->device_limit = args[3];
-
- /* This is the first cpu (cpu 0) and it will start booting at args[2] */
- err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
- if (err)
- goto free_lg;
-
- /*
- * Initialize the Guest's shadow page tables. This allocates
- * memory, so can fail.
- */
- err = init_guest_pagetable(lg);
- if (err)
- goto free_regs;
-
- /* We keep our "struct lguest" in the file's private_data. */
- file->private_data = lg;
-
- mutex_unlock(&lguest_lock);
-
- /* And because this is a write() call, we return the length used. */
- return sizeof(args);
-
-free_regs:
- /* FIXME: This should be in free_vcpu */
- free_page(lg->cpus[0].regs_page);
-free_lg:
- kfree(lg);
-unlock:
- mutex_unlock(&lguest_lock);
- return err;
-}
-
-/*L:010
- * The first operation the Launcher does must be a write. All writes
- * start with an unsigned long number: for the first write this must be
- * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
- * writes of other values to send interrupts or set up receipt of notifications.
- *
- * Note that we overload the "offset" in the /dev/lguest file to indicate what
- * CPU number we're dealing with. Currently this is always 0 since we only
- * support uniprocessor Guests, but you can see the beginnings of SMP support
- * here.
- */
-static ssize_t write(struct file *file, const char __user *in,
- size_t size, loff_t *off)
-{
- /*
- * Once the Guest is initialized, we hold the "struct lguest" in the
- * file private data.
- */
- struct lguest *lg = file->private_data;
- const unsigned long __user *input = (const unsigned long __user *)in;
- unsigned long req;
- struct lg_cpu *uninitialized_var(cpu);
- unsigned int cpu_id = *off;
-
- /* The first value tells us what this request is. */
- if (get_user(req, input) != 0)
- return -EFAULT;
- input++;
-
- /* If you haven't initialized, you must do that first. */
- if (req != LHREQ_INITIALIZE) {
- if (!lg || (cpu_id >= lg->nr_cpus))
- return -EINVAL;
- cpu = &lg->cpus[cpu_id];
-
- /* Once the Guest is dead, you can only read() why it died. */
- if (lg->dead)
- return -ENOENT;
- }
-
- switch (req) {
- case LHREQ_INITIALIZE:
- return initialize(file, input);
- case LHREQ_IRQ:
- return user_send_irq(cpu, input);
- case LHREQ_GETREG:
- return getreg_setup(cpu, input);
- case LHREQ_SETREG:
- return setreg(cpu, input);
- case LHREQ_TRAP:
- return trap(cpu, input);
- default:
- return -EINVAL;
- }
-}
-
-static int open(struct inode *inode, struct file *file)
-{
- file->private_data = NULL;
-
- return 0;
-}
-
-/*L:060
- * The final piece of interface code is the close() routine. It reverses
- * everything done in initialize(). This is usually called because the
- * Launcher exited.
- *
- * Note that the close routine returns 0 or a negative error number: it can't
- * really fail, but it can whine. I blame Sun for this wart, and K&R C for
- * letting them do it.
-:*/
-static int close(struct inode *inode, struct file *file)
-{
- struct lguest *lg = file->private_data;
- unsigned int i;
-
- /* If we never successfully initialized, there's nothing to clean up */
- if (!lg)
- return 0;
-
- /*
- * We need the big lock, to protect from inter-guest I/O and other
- * Launchers initializing guests.
- */
- mutex_lock(&lguest_lock);
-
- /* Free up the shadow page tables for the Guest. */
- free_guest_pagetable(lg);
-
- for (i = 0; i < lg->nr_cpus; i++) {
- /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
- hrtimer_cancel(&lg->cpus[i].hrt);
- /* We can free up the register page we allocated. */
- free_page(lg->cpus[i].regs_page);
- /*
- * Now all the memory cleanups are done, it's safe to release
- * the Launcher's memory management structure.
- */
- mmput(lg->cpus[i].mm);
- }
-
- /*
- * If lg->dead doesn't contain an error code it will be NULL or a
- * kmalloc()ed string, either of which is ok to hand to kfree().
- */
- if (!IS_ERR(lg->dead))
- kfree(lg->dead);
- /* Free the memory allocated to the lguest_struct */
- kfree(lg);
- /* Release lock and exit. */
- mutex_unlock(&lguest_lock);
-
- return 0;
-}
-
-/*L:000
- * Welcome to our journey through the Launcher!
- *
- * The Launcher is the Host userspace program which sets up, runs and services
- * the Guest. In fact, many comments in the Drivers which refer to "the Host"
- * doing things are inaccurate: the Launcher does all the device handling for
- * the Guest, but the Guest can't know that.
- *
- * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
- * shall see more of that later.
- *
- * We begin our understanding with the Host kernel interface which the Launcher
- * uses: reading and writing a character device called /dev/lguest. All the
- * work happens in the read(), write() and close() routines:
- */
-static const struct file_operations lguest_fops = {
- .owner = THIS_MODULE,
- .open = open,
- .release = close,
- .write = write,
- .read = read,
- .llseek = default_llseek,
-};
-/*:*/
-
-/*
- * This is a textbook example of a "misc" character device. Populate a "struct
- * miscdevice" and register it with misc_register().
- */
-static struct miscdevice lguest_dev = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "lguest",
- .fops = &lguest_fops,
-};
-
-int __init lguest_device_init(void)
-{
- return misc_register(&lguest_dev);
-}
-
-void __exit lguest_device_remove(void)
-{
- misc_deregister(&lguest_dev);
-}
+++ /dev/null
-/*P:700
- * The pagetable code, on the other hand, still shows the scars of
- * previous encounters. It's functional, and as neat as it can be in the
- * circumstances, but be wary, for these things are subtle and break easily.
- * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here then point the CPU to the
- * converted Guest pages when running the Guest.
-:*/
-
-/* Copyright (C) Rusty Russell IBM Corporation 2013.
- * GPL v2 and any later version */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/percpu.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*M:008
- * We hold reference to pages, which prevents them from being swapped.
- * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
- * to swap out. If we had this, and a shrinker callback to trim PTE pages, we
- * could probably consider launching Guests as non-root.
-:*/
-
-/*H:300
- * The Page Table Code
- *
- * We use two-level page tables for the Guest, or three-level with PAE. If
- * you're not entirely comfortable with virtual addresses, physical addresses
- * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
- * Table Handling" (with diagrams!).
- *
- * The Guest keeps page tables, but we maintain the actual ones here: these are
- * called "shadow" page tables. Which is a very Guest-centric name: these are
- * the real page tables the CPU uses, although we keep them up to date to
- * reflect the Guest's. (See what I mean about weird naming? Since when do
- * shadows reflect anything?)
- *
- * Anyway, this is the most complicated part of the Host code. There are seven
- * parts to this:
- * (i) Looking up a page table entry when the Guest faults,
- * (ii) Making sure the Guest stack is mapped,
- * (iii) Setting up a page table entry when the Guest tells us one has changed,
- * (iv) Switching page tables,
- * (v) Flushing (throwing away) page tables,
- * (vi) Mapping the Switcher when the Guest is about to run,
- * (vii) Setting up the page tables initially.
-:*/
-
-/*
- * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB)
- * or 512 PTE entries with PAE (2MB).
- */
-#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
-
-/*
- * For PAE we need the PMD index as well. We use the last 2MB, so we
- * will need the last pmd entry of the last pmd page.
- */
-#ifdef CONFIG_X86_PAE
-#define CHECK_GPGD_MASK _PAGE_PRESENT
-#else
-#define CHECK_GPGD_MASK _PAGE_TABLE
-#endif
-
-/*H:320
- * The page table code is curly enough to need helper functions to keep it
- * clear and clean. The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
- *
- * There are two functions which return pointers to the shadow (aka "real")
- * page tables.
- *
- * spgd_addr() takes the virtual address and returns a pointer to the top-level
- * page directory entry (PGD) for that address. Since we keep track of several
- * page tables, the "i" argument tells us which one we're interested in (it's
- * usually the current one).
- */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
-{
- unsigned int index = pgd_index(vaddr);
-
- /* Return a pointer index'th pgd entry for the i'th page table. */
- return &cpu->lg->pgdirs[i].pgdir[index];
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * This routine then takes the PGD entry given above, which contains the
- * address of the PMD page. It then returns a pointer to the PMD entry for the
- * given address.
- */
-static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
- unsigned int index = pmd_index(vaddr);
- pmd_t *page;
-
- /* You should never call this if the PGD entry wasn't valid */
- BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
- page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-
- return &page[index];
-}
-#endif
-
-/*
- * This routine then takes the page directory entry returned above, which
- * contains the address of the page table entry (PTE) page. It then returns a
- * pointer to the PTE entry for the given address.
- */
-static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-#ifdef CONFIG_X86_PAE
- pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
- pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
-
- /* You should never call this if the PMD entry wasn't valid */
- BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
-#else
- pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
- /* You should never call this if the PGD entry wasn't valid */
- BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-#endif
-
- return &page[pte_index(vaddr)];
-}
-
-/*
- * These functions are just like the above, except they access the Guest
- * page tables. Hence they return a Guest address.
- */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
-{
- unsigned int index = vaddr >> (PGDIR_SHIFT);
- return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
-}
-
-#ifdef CONFIG_X86_PAE
-/* Follow the PGD to the PMD. */
-static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
-{
- unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
- BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
- return gpage + pmd_index(vaddr) * sizeof(pmd_t);
-}
-
-/* Follow the PMD to the PTE. */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
- pmd_t gpmd, unsigned long vaddr)
-{
- unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
-
- BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
- return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#else
-/* Follow the PGD to the PTE (no mid-level for !PAE). */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
- pgd_t gpgd, unsigned long vaddr)
-{
- unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-
- BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
- return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#endif
-/*:*/
-
-/*M:007
- * get_pfn is slow: we could probably try to grab batches of pages here as
- * an optimization (ie. pre-faulting).
-:*/
-
-/*H:350
- * This routine takes a page number given by the Guest and converts it to
- * an actual, physical page number. It can fail for several reasons: the
- * virtual address might not be mapped by the Launcher, the write flag is set
- * and the page is read-only, or the write flag was set and the page was
- * shared so had to be copied, but we ran out of memory.
- *
- * This holds a reference to the page, so release_pte() is careful to put that
- * back.
- */
-static unsigned long get_pfn(unsigned long virtpfn, int write)
-{
- struct page *page;
-
- /* gup me one page at this address please! */
- if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
- return page_to_pfn(page);
-
- /* This value indicates failure. */
- return -1UL;
-}
-
-/*H:340
- * Converting a Guest page table entry to a shadow (ie. real) page table
- * entry can be a little tricky. The flags are (almost) the same, but the
- * Guest PTE contains a virtual page number: the CPU needs the real page
- * number.
- */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
-{
- unsigned long pfn, base, flags;
-
- /*
- * The Guest sets the global flag, because it thinks that it is using
- * PGE. We only told it to use PGE so it would tell us whether it was
- * flushing a kernel mapping or a userspace mapping. We don't actually
- * use the global bit, so throw it away.
- */
- flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
-
- /* The Guest's pages are offset inside the Launcher. */
- base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
-
- /*
- * We need a temporary "unsigned long" variable to hold the answer from
- * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
- * fit in spte.pfn. get_pfn() finds the real physical number of the
- * page, given the virtual number.
- */
- pfn = get_pfn(base + pte_pfn(gpte), write);
- if (pfn == -1UL) {
- kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
- /*
- * When we destroy the Guest, we'll go through the shadow page
- * tables and release_pte() them. Make sure we don't think
- * this one is valid!
- */
- flags = 0;
- }
- /* Now we assemble our shadow PTE from the page number and flags. */
- return pfn_pte(pfn, __pgprot(flags));
-}
-
-/*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(pte_t pte)
-{
- /*
- * Remember that get_user_pages_fast() took a reference to the page, in
- * get_pfn()? We have to put it back now.
- */
- if (pte_flags(pte) & _PAGE_PRESENT)
- put_page(pte_page(pte));
-}
-/*:*/
-
-static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
-{
- /* We don't handle large pages. */
- if (pte_flags(gpte) & _PAGE_PSE)
- return false;
-
- return (pte_pfn(gpte) >= cpu->lg->pfn_limit
- && pte_pfn(gpte) < cpu->lg->device_limit);
-}
-
-static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
-{
- if ((pte_flags(gpte) & _PAGE_PSE) ||
- pte_pfn(gpte) >= cpu->lg->pfn_limit) {
- kill_guest(cpu, "bad page table entry");
- return false;
- }
- return true;
-}
-
-static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
-{
- if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
- (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
- kill_guest(cpu, "bad page directory entry");
- return false;
- }
- return true;
-}
-
-#ifdef CONFIG_X86_PAE
-static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
-{
- if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
- (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
- kill_guest(cpu, "bad page middle directory entry");
- return false;
- }
- return true;
-}
-#endif
-
-/*H:331
- * This is the core routine to walk the shadow page tables and find the page
- * table entry for a specific address.
- *
- * If allocate is set, then we allocate any missing levels, setting the flags
- * on the new page directory and mid-level directories using the arguments
- * (which are copied from the Guest's page table entries).
- */
-static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
- int pgd_flags, int pmd_flags)
-{
- pgd_t *spgd;
- /* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
- pmd_t *spmd;
-#endif
-
- /* Get top level entry. */
- spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
- if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
- /* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage;
-
- /* If they didn't want us to allocate anything, stop. */
- if (!allocate)
- return NULL;
-
- ptepage = get_zeroed_page(GFP_KERNEL);
- /*
- * This is not really the Guest's fault, but killing it is
- * simple for this corner case.
- */
- if (!ptepage) {
- kill_guest(cpu, "out of memory allocating pte page");
- return NULL;
- }
- /*
- * And we copy the flags to the shadow PGD entry. The page
- * number in the shadow PGD is the page we just allocated.
- */
- set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
- }
-
- /*
- * Intel's Physical Address Extension actually uses three levels of
- * page tables, so we need to look in the mid-level.
- */
-#ifdef CONFIG_X86_PAE
- /* Now look at the mid-level shadow entry. */
- spmd = spmd_addr(cpu, *spgd, vaddr);
-
- if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
- /* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage;
-
- /* If they didn't want us to allocate anything, stop. */
- if (!allocate)
- return NULL;
-
- ptepage = get_zeroed_page(GFP_KERNEL);
-
- /*
- * This is not really the Guest's fault, but killing it is
- * simple for this corner case.
- */
- if (!ptepage) {
- kill_guest(cpu, "out of memory allocating pmd page");
- return NULL;
- }
-
- /*
- * And we copy the flags to the shadow PMD entry. The page
- * number in the shadow PMD is the page we just allocated.
- */
- set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
- }
-#endif
-
- /* Get the pointer to the shadow PTE entry we're going to set. */
- return spte_addr(cpu, *spgd, vaddr);
-}
-
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here. That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
- *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true. Otherwise, it was a real fault and we need to tell the Guest.
- *
- * There's a corner case: they're trying to access memory between
- * pfn_limit and device_limit, which is I/O memory. In this case, we
- * return false and set @iomem to the physical address, so the the
- * Launcher can handle the instruction manually.
- */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
- unsigned long *iomem)
-{
- unsigned long gpte_ptr;
- pte_t gpte;
- pte_t *spte;
- pmd_t gpmd;
- pgd_t gpgd;
-
- *iomem = 0;
-
- /* We never demand page the Switcher, so trying is a mistake. */
- if (vaddr >= switcher_addr)
- return false;
-
- /* First step: get the top-level Guest page table entry. */
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpgd = __pgd(CHECK_GPGD_MASK);
- } else {
- gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
- /* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- return false;
-
- /*
- * This kills the Guest if it has weird flags or tries to
- * refer to a "physical" address outside the bounds.
- */
- if (!check_gpgd(cpu, gpgd))
- return false;
- }
-
- /* This "mid-level" entry is only used for non-linear, PAE mode. */
- gpmd = __pmd(_PAGE_TABLE);
-
-#ifdef CONFIG_X86_PAE
- if (likely(!cpu->linear_pages)) {
- gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- /* Middle level not present? We can't map it in. */
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
- return false;
-
- /*
- * This kills the Guest if it has weird flags or tries to
- * refer to a "physical" address outside the bounds.
- */
- if (!check_gpmd(cpu, gpmd))
- return false;
- }
-
- /*
- * OK, now we look at the lower level in the Guest page table: keep its
- * address, because we might update it later.
- */
- gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
-#else
- /*
- * OK, now we look at the lower level in the Guest page table: keep its
- * address, because we might update it later.
- */
- gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
-#endif
-
- if (unlikely(cpu->linear_pages)) {
- /* Linear? Make up a PTE which points to same page. */
- gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
- } else {
- /* Read the actual PTE value. */
- gpte = lgread(cpu, gpte_ptr, pte_t);
- }
-
- /* If this page isn't in the Guest page tables, we can't page it in. */
- if (!(pte_flags(gpte) & _PAGE_PRESENT))
- return false;
-
- /*
- * Check they're not trying to write to a page the Guest wants
- * read-only (bit 2 of errcode == write).
- */
- if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
- return false;
-
- /* User access to a kernel-only page? (bit 3 == user access) */
- if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
- return false;
-
- /* If they're accessing io memory, we expect a fault. */
- if (gpte_in_iomem(cpu, gpte)) {
- *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
- return false;
- }
-
- /*
- * Check that the Guest PTE flags are OK, and the page number is below
- * the pfn_limit (ie. not mapping the Launcher binary).
- */
- if (!check_gpte(cpu, gpte))
- return false;
-
- /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
- gpte = pte_mkyoung(gpte);
- if (errcode & 2)
- gpte = pte_mkdirty(gpte);
-
- /* Get the pointer to the shadow PTE entry we're going to set. */
- spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
- if (!spte)
- return false;
-
- /*
- * If there was a valid shadow PTE entry here before, we release it.
- * This can happen with a write to a previously read-only entry.
- */
- release_pte(*spte);
-
- /*
- * If this is a write, we insist that the Guest page is writable (the
- * final arg to gpte_to_spte()).
- */
- if (pte_dirty(gpte))
- *spte = gpte_to_spte(cpu, gpte, 1);
- else
- /*
- * If this is a read, don't set the "writable" bit in the page
- * table entry, even if the Guest says it's writable. That way
- * we will come back here when a write does actually occur, so
- * we can update the Guest's _PAGE_DIRTY flag.
- */
- set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
-
- /*
- * Finally, we write the Guest PTE entry back: we've set the
- * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
- */
- if (likely(!cpu->linear_pages))
- lgwrite(cpu, gpte_ptr, pte_t, gpte);
-
- /*
- * The fault is fixed, the page table is populated, the mapping
- * manipulated, the result returned and the code complete. A small
- * delay and a trace of alliteration are the only indications the Guest
- * has that a page fault occurred at all.
- */
- return true;
-}
-
-/*H:360
- * (ii) Making sure the Guest stack is mapped.
- *
- * Remember that direct traps into the Guest need a mapped Guest kernel stack.
- * pin_stack_pages() calls us here: we could simply call demand_page(), but as
- * we've seen that logic is quite long, and usually the stack pages are already
- * mapped, so it's overkill.
- *
- * This is a quick version which answers the question: is this virtual address
- * mapped by the shadow page tables, and is it writable?
- */
-static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
-{
- pte_t *spte;
- unsigned long flags;
-
- /* You can't put your stack in the Switcher! */
- if (vaddr >= switcher_addr)
- return false;
-
- /* If there's no shadow PTE, it's not writable. */
- spte = find_spte(cpu, vaddr, false, 0, 0);
- if (!spte)
- return false;
-
- /*
- * Check the flags on the pte entry itself: it must be present and
- * writable.
- */
- flags = pte_flags(*spte);
- return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
-}
-
-/*
- * So, when pin_stack_pages() asks us to pin a page, we check if it's already
- * in the page tables, and if not, we call demand_page() with error code 2
- * (meaning "write").
- */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
-{
- unsigned long iomem;
-
- if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
- kill_guest(cpu, "bad stack page %#lx", vaddr);
-}
-/*:*/
-
-#ifdef CONFIG_X86_PAE
-static void release_pmd(pmd_t *spmd)
-{
- /* If the entry's not present, there's nothing to release. */
- if (pmd_flags(*spmd) & _PAGE_PRESENT) {
- unsigned int i;
- pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
- /* For each entry in the page, we might need to release it. */
- for (i = 0; i < PTRS_PER_PTE; i++)
- release_pte(ptepage[i]);
- /* Now we can free the page of PTEs */
- free_page((long)ptepage);
- /* And zero out the PMD entry so we never release it twice. */
- set_pmd(spmd, __pmd(0));
- }
-}
-
-static void release_pgd(pgd_t *spgd)
-{
- /* If the entry's not present, there's nothing to release. */
- if (pgd_flags(*spgd) & _PAGE_PRESENT) {
- unsigned int i;
- pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
- for (i = 0; i < PTRS_PER_PMD; i++)
- release_pmd(&pmdpage[i]);
-
- /* Now we can free the page of PMDs */
- free_page((long)pmdpage);
- /* And zero out the PGD entry so we never release it twice. */
- set_pgd(spgd, __pgd(0));
- }
-}
-
-#else /* !CONFIG_X86_PAE */
-/*H:450
- * If we chase down the release_pgd() code, the non-PAE version looks like
- * this. The PAE version is almost identical, but instead of calling
- * release_pte it calls release_pmd(), which looks much like this.
- */
-static void release_pgd(pgd_t *spgd)
-{
- /* If the entry's not present, there's nothing to release. */
- if (pgd_flags(*spgd) & _PAGE_PRESENT) {
- unsigned int i;
- /*
- * Converting the pfn to find the actual PTE page is easy: turn
- * the page number into a physical address, then convert to a
- * virtual address (easy for kernel pages like this one).
- */
- pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
- /* For each entry in the page, we might need to release it. */
- for (i = 0; i < PTRS_PER_PTE; i++)
- release_pte(ptepage[i]);
- /* Now we can free the page of PTEs */
- free_page((long)ptepage);
- /* And zero out the PGD entry so we never release it twice. */
- *spgd = __pgd(0);
- }
-}
-#endif
-
-/*H:445
- * We saw flush_user_mappings() twice: once from the flush_user_mappings()
- * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
- * It simply releases every PTE page from 0 up to the Guest's kernel address.
- */
-static void flush_user_mappings(struct lguest *lg, int idx)
-{
- unsigned int i;
- /* Release every pgd entry up to the kernel's address. */
- for (i = 0; i < pgd_index(lg->kernel_address); i++)
- release_pgd(lg->pgdirs[idx].pgdir + i);
-}
-
-/*H:440
- * (v) Flushing (throwing away) page tables,
- *
- * The Guest has a hypercall to throw away the page tables: it's used when a
- * large number of mappings have been changed.
- */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
-{
- /* Drop the userspace part of the current page table. */
- flush_user_mappings(cpu->lg, cpu->cpu_pgd);
-}
-/*:*/
-
-/* We walk down the guest page tables to get a guest-physical address */
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
-{
- pgd_t gpgd;
- pte_t gpte;
-#ifdef CONFIG_X86_PAE
- pmd_t gpmd;
-#endif
-
- /* Still not set up? Just map 1:1. */
- if (unlikely(cpu->linear_pages)) {
- *paddr = vaddr;
- return true;
- }
-
- /* First step: get the top-level Guest page table entry. */
- gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
- /* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- goto fail;
-
-#ifdef CONFIG_X86_PAE
- gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
- goto fail;
- gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
-#else
- gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
-#endif
- if (!(pte_flags(gpte) & _PAGE_PRESENT))
- goto fail;
-
- *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
- return true;
-
-fail:
- *paddr = -1UL;
- return false;
-}
-
-/*
- * This is the version we normally use: kills the Guest if it uses a
- * bad address
- */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
-{
- unsigned long paddr;
-
- if (!__guest_pa(cpu, vaddr, &paddr))
- kill_guest(cpu, "Bad address %#lx", vaddr);
- return paddr;
-}
-
-/*
- * We keep several page tables. This is a simple routine to find the page
- * table (if any) corresponding to this top-level address the Guest has given
- * us.
- */
-static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
-{
- unsigned int i;
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
- break;
- return i;
-}
-
-/*H:435
- * And this is us, creating the new page directory. If we really do
- * allocate a new one (and so the kernel parts are not there), we set
- * blank_pgdir.
- */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
- unsigned long gpgdir,
- int *blank_pgdir)
-{
- unsigned int next;
-
- /*
- * We pick one entry at random to throw out. Choosing the Least
- * Recently Used might be better, but this is easy.
- */
- next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
- /* If it's never been allocated at all before, try now. */
- if (!cpu->lg->pgdirs[next].pgdir) {
- cpu->lg->pgdirs[next].pgdir =
- (pgd_t *)get_zeroed_page(GFP_KERNEL);
- /* If the allocation fails, just keep using the one we have */
- if (!cpu->lg->pgdirs[next].pgdir)
- next = cpu->cpu_pgd;
- else {
- /*
- * This is a blank page, so there are no kernel
- * mappings: caller must map the stack!
- */
- *blank_pgdir = 1;
- }
- }
- /* Record which Guest toplevel this shadows. */
- cpu->lg->pgdirs[next].gpgdir = gpgdir;
- /* Release all the non-kernel mappings. */
- flush_user_mappings(cpu->lg, next);
-
- /* This hasn't run on any CPU at all. */
- cpu->lg->pgdirs[next].last_host_cpu = -1;
-
- return next;
-}
-
-/*H:501
- * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here. We map the Switcher code immediately,
- * but defer mapping of the guest register page and IDT/LDT etc page until
- * just before we run the guest in map_switcher_in_guest().
- *
- * We *could* do this setup in map_switcher_in_guest(), but at that point
- * we've interrupts disabled, and allocating pages like that is fraught: we
- * can't sleep if we need to free up some memory.
- */
-static bool allocate_switcher_mapping(struct lg_cpu *cpu)
-{
- int i;
-
- for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
- pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
- CHECK_GPGD_MASK, _PAGE_TABLE);
- if (!pte)
- return false;
-
- /*
- * Map the switcher page if not already there. It might
- * already be there because we call allocate_switcher_mapping()
- * in guest_set_pgd() just in case it did discard our Switcher
- * mapping, but it probably didn't.
- */
- if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
- /* Get a reference to the Switcher page. */
- get_page(lg_switcher_pages[0]);
- /* Create a read-only, exectuable, kernel-style PTE */
- set_pte(pte,
- mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
- }
- }
- cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
- return true;
-}
-
-/*H:470
- * Finally, a routine which throws away everything: all PGD entries in all
- * the shadow page tables, including the Guest's kernel mappings. This is used
- * when we destroy the Guest.
- */
-static void release_all_pagetables(struct lguest *lg)
-{
- unsigned int i, j;
-
- /* Every shadow pagetable this Guest has */
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
- if (!lg->pgdirs[i].pgdir)
- continue;
-
- /* Every PGD entry. */
- for (j = 0; j < PTRS_PER_PGD; j++)
- release_pgd(lg->pgdirs[i].pgdir + j);
- lg->pgdirs[i].switcher_mapped = false;
- lg->pgdirs[i].last_host_cpu = -1;
- }
-}
-
-/*
- * We also throw away everything when a Guest tells us it's changed a kernel
- * mapping. Since kernel mappings are in every page table, it's easiest to
- * throw them all away. This traps the Guest in amber for a while as
- * everything faults back in, but it's rare.
- */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
-{
- release_all_pagetables(cpu->lg);
- /* We need the Guest kernel stack mapped again. */
- pin_stack_pages(cpu);
- /* And we need Switcher allocated. */
- if (!allocate_switcher_mapping(cpu))
- kill_guest(cpu, "Cannot populate switcher mapping");
-}
-
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir). This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
- int newpgdir, repin = 0;
-
- /*
- * The very first time they call this, we're actually running without
- * any page tables; we've been making it up. Throw them away now.
- */
- if (unlikely(cpu->linear_pages)) {
- release_all_pagetables(cpu->lg);
- cpu->linear_pages = false;
- /* Force allocation of a new pgdir. */
- newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
- } else {
- /* Look to see if we have this one already. */
- newpgdir = find_pgdir(cpu->lg, pgtable);
- }
-
- /*
- * If not, we allocate or mug an existing one: if it's a fresh one,
- * repin gets set to 1.
- */
- if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
- newpgdir = new_pgdir(cpu, pgtable, &repin);
- /* Change the current pgd index to the new one. */
- cpu->cpu_pgd = newpgdir;
- /*
- * If it was completely blank, we map in the Guest kernel stack and
- * the Switcher.
- */
- if (repin)
- pin_stack_pages(cpu);
-
- if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
- if (!allocate_switcher_mapping(cpu))
- kill_guest(cpu, "Cannot populate switcher mapping");
- }
-}
-/*:*/
-
-/*M:009
- * Since we throw away all mappings when a kernel mapping changes, our
- * performance sucks for guests using highmem. In fact, a guest with
- * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
- * usually slower than a Guest with less memory.
- *
- * This, of course, cannot be fixed. It would take some kind of... well, I
- * don't know, but the term "puissant code-fu" comes to mind.
-:*/
-
-/*H:420
- * This is the routine which actually sets the page table entry for then
- * "idx"'th shadow page table.
- *
- * Normally, we can just throw out the old entry and replace it with 0: if they
- * use it demand_page() will put the new entry in. We need to do this anyway:
- * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
- * is read from, and _PAGE_DIRTY when it's written to.
- *
- * But Avi Kivity pointed out that most Operating Systems (Linux included) set
- * these bits on PTEs immediately anyway. This is done to save the CPU from
- * having to update them, but it helps us the same way: if they set
- * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
- * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
- */
-static void __guest_set_pte(struct lg_cpu *cpu, int idx,
- unsigned long vaddr, pte_t gpte)
-{
- /* Look up the matching shadow page directory entry. */
- pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
-#ifdef CONFIG_X86_PAE
- pmd_t *spmd;
-#endif
-
- /* If the top level isn't present, there's no entry to update. */
- if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-#ifdef CONFIG_X86_PAE
- spmd = spmd_addr(cpu, *spgd, vaddr);
- if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-#endif
- /* Otherwise, start by releasing the existing entry. */
- pte_t *spte = spte_addr(cpu, *spgd, vaddr);
- release_pte(*spte);
-
- /*
- * If they're setting this entry as dirty or accessed,
- * we might as well put that entry they've given us in
- * now. This shaves 10% off a copy-on-write
- * micro-benchmark.
- */
- if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
- && !gpte_in_iomem(cpu, gpte)) {
- if (!check_gpte(cpu, gpte))
- return;
- set_pte(spte,
- gpte_to_spte(cpu, gpte,
- pte_flags(gpte) & _PAGE_DIRTY));
- } else {
- /*
- * Otherwise kill it and we can demand_page()
- * it in later.
- */
- set_pte(spte, __pte(0));
- }
-#ifdef CONFIG_X86_PAE
- }
-#endif
- }
-}
-
-/*H:410
- * Updating a PTE entry is a little trickier.
- *
- * We keep track of several different page tables (the Guest uses one for each
- * process, so it makes sense to cache at least a few). Each of these have
- * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
- * all processes. So when the page table above that address changes, we update
- * all the page tables, not just the current one. This is rare.
- *
- * The benefit is that when we have to track a new page table, we can keep all
- * the kernel mappings. This speeds up context switch immensely.
- */
-void guest_set_pte(struct lg_cpu *cpu,
- unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
-{
- /* We don't let you remap the Switcher; we need it to get back! */
- if (vaddr >= switcher_addr) {
- kill_guest(cpu, "attempt to set pte into Switcher pages");
- return;
- }
-
- /*
- * Kernel mappings must be changed on all top levels. Slow, but doesn't
- * happen often.
- */
- if (vaddr >= cpu->lg->kernel_address) {
- unsigned int i;
- for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
- if (cpu->lg->pgdirs[i].pgdir)
- __guest_set_pte(cpu, i, vaddr, gpte);
- } else {
- /* Is this page table one we have a shadow for? */
- int pgdir = find_pgdir(cpu->lg, gpgdir);
- if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
- /* If so, do the update. */
- __guest_set_pte(cpu, pgdir, vaddr, gpte);
- }
-}
-
-/*H:400
- * (iii) Setting up a page table entry when the Guest tells us one has changed.
- *
- * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
- * with the other side of page tables while we're here: what happens when the
- * Guest asks for a page table to be updated?
- *
- * We already saw that demand_page() will fill in the shadow page tables when
- * needed, so we can simply remove shadow page table entries whenever the Guest
- * tells us they've changed. When the Guest tries to use the new entry it will
- * fault and demand_page() will fix it up.
- *
- * So with that in mind here's our code to update a (top-level) PGD entry:
- */
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
-{
- int pgdir;
-
- if (idx > PTRS_PER_PGD) {
- kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
- idx, PTRS_PER_PGD);
- return;
- }
-
- /* If they're talking about a page table we have a shadow for... */
- pgdir = find_pgdir(lg, gpgdir);
- if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
- /* ... throw it away. */
- release_pgd(lg->pgdirs[pgdir].pgdir + idx);
- /* That might have been the Switcher mapping, remap it. */
- if (!allocate_switcher_mapping(&lg->cpus[0])) {
- kill_guest(&lg->cpus[0],
- "Cannot populate switcher mapping");
- }
- lg->pgdirs[pgdir].last_host_cpu = -1;
- }
-}
-
-#ifdef CONFIG_X86_PAE
-/* For setting a mid-level, we just throw everything away. It's easy. */
-void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
-{
- guest_pagetable_clear_all(&lg->cpus[0]);
-}
-#endif
-
-/*H:500
- * (vii) Setting up the page tables initially.
- *
- * When a Guest is first created, set initialize a shadow page table which
- * we will populate on future faults. The Guest doesn't have any actual
- * pagetables yet, so we set linear_pages to tell demand_page() to fake it
- * for the moment.
- *
- * We do need the Switcher to be mapped at all times, so we allocate that
- * part of the Guest page table here.
- */
-int init_guest_pagetable(struct lguest *lg)
-{
- struct lg_cpu *cpu = &lg->cpus[0];
- int allocated = 0;
-
- /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
- cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
- if (!allocated)
- return -ENOMEM;
-
- /* We start with a linear mapping until the initialize. */
- cpu->linear_pages = true;
-
- /* Allocate the page tables for the Switcher. */
- if (!allocate_switcher_mapping(cpu)) {
- release_all_pagetables(lg);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
-{
- /*
- * We tell the Guest that it can't use the virtual addresses
- * used by the Switcher. This trick is equivalent to 4GB -
- * switcher_addr.
- */
- u32 top = ~switcher_addr + 1;
-
- /* We get the kernel address: above this is all kernel memory. */
- if (get_user(cpu->lg->kernel_address,
- &cpu->lg->lguest_data->kernel_address)
- /*
- * We tell the Guest that it can't use the top virtual
- * addresses (used by the Switcher).
- */
- || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
- kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
- return;
- }
-
- /*
- * In flush_user_mappings() we loop from 0 to
- * "pgd_index(lg->kernel_address)". This assumes it won't hit the
- * Switcher mappings, so check that now.
- */
- if (cpu->lg->kernel_address >= switcher_addr)
- kill_guest(cpu, "bad kernel address %#lx",
- cpu->lg->kernel_address);
-}
-
-/* When a Guest dies, our cleanup is fairly simple. */
-void free_guest_pagetable(struct lguest *lg)
-{
- unsigned int i;
-
- /* Throw away all page table pages. */
- release_all_pagetables(lg);
- /* Now free the top levels: free_page() can handle 0 just fine. */
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- free_page((long)lg->pgdirs[i].pgdir);
-}
-
-/*H:481
- * This clears the Switcher mappings for cpu #i.
- */
-static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
-{
- unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
- pte_t *pte;
-
- /* Clear the mappings for both pages. */
- pte = find_spte(cpu, base, false, 0, 0);
- release_pte(*pte);
- set_pte(pte, __pte(0));
-
- pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
- release_pte(*pte);
- set_pte(pte, __pte(0));
-}
-
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the Guest
- * (and not the pages for other CPUs).
- *
- * The pages for the pagetables have all been allocated before: we just need
- * to make sure the actual PTEs are up-to-date for the CPU we're about to run
- * on.
- */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
- unsigned long base;
- struct page *percpu_switcher_page, *regs_page;
- pte_t *pte;
- struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
-
- /* Switcher page should always be mapped by now! */
- BUG_ON(!pgdir->switcher_mapped);
-
- /*
- * Remember that we have two pages for each Host CPU, so we can run a
- * Guest on each CPU without them interfering. We need to make sure
- * those pages are mapped correctly in the Guest, but since we usually
- * run on the same CPU, we cache that, and only update the mappings
- * when we move.
- */
- if (pgdir->last_host_cpu == raw_smp_processor_id())
- return;
-
- /* -1 means unknown so we remove everything. */
- if (pgdir->last_host_cpu == -1) {
- unsigned int i;
- for_each_possible_cpu(i)
- remove_switcher_percpu_map(cpu, i);
- } else {
- /* We know exactly what CPU mapping to remove. */
- remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
- }
-
- /*
- * When we're running the Guest, we want the Guest's "regs" page to
- * appear where the first Switcher page for this CPU is. This is an
- * optimization: when the Switcher saves the Guest registers, it saves
- * them into the first page of this CPU's "struct lguest_pages": if we
- * make sure the Guest's register page is already mapped there, we
- * don't have to copy them out again.
- */
- /* Find the shadow PTE for this regs page. */
- base = switcher_addr + PAGE_SIZE
- + raw_smp_processor_id() * sizeof(struct lguest_pages);
- pte = find_spte(cpu, base, false, 0, 0);
- regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
- get_page(regs_page);
- set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
-
- /*
- * We map the second page of the struct lguest_pages read-only in
- * the Guest: the IDT, GDT and other things it's not supposed to
- * change.
- */
- pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
- percpu_switcher_page
- = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
- get_page(percpu_switcher_page);
- set_pte(pte, mk_pte(percpu_switcher_page,
- __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
-
- pgdir->last_host_cpu = raw_smp_processor_id();
-}
-
-/*H:490
- * We've made it through the page table code. Perhaps our tired brains are
- * still processing the details, or perhaps we're simply glad it's over.
- *
- * If nothing else, note that all this complexity in juggling shadow page tables
- * in sync with the Guest's page tables is for one reason: for most Guests this
- * page table dance determines how bad performance will be. This is why Xen
- * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
- * have implemented shadow page table support directly into hardware.
- *
- * There is just one file remaining in the Host.
- */
+++ /dev/null
-/*P:600
- * The x86 architecture has segments, which involve a table of descriptors
- * which can be used to do funky things with virtual address interpretation.
- * We originally used to use segments so the Guest couldn't alter the
- * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
- * for userspace per-thread segments, but trim again for on userspace->kernel
- * transitions... This nightmarish creation was contained within this file,
- * where we knew not to tread without heavy armament and a change of underwear.
- *
- * In these modern times, the segment handling code consists of simple sanity
- * checks, and the worst you'll experience reading this code is butterfly-rash
- * from frolicking through its parklike serenity.
-:*/
-#include "lg.h"
-
-/*H:600
- * Segments & The Global Descriptor Table
- *
- * (That title sounds like a bad Nerdcore group. Not to suggest that there are
- * any good Nerdcore groups, but in high school a friend of mine had a band
- * called Joe Fish and the Chips, so there are definitely worse band names).
- *
- * To refresh: the GDT is a table of 8-byte values describing segments. Once
- * set up, these segments can be loaded into one of the 6 "segment registers".
- *
- * GDT entries are passed around as "struct desc_struct"s, which like IDT
- * entries are split into two 32-bit members, "a" and "b". One day, someone
- * will clean that up, and be declared a Hero. (No pressure, I'm just saying).
- *
- * Anyway, the GDT entry contains a base (the start address of the segment), a
- * limit (the size of the segment - 1), and some flags. Sounds simple, and it
- * would be, except those zany Intel engineers decided that it was too boring
- * to put the base at one end, the limit at the other, and the flags in
- * between. They decided to shotgun the bits at random throughout the 8 bytes,
- * like so:
- *
- * 0 16 40 48 52 56 63
- * [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
- * mit ags part 2
- * part 2
- *
- * As a result, this file contains a certain amount of magic numeracy. Let's
- * begin.
- */
-
-/*
- * There are several entries we don't let the Guest set. The TSS entry is the
- * "Task State Segment" which controls all kinds of delicate things. The
- * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
- * the Guest can't be trusted to deal with double faults.
- */
-static bool ignored_gdt(unsigned int num)
-{
- return (num == GDT_ENTRY_TSS
- || num == GDT_ENTRY_LGUEST_CS
- || num == GDT_ENTRY_LGUEST_DS
- || num == GDT_ENTRY_DOUBLEFAULT_TSS);
-}
-
-/*H:630
- * Once the Guest gave us new GDT entries, we fix them up a little. We
- * don't care if they're invalid: the worst that can happen is a General
- * Protection Fault in the Switcher when it restores a Guest segment register
- * which tries to use that entry. Then we kill the Guest for causing such a
- * mess: the message will be "unhandled trap 256".
- */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
-{
- unsigned int i;
-
- for (i = start; i < end; i++) {
- /*
- * We never copy these ones to real GDT, so we don't care what
- * they say
- */
- if (ignored_gdt(i))
- continue;
-
- /*
- * Segment descriptors contain a privilege level: the Guest is
- * sometimes careless and leaves this as 0, even though it's
- * running at privilege level 1. If so, we fix it here.
- */
- if (cpu->arch.gdt[i].dpl == 0)
- cpu->arch.gdt[i].dpl |= GUEST_PL;
-
- /*
- * Each descriptor has an "accessed" bit. If we don't set it
- * now, the CPU will try to set it when the Guest first loads
- * that entry into a segment register. But the GDT isn't
- * writable by the Guest, so bad things can happen.
- */
- cpu->arch.gdt[i].type |= 0x1;
- }
-}
-
-/*H:610
- * Like the IDT, we never simply use the GDT the Guest gives us. We keep
- * a GDT for each CPU, and copy across the Guest's entries each time we want to
- * run the Guest on that CPU.
- *
- * This routine is called at boot or modprobe time for each CPU to set up the
- * constant GDT entries: the ones which are the same no matter what Guest we're
- * running.
- */
-void setup_default_gdt_entries(struct lguest_ro_state *state)
-{
- struct desc_struct *gdt = state->guest_gdt;
- unsigned long tss = (unsigned long)&state->guest_tss;
-
- /* The Switcher segments are full 0-4G segments, privilege level 0 */
- gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
- gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-
- /*
- * The TSS segment refers to the TSS entry for this particular CPU.
- */
- gdt[GDT_ENTRY_TSS].a = 0;
- gdt[GDT_ENTRY_TSS].b = 0;
-
- gdt[GDT_ENTRY_TSS].limit0 = 0x67;
- gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF;
- gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF;
- gdt[GDT_ENTRY_TSS].base2 = tss >> 24;
- gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */
- gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */
- gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */
- gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */
-
-}
-
-/*
- * This routine sets up the initial Guest GDT for booting. All entries start
- * as 0 (unusable).
- */
-void setup_guest_gdt(struct lg_cpu *cpu)
-{
- /*
- * Start with full 0-4G segments...except the Guest is allowed to use
- * them, so set the privilege level appropriately in the flags.
- */
- cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
- cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
- cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
- cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
-}
-
-/*H:650
- * An optimization of copy_gdt(), for just the three "thead-local storage"
- * entries.
- */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
- unsigned int i;
-
- for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
- gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:640
- * When the Guest is run on a different CPU, or the GDT entries have changed,
- * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
- * GDT.
- */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
- unsigned int i;
-
- /*
- * The default entries from setup_default_gdt_entries() are not
- * replaced. See ignored_gdt() above.
- */
- for (i = 0; i < GDT_ENTRIES; i++)
- if (!ignored_gdt(i))
- gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:620
- * This is where the Guest asks us to load a new GDT entry
- * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in.
- */
-void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
-{
- /*
- * We assume the Guest has the same number of GDT entries as the
- * Host, otherwise we'd have to dynamically allocate the Guest GDT.
- */
- if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
- kill_guest(cpu, "too many gdt entries %i", num);
- return;
- }
-
- /* Set it up, then fix it. */
- cpu->arch.gdt[num].a = lo;
- cpu->arch.gdt[num].b = hi;
- fixup_gdt_table(cpu, num, num+1);
- /*
- * Mark that the GDT changed so the core knows it has to copy it again,
- * even if the Guest is run on the same CPU.
- */
- cpu->changed |= CHANGED_GDT;
-}
-
-/*
- * This is the fast-track version for just changing the three TLS entries.
- * Remember that this happens on every context switch, so it's worth
- * optimizing. But wouldn't it be neater to have a single hypercall to cover
- * both cases?
- */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
-{
- struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
-
- __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
- fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
- /* Note that just the TLS entries have changed. */
- cpu->changed |= CHANGED_GDT_TLS;
-}
-
-/*H:660
- * With this, we have finished the Host.
- *
- * Five of the seven parts of our task are complete. You have made it through
- * the Bit of Despair (I think that's somewhere in the page table code,
- * myself).
- *
- * Next, we examine "make Switcher". It's short, but intense.
- */
+++ /dev/null
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*P:450
- * This file contains the x86-specific lguest code. It used to be all
- * mixed in with drivers/lguest/core.c but several foolhardy code slashers
- * wrestled most of the dependencies out to here in preparation for porting
- * lguest to other architectures (see what I mean by foolhardy?).
- *
- * This also contains a couple of non-obvious setup and teardown pieces which
- * were implemented after days of debugging pain.
-:*/
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/cpu.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/lguest.h>
-#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
-#include <asm/tlbflush.h>
-#include "../lg.h"
-
-static int cpu_had_pge;
-
-static struct {
- unsigned long offset;
- unsigned short segment;
-} lguest_entry;
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
- return switcher_addr - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages (after the Switcher text page) */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
- return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-}
-
-static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-
-/*S:010
- * We approach the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU. This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran. We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
- /*
- * Copying all this data can be quite expensive. We usually run the
- * same Guest we ran last time (and that Guest hasn't run anywhere else
- * meanwhile). If that's not the case, we pretend everything in the
- * Guest has changed.
- */
- if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
- __this_cpu_write(lg_last_cpu, cpu);
- cpu->last_pages = pages;
- cpu->changed = CHANGED_ALL;
- }
-
- /*
- * These copies are pretty cheap, so we do them unconditionally: */
- /* Save the current Host top-level page directory.
- */
- pages->state.host_cr3 = __pa(current->mm->pgd);
- /*
- * Set up the Guest's page tables to see this CPU's pages (and no
- * other CPU's pages).
- */
- map_switcher_in_guest(cpu, pages);
- /*
- * Set up the two "TSS" members which tell the CPU what stack to use
- * for traps which do directly into the Guest (ie. traps at privilege
- * level 1).
- */
- pages->state.guest_tss.sp1 = cpu->esp1;
- pages->state.guest_tss.ss1 = cpu->ss1;
-
- /* Copy direct-to-Guest trap entries. */
- if (cpu->changed & CHANGED_IDT)
- copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
-
- /* Copy all GDT entries which the Guest can change. */
- if (cpu->changed & CHANGED_GDT)
- copy_gdt(cpu, pages->state.guest_gdt);
- /* If only the TLS entries have changed, copy them. */
- else if (cpu->changed & CHANGED_GDT_TLS)
- copy_gdt_tls(cpu, pages->state.guest_gdt);
-
- /* Mark the Guest as unchanged for next time. */
- cpu->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
- /* This is a dummy value we need for GCC's sake. */
- unsigned int clobber;
-
- /*
- * Copy the guest-specific information into this CPU's "struct
- * lguest_pages".
- */
- copy_in_guest_info(cpu, pages);
-
- /*
- * Set the trap number to 256 (impossible value). If we fault while
- * switching to the Guest (bad segment registers or bug), this will
- * cause us to abort the Guest.
- */
- cpu->regs->trapnum = 256;
-
- /*
- * Now: we push the "eflags" register on the stack, then do an "lcall".
- * This is how we change from using the kernel code segment to using
- * the dedicated lguest code segment, as well as jumping into the
- * Switcher.
- *
- * The lcall also pushes the old code segment (KERNEL_CS) onto the
- * stack, then the address of this call. This stack layout happens to
- * exactly match the stack layout created by an interrupt...
- */
- asm volatile("pushf; lcall *%4"
- /*
- * This is how we tell GCC that %eax ("a") and %ebx ("b")
- * are changed by this routine. The "=" means output.
- */
- : "=a"(clobber), "=b"(clobber)
- /*
- * %eax contains the pages pointer. ("0" refers to the
- * 0-th argument above, ie "a"). %ebx contains the
- * physical address of the Guest's top-level page
- * directory.
- */
- : "0"(pages),
- "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
- "m"(lguest_entry)
- /*
- * We tell gcc that all these registers could change,
- * which means we don't have to save and restore them in
- * the Switcher.
- */
- : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
-/*:*/
-
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
-{
- switch (reg_off) {
- case offsetof(struct pt_regs, bx):
- return &cpu->regs->ebx;
- case offsetof(struct pt_regs, cx):
- return &cpu->regs->ecx;
- case offsetof(struct pt_regs, dx):
- return &cpu->regs->edx;
- case offsetof(struct pt_regs, si):
- return &cpu->regs->esi;
- case offsetof(struct pt_regs, di):
- return &cpu->regs->edi;
- case offsetof(struct pt_regs, bp):
- return &cpu->regs->ebp;
- case offsetof(struct pt_regs, ax):
- return &cpu->regs->eax;
- case offsetof(struct pt_regs, ip):
- return &cpu->regs->eip;
- case offsetof(struct pt_regs, sp):
- return &cpu->regs->esp;
- }
-
- /* Launcher can read these, but we don't allow any setting. */
- if (any) {
- switch (reg_off) {
- case offsetof(struct pt_regs, ds):
- return &cpu->regs->ds;
- case offsetof(struct pt_regs, es):
- return &cpu->regs->es;
- case offsetof(struct pt_regs, fs):
- return &cpu->regs->fs;
- case offsetof(struct pt_regs, gs):
- return &cpu->regs->gs;
- case offsetof(struct pt_regs, cs):
- return &cpu->regs->cs;
- case offsetof(struct pt_regs, flags):
- return &cpu->regs->eflags;
- case offsetof(struct pt_regs, ss):
- return &cpu->regs->ss;
- }
- }
-
- return NULL;
-}
-
-/*M:002
- * There are hooks in the scheduler which we can register to tell when we
- * get kicked off the CPU (preempt_notifier_register()). This would allow us
- * to lazily disable SYSENTER which would regain some performance, and should
- * also simplify copy_in_guest_info(). Note that we'd still need to restore
- * things when we exit to Launcher userspace, but that's fairly easy.
- *
- * We could also try using these hooks for PGE, but that might be too expensive.
- *
- * The hooks were designed for KVM, but we can also put them to good use.
-:*/
-
-/*H:040
- * This is the i386-specific code to setup and run the Guest. Interrupts
- * are disabled: we own the CPU.
- */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
-{
- /*
- * SYSENTER is an optimized way of doing system calls. We can't allow
- * it because it always jumps to privilege level 0. A normal Guest
- * won't try it because we don't advertise it in CPUID, but a malicious
- * Guest (or malicious Guest userspace program) could, so we tell the
- * CPU to disable it before running the Guest.
- */
- if (boot_cpu_has(X86_FEATURE_SEP))
- wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
- /*
- * Now we actually run the Guest. It will return when something
- * interesting happens, and we can examine its registers to see what it
- * was doing.
- */
- run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
-
- /*
- * Note that the "regs" structure contains two extra entries which are
- * not really registers: a trap number which says what interrupt or
- * trap made the switcher code come back, and an error code which some
- * traps set.
- */
-
- /* Restore SYSENTER if it's supposed to be on. */
- if (boot_cpu_has(X86_FEATURE_SEP))
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
- /*
- * If the Guest page faulted, then the cr2 register will tell us the
- * bad virtual address. We have to grab this now, because once we
- * re-enable interrupts an interrupt could fault and thus overwrite
- * cr2, or we could even move off to a different CPU.
- */
- if (cpu->regs->trapnum == 14)
- cpu->arch.last_pagefault = read_cr2();
- /*
- * Similarly, if we took a trap because the Guest used the FPU,
- * we have to restore the FPU it expects to see.
- * fpu__restore() may sleep and we may even move off to
- * a different CPU. So all the critical stuff should be done
- * before this.
- */
- else if (cpu->regs->trapnum == 7 && !fpregs_active())
- fpu__restore(¤t->thread.fpu);
-}
-
-/*H:130
- * Now we've examined the hypercall code; our Guest can make requests.
- * Our Guest is usually so well behaved; it never tries to do things it isn't
- * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
- * infrastructure isn't quite complete, because it doesn't contain replacements
- * for the Intel I/O instructions. As a result, the Guest sometimes fumbles
- * across one during the boot process as it probes for various things which are
- * usually attached to a PC.
- *
- * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here. We queue this to be sent out to the
- * Launcher to handle.
- */
-
-/*
- * The eip contains the *virtual* address of the Guest's instruction:
- * we copy the instruction here so the Launcher doesn't have to walk
- * the page tables to decode it. We handle the case (eg. in a kernel
- * module) where the instruction is over two pages, and the pages are
- * virtually but not physically contiguous.
- *
- * The longest possible x86 instruction is 15 bytes, but we don't handle
- * anything that strange.
- */
-static void copy_from_guest(struct lg_cpu *cpu,
- void *dst, unsigned long vaddr, size_t len)
-{
- size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
- unsigned long paddr;
-
- BUG_ON(len > PAGE_SIZE);
-
- /* If it goes over a page, copy in two parts. */
- if (len > to_page_end) {
- /* But make sure the next page is mapped! */
- if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
- copy_from_guest(cpu, dst + to_page_end,
- vaddr + to_page_end,
- len - to_page_end);
- else
- /* Otherwise fill with zeroes. */
- memset(dst + to_page_end, 0, len - to_page_end);
- len = to_page_end;
- }
-
- /* This will kill the guest if it isn't mapped, but that
- * shouldn't happen. */
- __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-}
-
-
-static void setup_emulate_insn(struct lg_cpu *cpu)
-{
- cpu->pending.trap = 13;
- copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
- sizeof(cpu->pending.insn));
-}
-
-static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-{
- cpu->pending.trap = 14;
- cpu->pending.addr = iomem_addr;
- copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
- sizeof(cpu->pending.insn));
-}
-
-/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
-{
- unsigned long iomem_addr;
-
- switch (cpu->regs->trapnum) {
- case 13: /* We've intercepted a General Protection Fault. */
- /* Hand to Launcher to emulate those pesky IN and OUT insns */
- if (cpu->regs->errcode == 0) {
- setup_emulate_insn(cpu);
- return;
- }
- break;
- case 14: /* We've intercepted a Page Fault. */
- /*
- * The Guest accessed a virtual address that wasn't mapped.
- * This happens a lot: we don't actually set up most of the page
- * tables for the Guest at all when we start: as it runs it asks
- * for more and more, and we set them up as required. In this
- * case, we don't even tell the Guest that the fault happened.
- *
- * The errcode tells whether this was a read or a write, and
- * whether kernel or userspace code.
- */
- if (demand_page(cpu, cpu->arch.last_pagefault,
- cpu->regs->errcode, &iomem_addr))
- return;
-
- /* Was this an access to memory mapped IO? */
- if (iomem_addr) {
- /* Tell Launcher, let it handle it. */
- setup_iomem_insn(cpu, iomem_addr);
- return;
- }
-
- /*
- * OK, it's really not there (or not OK): the Guest needs to
- * know. We write out the cr2 value so it knows where the
- * fault occurred.
- *
- * Note that if the Guest were really messed up, this could
- * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
- * lg->lguest_data could be NULL
- */
- if (cpu->lg->lguest_data &&
- put_user(cpu->arch.last_pagefault,
- &cpu->lg->lguest_data->cr2))
- kill_guest(cpu, "Writing cr2");
- break;
- case 7: /* We've intercepted a Device Not Available fault. */
- /* No special handling is needed here. */
- break;
- case 32 ... 255:
- /* This might be a syscall. */
- if (could_be_syscall(cpu->regs->trapnum))
- break;
-
- /*
- * Other values mean a real interrupt occurred, in which case
- * the Host handler has already been run. We just do a
- * friendly check if another process should now be run, then
- * return to run the Guest again.
- */
- cond_resched();
- return;
- case LGUEST_TRAP_ENTRY:
- /*
- * Our 'struct hcall_args' maps directly over our regs: we set
- * up the pointer now to indicate a hypercall is pending.
- */
- cpu->hcall = (struct hcall_args *)cpu->regs;
- return;
- }
-
- /* We didn't handle the trap, so it needs to go to the Guest. */
- if (!deliver_trap(cpu, cpu->regs->trapnum))
- /*
- * If the Guest doesn't have a handler (either it hasn't
- * registered any yet, or it's one of the faults we don't let
- * it handle), it dies with this cryptic error message.
- */
- kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
- cpu->regs->trapnum, cpu->regs->eip,
- cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
- : cpu->regs->errcode);
-}
-
-/*
- * Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page(). After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete.
-:*/
-static void adjust_pge(void *on)
-{
- if (on)
- cr4_set_bits(X86_CR4_PGE);
- else
- cr4_clear_bits(X86_CR4_PGE);
-}
-
-/*H:020
- * Now the Switcher is mapped and every thing else is ready, we need to do
- * some more i386-specific initialization.
- */
-void __init lguest_arch_host_init(void)
-{
- int i;
-
- /*
- * Most of the x86/switcher_32.S doesn't care that it's been moved; on
- * Intel, jumps are relative, and it doesn't access any references to
- * external code or data.
- *
- * The only exception is the interrupt handlers in switcher.S: their
- * addresses are placed in a table (default_idt_entries), so we need to
- * update the table with the new addresses. switcher_offset() is a
- * convenience function which returns the distance between the
- * compiled-in switcher code and the high-mapped copy we just made.
- */
- for (i = 0; i < IDT_ENTRIES; i++)
- default_idt_entries[i] += switcher_offset();
-
- /*
- * Set up the Switcher's per-cpu areas.
- *
- * Each CPU gets two pages of its own within the high-mapped region
- * (aka. "struct lguest_pages"). Much of this can be initialized now,
- * but some depends on what Guest we are running (which is set up in
- * copy_in_guest_info()).
- */
- for_each_possible_cpu(i) {
- /* lguest_pages() returns this CPU's two pages. */
- struct lguest_pages *pages = lguest_pages(i);
- /* This is a convenience pointer to make the code neater. */
- struct lguest_ro_state *state = &pages->state;
-
- /*
- * The Global Descriptor Table: the Host has a different one
- * for each CPU. We keep a descriptor for the GDT which says
- * where it is and how big it is (the size is actually the last
- * byte, not the size, hence the "-1").
- */
- state->host_gdt_desc.size = GDT_SIZE-1;
- state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
-
- /*
- * All CPUs on the Host use the same Interrupt Descriptor
- * Table, so we just use store_idt(), which gets this CPU's IDT
- * descriptor.
- */
- store_idt(&state->host_idt_desc);
-
- /*
- * The descriptors for the Guest's GDT and IDT can be filled
- * out now, too. We copy the GDT & IDT into ->guest_gdt and
- * ->guest_idt before actually running the Guest.
- */
- state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
- state->guest_idt_desc.address = (long)&state->guest_idt;
- state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
- state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
- /*
- * We know where we want the stack to be when the Guest enters
- * the Switcher: in pages->regs. The stack grows upwards, so
- * we start it at the end of that structure.
- */
- state->guest_tss.sp0 = (long)(&pages->regs + 1);
- /*
- * And this is the GDT entry to use for the stack: we keep a
- * couple of special LGUEST entries.
- */
- state->guest_tss.ss0 = LGUEST_DS;
-
- /*
- * x86 can have a finegrained bitmap which indicates what I/O
- * ports the process can use. We set it to the end of our
- * structure, meaning "none".
- */
- state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
- /*
- * Some GDT entries are the same across all Guests, so we can
- * set them up now.
- */
- setup_default_gdt_entries(state);
- /* Most IDT entries are the same for all Guests, too.*/
- setup_default_idt_entries(state, default_idt_entries);
-
- /*
- * The Host needs to be able to use the LGUEST segments on this
- * CPU, too, so put them in the Host GDT.
- */
- get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
- get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
- }
-
- /*
- * In the Switcher, we want the %cs segment register to use the
- * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
- * it will be undisturbed when we switch. To change %cs and jump we
- * need this structure to feed to Intel's "lcall" instruction.
- */
- lguest_entry.offset = (long)switch_to_guest + switcher_offset();
- lguest_entry.segment = LGUEST_CS;
-
- /*
- * Finally, we need to turn off "Page Global Enable". PGE is an
- * optimization where page table entries are specially marked to show
- * they never change. The Host kernel marks all the kernel pages this
- * way because it's always present, even when userspace is running.
- *
- * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
- * switch to the Guest kernel. If you don't disable this on all CPUs,
- * you'll get really weird bugs that you'll chase for two days.
- *
- * I used to turn PGE off every time we switched to the Guest and back
- * on when we return, but that slowed the Switcher down noticibly.
- */
-
- /*
- * We don't need the complexity of CPUs coming and going while we're
- * doing this.
- */
- get_online_cpus();
- if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
- /* Remember that this was originally set (for cleanup). */
- cpu_had_pge = 1;
- /*
- * adjust_pge is a helper function which sets or unsets the PGE
- * bit on its CPU, depending on the argument (0 == unset).
- */
- on_each_cpu(adjust_pge, (void *)0, 1);
- /* Turn off the feature in the global feature set. */
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- }
- put_online_cpus();
-}
-/*:*/
-
-void __exit lguest_arch_host_fini(void)
-{
- /* If we had PGE before we started, turn it back on now. */
- get_online_cpus();
- if (cpu_had_pge) {
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- /* adjust_pge's argument "1" means set PGE. */
- on_each_cpu(adjust_pge, (void *)1, 1);
- }
- put_online_cpus();
-}
-
-
-/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
- switch (args->arg0) {
- case LHCALL_LOAD_GDT_ENTRY:
- load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
- break;
- case LHCALL_LOAD_IDT_ENTRY:
- load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
- break;
- case LHCALL_LOAD_TLS:
- guest_load_tls(cpu, args->arg1);
- break;
- default:
- /* Bad Guest. Bad! */
- return -EIO;
- }
- return 0;
-}
-
-/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
-{
- u32 tsc_speed;
-
- /*
- * The pointer to the Guest's "struct lguest_data" is the only argument.
- * We check that address now.
- */
- if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
- sizeof(*cpu->lg->lguest_data)))
- return -EFAULT;
-
- /*
- * Having checked it, we simply set lg->lguest_data to point straight
- * into the Launcher's memory at the right place and then use
- * copy_to_user/from_user from now on, instead of lgread/write. I put
- * this in to show that I'm not immune to writing stupid
- * optimizations.
- */
- cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
-
- /*
- * We insist that the Time Stamp Counter exist and doesn't change with
- * cpu frequency. Some devious chip manufacturers decided that TSC
- * changes could be handled in software. I decided that time going
- * backwards might be good for benchmarks, but it's bad for users.
- *
- * We also insist that the TSC be stable: the kernel detects unreliable
- * TSCs for its own purposes, and we use that here.
- */
- if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
- tsc_speed = tsc_khz;
- else
- tsc_speed = 0;
- if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
- return -EFAULT;
-
- /* The interrupt code might not like the system call vector. */
- if (!check_syscall_vector(cpu->lg))
- kill_guest(cpu, "bad syscall vector");
-
- return 0;
-}
-/*:*/
-
-/*L:030
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0.
- */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
-{
- struct lguest_regs *regs = cpu->regs;
-
- /*
- * There are four "segment" registers which the Guest needs to boot:
- * The "code segment" register (cs) refers to the kernel code segment
- * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
- * refer to the kernel data segment __KERNEL_DS.
- *
- * The privilege level is packed into the lower bits. The Guest runs
- * at privilege level 1 (GUEST_PL).
- */
- regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
- regs->cs = __KERNEL_CS|GUEST_PL;
-
- /*
- * The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
- * is supposed to always be "1". Bit 9 (0x200) controls whether
- * interrupts are enabled. We always leave interrupts enabled while
- * running the Guest.
- */
- regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-
- /*
- * The "Extended Instruction Pointer" register says where the Guest is
- * running.
- */
- regs->eip = start;
-
- /*
- * %esi points to our boot information, at physical address 0, so don't
- * touch it.
- */
-
- /* There are a couple of GDT entries the Guest expects at boot. */
- setup_guest_gdt(cpu);
-}
+++ /dev/null
-/*P:900
- * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
- * both the Host and Guest to do the low-level Guest<->Host switch. It is as
- * simple as it can be made, but it's naturally very specific to x86.
- *
- * You have now completed Preparation. If this has whet your appetite; if you
- * are feeling invigorated and refreshed then the next, more challenging stage
- * can be found in "make Guest".
- :*/
-
-/*M:012
- * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
- * gain at least 1% more performance. Since neither LOC nor performance can be
- * measured beforehand, it generally means implementing a feature then deciding
- * if it's worth it. And once it's implemented, who can say no?
- *
- * This is why I haven't implemented this idea myself. I want to, but I
- * haven't. You could, though.
- *
- * The main place where lguest performance sucks is Guest page faulting. When
- * a Guest userspace process hits an unmapped page we switch back to the Host,
- * walk the page tables, find it's not mapped, switch back to the Guest page
- * fault handler, which calls a hypercall to set the page table entry, then
- * finally returns to userspace. That's two round-trips.
- *
- * If we had a small walker in the Switcher, we could quickly check the Guest
- * page table and if the page isn't mapped, immediately reflect the fault back
- * into the Guest. This means the Switcher would have to know the top of the
- * Guest page table and the page fault handler address.
- *
- * For simplicity, the Guest should only handle the case where the privilege
- * level of the fault is 3 and probably only not present or write faults. It
- * should also detect recursive faults, and hand the original fault to the
- * Host (which is actually really easy).
- *
- * Two questions remain. Would the performance gain outweigh the complexity?
- * And who would write the verse documenting it?
-:*/
-
-/*M:011
- * Lguest64 handles NMI. This gave me NMI envy (until I looked at their
- * code). It's worth doing though, since it would let us use oprofile in the
- * Host when a Guest is running.
-:*/
-
-/*S:100
- * Welcome to the Switcher itself!
- *
- * This file contains the low-level code which changes the CPU to run the Guest
- * code, and returns to the Host when something happens. Understand this, and
- * you understand the heart of our journey.
- *
- * Because this is in assembler rather than C, our tale switches from prose to
- * verse. First I tried limericks:
- *
- * There once was an eax reg,
- * To which our pointer was fed,
- * It needed an add,
- * Which asm-offsets.h had
- * But this limerick is hurting my head.
- *
- * Next I tried haikus, but fitting the required reference to the seasons in
- * every stanza was quickly becoming tiresome:
- *
- * The %eax reg
- * Holds "struct lguest_pages" now:
- * Cherry blossoms fall.
- *
- * Then I started with Heroic Verse, but the rhyming requirement leeched away
- * the content density and led to some uniquely awful oblique rhymes:
- *
- * These constants are coming from struct offsets
- * For use within the asm switcher text.
- *
- * Finally, I settled for something between heroic hexameter, and normal prose
- * with inappropriate linebreaks. Anyway, it aint no Shakespeare.
- */
-
-// Not all kernel headers work from assembler
-// But these ones are needed: the ENTRY() define
-// And constants extracted from struct offsets
-// To avoid magic numbers and breakage:
-// Should they change the compiler can't save us
-// Down here in the depths of assembler code.
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/segment.h>
-#include <asm/lguest.h>
-
-// We mark the start of the code to copy
-// It's placed in .text tho it's never run here
-// You'll see the trick macro at the end
-// Which interleaves data and text to effect.
-.text
-ENTRY(start_switcher_text)
-
-// When we reach switch_to_guest we have just left
-// The safe and comforting shores of C code
-// %eax has the "struct lguest_pages" to use
-// Where we save state and still see it from the Guest
-// And %ebx holds the Guest shadow pagetable:
-// Once set we have truly left Host behind.
-ENTRY(switch_to_guest)
- // We told gcc all its regs could fade,
- // Clobbered by our journey into the Guest
- // We could have saved them, if we tried
- // But time is our master and cycles count.
-
- // Segment registers must be saved for the Host
- // We push them on the Host stack for later
- pushl %es
- pushl %ds
- pushl %gs
- pushl %fs
- // But the compiler is fickle, and heeds
- // No warning of %ebp clobbers
- // When frame pointers are used. That register
- // Must be saved and restored or chaos strikes.
- pushl %ebp
- // The Host's stack is done, now save it away
- // In our "struct lguest_pages" at offset
- // Distilled into asm-offsets.h
- movl %esp, LGUEST_PAGES_host_sp(%eax)
-
- // All saved and there's now five steps before us:
- // Stack, GDT, IDT, TSS
- // Then last of all the page tables are flipped.
-
- // Yet beware that our stack pointer must be
- // Always valid lest an NMI hits
- // %edx does the duty here as we juggle
- // %eax is lguest_pages: our stack lies within.
- movl %eax, %edx
- addl $LGUEST_PAGES_regs, %edx
- movl %edx, %esp
-
- // The Guest's GDT we so carefully
- // Placed in the "struct lguest_pages" before
- lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
-
- // The Guest's IDT we did partially
- // Copy to "struct lguest_pages" as well.
- lidt LGUEST_PAGES_guest_idt_desc(%eax)
-
- // The TSS entry which controls traps
- // Must be loaded up with "ltr" now:
- // The GDT entry that TSS uses
- // Changes type when we load it: damn Intel!
- // For after we switch over our page tables
- // That entry will be read-only: we'd crash.
- movl $(GDT_ENTRY_TSS*8), %edx
- ltr %dx
-
- // Look back now, before we take this last step!
- // The Host's TSS entry was also marked used;
- // Let's clear it again for our return.
- // The GDT descriptor of the Host
- // Points to the table after two "size" bytes
- movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
- // Clear "used" from type field (byte 5, bit 2)
- andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
-
- // Once our page table's switched, the Guest is live!
- // The Host fades as we run this final step.
- // Our "struct lguest_pages" is now read-only.
- movl %ebx, %cr3
-
- // The page table change did one tricky thing:
- // The Guest's register page has been mapped
- // Writable under our %esp (stack) --
- // We can simply pop off all Guest regs.
- popl %eax
- popl %ebx
- popl %ecx
- popl %edx
- popl %esi
- popl %edi
- popl %ebp
- popl %gs
- popl %fs
- popl %ds
- popl %es
-
- // Near the base of the stack lurk two strange fields
- // Which we fill as we exit the Guest
- // These are the trap number and its error
- // We can simply step past them on our way.
- addl $8, %esp
-
- // The last five stack slots hold return address
- // And everything needed to switch privilege
- // From Switcher's level 0 to Guest's 1,
- // And the stack where the Guest had last left it.
- // Interrupts are turned back on: we are Guest.
- iret
-
-// We tread two paths to switch back to the Host
-// Yet both must save Guest state and restore Host
-// So we put the routine in a macro.
-#define SWITCH_TO_HOST \
- /* We save the Guest state: all registers first \
- * Laid out just as "struct lguest_regs" defines */ \
- pushl %es; \
- pushl %ds; \
- pushl %fs; \
- pushl %gs; \
- pushl %ebp; \
- pushl %edi; \
- pushl %esi; \
- pushl %edx; \
- pushl %ecx; \
- pushl %ebx; \
- pushl %eax; \
- /* Our stack and our code are using segments \
- * Set in the TSS and IDT \
- * Yet if we were to touch data we'd use \
- * Whatever data segment the Guest had. \
- * Load the lguest ds segment for now. */ \
- movl $(LGUEST_DS), %eax; \
- movl %eax, %ds; \
- /* So where are we? Which CPU, which struct? \
- * The stack is our clue: our TSS starts \
- * It at the end of "struct lguest_pages". \
- * Or we may have stumbled while restoring \
- * Our Guest segment regs while in switch_to_guest, \
- * The fault pushed atop that part-unwound stack. \
- * If we round the stack down to the page start \
- * We're at the start of "struct lguest_pages". */ \
- movl %esp, %eax; \
- andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
- /* Save our trap number: the switch will obscure it \
- * (In the Host the Guest regs are not mapped here) \
- * %ebx holds it safe for deliver_to_host */ \
- movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
- /* The Host GDT, IDT and stack! \
- * All these lie safely hidden from the Guest: \
- * We must return to the Host page tables \
- * (Hence that was saved in struct lguest_pages) */ \
- movl LGUEST_PAGES_host_cr3(%eax), %edx; \
- movl %edx, %cr3; \
- /* As before, when we looked back at the Host \
- * As we left and marked TSS unused \
- * So must we now for the Guest left behind. */ \
- andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
- /* Switch to Host's GDT, IDT. */ \
- lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
- lidt LGUEST_PAGES_host_idt_desc(%eax); \
- /* Restore the Host's stack where its saved regs lie */ \
- movl LGUEST_PAGES_host_sp(%eax), %esp; \
- /* Last the TSS: our Host is returned */ \
- movl $(GDT_ENTRY_TSS*8), %edx; \
- ltr %dx; \
- /* Restore now the regs saved right at the first. */ \
- popl %ebp; \
- popl %fs; \
- popl %gs; \
- popl %ds; \
- popl %es
-
-// The first path is trod when the Guest has trapped:
-// (Which trap it was has been pushed on the stack).
-// We need only switch back, and the Host will decode
-// Why we came home, and what needs to be done.
-return_to_host:
- SWITCH_TO_HOST
- iret
-
-// We are lead to the second path like so:
-// An interrupt, with some cause external
-// Has ajerked us rudely from the Guest's code
-// Again we must return home to the Host
-deliver_to_host:
- SWITCH_TO_HOST
- // But now we must go home via that place
- // Where that interrupt was supposed to go
- // Had we not been ensconced, running the Guest.
- // Here we see the trickness of run_guest_once():
- // The Host stack is formed like an interrupt
- // With EIP, CS and EFLAGS layered.
- // Interrupt handlers end with "iret"
- // And that will take us home at long long last.
-
- // But first we must find the handler to call!
- // The IDT descriptor for the Host
- // Has two bytes for size, and four for address:
- // %edx will hold it for us for now.
- movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
- // We now know the table address we need,
- // And saved the trap's number inside %ebx.
- // Yet the pointer to the handler is smeared
- // Across the bits of the table entry.
- // What oracle can tell us how to extract
- // From such a convoluted encoding?
- // I consulted gcc, and it gave
- // These instructions, which I gladly credit:
- leal (%edx,%ebx,8), %eax
- movzwl (%eax),%edx
- movl 4(%eax), %eax
- xorw %ax, %ax
- orl %eax, %edx
- // Now the address of the handler's in %edx
- // We call it now: its "iret" drops us home.
- jmp *%edx
-
-// Every interrupt can come to us here
-// But we must truly tell each apart.
-// They number two hundred and fifty six
-// And each must land in a different spot,
-// Push its number on stack, and join the stream.
-
-// And worse, a mere six of the traps stand apart
-// And push on their stack an addition:
-// An error number, thirty two bits long
-// So we punish the other two fifty
-// And make them push a zero so they match.
-
-// Yet two fifty six entries is long
-// And all will look most the same as the last
-// So we create a macro which can make
-// As many entries as we need to fill.
-
-// Note the change to .data then .text:
-// We plant the address of each entry
-// Into a (data) table for the Host
-// To know where each Guest interrupt should go.
-.macro IRQ_STUB N TARGET
- .data; .long 1f; .text; 1:
- // Trap eight, ten through fourteen and seventeen
- // Supply an error number. Else zero.
- .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
- pushl $0
- .endif
- pushl $\N
- jmp \TARGET
- ALIGN
-.endm
-
-// This macro creates numerous entries
-// Using GAS macros which out-power C's.
-.macro IRQ_STUBS FIRST LAST TARGET
- irq=\FIRST
- .rept \LAST-\FIRST+1
- IRQ_STUB irq \TARGET
- irq=irq+1
- .endr
-.endm
-
-// Here's the marker for our pointer table
-// Laid in the data section just before
-// Each macro places the address of code
-// Forming an array: each one points to text
-// Which handles interrupt in its turn.
-.data
-.global default_idt_entries
-default_idt_entries:
-.text
- // The first two traps go straight back to the Host
- IRQ_STUBS 0 1 return_to_host
- // We'll say nothing, yet, about NMI
- IRQ_STUB 2 handle_nmi
- // Other traps also return to the Host
- IRQ_STUBS 3 31 return_to_host
- // All interrupts go via their handlers
- IRQ_STUBS 32 127 deliver_to_host
- // 'Cept system calls coming from userspace
- // Are to go to the Guest, never the Host.
- IRQ_STUB 128 return_to_host
- IRQ_STUBS 129 255 deliver_to_host
-
-// The NMI, what a fabulous beast
-// Which swoops in and stops us no matter that
-// We're suspended between heaven and hell,
-// (Or more likely between the Host and Guest)
-// When in it comes! We are dazed and confused
-// So we do the simplest thing which one can.
-// Though we've pushed the trap number and zero
-// We discard them, return, and hope we live.
-handle_nmi:
- addl $8, %esp
- iret
-
-// We are done; all that's left is Mastery
-// And "make Mastery" is a journey long
-// Designed to make your fingers itch to code.
-
-// Here ends the text, the file and poem.
-ENTRY(end_switcher_text)
depends on VIRTIO
---help---
This is the virtual network driver for virtio. It can be used with
- lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
+ QEMU based VMMs (like KVM or Xen). Say Y or M.
config NLMON
tristate "Virtual netlink monitoring device"
bool
help
Generic "hypervisor virtual console" infrastructure for various
- hypervisors (pSeries, iSeries, Xen, lguest).
+ hypervisors (pSeries, iSeries, Xen).
It will automatically be selected if one of the back-end console drivers
is selected.
tristate
---help---
This option is selected by any driver which implements the virtio
- bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
- CONFIG_RPMSG or CONFIG_S390_GUEST.
+ bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+ or CONFIG_S390_GUEST.
menu "Virtio drivers"
+++ /dev/null
-/*
- * Things the lguest guest needs to know. Note: like all lguest interfaces,
- * this is subject to wild and random change between versions.
- */
-#ifndef _LINUX_LGUEST_H
-#define _LINUX_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/time.h>
-#include <asm/irq.h>
-#include <asm/lguest_hcall.h>
-
-#define LG_CLOCK_MIN_DELTA 100UL
-#define LG_CLOCK_MAX_DELTA ULONG_MAX
-
-/*G:031
- * The second method of communicating with the Host is to via "struct
- * lguest_data". Once the Guest's initialization hypercall tells the Host where
- * this is, the Guest and Host both publish information in it.
-:*/
-struct lguest_data {
- /*
- * 512 == enabled (same as eflags in normal hardware). The Guest
- * changes interrupts so often that a hypercall is too slow.
- */
- unsigned int irq_enabled;
- /* Fine-grained interrupt disabling by the Guest */
- DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
-
- /*
- * The Host writes the virtual address of the last page fault here,
- * which saves the Guest a hypercall. CR2 is the native register where
- * this address would normally be found.
- */
- unsigned long cr2;
-
- /* Wallclock time set by the Host. */
- struct timespec time;
-
- /*
- * Interrupt pending set by the Host. The Guest should do a hypercall
- * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
- */
- int irq_pending;
-
- /*
- * Async hypercall ring. Instead of directly making hypercalls, we can
- * place them in here for processing the next time the Host wants.
- * This batching can be quite efficient.
- */
-
- /* 0xFF == done (set by Host), 0 == pending (set by Guest). */
- u8 hcall_status[LHCALL_RING_SIZE];
- /* The actual registers for the hypercalls. */
- struct hcall_args hcalls[LHCALL_RING_SIZE];
-
-/* Fields initialized by the Host at boot: */
- /* Memory not to try to access */
- unsigned long reserve_mem;
- /* KHz for the TSC clock. */
- u32 tsc_khz;
-
-/* Fields initialized by the Guest at boot: */
- /* Instruction to suppress interrupts even if enabled */
- unsigned long noirq_iret;
- /* Address above which page tables are all identical. */
- unsigned long kernel_address;
- /* The vector to try to use for system calls (0x40 or 0x80). */
- unsigned int syscall_vec;
-};
-extern struct lguest_data lguest_data;
-#endif /* __ASSEMBLY__ */
-#endif /* _LINUX_LGUEST_H */
+++ /dev/null
-#ifndef _LINUX_LGUEST_LAUNCHER
-#define _LINUX_LGUEST_LAUNCHER
-/* Everything the "lguest" userspace program needs to know. */
-#include <linux/types.h>
-
-/*D:010
- * Drivers
- *
- * The Guest needs devices to do anything useful. Since we don't let it touch
- * real devices (think of the damage it could do!) we provide virtual devices.
- * We emulate a PCI bus with virtio devices on it; we used to have our own
- * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
- *
- * Virtio devices are also used by kvm, so we can simply reuse their optimized
- * device drivers. And one day when everyone uses virtio, my plan will be
- * complete. Bwahahahah!
- */
-
-/* Write command first word is a request. */
-enum lguest_req
-{
- LHREQ_INITIALIZE, /* + base, pfnlimit, start */
- LHREQ_GETDMA, /* No longer used */
- LHREQ_IRQ, /* + irq */
- LHREQ_BREAK, /* No longer used */
- LHREQ_EVENTFD, /* No longer used. */
- LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
- LHREQ_SETREG, /* + offset within struct pt_regs, value. */
- LHREQ_TRAP, /* + trap number to deliver to guest. */
-};
-
-/*
- * This is what read() of the lguest fd populates. trap ==
- * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
- * argument), 14 for a page fault in the MMIO region (addr is
- * the trap address, insn is the instruction), or 13 for a GPF
- * (insn is the instruction).
- */
-struct lguest_pending {
- __u8 trap;
- __u8 insn[7];
- __u32 addr;
-};
-#endif /* _LINUX_LGUEST_LAUNCHER */
#ifndef _UAPI_LINUX_VIRTIO_RING_H
#define _UAPI_LINUX_VIRTIO_RING_H
-/* An interface for efficient virtio implementation, currently for use by KVM
- * and lguest, but hopefully others soon. Do NOT change this since it will
+/* An interface for efficient virtio implementation, currently for use by KVM,
+ * but hopefully others soon. Do NOT change this since it will
* break existing servers and clients.
*
* This header is BSD licensed so anyone can use the definitions to implement
@echo ' iio - IIO tools'
@echo ' kvm_stat - top-like utility for displaying kvm statistics'
@echo ' leds - LEDs tools'
- @echo ' lguest - a minimal 32-bit x86 hypervisor'
@echo ' liblockdep - user-space wrapper for kernel locking-validator'
@echo ' net - misc networking tools'
@echo ' perf - Linux performance measurement and analysis tool'
kvm_stat: FORCE
$(call descend,kvm/$@)
-all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
+all: acpi cgroup cpupower gpio hv firewire liblockdep \
perf selftests turbostat usb \
virtio vm net x86_energy_perf_policy \
tmon freefall objtool kvm_stat
cpupower_install:
$(call descend,power/$(@:_install=),install)
-cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
$(call descend,$(@:_install=),install)
liblockdep_install:
$(call descend,kvm/$(@:_install=),install)
install: acpi_install cgroup_install cpupower_install gpio_install \
- hv_install firewire_install lguest_install liblockdep_install \
+ hv_install firewire_install liblockdep_install \
perf_install selftests_install turbostat_install usb_install \
virtio_install vm_install net_install x86_energy_perf_policy_install \
tmon_install freefall_install objtool_install kvm_stat_install
cpupower_clean:
$(call descend,power/cpupower,clean)
-cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
$(call descend,$(@:_clean=),clean)
liblockdep_clean:
build_clean:
$(call descend,build,clean)
-clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
+++ /dev/null
-lguest
-include
+++ /dev/null
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
-
-all: lguest
-
-include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
- mkdir -p include/linux 2>&1 || true
- ln -sf ../../../../include/uapi/linux/virtio_types.h $@
-
-lguest: include/linux/virtio_types.h
-
-clean:
- rm -f lguest
- rm -rf include
+++ /dev/null
-#! /bin/sh
-
-set -e
-
-PREFIX=$1
-shift
-
-trap 'rm -r $TMPDIR' 0
-TMPDIR=`mktemp -d`
-
-exec 3>/dev/null
-for f; do
- while IFS="
-" read -r LINE; do
- case "$LINE" in
- *$PREFIX:[0-9]*:\**)
- NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
- if [ -f $TMPDIR/$NUM ]; then
- echo "$TMPDIR/$NUM already exits prior to $f"
- exit 1
- fi
- exec 3>>$TMPDIR/$NUM
- echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
- /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
- ;;
- *$PREFIX:[0-9]*)
- NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
- if [ -f $TMPDIR/$NUM ]; then
- echo "$TMPDIR/$NUM already exits prior to $f"
- exit 1
- fi
- exec 3>>$TMPDIR/$NUM
- echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
- /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
- ;;
- *:\**)
- /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
- echo >&3
- exec 3>/dev/null
- ;;
- *)
- /bin/echo "$LINE" >&3
- ;;
- esac
- done < $f
- echo >&3
- exec 3>/dev/null
-done
-
-LASTFILE=""
-for f in $TMPDIR/*; do
- if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
- LASTFILE=$(cat $TMPDIR/.$(basename $f) )
- echo "[ $LASTFILE ]"
- fi
- cat $f
-done
-
+++ /dev/null
-/*P:100
- * This is the Launcher code, a simple program which lays out the "physical"
- * memory for the new Guest by mapping the kernel image and the virtual
- * devices, then opens /dev/lguest to tell the kernel about the Guest and
- * control it.
-:*/
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/eventfd.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <ctype.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <getopt.h>
-#include <assert.h>
-#include <sched.h>
-#include <limits.h>
-#include <stddef.h>
-#include <signal.h>
-#include <pwd.h>
-#include <grp.h>
-#include <sys/user.h>
-#include <linux/pci_regs.h>
-
-#ifndef VIRTIO_F_ANY_LAYOUT
-#define VIRTIO_F_ANY_LAYOUT 27
-#endif
-
-/*L:110
- * We can ignore the 43 include files we need for this program, but I do want
- * to draw attention to the use of kernel-style types.
- *
- * As Linus said, "C is a Spartan language, and so should your naming be." I
- * like these abbreviations, so we define them here. Note that u64 is always
- * unsigned long long, which works on all Linux systems: this means that we can
- * use %llu in printf for any u64.
- */
-typedef unsigned long long u64;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-/*:*/
-
-#define VIRTIO_CONFIG_NO_LEGACY
-#define VIRTIO_PCI_NO_LEGACY
-#define VIRTIO_BLK_NO_LEGACY
-#define VIRTIO_NET_NO_LEGACY
-
-/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
-#include "../../include/uapi/linux/virtio_config.h"
-#include "../../include/uapi/linux/virtio_net.h"
-#include "../../include/uapi/linux/virtio_blk.h"
-#include "../../include/uapi/linux/virtio_console.h"
-#include "../../include/uapi/linux/virtio_rng.h"
-#include <linux/virtio_ring.h>
-#include "../../include/uapi/linux/virtio_pci.h"
-#include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
-
-#define BRIDGE_PFX "bridge:"
-#ifndef SIOCBRADDIF
-#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
-#endif
-/* We can have up to 256 pages for devices. */
-#define DEVICE_PAGES 256
-/* This will occupy 3 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 256
-
-/*L:120
- * verbose is both a global flag and a macro. The C preprocessor allows
- * this, and although I wouldn't recommend it, it works quite nicely here.
- */
-static bool verbose;
-#define verbose(args...) \
- do { if (verbose) printf(args); } while(0)
-/*:*/
-
-/* The pointer to the start of guest memory. */
-static void *guest_base;
-/* The maximum guest physical address allowed, and maximum possible. */
-static unsigned long guest_limit, guest_max, guest_mmio;
-/* The /dev/lguest file descriptor. */
-static int lguest_fd;
-
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
-/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
-#define MAX_PCI_DEVICES 32
-
-/* This is our list of devices. */
-struct device_list {
- /* Counter to assign interrupt numbers. */
- unsigned int next_irq;
-
- /* Counter to print out convenient device numbers. */
- unsigned int device_num;
-
- /* PCI devices. */
- struct device *pci[MAX_PCI_DEVICES];
-};
-
-/* The list of Guest devices, based on command line arguments. */
-static struct device_list devices;
-
-/*
- * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h,
- * but uses a u32 explicitly for the data.
- */
-struct virtio_pci_cfg_cap_u32 {
- struct virtio_pci_cap cap;
- u32 pci_cfg_data; /* Data for BAR access. */
-};
-
-struct virtio_pci_mmio {
- struct virtio_pci_common_cfg cfg;
- u16 notify;
- u8 isr;
- u8 padding;
- /* Device-specific configuration follows this. */
-};
-
-/* This is the layout (little-endian) of the PCI config space. */
-struct pci_config {
- u16 vendor_id, device_id;
- u16 command, status;
- u8 revid, prog_if, subclass, class;
- u8 cacheline_size, lat_timer, header_type, bist;
- u32 bar[6];
- u32 cardbus_cis_ptr;
- u16 subsystem_vendor_id, subsystem_device_id;
- u32 expansion_rom_addr;
- u8 capabilities, reserved1[3];
- u32 reserved2;
- u8 irq_line, irq_pin, min_grant, max_latency;
-
- /* Now, this is the linked capability list. */
- struct virtio_pci_cap common;
- struct virtio_pci_notify_cap notify;
- struct virtio_pci_cap isr;
- struct virtio_pci_cap device;
- struct virtio_pci_cfg_cap_u32 cfg_access;
-};
-
-/* The device structure describes a single device. */
-struct device {
- /* The name of this device, for --verbose. */
- const char *name;
-
- /* Any queues attached to this device */
- struct virtqueue *vq;
-
- /* Is it operational */
- bool running;
-
- /* Has it written FEATURES_OK but not re-checked it? */
- bool wrote_features_ok;
-
- /* PCI configuration */
- union {
- struct pci_config config;
- u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
- };
-
- /* Features we offer, and those accepted. */
- u64 features, features_accepted;
-
- /* Device-specific config hangs off the end of this. */
- struct virtio_pci_mmio *mmio;
-
- /* PCI MMIO resources (all in BAR0) */
- size_t mmio_size;
- u32 mmio_addr;
-
- /* Device-specific data. */
- void *priv;
-};
-
-/* The virtqueue structure describes a queue attached to a device. */
-struct virtqueue {
- struct virtqueue *next;
-
- /* Which device owns me. */
- struct device *dev;
-
- /* Name for printing errors. */
- const char *name;
-
- /* The actual ring of buffers. */
- struct vring vring;
-
- /* The information about this virtqueue (we only use queue_size on) */
- struct virtio_pci_common_cfg pci_config;
-
- /* Last available index we saw. */
- u16 last_avail_idx;
-
- /* How many are used since we sent last irq? */
- unsigned int pending_used;
-
- /* Eventfd where Guest notifications arrive. */
- int eventfd;
-
- /* Function for the thread which is servicing this virtqueue. */
- void (*service)(struct virtqueue *vq);
- pid_t thread;
-};
-
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
-/* The original tty settings to restore on exit. */
-static struct termios orig_term;
-
-/*
- * We have to be careful with barriers: our devices are all run in separate
- * threads and so we need to make sure that changes visible to the Guest happen
- * in precise order.
- */
-#define wmb() __asm__ __volatile__("" : : : "memory")
-#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-
-/* Wrapper for the last available index. Makes it easier to change. */
-#define lg_last_avail(vq) ((vq)->last_avail_idx)
-
-/*
- * The virtio configuration space is defined to be little-endian. x86 is
- * little-endian too, but it's nice to be explicit so we have these helpers.
- */
-#define cpu_to_le16(v16) (v16)
-#define cpu_to_le32(v32) (v32)
-#define cpu_to_le64(v64) (v64)
-#define le16_to_cpu(v16) (v16)
-#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v64) (v64)
-
-/*
- * A real device would ignore weird/non-compliant driver behaviour. We
- * stop and flag it, to help debugging Linux problems.
- */
-#define bad_driver(d, fmt, ...) \
- errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
-#define bad_driver_vq(vq, fmt, ...) \
- errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
- vq->name, ## __VA_ARGS__)
-
-/* Is this iovec empty? */
-static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
-{
- unsigned int i;
-
- for (i = 0; i < num_iov; i++)
- if (iov[i].iov_len)
- return false;
- return true;
-}
-
-/* Take len bytes from the front of this iovec. */
-static void iov_consume(struct device *d,
- struct iovec iov[], unsigned num_iov,
- void *dest, unsigned len)
-{
- unsigned int i;
-
- for (i = 0; i < num_iov; i++) {
- unsigned int used;
-
- used = iov[i].iov_len < len ? iov[i].iov_len : len;
- if (dest) {
- memcpy(dest, iov[i].iov_base, used);
- dest += used;
- }
- iov[i].iov_base += used;
- iov[i].iov_len -= used;
- len -= used;
- }
- if (len != 0)
- bad_driver(d, "iovec too short!");
-}
-
-/*L:100
- * The Launcher code itself takes us out into userspace, that scary place where
- * pointers run wild and free! Unfortunately, like most userspace programs,
- * it's quite boring (which is why everyone likes to hack on the kernel!).
- * Perhaps if you make up an Lguest Drinking Game at this point, it will get
- * you through this section. Or, maybe not.
- *
- * The Launcher sets up a big chunk of memory to be the Guest's "physical"
- * memory and stores it in "guest_base". In other words, Guest physical ==
- * Launcher virtual with an offset.
- *
- * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us its
- * "physical" addresses:
- */
-static void *from_guest_phys(unsigned long addr)
-{
- return guest_base + addr;
-}
-
-static unsigned long to_guest_phys(const void *addr)
-{
- return (addr - guest_base);
-}
-
-/*L:130
- * Loading the Kernel.
- *
- * We start with couple of simple helper routines. open_or_die() avoids
- * error-checking code cluttering the callers:
- */
-static int open_or_die(const char *name, int flags)
-{
- int fd = open(name, flags);
- if (fd < 0)
- err(1, "Failed to open %s", name);
- return fd;
-}
-
-/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned int num)
-{
- int fd = open_or_die("/dev/zero", O_RDONLY);
- void *addr;
-
- /*
- * We use a private mapping (ie. if we write to the page, it will be
- * copied). We allocate an extra two pages PROT_NONE to act as guard
- * pages against read/write attempts that exceed allocated space.
- */
- addr = mmap(NULL, getpagesize() * (num+2),
- PROT_NONE, MAP_PRIVATE, fd, 0);
-
- if (addr == MAP_FAILED)
- err(1, "Mmapping %u pages of /dev/zero", num);
-
- if (mprotect(addr + getpagesize(), getpagesize() * num,
- PROT_READ|PROT_WRITE) == -1)
- err(1, "mprotect rw %u pages failed", num);
-
- /*
- * One neat mmap feature is that you can close the fd, and it
- * stays mapped.
- */
- close(fd);
-
- /* Return address after PROT_NONE page */
- return addr + getpagesize();
-}
-
-/* Get some bytes which won't be mapped into the guest. */
-static unsigned long get_mmio_region(size_t size)
-{
- unsigned long addr = guest_mmio;
- size_t i;
-
- if (!size)
- return addr;
-
- /* Size has to be a power of 2 (and multiple of 16) */
- for (i = 1; i < size; i <<= 1);
-
- guest_mmio += i;
-
- return addr;
-}
-
-/*
- * This routine is used to load the kernel or initrd. It tries mmap, but if
- * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
- * it falls back to reading the memory in.
- */
-static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
-{
- ssize_t r;
-
- /*
- * We map writable even though for some segments are marked read-only.
- * The kernel really wants to be writable: it patches its own
- * instructions.
- *
- * MAP_PRIVATE means that the page won't be copied until a write is
- * done to it. This allows us to share untouched memory between
- * Guests.
- */
- if (mmap(addr, len, PROT_READ|PROT_WRITE,
- MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
- return;
-
- /* pread does a seek and a read in one shot: saves a few lines. */
- r = pread(fd, addr, len, offset);
- if (r != len)
- err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
-}
-
-/*
- * This routine takes an open vmlinux image, which is in ELF, and maps it into
- * the Guest memory. ELF = Embedded Linking Format, which is the format used
- * by all modern binaries on Linux including the kernel.
- *
- * The ELF headers give *two* addresses: a physical address, and a virtual
- * address. We use the physical address; the Guest will map itself to the
- * virtual address.
- *
- * We return the starting address.
- */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-{
- Elf32_Phdr phdr[ehdr->e_phnum];
- unsigned int i;
-
- /*
- * Sanity checks on the main ELF header: an x86 executable with a
- * reasonable number of correctly-sized program headers.
- */
- if (ehdr->e_type != ET_EXEC
- || ehdr->e_machine != EM_386
- || ehdr->e_phentsize != sizeof(Elf32_Phdr)
- || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
- errx(1, "Malformed elf header");
-
- /*
- * An ELF executable contains an ELF header and a number of "program"
- * headers which indicate which parts ("segments") of the program to
- * load where.
- */
-
- /* We read in all the program headers at once: */
- if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
- err(1, "Seeking to program headers");
- if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
- err(1, "Reading program headers");
-
- /*
- * Try all the headers: there are usually only three. A read-only one,
- * a read-write one, and a "note" section which we don't load.
- */
- for (i = 0; i < ehdr->e_phnum; i++) {
- /* If this isn't a loadable segment, we ignore it */
- if (phdr[i].p_type != PT_LOAD)
- continue;
-
- verbose("Section %i: size %i addr %p\n",
- i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
- /* We map this section of the file at its physical address. */
- map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
- phdr[i].p_offset, phdr[i].p_filesz);
- }
-
- /* The entry point is given in the ELF header. */
- return ehdr->e_entry;
-}
-
-/*L:150
- * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed
- * to jump into it and it will unpack itself. We used to have to perform some
- * hairy magic because the unpacking code scared me.
- *
- * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
- * a small patch to jump over the tricky bits in the Guest, so now we just read
- * the funky header so we know where in the file to load, and away we go!
- */
-static unsigned long load_bzimage(int fd)
-{
- struct boot_params boot;
- int r;
- /* Modern bzImages get loaded at 1M. */
- void *p = from_guest_phys(0x100000);
-
- /*
- * Go back to the start of the file and read the header. It should be
- * a Linux boot header (see Documentation/x86/boot.txt)
- */
- lseek(fd, 0, SEEK_SET);
- read(fd, &boot, sizeof(boot));
-
- /* Inside the setup_hdr, we expect the magic "HdrS" */
- if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
- errx(1, "This doesn't look like a bzImage to me");
-
- /* Skip over the extra sectors of the header. */
- lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
-
- /* Now read everything into memory. in nice big chunks. */
- while ((r = read(fd, p, 65536)) > 0)
- p += r;
-
- /* Finally, code32_start tells us where to enter the kernel. */
- return boot.hdr.code32_start;
-}
-
-/*L:140
- * Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format. With a little
- * work, we can load those, too.
- */
-static unsigned long load_kernel(int fd)
-{
- Elf32_Ehdr hdr;
-
- /* Read in the first few bytes. */
- if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
- err(1, "Reading kernel");
-
- /* If it's an ELF file, it starts with "\177ELF" */
- if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
- return map_elf(fd, &hdr);
-
- /* Otherwise we assume it's a bzImage, and try to load it. */
- return load_bzimage(fd);
-}
-
-/*
- * This is a trivial little helper to align pages. Andi Kleen hated it because
- * it calls getpagesize() twice: "it's dumb code."
- *
- * Kernel guys get really het up about optimization, even when it's not
- * necessary. I leave this code as a reaction against that.
- */
-static inline unsigned long page_align(unsigned long addr)
-{
- /* Add upwards and truncate downwards. */
- return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/*L:180
- * An "initial ram disk" is a disk image loaded into memory along with the
- * kernel which the kernel can use to boot from without needing any drivers.
- * Most distributions now use this as standard: the initrd contains the code to
- * load the appropriate driver modules for the current machine.
- *
- * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
- * kernels. He sent me this (and tells me when I break it).
- */
-static unsigned long load_initrd(const char *name, unsigned long mem)
-{
- int ifd;
- struct stat st;
- unsigned long len;
-
- ifd = open_or_die(name, O_RDONLY);
- /* fstat() is needed to get the file size. */
- if (fstat(ifd, &st) < 0)
- err(1, "fstat() on initrd '%s'", name);
-
- /*
- * We map the initrd at the top of memory, but mmap wants it to be
- * page-aligned, so we round the size up for that.
- */
- len = page_align(st.st_size);
- map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
- /*
- * Once a file is mapped, you can close the file descriptor. It's a
- * little odd, but quite useful.
- */
- close(ifd);
- verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
-
- /* We return the initrd size. */
- return len;
-}
-/*:*/
-
-/*
- * Simple routine to roll all the commandline arguments together with spaces
- * between them.
- */
-static void concat(char *dst, char *args[])
-{
- unsigned int i, len = 0;
-
- for (i = 0; args[i]; i++) {
- if (i) {
- strcat(dst+len, " ");
- len++;
- }
- strcpy(dst+len, args[i]);
- len += strlen(args[i]);
- }
- /* In case it's empty. */
- dst[len] = '\0';
-}
-
-/*L:185
- * This is where we actually tell the kernel to initialize the Guest. We
- * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow and the
- * entry point for the Guest.
- */
-static void tell_kernel(unsigned long start)
-{
- unsigned long args[] = { LHREQ_INITIALIZE,
- (unsigned long)guest_base,
- guest_limit / getpagesize(), start,
- (guest_mmio+getpagesize()-1) / getpagesize() };
- verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
- guest_base, guest_base + guest_limit,
- guest_limit, guest_mmio);
- lguest_fd = open_or_die("/dev/lguest", O_RDWR);
- if (write(lguest_fd, args, sizeof(args)) < 0)
- err(1, "Writing to /dev/lguest");
-}
-/*:*/
-
-/*L:200
- * Device Handling.
- *
- * When the Guest gives us a buffer, it sends an array of addresses and sizes.
- * We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which checks it and exits with an error message
- * if something funny is going on:
- */
-static void *_check_pointer(struct device *d,
- unsigned long addr, unsigned int size,
- unsigned int line)
-{
- /*
- * Check if the requested address and size exceeds the allocated memory,
- * or addr + size wraps around.
- */
- if ((addr + size) > guest_limit || (addr + size) < addr)
- bad_driver(d, "%s:%i: Invalid address %#lx",
- __FILE__, line, addr);
- /*
- * We return a pointer for the caller's convenience, now we know it's
- * safe to use.
- */
- return from_guest_phys(addr);
-}
-/* A macro which transparently hands the line number to the real function. */
-#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
-
-/*
- * Each buffer in the virtqueues is actually a chain of descriptors. This
- * function returns the next descriptor in the chain, or vq->vring.num if we're
- * at the end.
- */
-static unsigned next_desc(struct device *d, struct vring_desc *desc,
- unsigned int i, unsigned int max)
-{
- unsigned int next;
-
- /* If this descriptor says it doesn't chain, we're done. */
- if (!(desc[i].flags & VRING_DESC_F_NEXT))
- return max;
-
- /* Check they're not leading us off end of descriptors. */
- next = desc[i].next;
- /* Make sure compiler knows to grab that: we don't want it changing! */
- wmb();
-
- if (next >= max)
- bad_driver(d, "Desc next is %u", next);
-
- return next;
-}
-
-/*
- * This actually sends the interrupt for this virtqueue, if we've used a
- * buffer.
- */
-static void trigger_irq(struct virtqueue *vq)
-{
- unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
-
- /* Don't inform them if nothing used. */
- if (!vq->pending_used)
- return;
- vq->pending_used = 0;
-
- /*
- * 2.4.7.1:
- *
- * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
- * The driver MUST set flags to 0 or 1.
- */
- if (vq->vring.avail->flags > 1)
- bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
-
- /*
- * 2.4.7.2:
- *
- * If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
- *
- * - The device MUST ignore the used_event value.
- * - After the device writes a descriptor index into the used ring:
- * - If flags is 1, the device SHOULD NOT send an interrupt.
- * - If flags is 0, the device MUST send an interrupt.
- */
- if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
- return;
- }
-
- /*
- * 4.1.4.5.1:
- *
- * If MSI-X capability is disabled, the device MUST set the Queue
- * Interrupt bit in ISR status before sending a virtqueue notification
- * to the driver.
- */
- vq->dev->mmio->isr = 0x1;
-
- /* Send the Guest an interrupt tell them we used something up. */
- if (write(lguest_fd, buf, sizeof(buf)) != 0)
- err(1, "Triggering irq %i", vq->dev->config.irq_line);
-}
-
-/*
- * This looks in the virtqueue for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function waits if necessary, and returns the descriptor number found.
- */
-static unsigned wait_for_vq_desc(struct virtqueue *vq,
- struct iovec iov[],
- unsigned int *out_num, unsigned int *in_num)
-{
- unsigned int i, head, max;
- struct vring_desc *desc;
- u16 last_avail = lg_last_avail(vq);
-
- /*
- * 2.4.7.1:
- *
- * The driver MUST handle spurious interrupts from the device.
- *
- * That's why this is a while loop.
- */
-
- /* There's nothing available? */
- while (last_avail == vq->vring.avail->idx) {
- u64 event;
-
- /*
- * Since we're about to sleep, now is a good time to tell the
- * Guest about what we've used up to now.
- */
- trigger_irq(vq);
-
- /* OK, now we need to know about added descriptors. */
- vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-
- /*
- * They could have slipped one in as we were doing that: make
- * sure it's written, then check again.
- */
- mb();
- if (last_avail != vq->vring.avail->idx) {
- vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
- break;
- }
-
- /* Nothing new? Wait for eventfd to tell us they refilled. */
- if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
- errx(1, "Event read failed?");
-
- /* We don't need to be notified again. */
- vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
- }
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
- bad_driver_vq(vq, "Guest moved used index from %u to %u",
- last_avail, vq->vring.avail->idx);
-
- /*
- * Make sure we read the descriptor number *after* we read the ring
- * update; don't let the cpu or compiler change the order.
- */
- rmb();
-
- /*
- * Grab the next descriptor number they're advertising, and increment
- * the index we've seen.
- */
- head = vq->vring.avail->ring[last_avail % vq->vring.num];
- lg_last_avail(vq)++;
-
- /* If their number is silly, that's a fatal mistake. */
- if (head >= vq->vring.num)
- bad_driver_vq(vq, "Guest says index %u is available", head);
-
- /* When we start there are none of either input nor output. */
- *out_num = *in_num = 0;
-
- max = vq->vring.num;
- desc = vq->vring.desc;
- i = head;
-
- /*
- * We have to read the descriptor after we read the descriptor number,
- * but there's a data dependency there so the CPU shouldn't reorder
- * that: no rmb() required.
- */
-
- do {
- /*
- * If this is an indirect entry, then this buffer contains a
- * descriptor table which we handle as if it's any normal
- * descriptor chain.
- */
- if (desc[i].flags & VRING_DESC_F_INDIRECT) {
- /* 2.4.5.3.1:
- *
- * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
- * flag unless the VIRTIO_F_INDIRECT_DESC feature was
- * negotiated.
- */
- if (!(vq->dev->features_accepted &
- (1<<VIRTIO_RING_F_INDIRECT_DESC)))
- bad_driver_vq(vq, "vq indirect not negotiated");
-
- /*
- * 2.4.5.3.1:
- *
- * The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
- * flag within an indirect descriptor (ie. only one
- * table per descriptor).
- */
- if (desc != vq->vring.desc)
- bad_driver_vq(vq, "Indirect within indirect");
-
- /*
- * Proposed update VIRTIO-134 spells this out:
- *
- * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
- * and VIRTQ_DESC_F_NEXT in flags.
- */
- if (desc[i].flags & VRING_DESC_F_NEXT)
- bad_driver_vq(vq, "indirect and next together");
-
- if (desc[i].len % sizeof(struct vring_desc))
- bad_driver_vq(vq,
- "Invalid size for indirect table");
- /*
- * 2.4.5.3.2:
- *
- * The device MUST ignore the write-only flag
- * (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
- * refers to an indirect table.
- *
- * We ignore it here: :)
- */
-
- max = desc[i].len / sizeof(struct vring_desc);
- desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
- i = 0;
-
- /* 2.4.5.3.1:
- *
- * A driver MUST NOT create a descriptor chain longer
- * than the Queue Size of the device.
- */
- if (max > vq->pci_config.queue_size)
- bad_driver_vq(vq,
- "indirect has too many entries");
- }
-
- /* Grab the first descriptor, and check it's OK. */
- iov[*out_num + *in_num].iov_len = desc[i].len;
- iov[*out_num + *in_num].iov_base
- = check_pointer(vq->dev, desc[i].addr, desc[i].len);
- /* If this is an input descriptor, increment that count. */
- if (desc[i].flags & VRING_DESC_F_WRITE)
- (*in_num)++;
- else {
- /*
- * If it's an output descriptor, they're all supposed
- * to come before any input descriptors.
- */
- if (*in_num)
- bad_driver_vq(vq,
- "Descriptor has out after in");
- (*out_num)++;
- }
-
- /* If we've got too many, that implies a descriptor loop. */
- if (*out_num + *in_num > max)
- bad_driver_vq(vq, "Looped descriptor");
- } while ((i = next_desc(vq->dev, desc, i, max)) != max);
-
- return head;
-}
-
-/*
- * After we've used one of their buffers, we tell the Guest about it. Sometime
- * later we'll want to send them an interrupt using trigger_irq(); note that
- * wait_for_vq_desc() does that for us if it has to wait.
- */
-static void add_used(struct virtqueue *vq, unsigned int head, int len)
-{
- struct vring_used_elem *used;
-
- /*
- * The virtqueue contains a ring of used buffers. Get a pointer to the
- * next entry in that used ring.
- */
- used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
- used->id = head;
- used->len = len;
- /* Make sure buffer is written before we update index. */
- wmb();
- vq->vring.used->idx++;
- vq->pending_used++;
-}
-
-/* And here's the combo meal deal. Supersize me! */
-static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
-{
- add_used(vq, head, len);
- trigger_irq(vq);
-}
-
-/*
- * The Console
- *
- * We associate some data with the console for our exit hack.
- */
-struct console_abort {
- /* How many times have they hit ^C? */
- int count;
- /* When did they start? */
- struct timeval start;
-};
-
-/* This is the routine which handles console input (ie. stdin). */
-static void console_input(struct virtqueue *vq)
-{
- int len;
- unsigned int head, in_num, out_num;
- struct console_abort *abort = vq->dev->priv;
- struct iovec iov[vq->vring.num];
-
- /* Make sure there's a descriptor available. */
- head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
- if (out_num)
- bad_driver_vq(vq, "Output buffers in console in queue?");
-
- /* Read into it. This is where we usually wait. */
- len = readv(STDIN_FILENO, iov, in_num);
- if (len <= 0) {
- /* Ran out of input? */
- warnx("Failed to get console input, ignoring console.");
- /*
- * For simplicity, dying threads kill the whole Launcher. So
- * just nap here.
- */
- for (;;)
- pause();
- }
-
- /* Tell the Guest we used a buffer. */
- add_used_and_trigger(vq, head, len);
-
- /*
- * Three ^C within one second? Exit.
- *
- * This is such a hack, but works surprisingly well. Each ^C has to
- * be in a buffer by itself, so they can't be too fast. But we check
- * that we get three within about a second, so they can't be too
- * slow.
- */
- if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
- abort->count = 0;
- return;
- }
-
- abort->count++;
- if (abort->count == 1)
- gettimeofday(&abort->start, NULL);
- else if (abort->count == 3) {
- struct timeval now;
- gettimeofday(&now, NULL);
- /* Kill all Launcher processes with SIGINT, like normal ^C */
- if (now.tv_sec <= abort->start.tv_sec+1)
- kill(0, SIGINT);
- abort->count = 0;
- }
-}
-
-/* This is the routine which handles console output (ie. stdout). */
-static void console_output(struct virtqueue *vq)
-{
- unsigned int head, out, in;
- struct iovec iov[vq->vring.num];
-
- /* We usually wait in here, for the Guest to give us something. */
- head = wait_for_vq_desc(vq, iov, &out, &in);
- if (in)
- bad_driver_vq(vq, "Input buffers in console output queue?");
-
- /* writev can return a partial write, so we loop here. */
- while (!iov_empty(iov, out)) {
- int len = writev(STDOUT_FILENO, iov, out);
- if (len <= 0) {
- warn("Write to stdout gave %i (%d)", len, errno);
- break;
- }
- iov_consume(vq->dev, iov, out, NULL, len);
- }
-
- /*
- * We're finished with that buffer: if we're going to sleep,
- * wait_for_vq_desc() will prod the Guest with an interrupt.
- */
- add_used(vq, head, 0);
-}
-
-/*
- * The Network
- *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
- */
-struct net_info {
- int tunfd;
-};
-
-static void net_output(struct virtqueue *vq)
-{
- struct net_info *net_info = vq->dev->priv;
- unsigned int head, out, in;
- struct iovec iov[vq->vring.num];
-
- /* We usually wait in here for the Guest to give us a packet. */
- head = wait_for_vq_desc(vq, iov, &out, &in);
- if (in)
- bad_driver_vq(vq, "Input buffers in net output queue?");
- /*
- * Send the whole thing through to /dev/net/tun. It expects the exact
- * same format: what a coincidence!
- */
- if (writev(net_info->tunfd, iov, out) < 0)
- warnx("Write to tun failed (%d)?", errno);
-
- /*
- * Done with that one; wait_for_vq_desc() will send the interrupt if
- * all packets are processed.
- */
- add_used(vq, head, 0);
-}
-
-/*
- * Handling network input is a bit trickier, because I've tried to optimize it.
- *
- * First we have a helper routine which tells is if from this file descriptor
- * (ie. the /dev/net/tun device) will block:
- */
-static bool will_block(int fd)
-{
- fd_set fdset;
- struct timeval zero = { 0, 0 };
- FD_ZERO(&fdset);
- FD_SET(fd, &fdset);
- return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
-}
-
-/*
- * This handles packets coming in from the tun device to our Guest. Like all
- * service routines, it gets called again as soon as it returns, so you don't
- * see a while(1) loop here.
- */
-static void net_input(struct virtqueue *vq)
-{
- int len;
- unsigned int head, out, in;
- struct iovec iov[vq->vring.num];
- struct net_info *net_info = vq->dev->priv;
-
- /*
- * Get a descriptor to write an incoming packet into. This will also
- * send an interrupt if they're out of descriptors.
- */
- head = wait_for_vq_desc(vq, iov, &out, &in);
- if (out)
- bad_driver_vq(vq, "Output buffers in net input queue?");
-
- /*
- * If it looks like we'll block reading from the tun device, send them
- * an interrupt.
- */
- if (vq->pending_used && will_block(net_info->tunfd))
- trigger_irq(vq);
-
- /*
- * Read in the packet. This is where we normally wait (when there's no
- * incoming network traffic).
- */
- len = readv(net_info->tunfd, iov, in);
- if (len <= 0)
- warn("Failed to read from tun (%d).", errno);
-
- /*
- * Mark that packet buffer as used, but don't interrupt here. We want
- * to wait until we've done as much work as we can.
- */
- add_used(vq, head, len);
-}
-/*:*/
-
-/* This is the helper to create threads: run the service routine in a loop. */
-static int do_thread(void *_vq)
-{
- struct virtqueue *vq = _vq;
-
- for (;;)
- vq->service(vq);
- return 0;
-}
-
-/*
- * When a child dies, we kill our entire process group with SIGTERM. This
- * also has the side effect that the shell restores the console for us!
- */
-static void kill_launcher(int signal)
-{
- kill(0, SIGTERM);
-}
-
-static void reset_vq_pci_config(struct virtqueue *vq)
-{
- vq->pci_config.queue_size = VIRTQUEUE_NUM;
- vq->pci_config.queue_enable = 0;
-}
-
-static void reset_device(struct device *dev)
-{
- struct virtqueue *vq;
-
- verbose("Resetting device %s\n", dev->name);
-
- /* Clear any features they've acked. */
- dev->features_accepted = 0;
-
- /* We're going to be explicitly killing threads, so ignore them. */
- signal(SIGCHLD, SIG_IGN);
-
- /*
- * 4.1.4.3.1:
- *
- * The device MUST present a 0 in queue_enable on reset.
- *
- * This means we set it here, and reset the saved ones in every vq.
- */
- dev->mmio->cfg.queue_enable = 0;
-
- /* Get rid of the virtqueue threads */
- for (vq = dev->vq; vq; vq = vq->next) {
- vq->last_avail_idx = 0;
- reset_vq_pci_config(vq);
- if (vq->thread != (pid_t)-1) {
- kill(vq->thread, SIGTERM);
- waitpid(vq->thread, NULL, 0);
- vq->thread = (pid_t)-1;
- }
- }
- dev->running = false;
- dev->wrote_features_ok = false;
-
- /* Now we care if threads die. */
- signal(SIGCHLD, (void *)kill_launcher);
-}
-
-static void cleanup_devices(void)
-{
- unsigned int i;
-
- for (i = 1; i < MAX_PCI_DEVICES; i++) {
- struct device *d = devices.pci[i];
- if (!d)
- continue;
- reset_device(d);
- }
-
- /* If we saved off the original terminal settings, restore them now. */
- if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
- tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/*L:217
- * We do PCI. This is mainly done to let us test the kernel virtio PCI
- * code.
- */
-
-/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
-static struct device pci_host_bridge;
-
-static void init_pci_host_bridge(void)
-{
- pci_host_bridge.name = "PCI Host Bridge";
- pci_host_bridge.config.class = 0x06; /* bridge */
- pci_host_bridge.config.subclass = 0; /* host bridge */
- devices.pci[0] = &pci_host_bridge;
-}
-
-/* The IO ports used to read the PCI config space. */
-#define PCI_CONFIG_ADDR 0xCF8
-#define PCI_CONFIG_DATA 0xCFC
-
-/*
- * Not really portable, but does help readability: this is what the Guest
- * writes to the PCI_CONFIG_ADDR IO port.
- */
-union pci_config_addr {
- struct {
- unsigned mbz: 2;
- unsigned offset: 6;
- unsigned funcnum: 3;
- unsigned devnum: 5;
- unsigned busnum: 8;
- unsigned reserved: 7;
- unsigned enabled : 1;
- } bits;
- u32 val;
-};
-
-/*
- * We cache what they wrote to the address port, so we know what they're
- * talking about when they access the data port.
- */
-static union pci_config_addr pci_config_addr;
-
-static struct device *find_pci_device(unsigned int index)
-{
- return devices.pci[index];
-}
-
-/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
-static void ioread(u16 off, u32 v, u32 mask, u32 *val)
-{
- assert(off < 4);
- assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
- *val = (v >> (off * 8)) & mask;
-}
-
-/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
-static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
-{
- assert(off < 4);
- assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
- *dst &= ~(mask << (off * 8));
- *dst |= (v & mask) << (off * 8);
-}
-
-/*
- * Where PCI_CONFIG_DATA accesses depends on the previous write to
- * PCI_CONFIG_ADDR.
- */
-static struct device *dev_and_reg(u32 *reg)
-{
- if (!pci_config_addr.bits.enabled)
- return NULL;
-
- if (pci_config_addr.bits.funcnum != 0)
- return NULL;
-
- if (pci_config_addr.bits.busnum != 0)
- return NULL;
-
- if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
- return NULL;
-
- *reg = pci_config_addr.bits.offset;
- return find_pci_device(pci_config_addr.bits.devnum);
-}
-
-/*
- * We can get invalid combinations of values while they're writing, so we
- * only fault if they try to write with some invalid bar/offset/length.
- */
-static bool valid_bar_access(struct device *d,
- struct virtio_pci_cfg_cap_u32 *cfg_access)
-{
- /* We only have 1 bar (BAR0) */
- if (cfg_access->cap.bar != 0)
- return false;
-
- /* Check it's within BAR0. */
- if (cfg_access->cap.offset >= d->mmio_size
- || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
- return false;
-
- /* Check length is 1, 2 or 4. */
- if (cfg_access->cap.length != 1
- && cfg_access->cap.length != 2
- && cfg_access->cap.length != 4)
- return false;
-
- /*
- * 4.1.4.7.2:
- *
- * The driver MUST NOT write a cap.offset which is not a multiple of
- * cap.length (ie. all accesses MUST be aligned).
- */
- if (cfg_access->cap.offset % cfg_access->cap.length != 0)
- return false;
-
- /* Return pointer into word in BAR0. */
- return true;
-}
-
-/* Is this accessing the PCI config address port?. */
-static bool is_pci_addr_port(u16 port)
-{
- return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
-}
-
-static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
-{
- iowrite(port - PCI_CONFIG_ADDR, val, mask,
- &pci_config_addr.val);
- verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
- pci_config_addr.bits.enabled ? "" : " DISABLED",
- val, mask,
- pci_config_addr.bits.busnum,
- pci_config_addr.bits.devnum,
- pci_config_addr.bits.funcnum,
- pci_config_addr.bits.offset);
- return true;
-}
-
-static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
-{
- ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
-}
-
-/* Is this accessing the PCI config data port?. */
-static bool is_pci_data_port(u16 port)
-{
- return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
-
-static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
-{
- u32 reg, portoff;
- struct device *d = dev_and_reg(®);
-
- /* Complain if they don't belong to a device. */
- if (!d)
- return false;
-
- /* They can do 1 byte writes, etc. */
- portoff = port - PCI_CONFIG_DATA;
-
- /*
- * PCI uses a weird way to determine the BAR size: the OS
- * writes all 1's, and sees which ones stick.
- */
- if (&d->config_words[reg] == &d->config.bar[0]) {
- int i;
-
- iowrite(portoff, val, mask, &d->config.bar[0]);
- for (i = 0; (1 << i) < d->mmio_size; i++)
- d->config.bar[0] &= ~(1 << i);
- return true;
- } else if ((&d->config_words[reg] > &d->config.bar[0]
- && &d->config_words[reg] <= &d->config.bar[6])
- || &d->config_words[reg] == &d->config.expansion_rom_addr) {
- /* Allow writing to any other BAR, or expansion ROM */
- iowrite(portoff, val, mask, &d->config_words[reg]);
- return true;
- /* We let them override latency timer and cacheline size */
- } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
- /* Only let them change the first two fields. */
- if (mask == 0xFFFFFFFF)
- mask = 0xFFFF;
- iowrite(portoff, val, mask, &d->config_words[reg]);
- return true;
- } else if (&d->config_words[reg] == (void *)&d->config.command
- && mask == 0xFFFF) {
- /* Ignore command writes. */
- return true;
- } else if (&d->config_words[reg]
- == (void *)&d->config.cfg_access.cap.bar
- || &d->config_words[reg]
- == &d->config.cfg_access.cap.length
- || &d->config_words[reg]
- == &d->config.cfg_access.cap.offset) {
-
- /*
- * The VIRTIO_PCI_CAP_PCI_CFG capability
- * provides a backdoor to access the MMIO
- * regions without mapping them. Weird, but
- * useful.
- */
- iowrite(portoff, val, mask, &d->config_words[reg]);
- return true;
- } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
- u32 write_mask;
-
- /*
- * 4.1.4.7.1:
- *
- * Upon detecting driver write access to pci_cfg_data, the
- * device MUST execute a write access at offset cap.offset at
- * BAR selected by cap.bar using the first cap.length bytes
- * from pci_cfg_data.
- */
-
- /* Must be bar 0 */
- if (!valid_bar_access(d, &d->config.cfg_access))
- return false;
-
- iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
-
- /*
- * Now emulate a write. The mask we use is set by
- * len, *not* this write!
- */
- write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
- verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
- d->config.cfg_access.pci_cfg_data, write_mask,
- d->config.cfg_access.cap.bar,
- d->config.cfg_access.cap.offset,
- d->config.cfg_access.cap.length);
-
- emulate_mmio_write(d, d->config.cfg_access.cap.offset,
- d->config.cfg_access.pci_cfg_data,
- write_mask);
- return true;
- }
-
- /*
- * 4.1.4.1:
- *
- * The driver MUST NOT write into any field of the capability
- * structure, with the exception of those with cap_type
- * VIRTIO_PCI_CAP_PCI_CFG...
- */
- return false;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
-
-static void pci_data_ioread(u16 port, u32 mask, u32 *val)
-{
- u32 reg;
- struct device *d = dev_and_reg(®);
-
- if (!d)
- return;
-
- /* Read through the PCI MMIO access window is special */
- if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
- u32 read_mask;
-
- /*
- * 4.1.4.7.1:
- *
- * Upon detecting driver read access to pci_cfg_data, the
- * device MUST execute a read access of length cap.length at
- * offset cap.offset at BAR selected by cap.bar and store the
- * first cap.length bytes in pci_cfg_data.
- */
- /* Must be bar 0 */
- if (!valid_bar_access(d, &d->config.cfg_access))
- bad_driver(d,
- "Invalid cfg_access to bar%u, offset %u len %u",
- d->config.cfg_access.cap.bar,
- d->config.cfg_access.cap.offset,
- d->config.cfg_access.cap.length);
-
- /*
- * Read into the window. The mask we use is set by
- * len, *not* this read!
- */
- read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
- d->config.cfg_access.pci_cfg_data
- = emulate_mmio_read(d,
- d->config.cfg_access.cap.offset,
- read_mask);
- verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
- d->config.cfg_access.pci_cfg_data, read_mask,
- d->config.cfg_access.cap.bar,
- d->config.cfg_access.cap.offset,
- d->config.cfg_access.cap.length);
- }
- ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
-}
-
-/*L:216
- * This is where we emulate a handful of Guest instructions. It's ugly
- * and we used to do it in the kernel but it grew over time.
- */
-
-/*
- * We use the ptrace syscall's pt_regs struct to talk about registers
- * to lguest: these macros convert the names to the offsets.
- */
-#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
-#define setreg(name, val) \
- setreg_off(offsetof(struct user_regs_struct, name), (val))
-
-static u32 getreg_off(size_t offset)
-{
- u32 r;
- unsigned long args[] = { LHREQ_GETREG, offset };
-
- if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
- err(1, "Getting register %u", offset);
- if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
- err(1, "Reading register %u", offset);
-
- return r;
-}
-
-static void setreg_off(size_t offset, u32 val)
-{
- unsigned long args[] = { LHREQ_SETREG, offset, val };
-
- if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
- err(1, "Setting register %u", offset);
-}
-
-/* Get register by instruction encoding */
-static u32 getreg_num(unsigned regnum, u32 mask)
-{
- /* 8 bit ops use regnums 4-7 for high parts of word */
- if (mask == 0xFF && (regnum & 0x4))
- return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
-
- switch (regnum) {
- case 0: return getreg(eax) & mask;
- case 1: return getreg(ecx) & mask;
- case 2: return getreg(edx) & mask;
- case 3: return getreg(ebx) & mask;
- case 4: return getreg(esp) & mask;
- case 5: return getreg(ebp) & mask;
- case 6: return getreg(esi) & mask;
- case 7: return getreg(edi) & mask;
- }
- abort();
-}
-
-/* Set register by instruction encoding */
-static void setreg_num(unsigned regnum, u32 val, u32 mask)
-{
- /* Don't try to set bits out of range */
- assert(~(val & ~mask));
-
- /* 8 bit ops use regnums 4-7 for high parts of word */
- if (mask == 0xFF && (regnum & 0x4)) {
- /* Construct the 16 bits we want. */
- val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
- setreg_num(regnum & 0x3, val, 0xFFFF);
- return;
- }
-
- switch (regnum) {
- case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
- case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
- case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
- case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
- case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
- case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
- case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
- case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
- }
- abort();
-}
-
-/* Get bytes of displacement appended to instruction, from r/m encoding */
-static u32 insn_displacement_len(u8 mod_reg_rm)
-{
- /* Switch on the mod bits */
- switch (mod_reg_rm >> 6) {
- case 0:
- /* If mod == 0, and r/m == 101, 16-bit displacement follows */
- if ((mod_reg_rm & 0x7) == 0x5)
- return 2;
- /* Normally, mod == 0 means no literal displacement */
- return 0;
- case 1:
- /* One byte displacement */
- return 1;
- case 2:
- /* Four byte displacement */
- return 4;
- case 3:
- /* Register mode */
- return 0;
- }
- abort();
-}
-
-static void emulate_insn(const u8 insn[])
-{
- unsigned long args[] = { LHREQ_TRAP, 13 };
- unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
- unsigned int eax, port, mask;
- /*
- * Default is to return all-ones on IO port reads, which traditionally
- * means "there's nothing there".
- */
- u32 val = 0xFFFFFFFF;
-
- /*
- * This must be the Guest kernel trying to do something, not userspace!
- * The bottom two bits of the CS segment register are the privilege
- * level.
- */
- if ((getreg(xcs) & 3) != 0x1)
- goto no_emulate;
-
- /* Decoding x86 instructions is icky. */
-
- /*
- * Around 2.6.33, the kernel started using an emulation for the
- * cmpxchg8b instruction in early boot on many configurations. This
- * code isn't paravirtualized, and it tries to disable interrupts.
- * Ignore it, which will Mostly Work.
- */
- if (insn[insnlen] == 0xfa) {
- /* "cli", or Clear Interrupt Enable instruction. Skip it. */
- insnlen = 1;
- goto skip_insn;
- }
-
- /*
- * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
- */
- if (insn[insnlen] == 0x66) {
- small_operand = 1;
- /* The instruction is 1 byte so far, read the next byte. */
- insnlen = 1;
- }
-
- /* If the lower bit isn't set, it's a single byte access */
- byte_access = !(insn[insnlen] & 1);
-
- /*
- * Now we can ignore the lower bit and decode the 4 opcodes
- * we need to emulate.
- */
- switch (insn[insnlen] & 0xFE) {
- case 0xE4: /* in <next byte>,%al */
- port = insn[insnlen+1];
- insnlen += 2;
- in = 1;
- break;
- case 0xEC: /* in (%dx),%al */
- port = getreg(edx) & 0xFFFF;
- insnlen += 1;
- in = 1;
- break;
- case 0xE6: /* out %al,<next byte> */
- port = insn[insnlen+1];
- insnlen += 2;
- break;
- case 0xEE: /* out %al,(%dx) */
- port = getreg(edx) & 0xFFFF;
- insnlen += 1;
- break;
- default:
- /* OK, we don't know what this is, can't emulate. */
- goto no_emulate;
- }
-
- /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
- if (byte_access)
- mask = 0xFF;
- else if (small_operand)
- mask = 0xFFFF;
- else
- mask = 0xFFFFFFFF;
-
- /*
- * If it was an "IN" instruction, they expect the result to be read
- * into %eax, so we change %eax.
- */
- eax = getreg(eax);
-
- if (in) {
- /* This is the PS/2 keyboard status; 1 means ready for output */
- if (port == 0x64)
- val = 1;
- else if (is_pci_addr_port(port))
- pci_addr_ioread(port, mask, &val);
- else if (is_pci_data_port(port))
- pci_data_ioread(port, mask, &val);
-
- /* Clear the bits we're about to read */
- eax &= ~mask;
- /* Copy bits in from val. */
- eax |= val & mask;
- /* Now update the register. */
- setreg(eax, eax);
- } else {
- if (is_pci_addr_port(port)) {
- if (!pci_addr_iowrite(port, mask, eax))
- goto bad_io;
- } else if (is_pci_data_port(port)) {
- if (!pci_data_iowrite(port, mask, eax))
- goto bad_io;
- }
- /* There are many other ports, eg. CMOS clock, serial
- * and parallel ports, so we ignore them all. */
- }
-
- verbose("IO %s of %x to %u: %#08x\n",
- in ? "IN" : "OUT", mask, port, eax);
-skip_insn:
- /* Finally, we've "done" the instruction, so move past it. */
- setreg(eip, getreg(eip) + insnlen);
- return;
-
-bad_io:
- warnx("Attempt to %s port %u (%#x mask)",
- in ? "read from" : "write to", port, mask);
-
-no_emulate:
- /* Inject trap into Guest. */
- if (write(lguest_fd, args, sizeof(args)) < 0)
- err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
-}
-
-static struct device *find_mmio_region(unsigned long paddr, u32 *off)
-{
- unsigned int i;
-
- for (i = 1; i < MAX_PCI_DEVICES; i++) {
- struct device *d = devices.pci[i];
-
- if (!d)
- continue;
- if (paddr < d->mmio_addr)
- continue;
- if (paddr >= d->mmio_addr + d->mmio_size)
- continue;
- *off = paddr - d->mmio_addr;
- return d;
- }
- return NULL;
-}
-
-/* FIXME: Use vq array. */
-static struct virtqueue *vq_by_num(struct device *d, u32 num)
-{
- struct virtqueue *vq = d->vq;
-
- while (num-- && vq)
- vq = vq->next;
-
- return vq;
-}
-
-static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
- struct virtqueue *vq)
-{
- vq->pci_config = *cfg;
-}
-
-static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
- struct virtqueue *vq)
-{
- /* Only restore the per-vq part */
- size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
-
- memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
- sizeof(*cfg) - off);
-}
-
-/*
- * 4.1.4.3.2:
- *
- * The driver MUST configure the other virtqueue fields before
- * enabling the virtqueue with queue_enable.
- *
- * When they enable the virtqueue, we check that their setup is valid.
- */
-static void check_virtqueue(struct device *d, struct virtqueue *vq)
-{
- /* Because lguest is 32 bit, all the descriptor high bits must be 0 */
- if (vq->pci_config.queue_desc_hi
- || vq->pci_config.queue_avail_hi
- || vq->pci_config.queue_used_hi)
- bad_driver_vq(vq, "invalid 64-bit queue address");
-
- /*
- * 2.4.1:
- *
- * The driver MUST ensure that the physical address of the first byte
- * of each virtqueue part is a multiple of the specified alignment
- * value in the above table.
- */
- if (vq->pci_config.queue_desc_lo % 16
- || vq->pci_config.queue_avail_lo % 2
- || vq->pci_config.queue_used_lo % 4)
- bad_driver_vq(vq, "invalid alignment in queue addresses");
-
- /* Initialize the virtqueue and check they're all in range. */
- vq->vring.num = vq->pci_config.queue_size;
- vq->vring.desc = check_pointer(vq->dev,
- vq->pci_config.queue_desc_lo,
- sizeof(*vq->vring.desc) * vq->vring.num);
- vq->vring.avail = check_pointer(vq->dev,
- vq->pci_config.queue_avail_lo,
- sizeof(*vq->vring.avail)
- + (sizeof(vq->vring.avail->ring[0])
- * vq->vring.num));
- vq->vring.used = check_pointer(vq->dev,
- vq->pci_config.queue_used_lo,
- sizeof(*vq->vring.used)
- + (sizeof(vq->vring.used->ring[0])
- * vq->vring.num));
-
- /*
- * 2.4.9.1:
- *
- * The driver MUST initialize flags in the used ring to 0
- * when allocating the used ring.
- */
- if (vq->vring.used->flags != 0)
- bad_driver_vq(vq, "invalid initial used.flags %#x",
- vq->vring.used->flags);
-}
-
-static void start_virtqueue(struct virtqueue *vq)
-{
- /*
- * Create stack for thread. Since the stack grows upwards, we point
- * the stack pointer to the end of this region.
- */
- char *stack = malloc(32768);
-
- /* Create a zero-initialized eventfd. */
- vq->eventfd = eventfd(0, 0);
- if (vq->eventfd < 0)
- err(1, "Creating eventfd");
-
- /*
- * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
- * we get a signal if it dies.
- */
- vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
- if (vq->thread == (pid_t)-1)
- err(1, "Creating clone");
-}
-
-static void start_virtqueues(struct device *d)
-{
- struct virtqueue *vq;
-
- for (vq = d->vq; vq; vq = vq->next) {
- if (vq->pci_config.queue_enable)
- start_virtqueue(vq);
- }
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
-{
- struct virtqueue *vq;
-
- switch (off) {
- case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
- /*
- * 4.1.4.3.1:
- *
- * The device MUST present the feature bits it is offering in
- * device_feature, starting at bit device_feature_select ∗ 32
- * for any device_feature_select written by the driver
- */
- if (val == 0)
- d->mmio->cfg.device_feature = d->features;
- else if (val == 1)
- d->mmio->cfg.device_feature = (d->features >> 32);
- else
- d->mmio->cfg.device_feature = 0;
- goto feature_write_through32;
- case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
- if (val > 1)
- bad_driver(d, "Unexpected driver select %u", val);
- goto feature_write_through32;
- case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
- if (d->mmio->cfg.guest_feature_select == 0) {
- d->features_accepted &= ~((u64)0xFFFFFFFF);
- d->features_accepted |= val;
- } else {
- assert(d->mmio->cfg.guest_feature_select == 1);
- d->features_accepted &= 0xFFFFFFFF;
- d->features_accepted |= ((u64)val) << 32;
- }
- /*
- * 2.2.1:
- *
- * The driver MUST NOT accept a feature which the device did
- * not offer
- */
- if (d->features_accepted & ~d->features)
- bad_driver(d, "over-accepted features %#llx of %#llx",
- d->features_accepted, d->features);
- goto feature_write_through32;
- case offsetof(struct virtio_pci_mmio, cfg.device_status): {
- u8 prev;
-
- verbose("%s: device status -> %#x\n", d->name, val);
- /*
- * 4.1.4.3.1:
- *
- * The device MUST reset when 0 is written to device_status,
- * and present a 0 in device_status once that is done.
- */
- if (val == 0) {
- reset_device(d);
- goto write_through8;
- }
-
- /* 2.1.1: The driver MUST NOT clear a device status bit. */
- if (d->mmio->cfg.device_status & ~val)
- bad_driver(d, "unset of device status bit %#x -> %#x",
- d->mmio->cfg.device_status, val);
-
- /*
- * 2.1.2:
- *
- * The device MUST NOT consume buffers or notify the driver
- * before DRIVER_OK.
- */
- if (val & VIRTIO_CONFIG_S_DRIVER_OK
- && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
- start_virtqueues(d);
-
- /*
- * 3.1.1:
- *
- * The driver MUST follow this sequence to initialize a device:
- * - Reset the device.
- * - Set the ACKNOWLEDGE status bit: the guest OS has
- * notice the device.
- * - Set the DRIVER status bit: the guest OS knows how
- * to drive the device.
- * - Read device feature bits, and write the subset
- * of feature bits understood by the OS and driver
- * to the device. During this step the driver MAY
- * read (but MUST NOT write) the device-specific
- * configuration fields to check that it can
- * support the device before accepting it.
- * - Set the FEATURES_OK status bit. The driver
- * MUST not accept new feature bits after this
- * step.
- * - Re-read device status to ensure the FEATURES_OK
- * bit is still set: otherwise, the device does
- * not support our subset of features and the
- * device is unusable.
- * - Perform device-specific setup, including
- * discovery of virtqueues for the device,
- * optional per-bus setup, reading and possibly
- * writing the device’s virtio configuration
- * space, and population of virtqueues.
- * - Set the DRIVER_OK status bit. At this point the
- * device is “live”.
- */
- prev = 0;
- switch (val & ~d->mmio->cfg.device_status) {
- case VIRTIO_CONFIG_S_DRIVER_OK:
- prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
- case VIRTIO_CONFIG_S_FEATURES_OK:
- prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
- case VIRTIO_CONFIG_S_DRIVER:
- prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
- case VIRTIO_CONFIG_S_ACKNOWLEDGE:
- break;
- default:
- bad_driver(d, "unknown device status bit %#x -> %#x",
- d->mmio->cfg.device_status, val);
- }
- if (d->mmio->cfg.device_status != prev)
- bad_driver(d, "unexpected status transition %#x -> %#x",
- d->mmio->cfg.device_status, val);
-
- /* If they just wrote FEATURES_OK, we make sure they read */
- switch (val & ~d->mmio->cfg.device_status) {
- case VIRTIO_CONFIG_S_FEATURES_OK:
- d->wrote_features_ok = true;
- break;
- case VIRTIO_CONFIG_S_DRIVER_OK:
- if (d->wrote_features_ok)
- bad_driver(d, "did not re-read FEATURES_OK");
- break;
- }
- goto write_through8;
- }
- case offsetof(struct virtio_pci_mmio, cfg.queue_select):
- vq = vq_by_num(d, val);
- /*
- * 4.1.4.3.1:
- *
- * The device MUST present a 0 in queue_size if the virtqueue
- * corresponding to the current queue_select is unavailable.
- */
- if (!vq) {
- d->mmio->cfg.queue_size = 0;
- goto write_through16;
- }
- /* Save registers for old vq, if it was a valid vq */
- if (d->mmio->cfg.queue_size)
- save_vq_config(&d->mmio->cfg,
- vq_by_num(d, d->mmio->cfg.queue_select));
- /* Restore the registers for the queue they asked for */
- restore_vq_config(&d->mmio->cfg, vq);
- goto write_through16;
- case offsetof(struct virtio_pci_mmio, cfg.queue_size):
- /*
- * 4.1.4.3.2:
- *
- * The driver MUST NOT write a value which is not a power of 2
- * to queue_size.
- */
- if (val & (val-1))
- bad_driver(d, "invalid queue size %u", val);
- if (d->mmio->cfg.queue_enable)
- bad_driver(d, "changing queue size on live device");
- goto write_through16;
- case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
- bad_driver(d, "attempt to set MSIX vector to %u", val);
- case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
- struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
-
- /*
- * 4.1.4.3.2:
- *
- * The driver MUST NOT write a 0 to queue_enable.
- */
- if (val != 1)
- bad_driver(d, "setting queue_enable to %u", val);
-
- /*
- * 3.1.1:
- *
- * 7. Perform device-specific setup, including discovery of
- * virtqueues for the device, optional per-bus setup,
- * reading and possibly writing the device’s virtio
- * configuration space, and population of virtqueues.
- * 8. Set the DRIVER_OK status bit.
- *
- * All our devices require all virtqueues to be enabled, so
- * they should have done that before setting DRIVER_OK.
- */
- if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
- bad_driver(d, "enabling vq after DRIVER_OK");
-
- d->mmio->cfg.queue_enable = val;
- save_vq_config(&d->mmio->cfg, vq);
- check_virtqueue(d, vq);
- goto write_through16;
- }
- case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
- bad_driver(d, "attempt to write to queue_notify_off");
- case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
- case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
- case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
- case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
- case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
- case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
- /*
- * 4.1.4.3.2:
- *
- * The driver MUST configure the other virtqueue fields before
- * enabling the virtqueue with queue_enable.
- */
- if (d->mmio->cfg.queue_enable)
- bad_driver(d, "changing queue on live device");
-
- /*
- * 3.1.1:
- *
- * The driver MUST follow this sequence to initialize a device:
- *...
- * 5. Set the FEATURES_OK status bit. The driver MUST not
- * accept new feature bits after this step.
- */
- if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
- bad_driver(d, "setting up vq before FEATURES_OK");
-
- /*
- * 6. Re-read device status to ensure the FEATURES_OK bit is
- * still set...
- */
- if (d->wrote_features_ok)
- bad_driver(d, "didn't re-read FEATURES_OK before setup");
-
- goto write_through32;
- case offsetof(struct virtio_pci_mmio, notify):
- vq = vq_by_num(d, val);
- if (!vq)
- bad_driver(d, "Invalid vq notification on %u", val);
- /* Notify the process handling this vq by adding 1 to eventfd */
- write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
- goto write_through16;
- case offsetof(struct virtio_pci_mmio, isr):
- bad_driver(d, "Unexpected write to isr");
- /* Weird corner case: write to emerg_wr of console */
- case sizeof(struct virtio_pci_mmio)
- + offsetof(struct virtio_console_config, emerg_wr):
- if (strcmp(d->name, "console") == 0) {
- char c = val;
- write(STDOUT_FILENO, &c, 1);
- goto write_through32;
- }
- /* Fall through... */
- default:
- /*
- * 4.1.4.3.2:
- *
- * The driver MUST NOT write to device_feature, num_queues,
- * config_generation or queue_notify_off.
- */
- bad_driver(d, "Unexpected write to offset %u", off);
- }
-
-feature_write_through32:
- /*
- * 3.1.1:
- *
- * The driver MUST follow this sequence to initialize a device:
- *...
- * - Set the DRIVER status bit: the guest OS knows how
- * to drive the device.
- * - Read device feature bits, and write the subset
- * of feature bits understood by the OS and driver
- * to the device.
- *...
- * - Set the FEATURES_OK status bit. The driver MUST not
- * accept new feature bits after this step.
- */
- if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
- bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
- if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
- bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
-
- /*
- * 4.1.3.1:
- *
- * The driver MUST access each field using the “natural” access
- * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
- * 16-bit fields and 8-bit accesses for 8-bit fields.
- */
-write_through32:
- if (mask != 0xFFFFFFFF) {
- bad_driver(d, "non-32-bit write to offset %u (%#x)",
- off, getreg(eip));
- return;
- }
- memcpy((char *)d->mmio + off, &val, 4);
- return;
-
-write_through16:
- if (mask != 0xFFFF)
- bad_driver(d, "non-16-bit write to offset %u (%#x)",
- off, getreg(eip));
- memcpy((char *)d->mmio + off, &val, 2);
- return;
-
-write_through8:
- if (mask != 0xFF)
- bad_driver(d, "non-8-bit write to offset %u (%#x)",
- off, getreg(eip));
- memcpy((char *)d->mmio + off, &val, 1);
- return;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
-{
- u8 isr;
- u32 val = 0;
-
- switch (off) {
- case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
- case offsetof(struct virtio_pci_mmio, cfg.device_feature):
- case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
- case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
- /*
- * 3.1.1:
- *
- * The driver MUST follow this sequence to initialize a device:
- *...
- * - Set the DRIVER status bit: the guest OS knows how
- * to drive the device.
- * - Read device feature bits, and write the subset
- * of feature bits understood by the OS and driver
- * to the device.
- */
- if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
- bad_driver(d,
- "feature read before VIRTIO_CONFIG_S_DRIVER");
- goto read_through32;
- case offsetof(struct virtio_pci_mmio, cfg.msix_config):
- bad_driver(d, "read of msix_config");
- case offsetof(struct virtio_pci_mmio, cfg.num_queues):
- goto read_through16;
- case offsetof(struct virtio_pci_mmio, cfg.device_status):
- /* As they did read, any write of FEATURES_OK is now fine. */
- d->wrote_features_ok = false;
- goto read_through8;
- case offsetof(struct virtio_pci_mmio, cfg.config_generation):
- /*
- * 4.1.4.3.1:
- *
- * The device MUST present a changed config_generation after
- * the driver has read a device-specific configuration value
- * which has changed since any part of the device-specific
- * configuration was last read.
- *
- * This is simple: none of our devices change config, so this
- * is always 0.
- */
- goto read_through8;
- case offsetof(struct virtio_pci_mmio, notify):
- /*
- * 3.1.1:
- *
- * The driver MUST NOT notify the device before setting
- * DRIVER_OK.
- */
- if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
- bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
- goto read_through16;
- case offsetof(struct virtio_pci_mmio, isr):
- if (mask != 0xFF)
- bad_driver(d, "non-8-bit read from offset %u (%#x)",
- off, getreg(eip));
- isr = d->mmio->isr;
- /*
- * 4.1.4.5.1:
- *
- * The device MUST reset ISR status to 0 on driver read.
- */
- d->mmio->isr = 0;
- return isr;
- case offsetof(struct virtio_pci_mmio, padding):
- bad_driver(d, "read from padding (%#x)", getreg(eip));
- default:
- /* Read from device config space, beware unaligned overflow */
- if (off > d->mmio_size - 4)
- bad_driver(d, "read past end (%#x)", getreg(eip));
-
- /*
- * 3.1.1:
- * The driver MUST follow this sequence to initialize a device:
- *...
- * 3. Set the DRIVER status bit: the guest OS knows how to
- * drive the device.
- * 4. Read device feature bits, and write the subset of
- * feature bits understood by the OS and driver to the
- * device. During this step the driver MAY read (but MUST NOT
- * write) the device-specific configuration fields to check
- * that it can support the device before accepting it.
- */
- if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
- bad_driver(d,
- "config read before VIRTIO_CONFIG_S_DRIVER");
-
- if (mask == 0xFFFFFFFF)
- goto read_through32;
- else if (mask == 0xFFFF)
- goto read_through16;
- else
- goto read_through8;
- }
-
- /*
- * 4.1.3.1:
- *
- * The driver MUST access each field using the “natural” access
- * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
- * 16-bit fields and 8-bit accesses for 8-bit fields.
- */
-read_through32:
- if (mask != 0xFFFFFFFF)
- bad_driver(d, "non-32-bit read to offset %u (%#x)",
- off, getreg(eip));
- memcpy(&val, (char *)d->mmio + off, 4);
- return val;
-
-read_through16:
- if (mask != 0xFFFF)
- bad_driver(d, "non-16-bit read to offset %u (%#x)",
- off, getreg(eip));
- memcpy(&val, (char *)d->mmio + off, 2);
- return val;
-
-read_through8:
- if (mask != 0xFF)
- bad_driver(d, "non-8-bit read to offset %u (%#x)",
- off, getreg(eip));
- memcpy(&val, (char *)d->mmio + off, 1);
- return val;
-}
-
-static void emulate_mmio(unsigned long paddr, const u8 *insn)
-{
- u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
- struct device *d = find_mmio_region(paddr, &off);
- unsigned long args[] = { LHREQ_TRAP, 14 };
-
- if (!d) {
- warnx("MMIO touching %#08lx (not a device)", paddr);
- goto reinject;
- }
-
- /* Prefix makes it a 16 bit op */
- if (insn[0] == 0x66) {
- mask = 0xFFFF;
- insnlen++;
- }
-
- /* iowrite */
- if (insn[insnlen] == 0x89) {
- /* Next byte is r/m byte: bits 3-5 are register. */
- val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
- emulate_mmio_write(d, off, val, mask);
- insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
- } else if (insn[insnlen] == 0x8b) { /* ioread */
- /* Next byte is r/m byte: bits 3-5 are register. */
- val = emulate_mmio_read(d, off, mask);
- setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
- insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
- } else if (insn[0] == 0x88) { /* 8-bit iowrite */
- mask = 0xff;
- /* Next byte is r/m byte: bits 3-5 are register. */
- val = getreg_num((insn[1] >> 3) & 0x7, mask);
- emulate_mmio_write(d, off, val, mask);
- insnlen = 2 + insn_displacement_len(insn[1]);
- } else if (insn[0] == 0x8a) { /* 8-bit ioread */
- mask = 0xff;
- val = emulate_mmio_read(d, off, mask);
- setreg_num((insn[1] >> 3) & 0x7, val, mask);
- insnlen = 2 + insn_displacement_len(insn[1]);
- } else {
- warnx("Unknown MMIO instruction touching %#08lx:"
- " %02x %02x %02x %02x at %u",
- paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
- reinject:
- /* Inject trap into Guest. */
- if (write(lguest_fd, args, sizeof(args)) < 0)
- err(1, "Reinjecting trap 14 for fault at %#x",
- getreg(eip));
- return;
- }
-
- /* Finally, we've "done" the instruction, so move past it. */
- setreg(eip, getreg(eip) + insnlen);
-}
-
-/*L:190
- * Device Setup
- *
- * All devices need a descriptor so the Guest knows it exists, and a "struct
- * device" so the Launcher can keep track of it. We have common helper
- * routines to allocate and manage them.
- */
-static void add_pci_virtqueue(struct device *dev,
- void (*service)(struct virtqueue *),
- const char *name)
-{
- struct virtqueue **i, *vq = malloc(sizeof(*vq));
-
- /* Initialize the virtqueue */
- vq->next = NULL;
- vq->last_avail_idx = 0;
- vq->dev = dev;
- vq->name = name;
-
- /*
- * This is the routine the service thread will run, and its Process ID
- * once it's running.
- */
- vq->service = service;
- vq->thread = (pid_t)-1;
-
- /* Initialize the configuration. */
- reset_vq_pci_config(vq);
- vq->pci_config.queue_notify_off = 0;
-
- /* Add one to the number of queues */
- vq->dev->mmio->cfg.num_queues++;
-
- /*
- * Add to tail of list, so dev->vq is first vq, dev->vq->next is
- * second.
- */
- for (i = &dev->vq; *i; i = &(*i)->next);
- *i = vq;
-}
-
-/* The Guest accesses the feature bits via the PCI common config MMIO region */
-static void add_pci_feature(struct device *dev, unsigned bit)
-{
- dev->features |= (1ULL << bit);
-}
-
-/* For devices with no config. */
-static void no_device_config(struct device *dev)
-{
- dev->mmio_addr = get_mmio_region(dev->mmio_size);
-
- dev->config.bar[0] = dev->mmio_addr;
- /* Bottom 4 bits must be zero */
- assert(~(dev->config.bar[0] & 0xF));
-}
-
-/* This puts the device config into BAR0 */
-static void set_device_config(struct device *dev, const void *conf, size_t len)
-{
- /* Set up BAR 0 */
- dev->mmio_size += len;
- dev->mmio = realloc(dev->mmio, dev->mmio_size);
- memcpy(dev->mmio + 1, conf, len);
-
- /*
- * 4.1.4.6:
- *
- * The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
- * capability for any device type which has a device-specific
- * configuration.
- */
- /* Hook up device cfg */
- dev->config.cfg_access.cap.cap_next
- = offsetof(struct pci_config, device);
-
- /*
- * 4.1.4.6.1:
- *
- * The offset for the device-specific configuration MUST be 4-byte
- * aligned.
- */
- assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
-
- /* Fix up device cfg field length. */
- dev->config.device.length = len;
-
- /* The rest is the same as the no-config case */
- no_device_config(dev);
-}
-
-static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
- size_t bar_offset, size_t bar_bytes, u8 next)
-{
- cap->cap_vndr = PCI_CAP_ID_VNDR;
- cap->cap_next = next;
- cap->cap_len = caplen;
- cap->cfg_type = type;
- cap->bar = 0;
- memset(cap->padding, 0, sizeof(cap->padding));
- cap->offset = bar_offset;
- cap->length = bar_bytes;
-}
-
-/*
- * This sets up the pci_config structure, as defined in the virtio 1.0
- * standard (and PCI standard).
- */
-static void init_pci_config(struct pci_config *pci, u16 type,
- u8 class, u8 subclass)
-{
- size_t bar_offset, bar_len;
-
- /*
- * 4.1.4.4.1:
- *
- * The device MUST either present notify_off_multiplier as an even
- * power of 2, or present notify_off_multiplier as 0.
- *
- * 2.1.2:
- *
- * The device MUST initialize device status to 0 upon reset.
- */
- memset(pci, 0, sizeof(*pci));
-
- /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
- pci->vendor_id = 0x1AF4;
- /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
- pci->device_id = 0x1040 + type;
-
- /*
- * PCI have specific codes for different types of devices.
- * Linux doesn't care, but it's a good clue for people looking
- * at the device.
- */
- pci->class = class;
- pci->subclass = subclass;
-
- /*
- * 4.1.2.1:
- *
- * Non-transitional devices SHOULD have a PCI Revision ID of 1 or
- * higher
- */
- pci->revid = 1;
-
- /*
- * 4.1.2.1:
- *
- * Non-transitional devices SHOULD have a PCI Subsystem Device ID of
- * 0x40 or higher.
- */
- pci->subsystem_device_id = 0x40;
-
- /* We use our dummy interrupt controller, and irq_line is the irq */
- pci->irq_line = devices.next_irq++;
- pci->irq_pin = 0;
-
- /* Support for extended capabilities. */
- pci->status = (1 << 4);
-
- /* Link them in. */
- /*
- * 4.1.4.3.1:
- *
- * The device MUST present at least one common configuration
- * capability.
- */
- pci->capabilities = offsetof(struct pci_config, common);
-
- /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
- assert(pci->capabilities % 4 == 0);
-
- bar_offset = offsetof(struct virtio_pci_mmio, cfg);
- bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
- init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
- bar_offset, bar_len,
- offsetof(struct pci_config, notify));
-
- /*
- * 4.1.4.4.1:
- *
- * The device MUST present at least one notification capability.
- */
- bar_offset += bar_len;
- bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
-
- /*
- * 4.1.4.4.1:
- *
- * The cap.offset MUST be 2-byte aligned.
- */
- assert(pci->common.cap_next % 2 == 0);
-
- /* FIXME: Use a non-zero notify_off, for per-queue notification? */
- /*
- * 4.1.4.4.1:
- *
- * The value cap.length presented by the device MUST be at least 2 and
- * MUST be large enough to support queue notification offsets for all
- * supported queues in all possible configurations.
- */
- assert(bar_len >= 2);
-
- init_cap(&pci->notify.cap, sizeof(pci->notify),
- VIRTIO_PCI_CAP_NOTIFY_CFG,
- bar_offset, bar_len,
- offsetof(struct pci_config, isr));
-
- bar_offset += bar_len;
- bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
- /*
- * 4.1.4.5.1:
- *
- * The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
- * capability.
- */
- init_cap(&pci->isr, sizeof(pci->isr),
- VIRTIO_PCI_CAP_ISR_CFG,
- bar_offset, bar_len,
- offsetof(struct pci_config, cfg_access));
-
- /*
- * 4.1.4.7.1:
- *
- * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
- * capability.
- */
- /* This doesn't have any presence in the BAR */
- init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
- VIRTIO_PCI_CAP_PCI_CFG,
- 0, 0, 0);
-
- bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
- assert(bar_offset == sizeof(struct virtio_pci_mmio));
-
- /*
- * This gets sewn in and length set in set_device_config().
- * Some devices don't have a device configuration interface, so
- * we never expose this if we don't call set_device_config().
- */
- init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
- bar_offset, 0, 0);
-}
-
-/*
- * This routine does all the creation and setup of a new device, but we don't
- * actually place the MMIO region until we know the size (if any) of the
- * device-specific config. And we don't actually start the service threads
- * until later.
- *
- * See what I mean about userspace being boring?
- */
-static struct device *new_pci_device(const char *name, u16 type,
- u8 class, u8 subclass)
-{
- struct device *dev = malloc(sizeof(*dev));
-
- /* Now we populate the fields one at a time. */
- dev->name = name;
- dev->vq = NULL;
- dev->running = false;
- dev->wrote_features_ok = false;
- dev->mmio_size = sizeof(struct virtio_pci_mmio);
- dev->mmio = calloc(1, dev->mmio_size);
- dev->features = (u64)1 << VIRTIO_F_VERSION_1;
- dev->features_accepted = 0;
-
- if (devices.device_num + 1 >= MAX_PCI_DEVICES)
- errx(1, "Can only handle 31 PCI devices");
-
- init_pci_config(&dev->config, type, class, subclass);
- assert(!devices.pci[devices.device_num+1]);
- devices.pci[++devices.device_num] = dev;
-
- return dev;
-}
-
-/*
- * Our first setup routine is the console. It's a fairly simple device, but
- * UNIX tty handling makes it uglier than it could be.
- */
-static void setup_console(void)
-{
- struct device *dev;
- struct virtio_console_config conf;
-
- /* If we can save the initial standard input settings... */
- if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
- struct termios term = orig_term;
- /*
- * Then we turn off echo, line buffering and ^C etc: We want a
- * raw input stream to the Guest.
- */
- term.c_lflag &= ~(ISIG|ICANON|ECHO);
- tcsetattr(STDIN_FILENO, TCSANOW, &term);
- }
-
- dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
-
- /* We store the console state in dev->priv, and initialize it. */
- dev->priv = malloc(sizeof(struct console_abort));
- ((struct console_abort *)dev->priv)->count = 0;
-
- /*
- * The console needs two virtqueues: the input then the output. When
- * they put something the input queue, we make sure we're listening to
- * stdin. When they put something in the output queue, we write it to
- * stdout.
- */
- add_pci_virtqueue(dev, console_input, "input");
- add_pci_virtqueue(dev, console_output, "output");
-
- /* We need a configuration area for the emerg_wr early writes. */
- add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
- set_device_config(dev, &conf, sizeof(conf));
-
- verbose("device %u: console\n", devices.device_num);
-}
-/*:*/
-
-/*M:010
- * Inter-guest networking is an interesting area. Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe. This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sophisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic. Of course, namespace and permissions issues need to be
- * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could use a virtio network switch in the kernel, ie. vhost.
-:*/
-
-static u32 str2ip(const char *ipaddr)
-{
- unsigned int b[4];
-
- if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
- errx(1, "Failed to parse IP address '%s'", ipaddr);
- return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-}
-
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
- unsigned int m[6];
- if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
- &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
- errx(1, "Failed to parse mac address '%s'", macaddr);
- mac[0] = m[0];
- mac[1] = m[1];
- mac[2] = m[2];
- mac[3] = m[3];
- mac[4] = m[4];
- mac[5] = m[5];
-}
-
-/*
- * This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it.
- */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
- int ifidx;
- struct ifreq ifr;
-
- if (!*br_name)
- errx(1, "must specify bridge name");
-
- ifidx = if_nametoindex(if_name);
- if (!ifidx)
- errx(1, "interface %s does not exist!", if_name);
-
- strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
- ifr.ifr_name[IFNAMSIZ-1] = '\0';
- ifr.ifr_ifindex = ifidx;
- if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
- err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/*
- * This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer.
- */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
- struct ifreq ifr;
- struct sockaddr_in sin;
-
- memset(&ifr, 0, sizeof(ifr));
- strcpy(ifr.ifr_name, tapif);
-
- /* Don't read these incantations. Just cut & paste them like I did! */
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = htonl(ipaddr);
- memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
- if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
- err(1, "Setting %s interface address", tapif);
- ifr.ifr_flags = IFF_UP;
- if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
- err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
- struct ifreq ifr;
- int vnet_hdr_sz;
- int netfd;
-
- /* Start with this zeroed. Messy but sure. */
- memset(&ifr, 0, sizeof(ifr));
-
- /*
- * We open the /dev/net/tun device and tell it we want a tap device. A
- * tap device is like a tun device, only somehow different. To tell
- * the truth, I completely blundered my way through this code, but it
- * works now!
- */
- netfd = open_or_die("/dev/net/tun", O_RDWR);
- ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
- strcpy(ifr.ifr_name, "tap%d");
- if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
- err(1, "configuring /dev/net/tun");
-
- if (ioctl(netfd, TUNSETOFFLOAD,
- TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
- err(1, "Could not set features for tun device");
-
- /*
- * We don't need checksums calculated for packets coming in this
- * device: trust us!
- */
- ioctl(netfd, TUNSETNOCSUM, 1);
-
- /*
- * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
- * field at the end of the network header iff
- * VIRTIO_NET_F_MRG_RXBUF was negotiated. For virtio 1.0,
- * that became the norm, but we need to tell the tun device
- * about our expanded header (which is called
- * virtio_net_hdr_mrg_rxbuf in the legacy system).
- */
- vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
- if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
- err(1, "Setting tun header size to %u", vnet_hdr_sz);
-
- memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
- return netfd;
-}
-
-/*L:195
- * Our network is a Host<->Guest network. This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card. We
- * just shunt packets between the Guest and the tun device.
- */
-static void setup_tun_net(char *arg)
-{
- struct device *dev;
- struct net_info *net_info = malloc(sizeof(*net_info));
- int ipfd;
- u32 ip = INADDR_ANY;
- bool bridging = false;
- char tapif[IFNAMSIZ], *p;
- struct virtio_net_config conf;
-
- net_info->tunfd = get_tun_device(tapif);
-
- /* First we create a new network device. */
- dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
- dev->priv = net_info;
-
- /* Network devices need a recv and a send queue, just like console. */
- add_pci_virtqueue(dev, net_input, "rx");
- add_pci_virtqueue(dev, net_output, "tx");
-
- /*
- * We need a socket to perform the magic network ioctls to bring up the
- * tap interface, connect to the bridge etc. Any socket will do!
- */
- ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
- if (ipfd < 0)
- err(1, "opening IP socket");
-
- /* If the command line was --tunnet=bridge:<name> do bridging. */
- if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
- arg += strlen(BRIDGE_PFX);
- bridging = true;
- }
-
- /* A mac address may follow the bridge name or IP address */
- p = strchr(arg, ':');
- if (p) {
- str2mac(p+1, conf.mac);
- add_pci_feature(dev, VIRTIO_NET_F_MAC);
- *p = '\0';
- }
-
- /* arg is now either an IP address or a bridge name */
- if (bridging)
- add_to_bridge(ipfd, tapif, arg);
- else
- ip = str2ip(arg);
-
- /* Set up the tun device. */
- configure_device(ipfd, tapif, ip);
-
- /* Expect Guest to handle everything except UFO */
- add_pci_feature(dev, VIRTIO_NET_F_CSUM);
- add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
- add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
- add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
- add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
- add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
- add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
- add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
- /* We handle indirect ring entries */
- add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
- set_device_config(dev, &conf, sizeof(conf));
-
- /* We don't need the socket any more; setup is done. */
- close(ipfd);
-
- if (bridging)
- verbose("device %u: tun %s attached to bridge: %s\n",
- devices.device_num, tapif, arg);
- else
- verbose("device %u: tun %s: %s\n",
- devices.device_num, tapif, arg);
-}
-/*:*/
-
-/* This hangs off device->priv. */
-struct vblk_info {
- /* The size of the file. */
- off64_t len;
-
- /* The file descriptor for the file. */
- int fd;
-
-};
-
-/*L:210
- * The Disk
- *
- * The disk only has one virtqueue, so it only has one thread. It is really
- * simple: the Guest asks for a block number and we read or write that position
- * in the file.
- *
- * Before we serviced each virtqueue in a separate thread, that was unacceptably
- * slow: the Guest waits until the read is finished before running anything
- * else, even if it could have been doing useful work.
- *
- * We could have used async I/O, except it's reputed to suck so hard that
- * characters actually go missing from your code when you try to use it.
- */
-static void blk_request(struct virtqueue *vq)
-{
- struct vblk_info *vblk = vq->dev->priv;
- unsigned int head, out_num, in_num, wlen;
- int ret, i;
- u8 *in;
- struct virtio_blk_outhdr out;
- struct iovec iov[vq->vring.num];
- off64_t off;
-
- /*
- * Get the next request, where we normally wait. It triggers the
- * interrupt to acknowledge previously serviced requests (if any).
- */
- head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-
- /* Copy the output header from the front of the iov (adjusts iov) */
- iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
-
- /* Find and trim end of iov input array, for our status byte. */
- in = NULL;
- for (i = out_num + in_num - 1; i >= out_num; i--) {
- if (iov[i].iov_len > 0) {
- in = iov[i].iov_base + iov[i].iov_len - 1;
- iov[i].iov_len--;
- break;
- }
- }
- if (!in)
- bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
-
- /*
- * For historical reasons, block operations are expressed in 512 byte
- * "sectors".
- */
- off = out.sector * 512;
-
- if (out.type & VIRTIO_BLK_T_OUT) {
- /*
- * Write
- *
- * Move to the right location in the block file. This can fail
- * if they try to write past end.
- */
- if (lseek64(vblk->fd, off, SEEK_SET) != off)
- err(1, "Bad seek to sector %llu", out.sector);
-
- ret = writev(vblk->fd, iov, out_num);
- verbose("WRITE to sector %llu: %i\n", out.sector, ret);
-
- /*
- * Grr... Now we know how long the descriptor they sent was, we
- * make sure they didn't try to write over the end of the block
- * file (possibly extending it).
- */
- if (ret > 0 && off + ret > vblk->len) {
- /* Trim it back to the correct length */
- ftruncate64(vblk->fd, vblk->len);
- /* Die, bad Guest, die. */
- bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
- }
-
- wlen = sizeof(*in);
- *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
- } else if (out.type & VIRTIO_BLK_T_FLUSH) {
- /* Flush */
- ret = fdatasync(vblk->fd);
- verbose("FLUSH fdatasync: %i\n", ret);
- wlen = sizeof(*in);
- *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
- } else {
- /*
- * Read
- *
- * Move to the right location in the block file. This can fail
- * if they try to read past end.
- */
- if (lseek64(vblk->fd, off, SEEK_SET) != off)
- err(1, "Bad seek to sector %llu", out.sector);
-
- ret = readv(vblk->fd, iov + out_num, in_num);
- if (ret >= 0) {
- wlen = sizeof(*in) + ret;
- *in = VIRTIO_BLK_S_OK;
- } else {
- wlen = sizeof(*in);
- *in = VIRTIO_BLK_S_IOERR;
- }
- }
-
- /* Finished that request. */
- add_used(vq, head, wlen);
-}
-
-/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
-{
- struct device *dev;
- struct vblk_info *vblk;
- struct virtio_blk_config conf;
-
- /* Create the device. */
- dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
-
- /* The device has one virtqueue, where the Guest places requests. */
- add_pci_virtqueue(dev, blk_request, "request");
-
- /* Allocate the room for our own bookkeeping */
- vblk = dev->priv = malloc(sizeof(*vblk));
-
- /* First we open the file and store the length. */
- vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
- vblk->len = lseek64(vblk->fd, 0, SEEK_END);
-
- /* Tell Guest how many sectors this device has. */
- conf.capacity = cpu_to_le64(vblk->len / 512);
-
- /*
- * Tell Guest not to put in too many descriptors at once: two are used
- * for the in and out elements.
- */
- add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
- conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
-
- set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
-
- verbose("device %u: virtblock %llu sectors\n",
- devices.device_num, le64_to_cpu(conf.capacity));
-}
-
-/*L:211
- * Our random number generator device reads from /dev/urandom into the Guest's
- * input buffers. The usual case is that the Guest doesn't want random numbers
- * and so has no buffers although /dev/urandom is still readable, whereas
- * console is the reverse.
- *
- * The same logic applies, however.
- */
-struct rng_info {
- int rfd;
-};
-
-static void rng_input(struct virtqueue *vq)
-{
- int len;
- unsigned int head, in_num, out_num, totlen = 0;
- struct rng_info *rng_info = vq->dev->priv;
- struct iovec iov[vq->vring.num];
-
- /* First we need a buffer from the Guests's virtqueue. */
- head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
- if (out_num)
- bad_driver_vq(vq, "Output buffers in rng?");
-
- /*
- * Just like the console write, we loop to cover the whole iovec.
- * In this case, short reads actually happen quite a bit.
- */
- while (!iov_empty(iov, in_num)) {
- len = readv(rng_info->rfd, iov, in_num);
- if (len <= 0)
- err(1, "Read from /dev/urandom gave %i", len);
- iov_consume(vq->dev, iov, in_num, NULL, len);
- totlen += len;
- }
-
- /* Tell the Guest about the new input. */
- add_used(vq, head, totlen);
-}
-
-/*L:199
- * This creates a "hardware" random number device for the Guest.
- */
-static void setup_rng(void)
-{
- struct device *dev;
- struct rng_info *rng_info = malloc(sizeof(*rng_info));
-
- /* Our device's private info simply contains the /dev/urandom fd. */
- rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
-
- /* Create the new device. */
- dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
- dev->priv = rng_info;
-
- /* The device has one virtqueue, where the Guest places inbufs. */
- add_pci_virtqueue(dev, rng_input, "input");
-
- /* We don't have any configuration space */
- no_device_config(dev);
-
- verbose("device %u: rng\n", devices.device_num);
-}
-/* That's the end of device setup. */
-
-/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
-static void __attribute__((noreturn)) restart_guest(void)
-{
- unsigned int i;
-
- /*
- * Since we don't track all open fds, we simply close everything beyond
- * stderr.
- */
- for (i = 3; i < FD_SETSIZE; i++)
- close(i);
-
- /* Reset all the devices (kills all threads). */
- cleanup_devices();
-
- execv(main_args[0], main_args);
- err(1, "Could not exec %s", main_args[0]);
-}
-
-/*L:220
- * Finally we reach the core of the Launcher which runs the Guest, serves
- * its input and output, and finally, lays it to rest.
- */
-static void __attribute__((noreturn)) run_guest(void)
-{
- for (;;) {
- struct lguest_pending notify;
- int readval;
-
- /* We read from the /dev/lguest device to run the Guest. */
- readval = pread(lguest_fd, ¬ify, sizeof(notify), cpu_id);
- if (readval == sizeof(notify)) {
- if (notify.trap == 13) {
- verbose("Emulating instruction at %#x\n",
- getreg(eip));
- emulate_insn(notify.insn);
- } else if (notify.trap == 14) {
- verbose("Emulating MMIO at %#x\n",
- getreg(eip));
- emulate_mmio(notify.addr, notify.insn);
- } else
- errx(1, "Unknown trap %i addr %#08x\n",
- notify.trap, notify.addr);
- /* ENOENT means the Guest died. Reading tells us why. */
- } else if (errno == ENOENT) {
- char reason[1024] = { 0 };
- pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
- errx(1, "%s", reason);
- /* ERESTART means that we need to reboot the guest */
- } else if (errno == ERESTART) {
- restart_guest();
- /* Anything else means a bug or incompatible change. */
- } else
- err(1, "Running guest failed");
- }
-}
-/*L:240
- * This is the end of the Launcher. The good news: we are over halfway
- * through! The bad news: the most fiendish part of the code still lies ahead
- * of us.
- *
- * Are you ready? Take a deep breath and join me in the core of the Host, in
- * "make Host".
-:*/
-
-static struct option opts[] = {
- { "verbose", 0, NULL, 'v' },
- { "tunnet", 1, NULL, 't' },
- { "block", 1, NULL, 'b' },
- { "rng", 0, NULL, 'r' },
- { "initrd", 1, NULL, 'i' },
- { "username", 1, NULL, 'u' },
- { "chroot", 1, NULL, 'c' },
- { NULL },
-};
-static void usage(void)
-{
- errx(1, "Usage: lguest [--verbose] "
- "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
- "|--block=<filename>|--initrd=<filename>]...\n"
- "<mem-in-mb> vmlinux [args...]");
-}
-
-/*L:105 The main routine is where the real work begins: */
-int main(int argc, char *argv[])
-{
- /* Memory, code startpoint and size of the (optional) initrd. */
- unsigned long mem = 0, start, initrd_size = 0;
- /* Two temporaries. */
- int i, c;
- /* The boot information for the Guest. */
- struct boot_params *boot;
- /* If they specify an initrd file to load. */
- const char *initrd_name = NULL;
-
- /* Password structure for initgroups/setres[gu]id */
- struct passwd *user_details = NULL;
-
- /* Directory to chroot to */
- char *chroot_path = NULL;
-
- /* Save the args: we "reboot" by execing ourselves again. */
- main_args = argv;
-
- /*
- * First we initialize the device list. We remember next interrupt
- * number to use for devices (1: remember that 0 is used by the timer).
- */
- devices.next_irq = 1;
-
- /* We're CPU 0. In fact, that's the only CPU possible right now. */
- cpu_id = 0;
-
- /*
- * We need to know how much memory so we can set up the device
- * descriptor and memory pages for the devices as we parse the command
- * line. So we quickly look through the arguments to find the amount
- * of memory now.
- */
- for (i = 1; i < argc; i++) {
- if (argv[i][0] != '-') {
- mem = atoi(argv[i]) * 1024 * 1024;
- /*
- * We start by mapping anonymous pages over all of
- * guest-physical memory range. This fills it with 0,
- * and ensures that the Guest won't be killed when it
- * tries to access it.
- */
- guest_base = map_zeroed_pages(mem / getpagesize()
- + DEVICE_PAGES);
- guest_limit = mem;
- guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
- break;
- }
- }
-
- /* If we exit via err(), this kills all the threads, restores tty. */
- atexit(cleanup_devices);
-
- /* We always have a console device, and it's always device 1. */
- setup_console();
-
- /* The options are fairly straight-forward */
- while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
- switch (c) {
- case 'v':
- verbose = true;
- break;
- case 't':
- setup_tun_net(optarg);
- break;
- case 'b':
- setup_block_file(optarg);
- break;
- case 'r':
- setup_rng();
- break;
- case 'i':
- initrd_name = optarg;
- break;
- case 'u':
- user_details = getpwnam(optarg);
- if (!user_details)
- err(1, "getpwnam failed, incorrect username?");
- break;
- case 'c':
- chroot_path = optarg;
- break;
- default:
- warnx("Unknown argument %s", argv[optind]);
- usage();
- }
- }
- /*
- * After the other arguments we expect memory and kernel image name,
- * followed by command line arguments for the kernel.
- */
- if (optind + 2 > argc)
- usage();
-
- verbose("Guest base is at %p\n", guest_base);
-
- /* Initialize the (fake) PCI host bridge device. */
- init_pci_host_bridge();
-
- /* Now we load the kernel */
- start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-
- /* Boot information is stashed at physical address 0 */
- boot = from_guest_phys(0);
-
- /* Map the initrd image if requested (at top of physical memory) */
- if (initrd_name) {
- initrd_size = load_initrd(initrd_name, mem);
- /*
- * These are the location in the Linux boot header where the
- * start and size of the initrd are expected to be found.
- */
- boot->hdr.ramdisk_image = mem - initrd_size;
- boot->hdr.ramdisk_size = initrd_size;
- /* The bootloader type 0xFF means "unknown"; that's OK. */
- boot->hdr.type_of_loader = 0xFF;
- }
-
- /*
- * The Linux boot header contains an "E820" memory map: ours is a
- * simple, single region.
- */
- boot->e820_entries = 1;
- boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
- /*
- * The boot header contains a command line pointer: we put the command
- * line after the boot header.
- */
- boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
- /* We use a simple helper to copy the arguments separated by spaces. */
- concat((char *)(boot + 1), argv+optind+2);
-
- /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
- boot->hdr.kernel_alignment = 0x1000000;
-
- /* Boot protocol version: 2.07 supports the fields for lguest. */
- boot->hdr.version = 0x207;
-
- /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
- boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
-
- /* Tell the entry path not to try to reload segment registers. */
- boot->hdr.loadflags |= KEEP_SEGMENTS;
-
- /* We don't support tboot: */
- boot->tboot_addr = 0;
-
- /* Ensure this is 0 to prevent APM from loading: */
- boot->apm_bios_info.version = 0;
-
- /* We tell the kernel to initialize the Guest. */
- tell_kernel(start);
-
- /* Ensure that we terminate if a device-servicing child dies. */
- signal(SIGCHLD, kill_launcher);
-
- /* If requested, chroot to a directory */
- if (chroot_path) {
- if (chroot(chroot_path) != 0)
- err(1, "chroot(\"%s\") failed", chroot_path);
-
- if (chdir("/") != 0)
- err(1, "chdir(\"/\") failed");
-
- verbose("chroot done\n");
- }
-
- /* If requested, drop privileges */
- if (user_details) {
- uid_t u;
- gid_t g;
-
- u = user_details->pw_uid;
- g = user_details->pw_gid;
-
- if (initgroups(user_details->pw_name, g) != 0)
- err(1, "initgroups failed");
-
- if (setresgid(g, g, g) != 0)
- err(1, "setresgid failed");
-
- if (setresuid(u, u, u) != 0)
- err(1, "setresuid failed");
-
- verbose("Dropping privileges completed\n");
- }
-
- /* Finally, run the Guest. This doesn't return. */
- run_guest();
-}
-/*:*/
-
-/*M:999
- * Mastery is done: you now know everything I do.
- *
- * But surely you have seen code, features and bugs in your wanderings which
- * you now yearn to attack? That is the real game, and I look forward to you
- * patching and forking lguest into the Your-Name-Here-visor.
- *
- * Farewell, and good coding!
- * Rusty Russell.
- */
+++ /dev/null
- __
- (___()'`; Rusty's Remarkably Unreliable Guide to Lguest
- /, /` - or, A Young Coder's Illustrated Hypervisor
- \\"--\\ http://lguest.ozlabs.org
-
-Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
-for Linux developers and users to experiment with virtualization with the
-minimum of complexity. Nonetheless, it should have sufficient features to
-make it useful for specific tasks, and, of course, you are encouraged to fork
-and enhance it (see drivers/lguest/README).
-
-Features:
-
-- Kernel module which runs in a normal kernel.
-- Simple I/O model for communication.
-- Simple program to create new guests.
-- Logo contains cute puppies: http://lguest.ozlabs.org
-
-Developer features:
-
-- Fun to hack on.
-- No ABI: being tied to a specific kernel anyway, you can change anything.
-- Many opportunities for improvement or feature implementation.
-
-Running Lguest:
-
-- The easiest way to run lguest is to use same kernel as guest and host.
- You can configure them differently, but usually it's easiest not to.
-
- You will need to configure your kernel with the following options:
-
- "Processor type and features":
- "Paravirtualized guest support" = Y
- "Lguest guest support" = Y
- "High Memory Support" = off/4GB
- "Alignment value to which kernel should be aligned" = 0x100000
- (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
- CONFIG_PHYSICAL_ALIGN=0x100000)
-
- "Device Drivers":
- "Block devices"
- "Virtio block driver" = M/Y
- "Network device support"
- "Universal TUN/TAP device driver support" = M/Y
- "Virtio network driver" = M/Y
- (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
-
- "Virtualization"
- "Linux hypervisor example code" = M/Y
- (CONFIG_LGUEST=m)
-
-- A tool called "lguest" is available in this directory: type "make"
- to build it. If you didn't build your kernel in-tree, use "make
- O=<builddir>".
-
-- Create or find a root disk image. There are several useful ones
- around, such as the xm-test tiny root image at
- http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
-
- For more serious work, I usually use a distribution ISO image and
- install it under qemu, then make multiple copies:
-
- dd if=/dev/zero of=rootfile bs=1M count=2048
- qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
-
- Make sure that you install a getty on /dev/hvc0 if you want to log in on the
- console!
-
-- "modprobe lg" if you built it as a module.
-
-- Run an lguest as root:
-
- tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
- --block=rootfile root=/dev/vda
-
- Explanation:
- 64: the amount of memory to use, in MB.
-
- vmlinux: the kernel image found in the top of your build directory. You
- can also use a standard bzImage.
-
- --tunnet=192.168.19.1: configures a "tap" device for networking with this
- IP address.
-
- --block=rootfile: a file or block device which becomes /dev/vda
- inside the guest.
-
- root=/dev/vda: this (and anything else on the command line) are
- kernel boot parameters.
-
-- Configuring networking. I usually have the host masquerade, using
- "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
- /proc/sys/net/ipv4/ip_forward". In this example, I would configure
- eth0 inside the guest at 192.168.19.2.
-
- Another method is to bridge the tap device to an external interface
- using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
- to obtain an IP address. The bridge needs to be configured first:
- this option simply adds the tap interface to it.
-
- A simple example on my system:
-
- ifconfig eth0 0.0.0.0
- brctl addbr lg0
- ifconfig lg0 up
- brctl addif lg0 eth0
- dhclient lg0
-
- Then use --tunnet=bridge:lg0 when launching the guest.
-
- See:
-
- http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
-
- for general information on how to get bridging to work.
-
-- Random number generation. Using the --rng option will provide a
- /dev/hwrng in the guest that will read from the host's /dev/random.
- Use this option in conjunction with rng-tools (see ../hw_random.txt)
- to provide entropy to the guest kernel's /dev/random.
-
-There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-
-Good luck!
-Rusty Russell rusty@rustcorp.com.au.