x86/lguest: Remove lguest support

author Juergen Gross <jgross@suse.com>

Wed, 16 Aug 2017 17:31:57 +0000 (19:31 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 24 Aug 2017 07:57:28 +0000 (09:57 +0200)
author Juergen Gross <jgross@suse.com>
Wed, 16 Aug 2017 17:31:57 +0000 (19:31 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 24 Aug 2017 07:57:28 +0000 (09:57 +0200)
diff --git a/MAINTAINERS b/MAINTAINERS

index 84d6a8277cbde11208b197549d12f505556b87cf..6c8b66d2adcb731aa69c68f59e5259403eb8fe34 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7640,17 +7640,6 @@ T:       git git://linuxtv.org/mkrufky/tuners.git
  S:     Maintained
  F:     drivers/media/dvb-frontends/lgdt3305.*
  
-LGUEST
-M:     Rusty Russell <rusty@rustcorp.com.au>
-L:     lguest@lists.ozlabs.org
-W:     http://lguest.ozlabs.org/
-S:     Odd Fixes
-F:     arch/x86/include/asm/lguest*.h
-F:     arch/x86/lguest/
-F:     drivers/lguest/
-F:     include/linux/lguest*.h
-F:     tools/lguest/
-
  LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
  M:     Viresh Kumar <vireshk@kernel.org>
  L:     linux-ide@vger.kernel.org
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild

index 586b786b3edf9a6930bfdf05c47b4c8e0c57e718..f65a804b86f0535927f4c3fca23eb920755a6409 100644 (file)
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
  # Hyper-V paravirtualization support
  obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
  
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
  obj-y += realmode/
  obj-y += kernel/
  obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 9b302121584d2125f67a9d3d4a35bbc9d3eea8f5..65102171338521e517c6c29190581056045f5c9d 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -777,8 +777,6 @@ config KVM_DEBUG_FS
           Statistics are displayed in debugfs filesystem. Enabling this option
           may incur significant overhead.
  
-source "arch/x86/lguest/Kconfig"
-
  config PARAVIRT_TIME_ACCOUNTING
         bool "Paravirtual steal time accounting"
         depends on PARAVIRT
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h

deleted file mode 100644 (file)

index 73d0c9b..0000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS    10
-#define GDT_ENTRY_LGUEST_DS    11
-#define LGUEST_CS              (GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS              (GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
-       /* Manually saved part. */
-       unsigned long eax, ebx, ecx, edx;
-       unsigned long esi, edi, ebp;
-       unsigned long gs;
-       unsigned long fs, ds, es;
-       unsigned long trapnum, errcode;
-       /* Trap pushed part */
-       unsigned long eip;
-       unsigned long cs;
-       unsigned long eflags;
-       unsigned long esp;
-       unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
-       /* Host information we need to restore when we switch back. */
-       u32 host_cr3;
-       struct desc_ptr host_idt_desc;
-       struct desc_ptr host_gdt_desc;
-       u32 host_sp;
-
-       /* Fields which are used when guest is running. */
-       struct desc_ptr guest_idt_desc;
-       struct desc_ptr guest_gdt_desc;
-       struct x86_hw_tss guest_tss;
-       struct desc_struct guest_idt[IDT_ENTRIES];
-       struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
-       /* The GDT entries copied into lguest_ro_state when running. */
-       struct desc_struct gdt[GDT_ENTRIES];
-
-       /* The IDT entries: some copied into lguest_ro_state when running. */
-       struct desc_struct idt[IDT_ENTRIES];
-
-       /* The address of the last guest-visible pagefault (ie. cr2). */
-       unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
-       u32 cr0;
-
-       cr0 = read_cr0();
-       if (!(cr0 & 8))
-               write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
-       ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h

deleted file mode 100644 (file)

index 6c119cf..0000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC     0
-#define LHCALL_LGUEST_INIT     1
-#define LHCALL_SHUTDOWN                2
-#define LHCALL_NEW_PGTABLE     4
-#define LHCALL_FLUSH_TLB       5
-#define LHCALL_LOAD_IDT_ENTRY  6
-#define LHCALL_SET_STACK       7
-#define LHCALL_SET_CLOCKEVENT  9
-#define LHCALL_HALT            10
-#define LHCALL_SET_PMD         13
-#define LHCALL_SET_PTE         14
-#define LHCALL_SET_PGD         15
-#define LHCALL_LOAD_TLS                16
-#define LHCALL_LOAD_GDT_ENTRY  18
-#define LHCALL_SEND_INTERRUPTS 19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF       1
-#define LGUEST_SHUTDOWN_RESTART                2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations?  There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3,
-      unsigned long arg4)
-{
-       /* "int" is the Intel instruction to trigger a trap. */
-       asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-                    /* The call in %eax (aka "a") might be overwritten */
-                    : "=a"(call)
-                      /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
-                    : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
-                      /* "memory" means this might write somewhere in memory.
-                       * This isn't true for all calls, but it's safe to tell
-                       * gcc that it might happen so it doesn't get clever. */
-                    : "memory");
-       return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
-       /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-       unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 0b03d655db7c4559c007216a2a6bcf274abadc02..abc99b9c7ffdc3f84811481d8ca4467e7085f3e9 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -662,7 +662,7 @@ static inline void sync_core(void)
          * In case NMI unmasking or performance ever becomes a problem,
          * the next best option appears to be MOV-to-CR2 and an
          * unconditional jump.  That sequence also works on all CPUs,
-        * but it will fault at CPL3 (i.e. Xen PV and lguest).
+        * but it will fault at CPL3 (i.e. Xen PV).
          *
          * CPUID is the conventional way, but it's nasty: it doesn't
          * exist on some 486-like CPUs, and it usually exits to a
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h

index ddef37b16af2c92ccef2b23088763a211dab6ea6..66b8f93333d1ee14ed7d937373377bd7ed1c3db8 100644 (file)
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
   *
   * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
   *     PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
   * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
   *     which start at asm startup_xen() entry point and later jump to the C
   *     xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c

index 880aa093268df7a0d7db23abde984647880c47cc..710edab9e6443a0078d3b7fc91cc0500bba0ac6c 100644 (file)
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
  
  #include <asm/ucontext.h>
  
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
  #define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
  static char syscalls[] = {
  #include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
         OFFSET(stack_canary_offset, stack_canary, canary);
  #endif
  
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
-       BLANK();
-       OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-       OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
-       BLANK();
-       OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-       OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-       OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-       OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-       OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-       OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-       OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-       OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-       OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-       OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
         BLANK();
         DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
         DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S

index 0332664eb158184419ac5bf96709728174bb97d8..29da9599fec03fa78a71bc984f25ec1e9d866f5d 100644 (file)
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
         jmp *%eax
  
  .Lbad_subarch:
-WEAK(lguest_entry)
  WEAK(xen_entry)
         /* Unknown implementation; there's really
            nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
  
  subarch_entries:
         .long .Ldefault_entry           /* normal x86/PC */
-       .long lguest_entry              /* lguest hypervisor */
         .long xen_entry                 /* Xen hypervisor */
         .long .Ldefault_entry           /* Moorestown MID */
  num_subarch_entries = (. - subarch_entries) / 4
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c

index 91271122f0dfb947e1b1b68cddeb505053fd9425..502a77d0adb0504861155484d01dfd263bf47897 100644 (file)
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
                 x86_platform.legacy.reserve_bios_regions = 1;
                 break;
         case X86_SUBARCH_XEN:
-       case X86_SUBARCH_LGUEST:
                 x86_platform.legacy.devices.pnpbios = 0;
                 x86_platform.legacy.rtc = 0;
                 break;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig

index 2688c7dc53234bcdd66fff189b83cde9cb0583a8..3ea624452f9327d252dda724100bd6e97f8c6e0c 100644 (file)
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
  # OK, it's a little counter-intuitive to do this, but it puts it neatly under
  # the virtualization menu.
  source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
  
  endif # VIRTUALIZATION
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig

deleted file mode 100644 (file)

index 08f41ca..0000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
-       bool "Lguest guest support"
-       depends on X86_32 && PARAVIRT && PCI
-       select TTY
-       select VIRTUALIZATION
-       select VIRTIO
-       select VIRTIO_CONSOLE
-       help
-         Lguest is a tiny in-kernel hypervisor.  Selecting this will
-         allow your kernel to boot under lguest.  This option will increase
-         your kernel size by about 10k.  If in doubt, say N.
-
-         If you say Y here, make sure you say Y (or M) to the virtio block
-         and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile

deleted file mode 100644 (file)

index 8f38d57..0000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y          := head_32.o boot.o
-CFLAGS_boot.o  := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

deleted file mode 100644 (file)

index 9947269..0000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time.  This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h>                /* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
-       .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-       .noirq_iret = (u32)lguest_noirq_iret,
-       .kernel_address = PAGE_OFFSET,
-       .blocked_interrupts = { 1 }, /* Block timer interrupts */
-       .syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
-                       unsigned long arg2, unsigned long arg3,
-                       unsigned long arg4)
-{
-       /* Note: This code assumes we're uniprocessor. */
-       static unsigned int next_call;
-       unsigned long flags;
-
-       /*
-        * Disable interrupts if not already disabled: we don't want an
-        * interrupt handler making a hypercall while we're already doing
-        * one!
-        */
-       local_irq_save(flags);
-       if (lguest_data.hcall_status[next_call] != 0xFF) {
-               /* Table full, so do normal hcall which will flush table. */
-               hcall(call, arg1, arg2, arg3, arg4);
-       } else {
-               lguest_data.hcalls[next_call].arg0 = call;
-               lguest_data.hcalls[next_call].arg1 = arg1;
-               lguest_data.hcalls[next_call].arg2 = arg2;
-               lguest_data.hcalls[next_call].arg3 = arg3;
-               lguest_data.hcalls[next_call].arg4 = arg4;
-               /* Arguments must all be written before we mark it to go */
-               wmb();
-               lguest_data.hcall_status[next_call] = 0;
-               if (++next_call == LHCALL_RING_SIZE)
-                       next_call = 0;
-       }
-       local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall().  This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-               hcall(call, arg1, 0, 0, 0);
-       else
-               async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
-                       unsigned long arg1,
-                       unsigned long arg2)
-{
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-               hcall(call, arg1, arg2, 0, 0);
-       else
-               async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
-                       unsigned long arg1,
-                       unsigned long arg2,
-                       unsigned long arg3)
-{
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-               hcall(call, arg1, arg2, arg3, 0);
-       else
-               async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
-                       unsigned long arg1,
-                       unsigned long arg2,
-                       unsigned long arg3,
-                       unsigned long arg4)
-{
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-               hcall(call, arg1, arg2, arg3, arg4);
-       else
-               async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
-       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-       paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
-       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-       paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags").  The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
-       return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
-       lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment.  Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules.  In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register.  To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch.  One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled.  There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag.  If an interrupt does come
- * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
-                                  int entrynum, const gate_desc *g)
-{
-       /*
-        * The gate_desc structure is 8 bytes long: we hand it to the Host in
-        * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
-        * around like this; typesafety wasn't a big concern in Linux's early
-        * years.
-        */
-       u32 *desc = (u32 *)g;
-       /* Keep the local copy up to date. */
-       native_write_idt_entry(dt, entrynum, g);
-       /* Tell Host about this new entry. */
-       hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
-       unsigned int i;
-       struct desc_struct *idt = (void *)desc->address;
-
-       for (i = 0; i < (desc->size+1)/8; i++)
-               hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
-       unsigned int i;
-       struct desc_struct *gdt = (void *)desc->address;
-
-       for (i = 0; i < (desc->size+1)/8; i++)
-               hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
-                                  const void *desc, int type)
-{
-       native_write_gdt_entry(dt, entrynum, desc, type);
-       /* Tell Host about this new entry. */
-       hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
-             dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-       /*
-        * There's one problem which normal hardware doesn't have: the Host
-        * can't handle us removing entries we're currently using.  So we clear
-        * the GS register here: if it's needed it'll be reloaded anyway.
-        */
-       lazy_load_gs(0);
-       lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 6 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  No one's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
-                        unsigned int *cx, unsigned int *dx)
-{
-       int function = *ax;
-
-       native_cpuid(ax, bx, cx, dx);
-       switch (function) {
-       /*
-        * CPUID 0 gives the highest legal CPUID number (and the ID string).
-        * We futureproof our code a little by sticking to known CPUID values.
-        */
-       case 0:
-               if (*ax > 5)
-                       *ax = 5;
-               break;
-
-       /*
-        * CPUID 1 is a basic feature request.
-        *
-        * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
-        * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
-        */
-       case 1:
-               *cx &= 0x00002201;
-               *dx &= 0x07808151;
-               /*
-                * The Host can do a nice optimization if it knows that the
-                * kernel mappings (addresses above 0xC0000000 or whatever
-                * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-                * flush_tlb_user() for both user and kernel mappings unless
-                * the Page Global Enable (PGE) feature bit is set.
-                */
-               *dx |= 0x00002000;
-               /*
-                * We also lie, and say we're family id 5.  6 or greater
-                * leads to a rdmsr in early_init_intel which we can't handle.
-                * Family ID is returned as bits 8-12 in ax.
-                */
-               *ax &= 0xFFFFF0FF;
-               *ax |= 0x00000500;
-               break;
-
-       /*
-        * This is used to detect if we're running under KVM.  We might be,
-        * but that's a Host matter, not us.  So say we're not.
-        */
-       case KVM_CPUID_SIGNATURE:
-               *bx = *cx = *dx = 0;
-               break;
-
-       /*
-        * 0x80000000 returns the highest Extended Function, so we futureproof
-        * like we do above by limiting it to known fields.
-        */
-       case 0x80000000:
-               if (*ax > 0x80000008)
-                       *ax = 0x80000008;
-               break;
-
-       /*
-        * PAE systems can mark pages as non-executable.  Linux calls this the
-        * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-        * Virus Protection).  We just switch it off here, since we don't
-        * support it.
-        */
-       case 0x80000001:
-               *dx &= ~(1 << 20);
-               break;
-       }
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0.  cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-       return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
-       return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
-       lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
-       current_cr3 = cr3;
-
-       /* These two page tables are simple, linear, and used during boot */
-       if (cr3 != __pa_symbol(swapper_pg_dir) &&
-           cr3 != __pa_symbol(initial_page_table))
-               cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-       return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-       return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- *         |      --------->+---------+
- *         |         |      | PADDR1  |
- *       Mid-level   |      | PADDR2  |
- *       (PMD) page  |      |         |
- *         |         |    Lower-level |
- *         |         |    (PTE) page  |
- *         |         |      |         |
- *           ....               ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper  ]
- *        [   Level  ]
- *        [  Entries ]
- *        [(PUD Page)]---> +---------+
- *                         |      --------->+---------+
- *                         |         |      | PADDR1  |
- *                       Mid-level   |      | PADDR2  |
- *                       (PMD) page  |      |         |
- *                         |         |    Lower-level |
- *                         |         |    (PTE) page  |
- *                         |         |      |         |
- *                           ....               ....
- *
- *
- * And the virtual address is decoded as:
- *
- *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into    Index into mid    Index into lower    Offset within page
- * top entries   directory page     pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages.  The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  We tell the Host the toplevel and
- * address this corresponds to.  The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
-                              pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
-       /* PAE needs to hand a 64 bit page table entry, so it uses two args. */
-       lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
-                   ptep->pte_low, ptep->pte_high);
-#else
-       lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-                             pte_t *ptep, pte_t pteval)
-{
-       native_set_pte(ptep, pteval);
-       lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
-       native_set_pud(pudp, pudval);
-
-       /* 32 bytes aligned pdpt address and the index. */
-       lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
-                  (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-       native_set_pmd(pmdp, pmdval);
-       lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-                  (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-       native_set_pmd(pmdp, pmdval);
-       lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
-                  (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds!  So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-       native_set_pte(ptep, pteval);
-       if (cr3_changed)
-               lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry.  These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-       native_set_pte_atomic(ptep, pte);
-       if (cr3_changed)
-               lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
-                            pte_t *ptep)
-{
-       native_pte_clear(mm, addr, ptep);
-       lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
-       lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-       /* Simply set it to zero: if it was not, it will fault back in. */
-       lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
-       lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
-       lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
-       set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
-       clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-       .name           = "lguest",
-       .irq_mask       = disable_lguest_irq,
-       .irq_mask_ack   = disable_lguest_irq,
-       .irq_unmask     = enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
-       struct irq_desc *desc;
-       int err;
-
-       /* Returns -ve error or vector number. */
-       err = irq_alloc_desc_at(irq, 0);
-       if (err < 0 && err != -EEXIST)
-               return err;
-
-       /*
-        * Tell the Linux infrastructure that the interrupt is
-        * controlled by our level-based lguest interrupt controller.
-        */
-       irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-                                     handle_level_irq, "level");
-
-       /* Some systems map "vectors" to interrupts weirdly.  Not us! */
-       desc = irq_to_desc(irq);
-       __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
-       return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
-       int err;
-       u8 line = 0;
-
-       /* We literally use the PCI interrupt line as the irq number. */
-       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-       err = lguest_setup_irq(line);
-       if (!err)
-               dev->irq = line;
-       return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
-       WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
-       unsigned int i;
-
-       for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-               if (i != IA32_SYSCALL_VECTOR)
-                       set_intr_gate(i, irq_entries_start +
-                                       8 * (i - FIRST_EXTERNAL_VECTOR));
-       }
-
-       /*
-        * This call is required to set up for 4k stacks, where we have
-        * separate stacks for hard and soft interrupts.
-        */
-       irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
-       *now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
-       return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
-       unsigned long sec, nsec;
-
-       /*
-        * Since the time is in two parts (seconds and nanoseconds), we risk
-        * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
-        * and getting 99 and 0.  As Linux tends to come apart under the stress
-        * of time travel, we must be careful:
-        */
-       do {
-               /* First we read the seconds part. */
-               sec = lguest_data.time.tv_sec;
-               /*
-                * This read memory barrier tells the compiler and the CPU that
-                * this can't be reordered: we have to complete the above
-                * before going on.
-                */
-               rmb();
-               /* Now we read the nanoseconds part. */
-               nsec = lguest_data.time.tv_nsec;
-               /* Make sure we've done that. */
-               rmb();
-               /* Now if the seconds part has changed, try again. */
-       } while (unlikely(lguest_data.time.tv_sec != sec));
-
-       /* Our lguest clock is in real nanoseconds. */
-       return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
-       .name           = "lguest",
-       .rating         = 200,
-       .read           = lguest_clock_read,
-       .mask           = CLOCKSOURCE_MASK(64),
-       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-       /* FIXME: I don't think this can ever happen, but James tells me he had
-        * to put this code in.  Maybe we should remove it now.  Anyone? */
-       if (delta < LG_CLOCK_MIN_DELTA) {
-               if (printk_ratelimit())
-                       printk(KERN_DEBUG "%s: small delta %lu ns\n",
-                              __func__, delta);
-               return -ETIME;
-       }
-
-       /* Please wake us this far in the future. */
-       hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
-       return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
-       /* A 0 argument shuts the clock down. */
-       hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-       return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-       .name                   = "lguest",
-       .features               = CLOCK_EVT_FEAT_ONESHOT,
-       .set_next_event         = lguest_clockevent_set_next_event,
-       .set_state_shutdown     = lguest_clockevent_shutdown,
-       .rating                 = INT_MAX,
-       .mult                   = 1,
-       .shift                  = 0,
-       .min_delta_ns           = LG_CLOCK_MIN_DELTA,
-       .min_delta_ticks        = LG_CLOCK_MIN_DELTA,
-       .max_delta_ns           = LG_CLOCK_MAX_DELTA,
-       .max_delta_ticks        = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
-       unsigned long flags;
-
-       /* Don't interrupt us while this is running. */
-       local_irq_save(flags);
-       lguest_clockevent.event_handler(&lguest_clockevent);
-       local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
-       /* Set up the timer interrupt (0) to go to our simple timer routine */
-       if (lguest_setup_irq(0) != 0)
-               panic("Could not set up timer irq");
-       irq_set_handler(0, lguest_time_irq);
-
-       clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
-       /* We can't set cpumask in the initializer: damn C limitations!  Set it
-        * here and register our timer device. */
-       lguest_clockevent.cpumask = cpumask_of(0);
-       clockevents_register_device(&lguest_clockevent);
-
-       /* Finally, we unblock the timer interrupt. */
-       clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
-                           struct thread_struct *thread)
-{
-       lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
-                  THREAD_SIZE / PAGE_SIZE);
-       tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
-       /* FIXME: Implement */
-       return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-       /* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
-       return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
-       return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
-       /* Warn to see if there's any stray references */
-       WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
-       return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
-       return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
-       apic->read = lguest_apic_read;
-       apic->write = lguest_apic_write;
-       apic->icr_read = lguest_apic_icr_read;
-       apic->icr_write = lguest_apic_icr_write;
-       apic->wait_icr_idle = lguest_apic_wait_icr_idle;
-       apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-       hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
-       hcall(LHCALL_SHUTDOWN, __pa("Power down"),
-             LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-       hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-       /* The hcall won't return, but to keep gcc happy, we're "done". */
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-       .notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-       /*
-        * The Linux bootloader header contains an "e820" memory map: the
-        * Launcher populated the first entry with our memory limit.
-        */
-       e820__range_add(boot_params.e820_table[0].addr,
-                         boot_params.e820_table[0].size,
-                         boot_params.e820_table[0].type);
-
-       /* This string is for the boot messages. */
-       return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
-       write_pci_config_byte(0, 1, 0,
-                             cfg_offset + offsetof(struct virtio_pci_cap, bar),
-                             0);
-       write_pci_config(0, 1, 0,
-                        cfg_offset + offsetof(struct virtio_pci_cap, length),
-                        4);
-       write_pci_config(0, 1, 0,
-                        cfg_offset + offsetof(struct virtio_pci_cap, offset),
-                        off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
-       /*
-        * We could set this up once, then leave it; nothing else in the *
-        * kernel should touch these registers.  But if it went wrong, that
-        * would be a horrible bug to find.
-        */
-       set_cfg_window(cfg_offset, off);
-       write_pci_config(0, 1, 0,
-                        cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
-       u8 cap, common_cap = 0, device_cap = 0;
-       u32 device_len;
-
-       /* Avoid recursive printk into here. */
-       console_cfg_offset = -1;
-
-       if (!early_pci_allowed()) {
-               printk(KERN_ERR "lguest: early PCI access not allowed!\n");
-               return;
-       }
-
-       /* We expect a console PCI device at BUS0, slot 1. */
-       if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
-               printk(KERN_ERR "lguest: PCI device is %#x!\n",
-                      read_pci_config(0, 1, 0, 0));
-               return;
-       }
-
-       /* Find the capabilities we need (must be in bar0) */
-       cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
-       while (cap) {
-               u8 vndr = read_pci_config_byte(0, 1, 0, cap);
-               if (vndr == PCI_CAP_ID_VNDR) {
-                       u8 type, bar;
-
-                       type = read_pci_config_byte(0, 1, 0,
-                           cap + offsetof(struct virtio_pci_cap, cfg_type));
-                       bar = read_pci_config_byte(0, 1, 0,
-                           cap + offsetof(struct virtio_pci_cap, bar));
-
-                       switch (type) {
-                       case VIRTIO_PCI_CAP_DEVICE_CFG:
-                               if (bar == 0)
-                                       device_cap = cap;
-                               break;
-                       case VIRTIO_PCI_CAP_PCI_CFG:
-                               console_access_cap = cap;
-                               break;
-                       }
-               }
-               cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
-       }
-       if (!device_cap || !console_access_cap) {
-               printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
-                      common_cap, device_cap, console_access_cap);
-               return;
-       }
-
-       /*
-        * Note that we can't check features, until we've set the DRIVER
-        * status bit.  We don't want to do that until we have a real driver,
-        * so we just check that the device-specific config has room for
-        * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
-        * it should ignore the access.
-        */
-       device_len = read_pci_config(0, 1, 0,
-                       device_cap + offsetof(struct virtio_pci_cap, length));
-       if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
-                         + sizeof(u32))) {
-               printk(KERN_ERR "lguest: console missing emerg_wr field\n");
-               return;
-       }
-
-       console_cfg_offset = read_pci_config(0, 1, 0,
-                       device_cap + offsetof(struct virtio_pci_cap, offset));
-       printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
-       /* If we couldn't find PCI console, forget it. */
-       if (console_cfg_offset < 0)
-               return count;
-
-       if (unlikely(!console_cfg_offset)) {
-               probe_pci_console();
-               if (console_cfg_offset < 0)
-                       return count;
-       }
-
-       write_bar_via_cfg(console_access_cap,
-                         console_cfg_offset
-                         + offsetof(struct virtio_console_config, emerg_wr),
-                         buf[0]);
-       return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
-       hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-       const char *start, *end;
-} lguest_insns[] = {
-       [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-       [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-                            unsigned long addr, unsigned len)
-{
-       unsigned int insn_len;
-
-       /* Don't do anything special if we don't have a replacement */
-       if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-               return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-       insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-       /* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-       if (len < insn_len)
-               return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-       /* Copy in our instructions. */
-       memcpy(ibuf, lguest_insns[type].start, insn_len);
-       return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest.  The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
-       /* We're under lguest. */
-       pv_info.name = "lguest";
-       /* We're running at privilege level 1, not 0 as normal. */
-       pv_info.kernel_rpl = 1;
-       /* Everyone except Xen runs with this set. */
-       pv_info.shared_kernel_pmd = 1;
-
-       /*
-        * We set up all the lguest overrides for sensitive operations.  These
-        * are detailed with the operations themselves.
-        */
-
-       /* Interrupt-related operations */
-       pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
-       pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-       pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
-       pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
-       pv_irq_ops.safe_halt = lguest_safe_halt;
-
-       /* Setup operations */
-       pv_init_ops.patch = lguest_patch;
-
-       /* Intercepts of various CPU instructions */
-       pv_cpu_ops.load_gdt = lguest_load_gdt;
-       pv_cpu_ops.cpuid = lguest_cpuid;
-       pv_cpu_ops.load_idt = lguest_load_idt;
-       pv_cpu_ops.iret = lguest_iret;
-       pv_cpu_ops.load_sp0 = lguest_load_sp0;
-       pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-       pv_cpu_ops.set_ldt = lguest_set_ldt;
-       pv_cpu_ops.load_tls = lguest_load_tls;
-       pv_cpu_ops.get_debugreg = lguest_get_debugreg;
-       pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-       pv_cpu_ops.read_cr0 = lguest_read_cr0;
-       pv_cpu_ops.write_cr0 = lguest_write_cr0;
-       pv_cpu_ops.read_cr4 = lguest_read_cr4;
-       pv_cpu_ops.write_cr4 = lguest_write_cr4;
-       pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-       pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-       pv_cpu_ops.wbinvd = lguest_wbinvd;
-       pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
-       pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
-       /* Pagetable management */
-       pv_mmu_ops.write_cr3 = lguest_write_cr3;
-       pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-       pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-       pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-       pv_mmu_ops.set_pte = lguest_set_pte;
-       pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-       pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
-       pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
-       pv_mmu_ops.pte_clear = lguest_pte_clear;
-       pv_mmu_ops.pmd_clear = lguest_pmd_clear;
-       pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
-       pv_mmu_ops.read_cr2 = lguest_read_cr2;
-       pv_mmu_ops.read_cr3 = lguest_read_cr3;
-       pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-       pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
-       pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
-       pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       /* APIC read/write intercepts */
-       set_lguest_basic_apic_ops();
-#endif
-
-       x86_init.resources.memory_setup = lguest_memory_setup;
-       x86_init.irqs.intr_init = lguest_init_IRQ;
-       x86_init.timers.timer_init = lguest_time_init;
-       x86_platform.calibrate_tsc = lguest_tsc_khz;
-       x86_platform.get_wallclock =  lguest_get_wallclock;
-
-       /*
-        * Now is a good time to look at the implementations of these functions
-        * before returning to the rest of lguest_init().
-        */
-
-       /*G:070
-        * Now we've seen all the paravirt_ops, we return to
-        * lguest_init() where the rest of the fairly chaotic boot setup
-        * occurs.
-        */
-
-       /*
-        * The stack protector is a weird thing where gcc places a canary
-        * value on the stack and then checks it on return.  This file is
-        * compiled with -fno-stack-protector it, so we got this far without
-        * problems.  The value of the canary is kept at offset 20 from the
-        * %gs register, so we need to set that up before calling C functions
-        * in other files.
-        */
-       setup_stack_canary_segment(0);
-
-       /*
-        * We could just call load_stack_canary_segment(), but we might as well
-        * call switch_to_new_gdt() which loads the whole table and sets up the
-        * per-cpu segment descriptor register %fs as well.
-        */
-       switch_to_new_gdt(0);
-
-       /*
-        * The Host<->Guest Switcher lives at the top of our address space, and
-        * the Host told us how big it is when we made LGUEST_INIT hypercall:
-        * it put the answer in lguest_data.reserve_mem
-        */
-       reserve_top_address(lguest_data.reserve_mem);
-
-       /* Hook in our special panic hypercall code. */
-       atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-       /*
-        * This is messy CPU setup stuff which the native boot code does before
-        * start_kernel, so we have to do, too:
-        */
-       cpu_detect(&new_cpu_data);
-       /* head.S usually sets up the first capability word, so do it here. */
-       new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
-       /* Math is always hard! */
-       set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
-       /* We don't have features.  We have puppies!  Puppies! */
-#ifdef CONFIG_X86_MCE
-       mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
-       acpi_disabled = 1;
-#endif
-
-       /*
-        * We set the preferred console to "hvc".  This is the "hypervisor
-        * virtual console" driver written by the PowerPC people, which we also
-        * adapted for lguest's use.
-        */
-       add_preferred_console("hvc", 0, NULL);
-
-       /* Register our very early console. */
-       virtio_cons_early_init(early_put_chars);
-
-       /* Don't let ACPI try to control our PCI interrupts. */
-       disable_acpi();
-
-       /* We control them ourselves, by overriding these two hooks. */
-       pcibios_enable_irq = lguest_enable_irq;
-       pcibios_disable_irq = lguest_disable_irq;
-
-       /*
-        * Last of all, we set the power management poweroff hook to point to
-        * the Guest routine to power off, and the reboot hook to our restart
-        * routine.
-        */
-       pm_power_off = lguest_power_off;
-       machine_ops.restart = lguest_restart;
-
-       /*
-        * Now we're set up, call i386_start_kernel() in head32.c and we proceed
-        * to boot as normal.  It never returns.
-        */
-       i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S

deleted file mode 100644 (file)

index d5ae63f..0000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables.  Finally it checks the 'hardware_subarch' field.  This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here!  We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
-       /*
-        * We make the "initialization" hypercall now to tell the Host where
-        * our lguest_data struct is.
-        */
-       movl $LHCALL_LGUEST_INIT, %eax
-       movl $lguest_data - __PAGE_OFFSET, %ebx
-       int $LGUEST_TRAP_ENTRY
-
-       /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
-       movl $LHCALL_NEW_PGTABLE, %eax
-       movl $(initial_page_table - __PAGE_OFFSET), %ebx
-       int $LGUEST_TRAP_ENTRY
-
-       /* Set up the initial stack so we can run C code. */
-       movl $(init_thread_union+THREAD_SIZE),%esp
-
-       /* Jumps are relative: we're running __PAGE_OFFSET too low. */
-       jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...)                   \
-       lgstart_##name: insns; lgend_##name:;           \
-       .globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later).  If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
-       /*
-        * The reverse of irq_disable, this sets lguest_data.irq_enabled to
-        * X86_EFLAGS_IF (ie. "Interrupts enabled").
-        */
-       movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-       /*
-        * But now we need to check if the Host wants to know: there might have
-        * been interrupts waiting to be delivered, in which case it will have
-        * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-        * jump to send_interrupts, otherwise we're done.
-        */
-       cmpl $0, lguest_data+LGUEST_DATA_irq_pending
-       jnz send_interrupts
-       /*
-        * One cool thing about x86 is that you can do many things without using
-        * a register.  In this case, the normal path hasn't needed to save or
-        * restore any registers at all!
-        */
-       ret
-send_interrupts:
-       /*
-        * OK, now we need a register: eax is used for the hypercall number,
-        * which is LHCALL_SEND_INTERRUPTS.
-        *
-        * We used not to bother with this pending detection at all, which was
-        * much simpler.  Sooner or later the Host would realize it had to
-        * send us an interrupt.  But that turns out to make performance 7
-        * times worse on a simple tcp benchmark.  So now we do this the hard
-        * way.
-        */
-       pushl %eax
-       movl $LHCALL_SEND_INTERRUPTS, %eax
-       /* This is the actual hypercall trap. */
-       int  $LGUEST_TRAP_ENTRY
-       /* Put eax back the way we found it. */
-       popl %eax
-       ret
-
-/*
- * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
-       /* This is just "lguest_data.irq_enabled = flags;" */
-       movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-       /*
-        * Now, if the %eax value has enabled interrupts and
-        * lguest_data.irq_pending is set, we want to tell the Host so it can
-        * deliver any outstanding interrupts.  Fortunately, both values will
-        * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
-        * instruction will AND them together for us.  If both are set, we
-        * jump to send_interrupts.
-        */
-       testl lguest_data+LGUEST_DATA_irq_pending, %eax
-       jnz send_interrupts
-       /* Again, the normal path has used no extra registers.  Clever, huh? */
-       ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it.  However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic.  The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
-       pushl   2*4(%esp)
-       /*
-        * Note the %ss: segment prefix here.  Normal data accesses use the
-        * "ds" segment, but that will have already been restored for whatever
-        * we're returning to (such as userspace): we can't trust it.  The %ss:
-        * prefix makes sure we use the stack segment, which is still valid.
-        */
-       popl    %ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
-       iret
diff --git a/drivers/Makefile b/drivers/Makefile

index dfdcda00bfe371c43e985337d30df27133d455c1..d90fdc413648c445d55c9727faf2d8fd15f251db 100644 (file)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY) += accessibility/
  obj-$(CONFIG_ISDN)             += isdn/
  obj-$(CONFIG_EDAC)             += edac/
  obj-$(CONFIG_EISA)             += eisa/
-obj-y                          += lguest/
  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
  obj-$(CONFIG_CPU_IDLE)         += cpuidle/
  obj-y                          += mmc/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index 8ddc98279c8f732b26c8b62541a0e1653dbba104..80aaf3420e129184c38b49040a6c6fdfdf02c0bf 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,7 +470,7 @@ config VIRTIO_BLK
         depends on VIRTIO
         ---help---
           This is the virtual block driver for virtio.  It can be used with
-          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
  
  config VIRTIO_BLK_SCSI
         bool "SCSI passthrough request for the Virtio block driver"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig

index ccd239ab879ff326a1c7a85c2fd66fb8021d0f29..6237143446002767402ba8af1f30a968e297d8c0 100644 (file)
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
         depends on VIRTIO && TTY
         select HVC_DRIVER
         help
-         Virtio console for use with lguest and other hypervisors.
+         Virtio console for use with hypervisors.
  
           Also serves as a general-purpose serial device for data
           transfer between the guest and host.  Character devices at
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c

index ad843eb02ae7be21ae7470c6cc840a30a3f6e272..4d229dde6522de2533d17d5aa9bfd0161c79dafa 100644 (file)
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
   * We turn the characters into a scatter-gather list, add it to the
   * output queue and then kick the Host.  Then we sit here waiting for
   * it to finish: inefficient in theory, but in practice
- * implementations will do it immediately (lguest's Launcher does).
+ * implementations will do it immediately.
   */
  static int put_chars(u32 vtermno, const char *buf, int count)
  {
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig

deleted file mode 100644 (file)

index 169172d..0000000
--- a/drivers/lguest/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config LGUEST
-       tristate "Linux hypervisor example code"
-       depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
-       select HVC_DRIVER
-       ---help---
-         This is a very simple module which allows you to run
-         multiple instances of the same Linux kernel, using the
-         "lguest" command found in the tools/lguest directory.
-
-         Note that "lguest" is pronounced to rhyme with "fell quest",
-         not "rustyvisor". See tools/lguest/lguest.txt.
-
-         If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile

deleted file mode 100644 (file)

index 16f52ee..0000000
--- a/drivers/lguest/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Host requires the other files, which can be a module.
-obj-$(CONFIG_LGUEST)   += lg.o
-lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
-       segments.o lguest_user.o
-
-lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
-
-Preparation Preparation!: PREFIX=P
-Guest: PREFIX=G
-Drivers: PREFIX=D
-Launcher: PREFIX=L
-Host: PREFIX=H
-Switcher: PREFIX=S
-Mastery: PREFIX=M
-Beer:
-       @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
-Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
-       @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
-Puppy:
-       @clear
-       @printf "      __  \n (___()'\`;\n /,    /\`\n \\\\\\\"--\\\\\\   \n"
-       @sleep 2; clear; printf "\n\n   Sit!\n\n"; sleep 1; clear
-       @printf "    __    \n   ()'\`;  \n   /\\|\` \n  /  |  \n(/_)_|_   \n"
-       @sleep 2; clear; printf "\n\n  Stand!\n\n"; sleep 1; clear
-       @printf "    __    \n   ()'\`;  \n   /\\|\` \n  /._.= \n /| /     \n(_\_)_    \n"
-       @sleep 2; clear; printf "\n\n  Good puppy!\n\n"; sleep 1; clear
diff --git a/drivers/lguest/README b/drivers/lguest/README

deleted file mode 100644 (file)

index b7db39a..0000000
--- a/drivers/lguest/README
+++ /dev/null
@@ -1,47 +0,0 @@
-Welcome, friend reader, to lguest.
-
-Lguest is an adventure, with you, the reader, as Hero.  I can't think of many
-5000-line projects which offer both such capability and glimpses of future
-potential; it is an exciting time to be delving into the source!
-
-But be warned; this is an arduous journey of several hours or more!  And as we
-know, all true Heroes are driven by a Noble Goal.  Thus I offer a Beer (or
-equivalent) to anyone I meet who has completed this documentation.
-
-So get comfortable and keep your wits about you (both quick and humorous).
-Along your way to the Noble Goal, you will also gain masterly insight into
-lguest, and hypervisors and x86 virtualization in general.
-
-Our Quest is in seven parts: (best read with C highlighting turned on)
-
-I) Preparation
-       - In which our potential hero is flown quickly over the landscape for a
-         taste of its scope.  Suitable for the armchair coders and other such
-         persons of faint constitution.
-
-II) Guest
-       - Where we encounter the first tantalising wisps of code, and come to
-         understand the details of the life of a Guest kernel.
-
-III) Drivers
-       - Whereby the Guest finds its voice and become useful, and our
-         understanding of the Guest is completed.
-
-IV) Launcher
-       - Where we trace back to the creation of the Guest, and thus begin our
-         understanding of the Host.
-
-V) Host
-       - Where we master the Host code, through a long and tortuous journey.
-         Indeed, it is here that our hero is tested in the Bit of Despair.
-
-VI) Switcher
-       - Where our understanding of the intertwined nature of Guests and Hosts
-         is completed.
-
-VII) Mastery
-       - Where our fully fledged hero grapples with the Great Question:
-         "What next?"
-
-make Preparation!
-Rusty Russell.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c

deleted file mode 100644 (file)

index 395ed19..0000000
--- a/drivers/lguest/core.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*P:400
- * This contains run_guest() which actually calls into the Host<->Guest
- * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something.  This file also contains useful helper routines.
-:*/
-#include <linux/module.h>
-#include <linux/stringify.h>
-#include <linux/stddef.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/vmalloc.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/paravirt.h>
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-#include <asm/poll.h>
-#include <asm/asm-offsets.h>
-#include "lg.h"
-
-unsigned long switcher_addr;
-struct page **lg_switcher_pages;
-static struct vm_struct *switcher_text_vma;
-static struct vm_struct *switcher_stacks_vma;
-
-/* This One Big lock protects all inter-guest data structures. */
-DEFINE_MUTEX(lguest_lock);
-
-/*H:010
- * We need to set up the Switcher at a high virtual address.  Remember the
- * Switcher is a few hundred bytes of assembler code which actually changes the
- * CPU to run the Guest, and then changes back to the Host when a trap or
- * interrupt happens.
- *
- * The Switcher code must be at the same virtual address in the Guest as the
- * Host since it will be running as the switchover occurs.
- *
- * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.
- */
-static __init int map_switcher(void)
-{
-       int i, err;
-
-       /*
-        * Map the Switcher in to high memory.
-        *
-        * It turns out that if we choose the address 0xFFC00000 (4MB under the
-        * top virtual address), it makes setting up the page tables really
-        * easy.
-        */
-
-       /* We assume Switcher text fits into a single page. */
-       if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
-               printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
-                      end_switcher_text - start_switcher_text);
-               return -EINVAL;
-       }
-
-       /*
-        * We allocate an array of struct page pointers.  map_vm_area() wants
-        * this, rather than just an array of pages.
-        */
-       lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-                                   * TOTAL_SWITCHER_PAGES,
-                                   GFP_KERNEL);
-       if (!lg_switcher_pages) {
-               err = -ENOMEM;
-               goto out;
-       }
-
-       /*
-        * Now we actually allocate the pages.  The Guest will see these pages,
-        * so we make sure they're zeroed.
-        */
-       for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-               lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-               if (!lg_switcher_pages[i]) {
-                       err = -ENOMEM;
-                       goto free_some_pages;
-               }
-       }
-
-       /*
-        * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
-        * It goes in the first page, which we map in momentarily.
-        */
-       memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
-              end_switcher_text - start_switcher_text);
-       kunmap(lg_switcher_pages[0]);
-
-       /*
-        * We place the Switcher underneath the fixmap area, which is the
-        * highest virtual address we can get.  This is important, since we
-        * tell the Guest it can't access this memory, so we want its ceiling
-        * as high as possible.
-        */
-       switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
-
-       /*
-        * Now we reserve the "virtual memory area"s we want.  We might
-        * not get them in theory, but in practice it's worked so far.
-        *
-        * We want the switcher text to be read-only and executable, and
-        * the stacks to be read-write and non-executable.
-        */
-       switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
-                                         switcher_addr,
-                                         switcher_addr + PAGE_SIZE);
-
-       if (!switcher_text_vma) {
-               err = -ENOMEM;
-               printk("lguest: could not map switcher pages high\n");
-               goto free_pages;
-       }
-
-       switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
-                                           VM_ALLOC|VM_NO_GUARD,
-                                           switcher_addr + PAGE_SIZE,
-                                           switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
-       if (!switcher_stacks_vma) {
-               err = -ENOMEM;
-               printk("lguest: could not map switcher pages high\n");
-               goto free_text_vma;
-       }
-
-       /*
-        * This code actually sets up the pages we've allocated to appear at
-        * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-        * kind of pages we're mapping (kernel text pages and kernel writable
-        * pages respectively), and a pointer to our array of struct pages.
-        */
-       err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
-       if (err) {
-               printk("lguest: text map_vm_area failed: %i\n", err);
-               goto free_vmas;
-       }
-
-       err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
-                         lg_switcher_pages + SWITCHER_TEXT_PAGES);
-       if (err) {
-               printk("lguest: stacks map_vm_area failed: %i\n", err);
-               goto free_vmas;
-       }
-
-       /*
-        * Now the Switcher is mapped at the right address, we can't fail!
-        */
-       printk(KERN_INFO "lguest: mapped switcher at %p\n",
-              switcher_text_vma->addr);
-       /* And we succeeded... */
-       return 0;
-
-free_vmas:
-       /* Undoes map_vm_area and __get_vm_area */
-       vunmap(switcher_stacks_vma->addr);
-free_text_vma:
-       vunmap(switcher_text_vma->addr);
-free_pages:
-       i = TOTAL_SWITCHER_PAGES;
-free_some_pages:
-       for (--i; i >= 0; i--)
-               __free_pages(lg_switcher_pages[i], 0);
-       kfree(lg_switcher_pages);
-out:
-       return err;
-}
-/*:*/
-
-/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
-static void unmap_switcher(void)
-{
-       unsigned int i;
-
-       /* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-       vunmap(switcher_text_vma->addr);
-       vunmap(switcher_stacks_vma->addr);
-       /* Now we just need to free the pages we copied the switcher into */
-       for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-               __free_pages(lg_switcher_pages[i], 0);
-       kfree(lg_switcher_pages);
-}
-
-/*H:032
- * Dealing With Guest Memory.
- *
- * Before we go too much further into the Host, we need to grok the routines
- * we use to deal with Guest memory.
- *
- * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on the corresponding place in
- * the memory region allocated by the Launcher.
- *
- * But we can't trust the Guest: it might be trying to access the Launcher
- * code.  We have to check that the range is below the pfn_limit the Launcher
- * gave us.  We have to make sure that addr + len doesn't give us a false
- * positive by overflowing, too.
- */
-bool lguest_address_ok(const struct lguest *lg,
-                      unsigned long addr, unsigned long len)
-{
-       return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
-}
-
-/*
- * This routine copies memory from the Guest.  Here we can see how useful the
- * kill_lguest() routine we met in the Launcher can be: we return a random
- * value (all zeroes) instead of needing to return an error.
- */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
-{
-       if (!lguest_address_ok(cpu->lg, addr, bytes)
-           || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
-               /* copy_from_user should do this, but as we rely on it... */
-               memset(b, 0, bytes);
-               kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
-       }
-}
-
-/* This is the write (copy into Guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
-              unsigned bytes)
-{
-       if (!lguest_address_ok(cpu->lg, addr, bytes)
-           || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
-               kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
-}
-/*:*/
-
-/*H:030
- * Let's jump straight to the the main loop which runs the Guest.
- * Remember, this is called by the Launcher reading /dev/lguest, and we keep
- * going around and around until something interesting happens.
- */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
-{
-       /* If the launcher asked for a register with LHREQ_GETREG */
-       if (cpu->reg_read) {
-               if (put_user(*cpu->reg_read, user))
-                       return -EFAULT;
-               cpu->reg_read = NULL;
-               return sizeof(*cpu->reg_read);
-       }
-
-       /* We stop running once the Guest is dead. */
-       while (!cpu->lg->dead) {
-               unsigned int irq;
-               bool more;
-
-               /* First we run any hypercalls the Guest wants done. */
-               if (cpu->hcall)
-                       do_hypercalls(cpu);
-
-               /* Do we have to tell the Launcher about a trap? */
-               if (cpu->pending.trap) {
-                       if (copy_to_user(user, &cpu->pending,
-                                        sizeof(cpu->pending)))
-                               return -EFAULT;
-                       return sizeof(cpu->pending);
-               }
-
-               /*
-                * All long-lived kernel loops need to check with this horrible
-                * thing called the freezer.  If the Host is trying to suspend,
-                * it stops us.
-                */
-               try_to_freeze();
-
-               /* Check for signals */
-               if (signal_pending(current))
-                       return -ERESTARTSYS;
-
-               /*
-                * Check if there are any interrupts which can be delivered now:
-                * if so, this sets up the hander to be executed when we next
-                * run the Guest.
-                */
-               irq = interrupt_pending(cpu, &more);
-               if (irq < LGUEST_IRQS)
-                       try_deliver_interrupt(cpu, irq, more);
-
-               /*
-                * Just make absolutely sure the Guest is still alive.  One of
-                * those hypercalls could have been fatal, for example.
-                */
-               if (cpu->lg->dead)
-                       break;
-
-               /*
-                * If the Guest asked to be stopped, we sleep.  The Guest's
-                * clock timer will wake us.
-                */
-               if (cpu->halted) {
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       /*
-                        * Just before we sleep, make sure no interrupt snuck in
-                        * which we should be doing.
-                        */
-                       if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
-                               set_current_state(TASK_RUNNING);
-                       else
-                               schedule();
-                       continue;
-               }
-
-               /*
-                * OK, now we're ready to jump into the Guest.  First we put up
-                * the "Do Not Disturb" sign:
-                */
-               local_irq_disable();
-
-               /* Actually run the Guest until something happens. */
-               lguest_arch_run_guest(cpu);
-
-               /* Now we're ready to be interrupted or moved to other CPUs */
-               local_irq_enable();
-
-               /* Now we deal with whatever happened to the Guest. */
-               lguest_arch_handle_trap(cpu);
-       }
-
-       /* Special case: Guest is 'dead' but wants a reboot. */
-       if (cpu->lg->dead == ERR_PTR(-ERESTART))
-               return -ERESTART;
-
-       /* The Guest is dead => "No such file or directory" */
-       return -ENOENT;
-}
-
-/*H:000
- * Welcome to the Host!
- *
- * By this point your brain has been tickled by the Guest code and numbed by
- * the Launcher code; prepare for it to be stretched by the Host code.  This is
- * the heart.  Let's begin at the initialization routine for the Host's lg
- * module.
- */
-static int __init init(void)
-{
-       int err;
-
-       /* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
-       if (get_kernel_rpl() != 0) {
-               printk("lguest is afraid of being a guest\n");
-               return -EPERM;
-       }
-
-       /* First we put the Switcher up in very high virtual memory. */
-       err = map_switcher();
-       if (err)
-               goto out;
-
-       /* We might need to reserve an interrupt vector. */
-       err = init_interrupts();
-       if (err)
-               goto unmap;
-
-       /* /dev/lguest needs to be registered. */
-       err = lguest_device_init();
-       if (err)
-               goto free_interrupts;
-
-       /* Finally we do some architecture-specific setup. */
-       lguest_arch_host_init();
-
-       /* All good! */
-       return 0;
-
-free_interrupts:
-       free_interrupts();
-unmap:
-       unmap_switcher();
-out:
-       return err;
-}
-
-/* Cleaning up is just the same code, backwards.  With a little French. */
-static void __exit fini(void)
-{
-       lguest_device_remove();
-       free_interrupts();
-       unmap_switcher();
-
-       lguest_arch_host_fini();
-}
-/*:*/
-
-/*
- * The Host side of lguest can be a module.  This is a nice way for people to
- * play with it.
- */
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c

deleted file mode 100644 (file)

index 601f81c..0000000
--- a/drivers/lguest/hypercalls.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*P:500
- * Just as userspace programs request kernel operations through a system
- * call, the Guest requests Host operations through a "hypercall".  You might
- * notice this nomenclature doesn't really follow any logic, but the name has
- * been around for long enough that we're stuck with it.  As you'd expect, this
- * code is basically a one big switch statement.
-:*/
-
-/*  Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-*/
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/ktime.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include "lg.h"
-
-/*H:120
- * This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed.  Or, in the case of LHCALL_SHUTDOWN, both.
- */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-       switch (args->arg0) {
-       case LHCALL_FLUSH_ASYNC:
-               /*
-                * This call does nothing, except by breaking out of the Guest
-                * it makes us process all the asynchronous hypercalls.
-                */
-               break;
-       case LHCALL_SEND_INTERRUPTS:
-               /*
-                * This call does nothing too, but by breaking out of the Guest
-                * it makes us process any pending interrupts.
-                */
-               break;
-       case LHCALL_LGUEST_INIT:
-               /*
-                * You can't get here unless you're already initialized.  Don't
-                * do that.
-                */
-               kill_guest(cpu, "already have lguest_data");
-               break;
-       case LHCALL_SHUTDOWN: {
-               char msg[128];
-               /*
-                * Shutdown is such a trivial hypercall that we do it in five
-                * lines right here.
-                *
-                * If the lgread fails, it will call kill_guest() itself; the
-                * kill_guest() with the message will be ignored.
-                */
-               __lgread(cpu, msg, args->arg1, sizeof(msg));
-               msg[sizeof(msg)-1] = '\0';
-               kill_guest(cpu, "CRASH: %s", msg);
-               if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
-                       cpu->lg->dead = ERR_PTR(-ERESTART);
-               break;
-       }
-       case LHCALL_FLUSH_TLB:
-               /* FLUSH_TLB comes in two flavors, depending on the argument: */
-               if (args->arg1)
-                       guest_pagetable_clear_all(cpu);
-               else
-                       guest_pagetable_flush_user(cpu);
-               break;
-
-       /*
-        * All these calls simply pass the arguments through to the right
-        * routines.
-        */
-       case LHCALL_NEW_PGTABLE:
-               guest_new_pagetable(cpu, args->arg1);
-               break;
-       case LHCALL_SET_STACK:
-               guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
-               break;
-       case LHCALL_SET_PTE:
-#ifdef CONFIG_X86_PAE
-               guest_set_pte(cpu, args->arg1, args->arg2,
-                               __pte(args->arg3 | (u64)args->arg4 << 32));
-#else
-               guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
-#endif
-               break;
-       case LHCALL_SET_PGD:
-               guest_set_pgd(cpu->lg, args->arg1, args->arg2);
-               break;
-#ifdef CONFIG_X86_PAE
-       case LHCALL_SET_PMD:
-               guest_set_pmd(cpu->lg, args->arg1, args->arg2);
-               break;
-#endif
-       case LHCALL_SET_CLOCKEVENT:
-               guest_set_clockevent(cpu, args->arg1);
-               break;
-       case LHCALL_HALT:
-               /* Similarly, this sets the halted flag for run_guest(). */
-               cpu->halted = 1;
-               break;
-       default:
-               /* It should be an architecture-specific hypercall. */
-               if (lguest_arch_do_hcall(cpu, args))
-                       kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
-       }
-}
-
-/*H:124
- * Asynchronous hypercalls are easy: we just look in the array in the
- * Guest's "struct lguest_data" to see if any new ones are marked "ready".
- *
- * We are careful to do these in order: obviously we respect the order the
- * Guest put them in the ring, but we also promise the Guest that they will
- * happen before any normal hypercall (which is why we check this before
- * checking for a normal hcall).
- */
-static void do_async_hcalls(struct lg_cpu *cpu)
-{
-       unsigned int i;
-       u8 st[LHCALL_RING_SIZE];
-
-       /* For simplicity, we copy the entire call status array in at once. */
-       if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
-               return;
-
-       /* We process "struct lguest_data"s hcalls[] ring once. */
-       for (i = 0; i < ARRAY_SIZE(st); i++) {
-               struct hcall_args args;
-               /*
-                * We remember where we were up to from last time.  This makes
-                * sure that the hypercalls are done in the order the Guest
-                * places them in the ring.
-                */
-               unsigned int n = cpu->next_hcall;
-
-               /* 0xFF means there's no call here (yet). */
-               if (st[n] == 0xFF)
-                       break;
-
-               /*
-                * OK, we have hypercall.  Increment the "next_hcall" cursor,
-                * and wrap back to 0 if we reach the end.
-                */
-               if (++cpu->next_hcall == LHCALL_RING_SIZE)
-                       cpu->next_hcall = 0;
-
-               /*
-                * Copy the hypercall arguments into a local copy of the
-                * hcall_args struct.
-                */
-               if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
-                                  sizeof(struct hcall_args))) {
-                       kill_guest(cpu, "Fetching async hypercalls");
-                       break;
-               }
-
-               /* Do the hypercall, same as a normal one. */
-               do_hcall(cpu, &args);
-
-               /* Mark the hypercall done. */
-               if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
-                       kill_guest(cpu, "Writing result for async hypercall");
-                       break;
-               }
-
-               /*
-                * Stop doing hypercalls if they want to notify the Launcher:
-                * it needs to service this first.
-                */
-               if (cpu->pending.trap)
-                       break;
-       }
-}
-
-/*
- * Last of all, we look at what happens first of all.  The very first time the
- * Guest makes a hypercall, we end up here to set things up:
- */
-static void initialize(struct lg_cpu *cpu)
-{
-       /*
-        * You can't do anything until you're initialized.  The Guest knows the
-        * rules, so we're unforgiving here.
-        */
-       if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
-               kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
-               return;
-       }
-
-       if (lguest_arch_init_hypercalls(cpu))
-               kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-       /*
-        * The Guest tells us where we're not to deliver interrupts by putting
-        * the instruction address into "struct lguest_data".
-        */
-       if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
-               kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-       /*
-        * We write the current time into the Guest's data page once so it can
-        * set its clock.
-        */
-       write_timestamp(cpu);
-
-       /* page_tables.c will also do some setup. */
-       page_table_guest_data_init(cpu);
-
-       /*
-        * This is the one case where the above accesses might have been the
-        * first write to a Guest page.  This may have caused a copy-on-write
-        * fault, but the old page might be (read-only) in the Guest
-        * pagetable.
-        */
-       guest_pagetable_clear_all(cpu);
-}
-/*:*/
-
-/*M:013
- * If a Guest reads from a page (so creates a mapping) that it has never
- * written to, and then the Launcher writes to it (ie. the output of a virtual
- * device), the Guest will still see the old page.  In practice, this never
- * happens: why would the Guest read a page which it has never written to?  But
- * a similar scenario might one day bite us, so it's worth mentioning.
- *
- * Note that if we used a shared anonymous mapping in the Launcher instead of
- * mapping /dev/zero private, we wouldn't worry about cop-on-write.  And we
- * need that to switch the Launcher to processes (away from threads) anyway.
-:*/
-
-/*H:100
- * Hypercalls
- *
- * Remember from the Guest, hypercalls come in two flavors: normal and
- * asynchronous.  This file handles both of types.
- */
-void do_hypercalls(struct lg_cpu *cpu)
-{
-       /* Not initialized yet?  This hypercall must do it. */
-       if (unlikely(!cpu->lg->lguest_data)) {
-               /* Set up the "struct lguest_data" */
-               initialize(cpu);
-               /* Hcall is done. */
-               cpu->hcall = NULL;
-               return;
-       }
-
-       /*
-        * The Guest has initialized.
-        *
-        * Look in the hypercall ring for the async hypercalls:
-        */
-       do_async_hcalls(cpu);
-
-       /*
-        * If we stopped reading the hypercall ring because the Guest did a
-        * NOTIFY to the Launcher, we want to return now.  Otherwise we do
-        * the hypercall.
-        */
-       if (!cpu->pending.trap) {
-               do_hcall(cpu, cpu->hcall);
-               /*
-                * Tricky point: we reset the hcall pointer to mark the
-                * hypercall as "done".  We use the hcall pointer rather than
-                * the trap number to indicate a hypercall is pending.
-                * Normally it doesn't matter: the Guest will run again and
-                * update the trap number before we come back here.
-                *
-                * However, if we are signalled or the Guest sends I/O to the
-                * Launcher, the run_guest() loop will exit without running the
-                * Guest.  When it comes back it would try to re-run the
-                * hypercall.  Finding that bug sucked.
-                */
-               cpu->hcall = NULL;
-       }
-}
-
-/*
- * This routine supplies the Guest with time: it's used for wallclock time at
- * initial boot and as a rough time source if the TSC isn't available.
- */
-void write_timestamp(struct lg_cpu *cpu)
-{
-       struct timespec now;
-       ktime_get_real_ts(&now);
-       if (copy_to_user(&cpu->lg->lguest_data->time,
-                        &now, sizeof(struct timespec)))
-               kill_guest(cpu, "Writing timestamp");
-}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c

deleted file mode 100644 (file)

index 67392b6..0000000
--- a/drivers/lguest/interrupts_and_traps.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*P:800
- * Interrupts (traps) are complicated enough to earn their own file.
- * There are three classes of interrupts:
- *
- * 1) Real hardware interrupts which occur while we're running the Guest,
- * 2) Interrupts for virtual devices attached to the Guest, and
- * 3) Traps and faults from the Guest.
- *
- * Real hardware interrupts must be delivered to the Host, not the Guest.
- * Virtual interrupts must be delivered to the Guest, but we make them look
- * just like real hardware would deliver them.  Traps from the Guest can be set
- * up to go directly back into the Guest, but sometimes the Host wants to see
- * them first, so we also have a way of "reflecting" them into the Guest as if
- * they had been delivered to it directly.
-:*/
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include "lg.h"
-
-/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
-module_param(syscall_vector, uint, 0444);
-
-/* The address of the interrupt handler is split into two bits: */
-static unsigned long idt_address(u32 lo, u32 hi)
-{
-       return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
-}
-
-/*
- * The "type" of the interrupt handler is a 4 bit field: we only support a
- * couple of types.
- */
-static int idt_type(u32 lo, u32 hi)
-{
-       return (hi >> 8) & 0xF;
-}
-
-/* An IDT entry can't be used unless the "present" bit is set. */
-static bool idt_present(u32 lo, u32 hi)
-{
-       return (hi & 0x8000);
-}
-
-/*
- * We need a helper to "push" a value onto the Guest's stack, since that's a
- * big part of what delivering an interrupt does.
- */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
-{
-       /* Stack grows upwards: move stack then write value. */
-       *gstack -= 4;
-       lgwrite(cpu, *gstack, u32, val);
-}
-
-/*H:210
- * The push_guest_interrupt_stack() routine saves Guest state on the stack for
- * an interrupt or trap.  The mechanics of delivering traps and interrupts to
- * the Guest are the same, except some traps have an "error code" which gets
- * pushed onto the stack as well: the caller tells us if this is one.
- *
- * We set up the stack just like the CPU does for a real interrupt, so it's
- * identical for the Guest (and the standard "iret" instruction will undo
- * it).
- */
-static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
-{
-       unsigned long gstack, origstack;
-       u32 eflags, ss, irq_enable;
-       unsigned long virtstack;
-
-       /*
-        * There are two cases for interrupts: one where the Guest is already
-        * in the kernel, and a more complex one where the Guest is in
-        * userspace.  We check the privilege level to find out.
-        */
-       if ((cpu->regs->ss&0x3) != GUEST_PL) {
-               /*
-                * The Guest told us their kernel stack with the SET_STACK
-                * hypercall: both the virtual address and the segment.
-                */
-               virtstack = cpu->esp1;
-               ss = cpu->ss1;
-
-               origstack = gstack = guest_pa(cpu, virtstack);
-               /*
-                * We push the old stack segment and pointer onto the new
-                * stack: when the Guest does an "iret" back from the interrupt
-                * handler the CPU will notice they're dropping privilege
-                * levels and expect these here.
-                */
-               push_guest_stack(cpu, &gstack, cpu->regs->ss);
-               push_guest_stack(cpu, &gstack, cpu->regs->esp);
-       } else {
-               /* We're staying on the same Guest (kernel) stack. */
-               virtstack = cpu->regs->esp;
-               ss = cpu->regs->ss;
-
-               origstack = gstack = guest_pa(cpu, virtstack);
-       }
-
-       /*
-        * Remember that we never let the Guest actually disable interrupts, so
-        * the "Interrupt Flag" bit is always set.  We copy that bit from the
-        * Guest's "irq_enabled" field into the eflags word: we saw the Guest
-        * copy it back in "lguest_iret".
-        */
-       eflags = cpu->regs->eflags;
-       if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
-           && !(irq_enable & X86_EFLAGS_IF))
-               eflags &= ~X86_EFLAGS_IF;
-
-       /*
-        * An interrupt is expected to push three things on the stack: the old
-        * "eflags" word, the old code segment, and the old instruction
-        * pointer.
-        */
-       push_guest_stack(cpu, &gstack, eflags);
-       push_guest_stack(cpu, &gstack, cpu->regs->cs);
-       push_guest_stack(cpu, &gstack, cpu->regs->eip);
-
-       /* For the six traps which supply an error code, we push that, too. */
-       if (has_err)
-               push_guest_stack(cpu, &gstack, cpu->regs->errcode);
-
-       /* Adjust the stack pointer and stack segment. */
-       cpu->regs->ss = ss;
-       cpu->regs->esp = virtstack + (gstack - origstack);
-}
-
-/*
- * This actually makes the Guest start executing the given interrupt/trap
- * handler.
- *
- * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
- * interrupt or trap.  It's split into two parts for traditional reasons: gcc
- * on i386 used to be frightened by 64 bit numbers.
- */
-static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
-{
-       /* If we're already in the kernel, we don't change stacks. */
-       if ((cpu->regs->ss&0x3) != GUEST_PL)
-               cpu->regs->ss = cpu->esp1;
-
-       /*
-        * Set the code segment and the address to execute.
-        */
-       cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
-       cpu->regs->eip = idt_address(lo, hi);
-
-       /*
-        * Trapping always clears these flags:
-        * TF: Trap flag
-        * VM: Virtual 8086 mode
-        * RF: Resume
-        * NT: Nested task.
-        */
-       cpu->regs->eflags &=
-               ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-
-       /*
-        * There are two kinds of interrupt handlers: 0xE is an "interrupt
-        * gate" which expects interrupts to be disabled on entry.
-        */
-       if (idt_type(lo, hi) == 0xE)
-               if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
-                       kill_guest(cpu, "Disabling interrupts");
-}
-
-/* This restores the eflags word which was pushed on the stack by a trap */
-static void restore_eflags(struct lg_cpu *cpu)
-{
-       /* This is the physical address of the stack. */
-       unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
-
-       /*
-        * Stack looks like this:
-        * Address      Contents
-        * esp          EIP
-        * esp + 4      CS
-        * esp + 8      EFLAGS
-        */
-       cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
-       cpu->regs->eflags &=
-               ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-}
-
-/*H:205
- * Virtual Interrupts.
- *
- * interrupt_pending() returns the first pending interrupt which isn't blocked
- * by the Guest.  It is called before every entry to the Guest, and just before
- * we go to sleep when the Guest has halted itself.
- */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
-{
-       unsigned int irq;
-       DECLARE_BITMAP(blk, LGUEST_IRQS);
-
-       /* If the Guest hasn't even initialized yet, we can do nothing. */
-       if (!cpu->lg->lguest_data)
-               return LGUEST_IRQS;
-
-       /*
-        * Take our "irqs_pending" array and remove any interrupts the Guest
-        * wants blocked: the result ends up in "blk".
-        */
-       if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
-                          sizeof(blk)))
-               return LGUEST_IRQS;
-       bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
-
-       /* Find the first interrupt. */
-       irq = find_first_bit(blk, LGUEST_IRQS);
-       *more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-
-       return irq;
-}
-
-/*
- * This actually diverts the Guest to running an interrupt handler, once an
- * interrupt has been identified by interrupt_pending().
- */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
-{
-       struct desc_struct *idt;
-
-       BUG_ON(irq >= LGUEST_IRQS);
-
-       /* If they're halted, interrupts restart them. */
-       if (cpu->halted) {
-               /* Re-enable interrupts. */
-               if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
-                       kill_guest(cpu, "Re-enabling interrupts");
-               cpu->halted = 0;
-       } else {
-               /* Otherwise we check if they have interrupts disabled. */
-               u32 irq_enabled;
-               if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
-                       irq_enabled = 0;
-               if (!irq_enabled) {
-                       /* Make sure they know an IRQ is pending. */
-                       put_user(X86_EFLAGS_IF,
-                                &cpu->lg->lguest_data->irq_pending);
-                       return;
-               }
-       }
-
-       /*
-        * Look at the IDT entry the Guest gave us for this interrupt.  The
-        * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
-        * over them.
-        */
-       idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
-       /* If they don't have a handler (yet?), we just ignore it */
-       if (idt_present(idt->a, idt->b)) {
-               /* OK, mark it no longer pending and deliver it. */
-               clear_bit(irq, cpu->irqs_pending);
-
-               /*
-                * They may be about to iret, where they asked us never to
-                * deliver interrupts.  In this case, we can emulate that iret
-                * then immediately deliver the interrupt.  This is basically
-                * a noop: the iret would pop the interrupt frame and restore
-                * eflags, and then we'd set it up again.  So just restore the
-                * eflags word and jump straight to the handler in this case.
-                *
-                * Denys Vlasenko points out that this isn't quite right: if
-                * the iret was returning to userspace, then that interrupt
-                * would reset the stack pointer (which the Guest told us
-                * about via LHCALL_SET_STACK).  But unless the Guest is being
-                * *really* weird, that will be the same as the current stack
-                * anyway.
-                */
-               if (cpu->regs->eip == cpu->lg->noirq_iret) {
-                       restore_eflags(cpu);
-               } else {
-                       /*
-                        * set_guest_interrupt() takes a flag to say whether
-                        * this interrupt pushes an error code onto the stack
-                        * as well: virtual interrupts never do.
-                        */
-                       push_guest_interrupt_stack(cpu, false);
-               }
-               /* Actually make Guest cpu jump to handler. */
-               guest_run_interrupt(cpu, idt->a, idt->b);
-       }
-
-       /*
-        * Every time we deliver an interrupt, we update the timestamp in the
-        * Guest's lguest_data struct.  It would be better for the Guest if we
-        * did this more often, but it can actually be quite slow: doing it
-        * here is a compromise which means at least it gets updated every
-        * timer interrupt.
-        */
-       write_timestamp(cpu);
-
-       /*
-        * If there are no other interrupts we want to deliver, clear
-        * the pending flag.
-        */
-       if (!more)
-               put_user(0, &cpu->lg->lguest_data->irq_pending);
-}
-
-/* And this is the routine when we want to set an interrupt for the Guest. */
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
-{
-       /*
-        * Next time the Guest runs, the core code will see if it can deliver
-        * this interrupt.
-        */
-       set_bit(irq, cpu->irqs_pending);
-
-       /*
-        * Make sure it sees it; it might be asleep (eg. halted), or running
-        * the Guest right now, in which case kick_process() will knock it out.
-        */
-       if (!wake_up_process(cpu->tsk))
-               kick_process(cpu->tsk);
-}
-/*:*/
-
-/*
- * Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
- * me a patch, so we support that too.  It'd be a big step for lguest if half
- * the Plan 9 user base were to start using it.
- *
- * Actually now I think of it, it's possible that Ron *is* half the Plan 9
- * userbase.  Oh well.
- */
-bool could_be_syscall(unsigned int num)
-{
-       /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
-       return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
-}
-
-/* The syscall vector it wants must be unused by Host. */
-bool check_syscall_vector(struct lguest *lg)
-{
-       u32 vector;
-
-       if (get_user(vector, &lg->lguest_data->syscall_vec))
-               return false;
-
-       return could_be_syscall(vector);
-}
-
-int init_interrupts(void)
-{
-       /* If they want some strange system call vector, reserve it now */
-       if (syscall_vector != IA32_SYSCALL_VECTOR) {
-               if (test_bit(syscall_vector, used_vectors) ||
-                   vector_used_by_percpu_irq(syscall_vector)) {
-                       printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
-                                syscall_vector);
-                       return -EBUSY;
-               }
-               set_bit(syscall_vector, used_vectors);
-       }
-
-       return 0;
-}
-
-void free_interrupts(void)
-{
-       if (syscall_vector != IA32_SYSCALL_VECTOR)
-               clear_bit(syscall_vector, used_vectors);
-}
-
-/*H:220
- * Now we've got the routines to deliver interrupts, delivering traps like
- * page fault is easy.  The only trick is that Intel decided that some traps
- * should have error codes:
- */
-static bool has_err(unsigned int trap)
-{
-       return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
-}
-
-/* deliver_trap() returns true if it could deliver the trap. */
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
-{
-       /*
-        * Trap numbers are always 8 bit, but we set an impossible trap number
-        * for traps inside the Switcher, so check that here.
-        */
-       if (num >= ARRAY_SIZE(cpu->arch.idt))
-               return false;
-
-       /*
-        * Early on the Guest hasn't set the IDT entries (or maybe it put a
-        * bogus one in): if we fail here, the Guest will be killed.
-        */
-       if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
-               return false;
-       push_guest_interrupt_stack(cpu, has_err(num));
-       guest_run_interrupt(cpu, cpu->arch.idt[num].a,
-                           cpu->arch.idt[num].b);
-       return true;
-}
-
-/*H:250
- * Here's the hard part: returning to the Host every time a trap happens
- * and then calling deliver_trap() and re-entering the Guest is slow.
- * Particularly because Guest userspace system calls are traps (usually trap
- * 128).
- *
- * So we'd like to set up the IDT to tell the CPU to deliver traps directly
- * into the Guest.  This is possible, but the complexities cause the size of
- * this file to double!  However, 150 lines of code is worth writing for taking
- * system calls down from 1750ns to 270ns.  Plus, if lguest didn't do it, all
- * the other hypervisors would beat it up at lunchtime.
- *
- * This routine indicates if a particular trap number could be delivered
- * directly.
- *
- * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
- * trap gate for syscalls, so this trick is ineffective.  See Mastery for
- * how we could do this anyway...
- */
-static bool direct_trap(unsigned int num)
-{
-       /*
-        * Hardware interrupts don't go to the Guest at all (except system
-        * call).
-        */
-       if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
-               return false;
-
-       /*
-        * The Host needs to see page faults (for shadow paging and to save the
-        * fault address), general protection faults (in/out emulation) and
-        * device not available (TS handling) and of course, the hypercall trap.
-        */
-       return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
-}
-/*:*/
-
-/*M:005
- * The Guest has the ability to turn its interrupt gates into trap gates,
- * if it is careful.  The Host will let trap gates can go directly to the
- * Guest, but the Guest needs the interrupts atomically disabled for an
- * interrupt gate.  The Host could provide a mechanism to register more
- * "no-interrupt" regions, and the Guest could point the trap gate at
- * instructions within that region, where it can safely disable interrupts.
- */
-
-/*M:006
- * The Guests do not use the sysenter (fast system call) instruction,
- * because it's hardcoded to enter privilege level 0 and so can't go direct.
- * It's about twice as fast as the older "int 0x80" system call, so it might
- * still be worthwhile to handle it in the Switcher and lcall down to the
- * Guest.  The sysenter semantics are hairy tho: search for that keyword in
- * entry.S
-:*/
-
-/*H:260
- * When we make traps go directly into the Guest, we need to make sure
- * the kernel stack is valid (ie. mapped in the page tables).  Otherwise, the
- * CPU trying to deliver the trap will fault while trying to push the interrupt
- * words on the stack: this is called a double fault, and it forces us to kill
- * the Guest.
- *
- * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
- */
-void pin_stack_pages(struct lg_cpu *cpu)
-{
-       unsigned int i;
-
-       /*
-        * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
-        * two pages of stack space.
-        */
-       for (i = 0; i < cpu->lg->stack_pages; i++)
-               /*
-                * The stack grows *upwards*, so the address we're given is the
-                * start of the page after the kernel stack.  Subtract one to
-                * get back onto the first stack page, and keep subtracting to
-                * get to the rest of the stack pages.
-                */
-               pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
-}
-
-/*
- * Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the guest TSS to use that
- * stack.  The TSS entries expect a virtual address, so unlike most addresses
- * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
- * physical.
- *
- * In Linux each process has its own kernel stack, so this happens a lot: we
- * change stacks on each context switch.
- */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
-{
-       /*
-        * You're not allowed a stack segment with privilege level 0: bad Guest!
-        */
-       if ((seg & 0x3) != GUEST_PL)
-               kill_guest(cpu, "bad stack segment %i", seg);
-       /* We only expect one or two stack pages. */
-       if (pages > 2)
-               kill_guest(cpu, "bad stack pages %u", pages);
-       /* Save where the stack is, and how many pages */
-       cpu->ss1 = seg;
-       cpu->esp1 = esp;
-       cpu->lg->stack_pages = pages;
-       /* Make sure the new stack pages are mapped */
-       pin_stack_pages(cpu);
-}
-
-/*
- * All this reference to mapping stacks leads us neatly into the other complex
- * part of the Host: page table handling.
- */
-
-/*H:235
- * This is the routine which actually checks the Guest's IDT entry and
- * transfers it into the entry in "struct lguest":
- */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
-                    unsigned int num, u32 lo, u32 hi)
-{
-       u8 type = idt_type(lo, hi);
-
-       /* We zero-out a not-present entry */
-       if (!idt_present(lo, hi)) {
-               trap->a = trap->b = 0;
-               return;
-       }
-
-       /* We only support interrupt and trap gates. */
-       if (type != 0xE && type != 0xF)
-               kill_guest(cpu, "bad IDT type %i", type);
-
-       /*
-        * We only copy the handler address, present bit, privilege level and
-        * type.  The privilege level controls where the trap can be triggered
-        * manually with an "int" instruction.  This is usually GUEST_PL,
-        * except for system calls which userspace can use.
-        */
-       trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
-       trap->b = (hi&0xFFFFEF00);
-}
-
-/*H:230
- * While we're here, dealing with delivering traps and interrupts to the
- * Guest, we might as well complete the picture: how the Guest tells us where
- * it wants them to go.  This would be simple, except making traps fast
- * requires some tricks.
- *
- * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
- * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
- */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
-{
-       /*
-        * Guest never handles: NMI, doublefault, spurious interrupt or
-        * hypercall.  We ignore when it tries to set them.
-        */
-       if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
-               return;
-
-       /*
-        * Mark the IDT as changed: next time the Guest runs we'll know we have
-        * to copy this again.
-        */
-       cpu->changed |= CHANGED_IDT;
-
-       /* Check that the Guest doesn't try to step outside the bounds. */
-       if (num >= ARRAY_SIZE(cpu->arch.idt))
-               kill_guest(cpu, "Setting idt entry %u", num);
-       else
-               set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
-}
-
-/*
- * The default entry for each interrupt points into the Switcher routines which
- * simply return to the Host.  The run_guest() loop will then call
- * deliver_trap() to bounce it back into the Guest.
- */
-static void default_idt_entry(struct desc_struct *idt,
-                             int trap,
-                             const unsigned long handler,
-                             const struct desc_struct *base)
-{
-       /* A present interrupt gate. */
-       u32 flags = 0x8e00;
-
-       /*
-        * Set the privilege level on the entry for the hypercall: this allows
-        * the Guest to use the "int" instruction to trigger it.
-        */
-       if (trap == LGUEST_TRAP_ENTRY)
-               flags |= (GUEST_PL << 13);
-       else if (base)
-               /*
-                * Copy privilege level from what Guest asked for.  This allows
-                * debug (int 3) traps from Guest userspace, for example.
-                */
-               flags |= (base->b & 0x6000);
-
-       /* Now pack it into the IDT entry in its weird format. */
-       idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
-       idt->b = (handler&0xFFFF0000) | flags;
-}
-
-/* When the Guest first starts, we put default entries into the IDT. */
-void setup_default_idt_entries(struct lguest_ro_state *state,
-                              const unsigned long *def)
-{
-       unsigned int i;
-
-       for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
-               default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
-}
-
-/*H:240
- * We don't use the IDT entries in the "struct lguest" directly, instead
- * we copy them into the IDT which we've set up for Guests on this CPU, just
- * before we run the Guest.  This routine does that copy.
- */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-               const unsigned long *def)
-{
-       unsigned int i;
-
-       /*
-        * We can simply copy the direct traps, otherwise we use the default
-        * ones in the Switcher: they will return to the Host.
-        */
-       for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
-               const struct desc_struct *gidt = &cpu->arch.idt[i];
-
-               /* If no Guest can ever override this trap, leave it alone. */
-               if (!direct_trap(i))
-                       continue;
-
-               /*
-                * Only trap gates (type 15) can go direct to the Guest.
-                * Interrupt gates (type 14) disable interrupts as they are
-                * entered, which we never let the Guest do.  Not present
-                * entries (type 0x0) also can't go direct, of course.
-                *
-                * If it can't go direct, we still need to copy the priv. level:
-                * they might want to give userspace access to a software
-                * interrupt.
-                */
-               if (idt_type(gidt->a, gidt->b) == 0xF)
-                       idt[i] = *gidt;
-               else
-                       default_idt_entry(&idt[i], i, def[i], gidt);
-       }
-}
-
-/*H:200
- * The Guest Clock.
- *
- * There are two sources of virtual interrupts.  We saw one in lguest_user.c:
- * the Launcher sending interrupts for virtual devices.  The other is the Guest
- * timer interrupt.
- *
- * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
- * the next timer interrupt (in nanoseconds).  We use the high-resolution timer
- * infrastructure to set a callback at that time.
- *
- * 0 means "turn off the clock".
- */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
-{
-       ktime_t expires;
-
-       if (unlikely(delta == 0)) {
-               /* Clock event device is shutting down. */
-               hrtimer_cancel(&cpu->hrt);
-               return;
-       }
-
-       /*
-        * We use wallclock time here, so the Guest might not be running for
-        * all the time between now and the timer interrupt it asked for.  This
-        * is almost always the right thing to do.
-        */
-       expires = ktime_add_ns(ktime_get_real(), delta);
-       hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
-}
-
-/* This is the function called when the Guest's timer expires. */
-static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
-{
-       struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
-
-       /* Remember the first interrupt is the timer interrupt. */
-       set_interrupt(cpu, 0);
-       return HRTIMER_NORESTART;
-}
-
-/* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
-{
-       hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-       cpu->hrt.function = clockdev_fn;
-}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h

deleted file mode 100644 (file)

index 2356a23..0000000
--- a/drivers/lguest/lg.h
+++ /dev/null
@@ -1,258 +0,0 @@
-#ifndef _LGUEST_H
-#define _LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/wait.h>
-#include <linux/hrtimer.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include <asm/lguest.h>
-
-struct pgdir {
-       unsigned long gpgdir;
-       bool switcher_mapped;
-       int last_host_cpu;
-       pgd_t *pgdir;
-};
-
-/* We have two pages shared with guests, per cpu.  */
-struct lguest_pages {
-       /* This is the stack page mapped rw in guest */
-       char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
-       struct lguest_regs regs;
-
-       /* This is the host state & guest descriptor page, ro in guest */
-       struct lguest_ro_state state;
-} __attribute__((aligned(PAGE_SIZE)));
-
-#define CHANGED_IDT            1
-#define CHANGED_GDT            2
-#define CHANGED_GDT_TLS                4 /* Actually a subset of CHANGED_GDT */
-#define CHANGED_ALL            3
-
-struct lg_cpu {
-       unsigned int id;
-       struct lguest *lg;
-       struct task_struct *tsk;
-       struct mm_struct *mm;   /* == tsk->mm, but that becomes NULL on exit */
-
-       u32 cr2;
-       u32 esp1;
-       u16 ss1;
-
-       /* Bitmap of what has changed: see CHANGED_* above. */
-       int changed;
-
-       /* Pending operation. */
-       struct lguest_pending pending;
-
-       unsigned long *reg_read; /* register from LHREQ_GETREG */
-
-       /* At end of a page shared mapped over lguest_pages in guest. */
-       unsigned long regs_page;
-       struct lguest_regs *regs;
-
-       struct lguest_pages *last_pages;
-
-       /* Initialization mode: linear map everything. */
-       bool linear_pages;
-       int cpu_pgd; /* Which pgd this cpu is currently using */
-
-       /* If a hypercall was asked for, this points to the arguments. */
-       struct hcall_args *hcall;
-       u32 next_hcall;
-
-       /* Virtual clock device */
-       struct hrtimer hrt;
-
-       /* Did the Guest tell us to halt? */
-       int halted;
-
-       /* Pending virtual interrupts */
-       DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
-       struct lg_cpu_arch arch;
-};
-
-/* The private info the thread maintains about the guest. */
-struct lguest {
-       struct lguest_data __user *lguest_data;
-       struct lg_cpu cpus[NR_CPUS];
-       unsigned int nr_cpus;
-
-       /* Valid guest memory pages must be < this. */
-       u32 pfn_limit;
-
-       /* Device memory is >= pfn_limit and < device_limit. */
-       u32 device_limit;
-
-       /*
-        * This provides the offset to the base of guest-physical memory in the
-        * Launcher.
-        */
-       void __user *mem_base;
-       unsigned long kernel_address;
-
-       struct pgdir pgdirs[4];
-
-       unsigned long noirq_iret;
-
-       unsigned int stack_pages;
-       u32 tsc_khz;
-
-       /* Dead? */
-       const char *dead;
-};
-
-extern struct mutex lguest_lock;
-
-/* core.c: */
-bool lguest_address_ok(const struct lguest *lg,
-                      unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
-extern struct page **lg_switcher_pages;
-
-/*H:035
- * Using memory-copy operations like that is usually inconvient, so we
- * have the following helper macros which read and write a specific type (often
- * an unsigned long).
- *
- * This reads into a variable of the given type then returns that.
- */
-#define lgread(cpu, addr, type)                                                \
-       ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
-
-/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val)                          \
-       do {                                                    \
-               typecheck(type, val);                           \
-               __lgwrite((cpu), (addr), &(val), sizeof(val));  \
-       } while(0)
-/* (end of memory access helper routines) :*/
-
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
-
-/*
- * Helper macros to obtain the first 12 or the last 20 bits, this is only the
- * first step in the migration to the kernel types.  pte_pfn is already defined
- * in the kernel.
- */
-#define pgd_flags(x)   (pgd_val(x) & ~PAGE_MASK)
-#define pgd_pfn(x)     (pgd_val(x) >> PAGE_SHIFT)
-#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
-#define pmd_pfn(x)     (pmd_val(x) >> PAGE_SHIFT)
-
-/* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
-                         u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
-void setup_default_idt_entries(struct lguest_ro_state *state,
-                              const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-               const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-bool send_notify_to_eventfd(struct lg_cpu *cpu);
-void init_clockdev(struct lg_cpu *cpu);
-bool check_syscall_vector(struct lguest *lg);
-bool could_be_syscall(unsigned int num);
-int init_interrupts(void);
-void free_interrupts(void);
-
-/* segments.c: */
-void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
-                         u32 low, u32 hi);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
-
-/* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg);
-void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#ifdef CONFIG_X86_PAE
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#endif
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
-                  unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
-                unsigned long *iomem);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
-
-/* <arch>/core.c: */
-void lguest_arch_host_init(void);
-void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
-
-/* <arch>/switcher.S: */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-
-/* lguest_user.c: */
-int lguest_device_init(void);
-void lguest_device_remove(void);
-
-/* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
-
-/*L:035
- * Let's step aside for the moment, to study one important routine that's used
- * widely in the Host code.
- *
- * There are many cases where the Guest can do something invalid, like pass crap
- * to a hypercall.  Since only the Guest kernel can make hypercalls, it's quite
- * acceptable to simply terminate the Guest and give the Launcher a nicely
- * formatted reason.  It's also simpler for the Guest itself, which doesn't
- * need to check most hypercalls for "success"; if you're still running, it
- * succeeded.
- *
- * Once this is called, the Guest will never run again, so most Host code can
- * call this then continue as if nothing had happened.  This means many
- * functions don't have to explicitly return an error code, which keeps the
- * code simple.
- *
- * It also means that this can be called more than once: only the first one is
- * remembered.  The only trick is that we still need to kill the Guest even if
- * we can't allocate memory to store the reason.  Linux has a neat way of
- * packing error codes into invalid pointers, so we use that here.
- *
- * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
- * } while(0)".
- */
-#define kill_guest(cpu, fmt...)                                        \
-do {                                                           \
-       if (!(cpu)->lg->dead) {                                 \
-               (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);   \
-               if (!(cpu)->lg->dead)                           \
-                       (cpu)->lg->dead = ERR_PTR(-ENOMEM);     \
-       }                                                       \
-} while(0)
-/* (End of aside) :*/
-
-#endif /* __ASSEMBLY__ */
-#endif /* _LGUEST_H */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c

deleted file mode 100644 (file)

index 1a6787b..0000000
--- a/drivers/lguest/lguest_user.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*P:200 This contains all the /dev/lguest code, whereby the userspace
- * launcher controls and communicates with the Guest.  For example,
- * the first write will tell us the Guest's memory layout and entry
- * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest accessing a device.
-:*/
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "lg.h"
-
-/*L:052
-  The Launcher can get the registers, and also set some of them.
-*/
-static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-       unsigned long which;
-
-       /* We re-use the ptrace structure to specify which register to read. */
-       if (get_user(which, input) != 0)
-               return -EFAULT;
-
-       /*
-        * We set up the cpu register pointer, and their next read will
-        * actually get the value (instead of running the guest).
-        *
-        * The last argument 'true' says we can access any register.
-        */
-       cpu->reg_read = lguest_arch_regptr(cpu, which, true);
-       if (!cpu->reg_read)
-               return -ENOENT;
-
-       /* And because this is a write() call, we return the length used. */
-       return sizeof(unsigned long) * 2;
-}
-
-static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-       unsigned long which, value, *reg;
-
-       /* We re-use the ptrace structure to specify which register to read. */
-       if (get_user(which, input) != 0)
-               return -EFAULT;
-       input++;
-       if (get_user(value, input) != 0)
-               return -EFAULT;
-
-       /* The last argument 'false' means we can't access all registers. */
-       reg = lguest_arch_regptr(cpu, which, false);
-       if (!reg)
-               return -ENOENT;
-
-       *reg = value;
-
-       /* And because this is a write() call, we return the length used. */
-       return sizeof(unsigned long) * 3;
-}
-
-/*L:050
- * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
- * number to /dev/lguest.
- */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-       unsigned long irq;
-
-       if (get_user(irq, input) != 0)
-               return -EFAULT;
-       if (irq >= LGUEST_IRQS)
-               return -EINVAL;
-
-       /*
-        * Next time the Guest runs, the core code will see if it can deliver
-        * this interrupt.
-        */
-       set_interrupt(cpu, irq);
-       return 0;
-}
-
-/*L:053
- * Deliver a trap: this is used by the Launcher if it can't emulate
- * an instruction.
- */
-static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-       unsigned long trapnum;
-
-       if (get_user(trapnum, input) != 0)
-               return -EFAULT;
-
-       if (!deliver_trap(cpu, trapnum))
-               return -EINVAL;
-
-       return 0;
-}
-
-/*L:040
- * Once our Guest is initialized, the Launcher makes it run by reading
- * from /dev/lguest.
- */
-static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
-{
-       struct lguest *lg = file->private_data;
-       struct lg_cpu *cpu;
-       unsigned int cpu_id = *o;
-
-       /* You must write LHREQ_INITIALIZE first! */
-       if (!lg)
-               return -EINVAL;
-
-       /* Watch out for arbitrary vcpu indexes! */
-       if (cpu_id >= lg->nr_cpus)
-               return -EINVAL;
-
-       cpu = &lg->cpus[cpu_id];
-
-       /* If you're not the task which owns the Guest, go away. */
-       if (current != cpu->tsk)
-               return -EPERM;
-
-       /* If the Guest is already dead, we indicate why */
-       if (lg->dead) {
-               size_t len;
-
-               /* lg->dead either contains an error code, or a string. */
-               if (IS_ERR(lg->dead))
-                       return PTR_ERR(lg->dead);
-
-               /* We can only return as much as the buffer they read with. */
-               len = min(size, strlen(lg->dead)+1);
-               if (copy_to_user(user, lg->dead, len) != 0)
-                       return -EFAULT;
-               return len;
-       }
-
-       /*
-        * If we returned from read() last time because the Guest sent I/O,
-        * clear the flag.
-        */
-       if (cpu->pending.trap)
-               cpu->pending.trap = 0;
-
-       /* Run the Guest until something interesting happens. */
-       return run_guest(cpu, (unsigned long __user *)user);
-}
-
-/*L:025
- * This actually initializes a CPU.  For the moment, a Guest is only
- * uniprocessor, so "id" is always 0.
- */
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
-       /* We have a limited number of CPUs in the lguest struct. */
-       if (id >= ARRAY_SIZE(cpu->lg->cpus))
-               return -EINVAL;
-
-       /* Set up this CPU's id, and pointer back to the lguest struct. */
-       cpu->id = id;
-       cpu->lg = container_of(cpu, struct lguest, cpus[id]);
-       cpu->lg->nr_cpus++;
-
-       /* Each CPU has a timer it can set. */
-       init_clockdev(cpu);
-
-       /*
-        * We need a complete page for the Guest registers: they are accessible
-        * to the Guest and we can only grant it access to whole pages.
-        */
-       cpu->regs_page = get_zeroed_page(GFP_KERNEL);
-       if (!cpu->regs_page)
-               return -ENOMEM;
-
-       /* We actually put the registers at the end of the page. */
-       cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
-       /*
-        * Now we initialize the Guest's registers, handing it the start
-        * address.
-        */
-       lguest_arch_setup_regs(cpu, start_ip);
-
-       /*
-        * We keep a pointer to the Launcher task (ie. current task) for when
-        * other Guests want to wake this one (eg. console input).
-        */
-       cpu->tsk = current;
-
-       /*
-        * We need to keep a pointer to the Launcher's memory map, because if
-        * the Launcher dies we need to clean it up.  If we don't keep a
-        * reference, it is destroyed before close() is called.
-        */
-       cpu->mm = get_task_mm(cpu->tsk);
-
-       /*
-        * We remember which CPU's pages this Guest used last, for optimization
-        * when the same Guest runs on the same CPU twice.
-        */
-       cpu->last_pages = NULL;
-
-       /* No error == success. */
-       return 0;
-}
-
-/*L:020
- * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
- * addition to the LHREQ_INITIALIZE value).  These are:
- *
- * base: The start of the Guest-physical memory inside the Launcher memory.
- *
- * pfnlimit: The highest (Guest-physical) page number the Guest should be
- * allowed to access.  The Guest memory lives inside the Launcher, so it sets
- * this to ensure the Guest can only reach its own memory.
- *
- * start: The first instruction to execute ("eip" in x86-speak).
- */
-static int initialize(struct file *file, const unsigned long __user *input)
-{
-       /* "struct lguest" contains all we (the Host) know about a Guest. */
-       struct lguest *lg;
-       int err;
-       unsigned long args[4];
-
-       /*
-        * We grab the Big Lguest lock, which protects against multiple
-        * simultaneous initializations.
-        */
-       mutex_lock(&lguest_lock);
-       /* You can't initialize twice!  Close the device and start again... */
-       if (file->private_data) {
-               err = -EBUSY;
-               goto unlock;
-       }
-
-       if (copy_from_user(args, input, sizeof(args)) != 0) {
-               err = -EFAULT;
-               goto unlock;
-       }
-
-       lg = kzalloc(sizeof(*lg), GFP_KERNEL);
-       if (!lg) {
-               err = -ENOMEM;
-               goto unlock;
-       }
-
-       /* Populate the easy fields of our "struct lguest" */
-       lg->mem_base = (void __user *)args[0];
-       lg->pfn_limit = args[1];
-       lg->device_limit = args[3];
-
-       /* This is the first cpu (cpu 0) and it will start booting at args[2] */
-       err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
-       if (err)
-               goto free_lg;
-
-       /*
-        * Initialize the Guest's shadow page tables.  This allocates
-        * memory, so can fail.
-        */
-       err = init_guest_pagetable(lg);
-       if (err)
-               goto free_regs;
-
-       /* We keep our "struct lguest" in the file's private_data. */
-       file->private_data = lg;
-
-       mutex_unlock(&lguest_lock);
-
-       /* And because this is a write() call, we return the length used. */
-       return sizeof(args);
-
-free_regs:
-       /* FIXME: This should be in free_vcpu */
-       free_page(lg->cpus[0].regs_page);
-free_lg:
-       kfree(lg);
-unlock:
-       mutex_unlock(&lguest_lock);
-       return err;
-}
-
-/*L:010
- * The first operation the Launcher does must be a write.  All writes
- * start with an unsigned long number: for the first write this must be
- * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to send interrupts or set up receipt of notifications.
- *
- * Note that we overload the "offset" in the /dev/lguest file to indicate what
- * CPU number we're dealing with.  Currently this is always 0 since we only
- * support uniprocessor Guests, but you can see the beginnings of SMP support
- * here.
- */
-static ssize_t write(struct file *file, const char __user *in,
-                    size_t size, loff_t *off)
-{
-       /*
-        * Once the Guest is initialized, we hold the "struct lguest" in the
-        * file private data.
-        */
-       struct lguest *lg = file->private_data;
-       const unsigned long __user *input = (const unsigned long __user *)in;
-       unsigned long req;
-       struct lg_cpu *uninitialized_var(cpu);
-       unsigned int cpu_id = *off;
-
-       /* The first value tells us what this request is. */
-       if (get_user(req, input) != 0)
-               return -EFAULT;
-       input++;
-
-       /* If you haven't initialized, you must do that first. */
-       if (req != LHREQ_INITIALIZE) {
-               if (!lg || (cpu_id >= lg->nr_cpus))
-                       return -EINVAL;
-               cpu = &lg->cpus[cpu_id];
-
-               /* Once the Guest is dead, you can only read() why it died. */
-               if (lg->dead)
-                       return -ENOENT;
-       }
-
-       switch (req) {
-       case LHREQ_INITIALIZE:
-               return initialize(file, input);
-       case LHREQ_IRQ:
-               return user_send_irq(cpu, input);
-       case LHREQ_GETREG:
-               return getreg_setup(cpu, input);
-       case LHREQ_SETREG:
-               return setreg(cpu, input);
-       case LHREQ_TRAP:
-               return trap(cpu, input);
-       default:
-               return -EINVAL;
-       }
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-       file->private_data = NULL;
-
-       return 0;
-}
-
-/*L:060
- * The final piece of interface code is the close() routine.  It reverses
- * everything done in initialize().  This is usually called because the
- * Launcher exited.
- *
- * Note that the close routine returns 0 or a negative error number: it can't
- * really fail, but it can whine.  I blame Sun for this wart, and K&R C for
- * letting them do it.
-:*/
-static int close(struct inode *inode, struct file *file)
-{
-       struct lguest *lg = file->private_data;
-       unsigned int i;
-
-       /* If we never successfully initialized, there's nothing to clean up */
-       if (!lg)
-               return 0;
-
-       /*
-        * We need the big lock, to protect from inter-guest I/O and other
-        * Launchers initializing guests.
-        */
-       mutex_lock(&lguest_lock);
-
-       /* Free up the shadow page tables for the Guest. */
-       free_guest_pagetable(lg);
-
-       for (i = 0; i < lg->nr_cpus; i++) {
-               /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-               hrtimer_cancel(&lg->cpus[i].hrt);
-               /* We can free up the register page we allocated. */
-               free_page(lg->cpus[i].regs_page);
-               /*
-                * Now all the memory cleanups are done, it's safe to release
-                * the Launcher's memory management structure.
-                */
-               mmput(lg->cpus[i].mm);
-       }
-
-       /*
-        * If lg->dead doesn't contain an error code it will be NULL or a
-        * kmalloc()ed string, either of which is ok to hand to kfree().
-        */
-       if (!IS_ERR(lg->dead))
-               kfree(lg->dead);
-       /* Free the memory allocated to the lguest_struct */
-       kfree(lg);
-       /* Release lock and exit. */
-       mutex_unlock(&lguest_lock);
-
-       return 0;
-}
-
-/*L:000
- * Welcome to our journey through the Launcher!
- *
- * The Launcher is the Host userspace program which sets up, runs and services
- * the Guest.  In fact, many comments in the Drivers which refer to "the Host"
- * doing things are inaccurate: the Launcher does all the device handling for
- * the Guest, but the Guest can't know that.
- *
- * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
- * shall see more of that later.
- *
- * We begin our understanding with the Host kernel interface which the Launcher
- * uses: reading and writing a character device called /dev/lguest.  All the
- * work happens in the read(), write() and close() routines:
- */
-static const struct file_operations lguest_fops = {
-       .owner   = THIS_MODULE,
-       .open    = open,
-       .release = close,
-       .write   = write,
-       .read    = read,
-       .llseek  = default_llseek,
-};
-/*:*/
-
-/*
- * This is a textbook example of a "misc" character device.  Populate a "struct
- * miscdevice" and register it with misc_register().
- */
-static struct miscdevice lguest_dev = {
-       .minor  = MISC_DYNAMIC_MINOR,
-       .name   = "lguest",
-       .fops   = &lguest_fops,
-};
-
-int __init lguest_device_init(void)
-{
-       return misc_register(&lguest_dev);
-}
-
-void __exit lguest_device_remove(void)
-{
-       misc_deregister(&lguest_dev);
-}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c

deleted file mode 100644 (file)

index 0bc127e..0000000
--- a/drivers/lguest/page_tables.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*P:700
- * The pagetable code, on the other hand, still shows the scars of
- * previous encounters.  It's functional, and as neat as it can be in the
- * circumstances, but be wary, for these things are subtle and break easily.
- * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here then point the CPU to the
- * converted Guest pages when running the Guest.
-:*/
-
-/* Copyright (C) Rusty Russell IBM Corporation 2013.
- * GPL v2 and any later version */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/percpu.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*M:008
- * We hold reference to pages, which prevents them from being swapped.
- * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
- * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
- * could probably consider launching Guests as non-root.
-:*/
-
-/*H:300
- * The Page Table Code
- *
- * We use two-level page tables for the Guest, or three-level with PAE.  If
- * you're not entirely comfortable with virtual addresses, physical addresses
- * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
- * Table Handling" (with diagrams!).
- *
- * The Guest keeps page tables, but we maintain the actual ones here: these are
- * called "shadow" page tables.  Which is a very Guest-centric name: these are
- * the real page tables the CPU uses, although we keep them up to date to
- * reflect the Guest's.  (See what I mean about weird naming?  Since when do
- * shadows reflect anything?)
- *
- * Anyway, this is the most complicated part of the Host code.  There are seven
- * parts to this:
- *  (i) Looking up a page table entry when the Guest faults,
- *  (ii) Making sure the Guest stack is mapped,
- *  (iii) Setting up a page table entry when the Guest tells us one has changed,
- *  (iv) Switching page tables,
- *  (v) Flushing (throwing away) page tables,
- *  (vi) Mapping the Switcher when the Guest is about to run,
- *  (vii) Setting up the page tables initially.
-:*/
-
-/*
- * The Switcher uses the complete top PTE page.  That's 1024 PTE entries (4MB)
- * or 512 PTE entries with PAE (2MB).
- */
-#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
-
-/*
- * For PAE we need the PMD index as well. We use the last 2MB, so we
- * will need the last pmd entry of the last pmd page.
- */
-#ifdef CONFIG_X86_PAE
-#define CHECK_GPGD_MASK                _PAGE_PRESENT
-#else
-#define CHECK_GPGD_MASK                _PAGE_TABLE
-#endif
-
-/*H:320
- * The page table code is curly enough to need helper functions to keep it
- * clear and clean.  The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
- *
- * There are two functions which return pointers to the shadow (aka "real")
- * page tables.
- *
- * spgd_addr() takes the virtual address and returns a pointer to the top-level
- * page directory entry (PGD) for that address.  Since we keep track of several
- * page tables, the "i" argument tells us which one we're interested in (it's
- * usually the current one).
- */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
-{
-       unsigned int index = pgd_index(vaddr);
-
-       /* Return a pointer index'th pgd entry for the i'th page table. */
-       return &cpu->lg->pgdirs[i].pgdir[index];
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * This routine then takes the PGD entry given above, which contains the
- * address of the PMD page.  It then returns a pointer to the PMD entry for the
- * given address.
- */
-static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-       unsigned int index = pmd_index(vaddr);
-       pmd_t *page;
-
-       /* You should never call this if the PGD entry wasn't valid */
-       BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-       page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-
-       return &page[index];
-}
-#endif
-
-/*
- * This routine then takes the page directory entry returned above, which
- * contains the address of the page table entry (PTE) page.  It then returns a
- * pointer to the PTE entry for the given address.
- */
-static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-#ifdef CONFIG_X86_PAE
-       pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
-       pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
-
-       /* You should never call this if the PMD entry wasn't valid */
-       BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
-#else
-       pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-       /* You should never call this if the PGD entry wasn't valid */
-       BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-#endif
-
-       return &page[pte_index(vaddr)];
-}
-
-/*
- * These functions are just like the above, except they access the Guest
- * page tables.  Hence they return a Guest address.
- */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
-{
-       unsigned int index = vaddr >> (PGDIR_SHIFT);
-       return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
-}
-
-#ifdef CONFIG_X86_PAE
-/* Follow the PGD to the PMD. */
-static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
-{
-       unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-       BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-       return gpage + pmd_index(vaddr) * sizeof(pmd_t);
-}
-
-/* Follow the PMD to the PTE. */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-                              pmd_t gpmd, unsigned long vaddr)
-{
-       unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
-
-       BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
-       return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#else
-/* Follow the PGD to the PTE (no mid-level for !PAE). */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-                               pgd_t gpgd, unsigned long vaddr)
-{
-       unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-
-       BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-       return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#endif
-/*:*/
-
-/*M:007
- * get_pfn is slow: we could probably try to grab batches of pages here as
- * an optimization (ie. pre-faulting).
-:*/
-
-/*H:350
- * This routine takes a page number given by the Guest and converts it to
- * an actual, physical page number.  It can fail for several reasons: the
- * virtual address might not be mapped by the Launcher, the write flag is set
- * and the page is read-only, or the write flag was set and the page was
- * shared so had to be copied, but we ran out of memory.
- *
- * This holds a reference to the page, so release_pte() is careful to put that
- * back.
- */
-static unsigned long get_pfn(unsigned long virtpfn, int write)
-{
-       struct page *page;
-
-       /* gup me one page at this address please! */
-       if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
-               return page_to_pfn(page);
-
-       /* This value indicates failure. */
-       return -1UL;
-}
-
-/*H:340
- * Converting a Guest page table entry to a shadow (ie. real) page table
- * entry can be a little tricky.  The flags are (almost) the same, but the
- * Guest PTE contains a virtual page number: the CPU needs the real page
- * number.
- */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
-{
-       unsigned long pfn, base, flags;
-
-       /*
-        * The Guest sets the global flag, because it thinks that it is using
-        * PGE.  We only told it to use PGE so it would tell us whether it was
-        * flushing a kernel mapping or a userspace mapping.  We don't actually
-        * use the global bit, so throw it away.
-        */
-       flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
-
-       /* The Guest's pages are offset inside the Launcher. */
-       base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
-
-       /*
-        * We need a temporary "unsigned long" variable to hold the answer from
-        * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
-        * fit in spte.pfn.  get_pfn() finds the real physical number of the
-        * page, given the virtual number.
-        */
-       pfn = get_pfn(base + pte_pfn(gpte), write);
-       if (pfn == -1UL) {
-               kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
-               /*
-                * When we destroy the Guest, we'll go through the shadow page
-                * tables and release_pte() them.  Make sure we don't think
-                * this one is valid!
-                */
-               flags = 0;
-       }
-       /* Now we assemble our shadow PTE from the page number and flags. */
-       return pfn_pte(pfn, __pgprot(flags));
-}
-
-/*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(pte_t pte)
-{
-       /*
-        * Remember that get_user_pages_fast() took a reference to the page, in
-        * get_pfn()?  We have to put it back now.
-        */
-       if (pte_flags(pte) & _PAGE_PRESENT)
-               put_page(pte_page(pte));
-}
-/*:*/
-
-static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
-{
-       /* We don't handle large pages. */
-       if (pte_flags(gpte) & _PAGE_PSE)
-               return false;
-
-       return (pte_pfn(gpte) >= cpu->lg->pfn_limit
-               && pte_pfn(gpte) < cpu->lg->device_limit);
-}
-
-static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
-{
-       if ((pte_flags(gpte) & _PAGE_PSE) ||
-           pte_pfn(gpte) >= cpu->lg->pfn_limit) {
-               kill_guest(cpu, "bad page table entry");
-               return false;
-       }
-       return true;
-}
-
-static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
-{
-       if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-           (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
-               kill_guest(cpu, "bad page directory entry");
-               return false;
-       }
-       return true;
-}
-
-#ifdef CONFIG_X86_PAE
-static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
-{
-       if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-           (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
-               kill_guest(cpu, "bad page middle directory entry");
-               return false;
-       }
-       return true;
-}
-#endif
-
-/*H:331
- * This is the core routine to walk the shadow page tables and find the page
- * table entry for a specific address.
- *
- * If allocate is set, then we allocate any missing levels, setting the flags
- * on the new page directory and mid-level directories using the arguments
- * (which are copied from the Guest's page table entries).
- */
-static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
-                       int pgd_flags, int pmd_flags)
-{
-       pgd_t *spgd;
-       /* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
-       pmd_t *spmd;
-#endif
-
-       /* Get top level entry. */
-       spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-       if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
-               /* No shadow entry: allocate a new shadow PTE page. */
-               unsigned long ptepage;
-
-               /* If they didn't want us to allocate anything, stop. */
-               if (!allocate)
-                       return NULL;
-
-               ptepage = get_zeroed_page(GFP_KERNEL);
-               /*
-                * This is not really the Guest's fault, but killing it is
-                * simple for this corner case.
-                */
-               if (!ptepage) {
-                       kill_guest(cpu, "out of memory allocating pte page");
-                       return NULL;
-               }
-               /*
-                * And we copy the flags to the shadow PGD entry.  The page
-                * number in the shadow PGD is the page we just allocated.
-                */
-               set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
-       }
-
-       /*
-        * Intel's Physical Address Extension actually uses three levels of
-        * page tables, so we need to look in the mid-level.
-        */
-#ifdef CONFIG_X86_PAE
-       /* Now look at the mid-level shadow entry. */
-       spmd = spmd_addr(cpu, *spgd, vaddr);
-
-       if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
-               /* No shadow entry: allocate a new shadow PTE page. */
-               unsigned long ptepage;
-
-               /* If they didn't want us to allocate anything, stop. */
-               if (!allocate)
-                       return NULL;
-
-               ptepage = get_zeroed_page(GFP_KERNEL);
-
-               /*
-                * This is not really the Guest's fault, but killing it is
-                * simple for this corner case.
-                */
-               if (!ptepage) {
-                       kill_guest(cpu, "out of memory allocating pmd page");
-                       return NULL;
-               }
-
-               /*
-                * And we copy the flags to the shadow PMD entry.  The page
-                * number in the shadow PMD is the page we just allocated.
-                */
-               set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
-       }
-#endif
-
-       /* Get the pointer to the shadow PTE entry we're going to set. */
-       return spte_addr(cpu, *spgd, vaddr);
-}
-
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
- *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
- *
- * There's a corner case: they're trying to access memory between
- * pfn_limit and device_limit, which is I/O memory.  In this case, we
- * return false and set @iomem to the physical address, so the the
- * Launcher can handle the instruction manually.
- */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
-                unsigned long *iomem)
-{
-       unsigned long gpte_ptr;
-       pte_t gpte;
-       pte_t *spte;
-       pmd_t gpmd;
-       pgd_t gpgd;
-
-       *iomem = 0;
-
-       /* We never demand page the Switcher, so trying is a mistake. */
-       if (vaddr >= switcher_addr)
-               return false;
-
-       /* First step: get the top-level Guest page table entry. */
-       if (unlikely(cpu->linear_pages)) {
-               /* Faking up a linear mapping. */
-               gpgd = __pgd(CHECK_GPGD_MASK);
-       } else {
-               gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-               /* Toplevel not present?  We can't map it in. */
-               if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-                       return false;
-
-               /* 
-                * This kills the Guest if it has weird flags or tries to
-                * refer to a "physical" address outside the bounds.
-                */
-               if (!check_gpgd(cpu, gpgd))
-                       return false;
-       }
-
-       /* This "mid-level" entry is only used for non-linear, PAE mode. */
-       gpmd = __pmd(_PAGE_TABLE);
-
-#ifdef CONFIG_X86_PAE
-       if (likely(!cpu->linear_pages)) {
-               gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-               /* Middle level not present?  We can't map it in. */
-               if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-                       return false;
-
-               /* 
-                * This kills the Guest if it has weird flags or tries to
-                * refer to a "physical" address outside the bounds.
-                */
-               if (!check_gpmd(cpu, gpmd))
-                       return false;
-       }
-
-       /*
-        * OK, now we look at the lower level in the Guest page table: keep its
-        * address, because we might update it later.
-        */
-       gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
-#else
-       /*
-        * OK, now we look at the lower level in the Guest page table: keep its
-        * address, because we might update it later.
-        */
-       gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
-#endif
-
-       if (unlikely(cpu->linear_pages)) {
-               /* Linear?  Make up a PTE which points to same page. */
-               gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
-       } else {
-               /* Read the actual PTE value. */
-               gpte = lgread(cpu, gpte_ptr, pte_t);
-       }
-
-       /* If this page isn't in the Guest page tables, we can't page it in. */
-       if (!(pte_flags(gpte) & _PAGE_PRESENT))
-               return false;
-
-       /*
-        * Check they're not trying to write to a page the Guest wants
-        * read-only (bit 2 of errcode == write).
-        */
-       if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
-               return false;
-
-       /* User access to a kernel-only page? (bit 3 == user access) */
-       if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
-               return false;
-
-       /* If they're accessing io memory, we expect a fault. */
-       if (gpte_in_iomem(cpu, gpte)) {
-               *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
-               return false;
-       }
-
-       /*
-        * Check that the Guest PTE flags are OK, and the page number is below
-        * the pfn_limit (ie. not mapping the Launcher binary).
-        */
-       if (!check_gpte(cpu, gpte))
-               return false;
-
-       /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
-       gpte = pte_mkyoung(gpte);
-       if (errcode & 2)
-               gpte = pte_mkdirty(gpte);
-
-       /* Get the pointer to the shadow PTE entry we're going to set. */
-       spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
-       if (!spte)
-               return false;
-
-       /*
-        * If there was a valid shadow PTE entry here before, we release it.
-        * This can happen with a write to a previously read-only entry.
-        */
-       release_pte(*spte);
-
-       /*
-        * If this is a write, we insist that the Guest page is writable (the
-        * final arg to gpte_to_spte()).
-        */
-       if (pte_dirty(gpte))
-               *spte = gpte_to_spte(cpu, gpte, 1);
-       else
-               /*
-                * If this is a read, don't set the "writable" bit in the page
-                * table entry, even if the Guest says it's writable.  That way
-                * we will come back here when a write does actually occur, so
-                * we can update the Guest's _PAGE_DIRTY flag.
-                */
-               set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
-
-       /*
-        * Finally, we write the Guest PTE entry back: we've set the
-        * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
-        */
-       if (likely(!cpu->linear_pages))
-               lgwrite(cpu, gpte_ptr, pte_t, gpte);
-
-       /*
-        * The fault is fixed, the page table is populated, the mapping
-        * manipulated, the result returned and the code complete.  A small
-        * delay and a trace of alliteration are the only indications the Guest
-        * has that a page fault occurred at all.
-        */
-       return true;
-}
-
-/*H:360
- * (ii) Making sure the Guest stack is mapped.
- *
- * Remember that direct traps into the Guest need a mapped Guest kernel stack.
- * pin_stack_pages() calls us here: we could simply call demand_page(), but as
- * we've seen that logic is quite long, and usually the stack pages are already
- * mapped, so it's overkill.
- *
- * This is a quick version which answers the question: is this virtual address
- * mapped by the shadow page tables, and is it writable?
- */
-static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
-{
-       pte_t *spte;
-       unsigned long flags;
-
-       /* You can't put your stack in the Switcher! */
-       if (vaddr >= switcher_addr)
-               return false;
-
-       /* If there's no shadow PTE, it's not writable. */
-       spte = find_spte(cpu, vaddr, false, 0, 0);
-       if (!spte)
-               return false;
-
-       /*
-        * Check the flags on the pte entry itself: it must be present and
-        * writable.
-        */
-       flags = pte_flags(*spte);
-       return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
-}
-
-/*
- * So, when pin_stack_pages() asks us to pin a page, we check if it's already
- * in the page tables, and if not, we call demand_page() with error code 2
- * (meaning "write").
- */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
-{
-       unsigned long iomem;
-
-       if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
-               kill_guest(cpu, "bad stack page %#lx", vaddr);
-}
-/*:*/
-
-#ifdef CONFIG_X86_PAE
-static void release_pmd(pmd_t *spmd)
-{
-       /* If the entry's not present, there's nothing to release. */
-       if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-               unsigned int i;
-               pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
-               /* For each entry in the page, we might need to release it. */
-               for (i = 0; i < PTRS_PER_PTE; i++)
-                       release_pte(ptepage[i]);
-               /* Now we can free the page of PTEs */
-               free_page((long)ptepage);
-               /* And zero out the PMD entry so we never release it twice. */
-               set_pmd(spmd, __pmd(0));
-       }
-}
-
-static void release_pgd(pgd_t *spgd)
-{
-       /* If the entry's not present, there's nothing to release. */
-       if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-               unsigned int i;
-               pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
-               for (i = 0; i < PTRS_PER_PMD; i++)
-                       release_pmd(&pmdpage[i]);
-
-               /* Now we can free the page of PMDs */
-               free_page((long)pmdpage);
-               /* And zero out the PGD entry so we never release it twice. */
-               set_pgd(spgd, __pgd(0));
-       }
-}
-
-#else /* !CONFIG_X86_PAE */
-/*H:450
- * If we chase down the release_pgd() code, the non-PAE version looks like
- * this.  The PAE version is almost identical, but instead of calling
- * release_pte it calls release_pmd(), which looks much like this.
- */
-static void release_pgd(pgd_t *spgd)
-{
-       /* If the entry's not present, there's nothing to release. */
-       if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-               unsigned int i;
-               /*
-                * Converting the pfn to find the actual PTE page is easy: turn
-                * the page number into a physical address, then convert to a
-                * virtual address (easy for kernel pages like this one).
-                */
-               pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-               /* For each entry in the page, we might need to release it. */
-               for (i = 0; i < PTRS_PER_PTE; i++)
-                       release_pte(ptepage[i]);
-               /* Now we can free the page of PTEs */
-               free_page((long)ptepage);
-               /* And zero out the PGD entry so we never release it twice. */
-               *spgd = __pgd(0);
-       }
-}
-#endif
-
-/*H:445
- * We saw flush_user_mappings() twice: once from the flush_user_mappings()
- * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
- * It simply releases every PTE page from 0 up to the Guest's kernel address.
- */
-static void flush_user_mappings(struct lguest *lg, int idx)
-{
-       unsigned int i;
-       /* Release every pgd entry up to the kernel's address. */
-       for (i = 0; i < pgd_index(lg->kernel_address); i++)
-               release_pgd(lg->pgdirs[idx].pgdir + i);
-}
-
-/*H:440
- * (v) Flushing (throwing away) page tables,
- *
- * The Guest has a hypercall to throw away the page tables: it's used when a
- * large number of mappings have been changed.
- */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
-{
-       /* Drop the userspace part of the current page table. */
-       flush_user_mappings(cpu->lg, cpu->cpu_pgd);
-}
-/*:*/
-
-/* We walk down the guest page tables to get a guest-physical address */
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
-{
-       pgd_t gpgd;
-       pte_t gpte;
-#ifdef CONFIG_X86_PAE
-       pmd_t gpmd;
-#endif
-
-       /* Still not set up?  Just map 1:1. */
-       if (unlikely(cpu->linear_pages)) {
-               *paddr = vaddr;
-               return true;
-       }
-
-       /* First step: get the top-level Guest page table entry. */
-       gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-       /* Toplevel not present?  We can't map it in. */
-       if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-               goto fail;
-
-#ifdef CONFIG_X86_PAE
-       gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-       if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-               goto fail;
-       gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
-#else
-       gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
-#endif
-       if (!(pte_flags(gpte) & _PAGE_PRESENT))
-               goto fail;
-
-       *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
-       return true;
-
-fail:
-       *paddr = -1UL;
-       return false;
-}
-
-/*
- * This is the version we normally use: kills the Guest if it uses a
- * bad address
- */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
-{
-       unsigned long paddr;
-
-       if (!__guest_pa(cpu, vaddr, &paddr))
-               kill_guest(cpu, "Bad address %#lx", vaddr);
-       return paddr;
-}
-
-/*
- * We keep several page tables.  This is a simple routine to find the page
- * table (if any) corresponding to this top-level address the Guest has given
- * us.
- */
-static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
-{
-       unsigned int i;
-       for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-               if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
-                       break;
-       return i;
-}
-
-/*H:435
- * And this is us, creating the new page directory.  If we really do
- * allocate a new one (and so the kernel parts are not there), we set
- * blank_pgdir.
- */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
-                             unsigned long gpgdir,
-                             int *blank_pgdir)
-{
-       unsigned int next;
-
-       /*
-        * We pick one entry at random to throw out.  Choosing the Least
-        * Recently Used might be better, but this is easy.
-        */
-       next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
-       /* If it's never been allocated at all before, try now. */
-       if (!cpu->lg->pgdirs[next].pgdir) {
-               cpu->lg->pgdirs[next].pgdir =
-                                       (pgd_t *)get_zeroed_page(GFP_KERNEL);
-               /* If the allocation fails, just keep using the one we have */
-               if (!cpu->lg->pgdirs[next].pgdir)
-                       next = cpu->cpu_pgd;
-               else {
-                       /*
-                        * This is a blank page, so there are no kernel
-                        * mappings: caller must map the stack!
-                        */
-                       *blank_pgdir = 1;
-               }
-       }
-       /* Record which Guest toplevel this shadows. */
-       cpu->lg->pgdirs[next].gpgdir = gpgdir;
-       /* Release all the non-kernel mappings. */
-       flush_user_mappings(cpu->lg, next);
-
-       /* This hasn't run on any CPU at all. */
-       cpu->lg->pgdirs[next].last_host_cpu = -1;
-
-       return next;
-}
-
-/*H:501
- * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here.  We map the Switcher code immediately,
- * but defer mapping of the guest register page and IDT/LDT etc page until
- * just before we run the guest in map_switcher_in_guest().
- *
- * We *could* do this setup in map_switcher_in_guest(), but at that point
- * we've interrupts disabled, and allocating pages like that is fraught: we
- * can't sleep if we need to free up some memory.
- */
-static bool allocate_switcher_mapping(struct lg_cpu *cpu)
-{
-       int i;
-
-       for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-               pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
-                                      CHECK_GPGD_MASK, _PAGE_TABLE);
-               if (!pte)
-                       return false;
-
-               /*
-                * Map the switcher page if not already there.  It might
-                * already be there because we call allocate_switcher_mapping()
-                * in guest_set_pgd() just in case it did discard our Switcher
-                * mapping, but it probably didn't.
-                */
-               if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
-                       /* Get a reference to the Switcher page. */
-                       get_page(lg_switcher_pages[0]);
-                       /* Create a read-only, exectuable, kernel-style PTE */
-                       set_pte(pte,
-                               mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
-               }
-       }
-       cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
-       return true;
-}
-
-/*H:470
- * Finally, a routine which throws away everything: all PGD entries in all
- * the shadow page tables, including the Guest's kernel mappings.  This is used
- * when we destroy the Guest.
- */
-static void release_all_pagetables(struct lguest *lg)
-{
-       unsigned int i, j;
-
-       /* Every shadow pagetable this Guest has */
-       for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-               if (!lg->pgdirs[i].pgdir)
-                       continue;
-
-               /* Every PGD entry. */
-               for (j = 0; j < PTRS_PER_PGD; j++)
-                       release_pgd(lg->pgdirs[i].pgdir + j);
-               lg->pgdirs[i].switcher_mapped = false;
-               lg->pgdirs[i].last_host_cpu = -1;
-       }
-}
-
-/*
- * We also throw away everything when a Guest tells us it's changed a kernel
- * mapping.  Since kernel mappings are in every page table, it's easiest to
- * throw them all away.  This traps the Guest in amber for a while as
- * everything faults back in, but it's rare.
- */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
-{
-       release_all_pagetables(cpu->lg);
-       /* We need the Guest kernel stack mapped again. */
-       pin_stack_pages(cpu);
-       /* And we need Switcher allocated. */
-       if (!allocate_switcher_mapping(cpu))
-               kill_guest(cpu, "Cannot populate switcher mapping");
-}
-
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-       int newpgdir, repin = 0;
-
-       /*
-        * The very first time they call this, we're actually running without
-        * any page tables; we've been making it up.  Throw them away now.
-        */
-       if (unlikely(cpu->linear_pages)) {
-               release_all_pagetables(cpu->lg);
-               cpu->linear_pages = false;
-               /* Force allocation of a new pgdir. */
-               newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
-       } else {
-               /* Look to see if we have this one already. */
-               newpgdir = find_pgdir(cpu->lg, pgtable);
-       }
-
-       /*
-        * If not, we allocate or mug an existing one: if it's a fresh one,
-        * repin gets set to 1.
-        */
-       if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-               newpgdir = new_pgdir(cpu, pgtable, &repin);
-       /* Change the current pgd index to the new one. */
-       cpu->cpu_pgd = newpgdir;
-       /*
-        * If it was completely blank, we map in the Guest kernel stack and
-        * the Switcher.
-        */
-       if (repin)
-               pin_stack_pages(cpu);
-
-       if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
-               if (!allocate_switcher_mapping(cpu))
-                       kill_guest(cpu, "Cannot populate switcher mapping");
-       }
-}
-/*:*/
-
-/*M:009
- * Since we throw away all mappings when a kernel mapping changes, our
- * performance sucks for guests using highmem.  In fact, a guest with
- * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
- * usually slower than a Guest with less memory.
- *
- * This, of course, cannot be fixed.  It would take some kind of... well, I
- * don't know, but the term "puissant code-fu" comes to mind.
-:*/
-
-/*H:420
- * This is the routine which actually sets the page table entry for then
- * "idx"'th shadow page table.
- *
- * Normally, we can just throw out the old entry and replace it with 0: if they
- * use it demand_page() will put the new entry in.  We need to do this anyway:
- * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
- * is read from, and _PAGE_DIRTY when it's written to.
- *
- * But Avi Kivity pointed out that most Operating Systems (Linux included) set
- * these bits on PTEs immediately anyway.  This is done to save the CPU from
- * having to update them, but it helps us the same way: if they set
- * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
- * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
- */
-static void __guest_set_pte(struct lg_cpu *cpu, int idx,
-                      unsigned long vaddr, pte_t gpte)
-{
-       /* Look up the matching shadow page directory entry. */
-       pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
-#ifdef CONFIG_X86_PAE
-       pmd_t *spmd;
-#endif
-
-       /* If the top level isn't present, there's no entry to update. */
-       if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-#ifdef CONFIG_X86_PAE
-               spmd = spmd_addr(cpu, *spgd, vaddr);
-               if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-#endif
-                       /* Otherwise, start by releasing the existing entry. */
-                       pte_t *spte = spte_addr(cpu, *spgd, vaddr);
-                       release_pte(*spte);
-
-                       /*
-                        * If they're setting this entry as dirty or accessed,
-                        * we might as well put that entry they've given us in
-                        * now.  This shaves 10% off a copy-on-write
-                        * micro-benchmark.
-                        */
-                       if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
-                           && !gpte_in_iomem(cpu, gpte)) {
-                               if (!check_gpte(cpu, gpte))
-                                       return;
-                               set_pte(spte,
-                                       gpte_to_spte(cpu, gpte,
-                                               pte_flags(gpte) & _PAGE_DIRTY));
-                       } else {
-                               /*
-                                * Otherwise kill it and we can demand_page()
-                                * it in later.
-                                */
-                               set_pte(spte, __pte(0));
-                       }
-#ifdef CONFIG_X86_PAE
-               }
-#endif
-       }
-}
-
-/*H:410
- * Updating a PTE entry is a little trickier.
- *
- * We keep track of several different page tables (the Guest uses one for each
- * process, so it makes sense to cache at least a few).  Each of these have
- * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
- * all processes.  So when the page table above that address changes, we update
- * all the page tables, not just the current one.  This is rare.
- *
- * The benefit is that when we have to track a new page table, we can keep all
- * the kernel mappings.  This speeds up context switch immensely.
- */
-void guest_set_pte(struct lg_cpu *cpu,
-                  unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
-{
-       /* We don't let you remap the Switcher; we need it to get back! */
-       if (vaddr >= switcher_addr) {
-               kill_guest(cpu, "attempt to set pte into Switcher pages");
-               return;
-       }
-
-       /*
-        * Kernel mappings must be changed on all top levels.  Slow, but doesn't
-        * happen often.
-        */
-       if (vaddr >= cpu->lg->kernel_address) {
-               unsigned int i;
-               for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
-                       if (cpu->lg->pgdirs[i].pgdir)
-                               __guest_set_pte(cpu, i, vaddr, gpte);
-       } else {
-               /* Is this page table one we have a shadow for? */
-               int pgdir = find_pgdir(cpu->lg, gpgdir);
-               if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
-                       /* If so, do the update. */
-                       __guest_set_pte(cpu, pgdir, vaddr, gpte);
-       }
-}
-
-/*H:400
- * (iii) Setting up a page table entry when the Guest tells us one has changed.
- *
- * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
- * with the other side of page tables while we're here: what happens when the
- * Guest asks for a page table to be updated?
- *
- * We already saw that demand_page() will fill in the shadow page tables when
- * needed, so we can simply remove shadow page table entries whenever the Guest
- * tells us they've changed.  When the Guest tries to use the new entry it will
- * fault and demand_page() will fix it up.
- *
- * So with that in mind here's our code to update a (top-level) PGD entry:
- */
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
-{
-       int pgdir;
-
-       if (idx > PTRS_PER_PGD) {
-               kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
-                          idx, PTRS_PER_PGD);
-               return;
-       }
-
-       /* If they're talking about a page table we have a shadow for... */
-       pgdir = find_pgdir(lg, gpgdir);
-       if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
-               /* ... throw it away. */
-               release_pgd(lg->pgdirs[pgdir].pgdir + idx);
-               /* That might have been the Switcher mapping, remap it. */
-               if (!allocate_switcher_mapping(&lg->cpus[0])) {
-                       kill_guest(&lg->cpus[0],
-                                  "Cannot populate switcher mapping");
-               }
-               lg->pgdirs[pgdir].last_host_cpu = -1;
-       }
-}
-
-#ifdef CONFIG_X86_PAE
-/* For setting a mid-level, we just throw everything away.  It's easy. */
-void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
-{
-       guest_pagetable_clear_all(&lg->cpus[0]);
-}
-#endif
-
-/*H:500
- * (vii) Setting up the page tables initially.
- *
- * When a Guest is first created, set initialize a shadow page table which
- * we will populate on future faults.  The Guest doesn't have any actual
- * pagetables yet, so we set linear_pages to tell demand_page() to fake it
- * for the moment.
- *
- * We do need the Switcher to be mapped at all times, so we allocate that
- * part of the Guest page table here.
- */
-int init_guest_pagetable(struct lguest *lg)
-{
-       struct lg_cpu *cpu = &lg->cpus[0];
-       int allocated = 0;
-
-       /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
-       cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
-       if (!allocated)
-               return -ENOMEM;
-
-       /* We start with a linear mapping until the initialize. */
-       cpu->linear_pages = true;
-
-       /* Allocate the page tables for the Switcher. */
-       if (!allocate_switcher_mapping(cpu)) {
-               release_all_pagetables(lg);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
-{
-       /*
-        * We tell the Guest that it can't use the virtual addresses
-        * used by the Switcher.  This trick is equivalent to 4GB -
-        * switcher_addr.
-        */
-       u32 top = ~switcher_addr + 1;
-
-       /* We get the kernel address: above this is all kernel memory. */
-       if (get_user(cpu->lg->kernel_address,
-                    &cpu->lg->lguest_data->kernel_address)
-               /*
-                * We tell the Guest that it can't use the top virtual
-                * addresses (used by the Switcher).
-                */
-           || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-               kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-               return;
-       }
-
-       /*
-        * In flush_user_mappings() we loop from 0 to
-        * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
-        * Switcher mappings, so check that now.
-        */
-       if (cpu->lg->kernel_address >= switcher_addr)
-               kill_guest(cpu, "bad kernel address %#lx",
-                                cpu->lg->kernel_address);
-}
-
-/* When a Guest dies, our cleanup is fairly simple. */
-void free_guest_pagetable(struct lguest *lg)
-{
-       unsigned int i;
-
-       /* Throw away all page table pages. */
-       release_all_pagetables(lg);
-       /* Now free the top levels: free_page() can handle 0 just fine. */
-       for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-               free_page((long)lg->pgdirs[i].pgdir);
-}
-
-/*H:481
- * This clears the Switcher mappings for cpu #i.
- */
-static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
-{
-       unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-       pte_t *pte;
-
-       /* Clear the mappings for both pages. */
-       pte = find_spte(cpu, base, false, 0, 0);
-       release_pte(*pte);
-       set_pte(pte, __pte(0));
-
-       pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-       release_pte(*pte);
-       set_pte(pte, __pte(0));
-}
-
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the Guest
- * (and not the pages for other CPUs).
- *
- * The pages for the pagetables have all been allocated before: we just need
- * to make sure the actual PTEs are up-to-date for the CPU we're about to run
- * on.
- */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-       unsigned long base;
-       struct page *percpu_switcher_page, *regs_page;
-       pte_t *pte;
-       struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
-
-       /* Switcher page should always be mapped by now! */
-       BUG_ON(!pgdir->switcher_mapped);
-
-       /* 
-        * Remember that we have two pages for each Host CPU, so we can run a
-        * Guest on each CPU without them interfering.  We need to make sure
-        * those pages are mapped correctly in the Guest, but since we usually
-        * run on the same CPU, we cache that, and only update the mappings
-        * when we move.
-        */
-       if (pgdir->last_host_cpu == raw_smp_processor_id())
-               return;
-
-       /* -1 means unknown so we remove everything. */
-       if (pgdir->last_host_cpu == -1) {
-               unsigned int i;
-               for_each_possible_cpu(i)
-                       remove_switcher_percpu_map(cpu, i);
-       } else {
-               /* We know exactly what CPU mapping to remove. */
-               remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
-       }
-
-       /*
-        * When we're running the Guest, we want the Guest's "regs" page to
-        * appear where the first Switcher page for this CPU is.  This is an
-        * optimization: when the Switcher saves the Guest registers, it saves
-        * them into the first page of this CPU's "struct lguest_pages": if we
-        * make sure the Guest's register page is already mapped there, we
-        * don't have to copy them out again.
-        */
-       /* Find the shadow PTE for this regs page. */
-       base = switcher_addr + PAGE_SIZE
-               + raw_smp_processor_id() * sizeof(struct lguest_pages);
-       pte = find_spte(cpu, base, false, 0, 0);
-       regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
-       get_page(regs_page);
-       set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
-
-       /*
-        * We map the second page of the struct lguest_pages read-only in
-        * the Guest: the IDT, GDT and other things it's not supposed to
-        * change.
-        */
-       pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-       percpu_switcher_page
-               = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
-       get_page(percpu_switcher_page);
-       set_pte(pte, mk_pte(percpu_switcher_page,
-                           __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
-
-       pgdir->last_host_cpu = raw_smp_processor_id();
-}
-
-/*H:490
- * We've made it through the page table code.  Perhaps our tired brains are
- * still processing the details, or perhaps we're simply glad it's over.
- *
- * If nothing else, note that all this complexity in juggling shadow page tables
- * in sync with the Guest's page tables is for one reason: for most Guests this
- * page table dance determines how bad performance will be.  This is why Xen
- * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
- * have implemented shadow page table support directly into hardware.
- *
- * There is just one file remaining in the Host.
- */
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c

deleted file mode 100644 (file)

index c4fb424..0000000
--- a/drivers/lguest/segments.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*P:600
- * The x86 architecture has segments, which involve a table of descriptors
- * which can be used to do funky things with virtual address interpretation.
- * We originally used to use segments so the Guest couldn't alter the
- * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
- * for userspace per-thread segments, but trim again for on userspace->kernel
- * transitions...  This nightmarish creation was contained within this file,
- * where we knew not to tread without heavy armament and a change of underwear.
- *
- * In these modern times, the segment handling code consists of simple sanity
- * checks, and the worst you'll experience reading this code is butterfly-rash
- * from frolicking through its parklike serenity.
-:*/
-#include "lg.h"
-
-/*H:600
- * Segments & The Global Descriptor Table
- *
- * (That title sounds like a bad Nerdcore group.  Not to suggest that there are
- * any good Nerdcore groups, but in high school a friend of mine had a band
- * called Joe Fish and the Chips, so there are definitely worse band names).
- *
- * To refresh: the GDT is a table of 8-byte values describing segments.  Once
- * set up, these segments can be loaded into one of the 6 "segment registers".
- *
- * GDT entries are passed around as "struct desc_struct"s, which like IDT
- * entries are split into two 32-bit members, "a" and "b".  One day, someone
- * will clean that up, and be declared a Hero.  (No pressure, I'm just saying).
- *
- * Anyway, the GDT entry contains a base (the start address of the segment), a
- * limit (the size of the segment - 1), and some flags.  Sounds simple, and it
- * would be, except those zany Intel engineers decided that it was too boring
- * to put the base at one end, the limit at the other, and the flags in
- * between.  They decided to shotgun the bits at random throughout the 8 bytes,
- * like so:
- *
- * 0               16                     40       48  52  56     63
- * [ limit part 1 ][     base part 1     ][ flags ][li][fl][base ]
- *                                                  mit ags part 2
- *                                                part 2
- *
- * As a result, this file contains a certain amount of magic numeracy.  Let's
- * begin.
- */
-
-/*
- * There are several entries we don't let the Guest set.  The TSS entry is the
- * "Task State Segment" which controls all kinds of delicate things.  The
- * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
- * the Guest can't be trusted to deal with double faults.
- */
-static bool ignored_gdt(unsigned int num)
-{
-       return (num == GDT_ENTRY_TSS
-               || num == GDT_ENTRY_LGUEST_CS
-               || num == GDT_ENTRY_LGUEST_DS
-               || num == GDT_ENTRY_DOUBLEFAULT_TSS);
-}
-
-/*H:630
- * Once the Guest gave us new GDT entries, we fix them up a little.  We
- * don't care if they're invalid: the worst that can happen is a General
- * Protection Fault in the Switcher when it restores a Guest segment register
- * which tries to use that entry.  Then we kill the Guest for causing such a
- * mess: the message will be "unhandled trap 256".
- */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
-{
-       unsigned int i;
-
-       for (i = start; i < end; i++) {
-               /*
-                * We never copy these ones to real GDT, so we don't care what
-                * they say
-                */
-               if (ignored_gdt(i))
-                       continue;
-
-               /*
-                * Segment descriptors contain a privilege level: the Guest is
-                * sometimes careless and leaves this as 0, even though it's
-                * running at privilege level 1.  If so, we fix it here.
-                */
-               if (cpu->arch.gdt[i].dpl == 0)
-                       cpu->arch.gdt[i].dpl |= GUEST_PL;
-
-               /*
-                * Each descriptor has an "accessed" bit.  If we don't set it
-                * now, the CPU will try to set it when the Guest first loads
-                * that entry into a segment register.  But the GDT isn't
-                * writable by the Guest, so bad things can happen.
-                */
-               cpu->arch.gdt[i].type |= 0x1;
-       }
-}
-
-/*H:610
- * Like the IDT, we never simply use the GDT the Guest gives us.  We keep
- * a GDT for each CPU, and copy across the Guest's entries each time we want to
- * run the Guest on that CPU.
- *
- * This routine is called at boot or modprobe time for each CPU to set up the
- * constant GDT entries: the ones which are the same no matter what Guest we're
- * running.
- */
-void setup_default_gdt_entries(struct lguest_ro_state *state)
-{
-       struct desc_struct *gdt = state->guest_gdt;
-       unsigned long tss = (unsigned long)&state->guest_tss;
-
-       /* The Switcher segments are full 0-4G segments, privilege level 0 */
-       gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-       gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-
-       /*
-        * The TSS segment refers to the TSS entry for this particular CPU.
-        */
-       gdt[GDT_ENTRY_TSS].a = 0;
-       gdt[GDT_ENTRY_TSS].b = 0;
-
-       gdt[GDT_ENTRY_TSS].limit0 = 0x67;
-       gdt[GDT_ENTRY_TSS].base0  = tss & 0xFFFF;
-       gdt[GDT_ENTRY_TSS].base1  = (tss >> 16) & 0xFF;
-       gdt[GDT_ENTRY_TSS].base2  = tss >> 24;
-       gdt[GDT_ENTRY_TSS].type   = 0x9; /* 32-bit TSS (available) */
-       gdt[GDT_ENTRY_TSS].p      = 0x1; /* Entry is present */
-       gdt[GDT_ENTRY_TSS].dpl    = 0x0; /* Privilege level 0 */
-       gdt[GDT_ENTRY_TSS].s      = 0x0; /* system segment */
-
-}
-
-/*
- * This routine sets up the initial Guest GDT for booting.  All entries start
- * as 0 (unusable).
- */
-void setup_guest_gdt(struct lg_cpu *cpu)
-{
-       /*
-        * Start with full 0-4G segments...except the Guest is allowed to use
-        * them, so set the privilege level appropriately in the flags.
-        */
-       cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-       cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
-       cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
-       cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
-}
-
-/*H:650
- * An optimization of copy_gdt(), for just the three "thead-local storage"
- * entries.
- */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-       unsigned int i;
-
-       for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-               gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:640
- * When the Guest is run on a different CPU, or the GDT entries have changed,
- * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
- * GDT.
- */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-       unsigned int i;
-
-       /*
-        * The default entries from setup_default_gdt_entries() are not
-        * replaced.  See ignored_gdt() above.
-        */
-       for (i = 0; i < GDT_ENTRIES; i++)
-               if (!ignored_gdt(i))
-                       gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:620
- * This is where the Guest asks us to load a new GDT entry
- * (LHCALL_LOAD_GDT_ENTRY).  We tweak the entry and copy it in.
- */
-void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
-{
-       /*
-        * We assume the Guest has the same number of GDT entries as the
-        * Host, otherwise we'd have to dynamically allocate the Guest GDT.
-        */
-       if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
-               kill_guest(cpu, "too many gdt entries %i", num);
-               return;
-       }
-
-       /* Set it up, then fix it. */
-       cpu->arch.gdt[num].a = lo;
-       cpu->arch.gdt[num].b = hi;
-       fixup_gdt_table(cpu, num, num+1);
-       /*
-        * Mark that the GDT changed so the core knows it has to copy it again,
-        * even if the Guest is run on the same CPU.
-        */
-       cpu->changed |= CHANGED_GDT;
-}
-
-/*
- * This is the fast-track version for just changing the three TLS entries.
- * Remember that this happens on every context switch, so it's worth
- * optimizing.  But wouldn't it be neater to have a single hypercall to cover
- * both cases?
- */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
-{
-       struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
-
-       __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-       fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
-       /* Note that just the TLS entries have changed. */
-       cpu->changed |= CHANGED_GDT_TLS;
-}
-
-/*H:660
- * With this, we have finished the Host.
- *
- * Five of the seven parts of our task are complete.  You have made it through
- * the Bit of Despair (I think that's somewhere in the page table code,
- * myself).
- *
- * Next, we examine "make Switcher".  It's short, but intense.
- */
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c

deleted file mode 100644 (file)

index b4f79b9..0000000
--- a/drivers/lguest/x86/core.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*P:450
- * This file contains the x86-specific lguest code.  It used to be all
- * mixed in with drivers/lguest/core.c but several foolhardy code slashers
- * wrestled most of the dependencies out to here in preparation for porting
- * lguest to other architectures (see what I mean by foolhardy?).
- *
- * This also contains a couple of non-obvious setup and teardown pieces which
- * were implemented after days of debugging pain.
-:*/
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/cpu.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/lguest.h>
-#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
-#include <asm/tlbflush.h>
-#include "../lg.h"
-
-static int cpu_had_pge;
-
-static struct {
-       unsigned long offset;
-       unsigned short segment;
-} lguest_entry;
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
-       return switcher_addr - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages (after the Switcher text page) */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
-       return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-}
-
-static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-
-/*S:010
- * We approach the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU.  This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran.  We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-       /*
-        * Copying all this data can be quite expensive.  We usually run the
-        * same Guest we ran last time (and that Guest hasn't run anywhere else
-        * meanwhile).  If that's not the case, we pretend everything in the
-        * Guest has changed.
-        */
-       if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
-               __this_cpu_write(lg_last_cpu, cpu);
-               cpu->last_pages = pages;
-               cpu->changed = CHANGED_ALL;
-       }
-
-       /*
-        * These copies are pretty cheap, so we do them unconditionally: */
-       /* Save the current Host top-level page directory.
-        */
-       pages->state.host_cr3 = __pa(current->mm->pgd);
-       /*
-        * Set up the Guest's page tables to see this CPU's pages (and no
-        * other CPU's pages).
-        */
-       map_switcher_in_guest(cpu, pages);
-       /*
-        * Set up the two "TSS" members which tell the CPU what stack to use
-        * for traps which do directly into the Guest (ie. traps at privilege
-        * level 1).
-        */
-       pages->state.guest_tss.sp1 = cpu->esp1;
-       pages->state.guest_tss.ss1 = cpu->ss1;
-
-       /* Copy direct-to-Guest trap entries. */
-       if (cpu->changed & CHANGED_IDT)
-               copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
-
-       /* Copy all GDT entries which the Guest can change. */
-       if (cpu->changed & CHANGED_GDT)
-               copy_gdt(cpu, pages->state.guest_gdt);
-       /* If only the TLS entries have changed, copy them. */
-       else if (cpu->changed & CHANGED_GDT_TLS)
-               copy_gdt_tls(cpu, pages->state.guest_gdt);
-
-       /* Mark the Guest as unchanged for next time. */
-       cpu->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-       /* This is a dummy value we need for GCC's sake. */
-       unsigned int clobber;
-
-       /*
-        * Copy the guest-specific information into this CPU's "struct
-        * lguest_pages".
-        */
-       copy_in_guest_info(cpu, pages);
-
-       /*
-        * Set the trap number to 256 (impossible value).  If we fault while
-        * switching to the Guest (bad segment registers or bug), this will
-        * cause us to abort the Guest.
-        */
-       cpu->regs->trapnum = 256;
-
-       /*
-        * Now: we push the "eflags" register on the stack, then do an "lcall".
-        * This is how we change from using the kernel code segment to using
-        * the dedicated lguest code segment, as well as jumping into the
-        * Switcher.
-        *
-        * The lcall also pushes the old code segment (KERNEL_CS) onto the
-        * stack, then the address of this call.  This stack layout happens to
-        * exactly match the stack layout created by an interrupt...
-        */
-       asm volatile("pushf; lcall *%4"
-                    /*
-                     * This is how we tell GCC that %eax ("a") and %ebx ("b")
-                     * are changed by this routine.  The "=" means output.
-                     */
-                    : "=a"(clobber), "=b"(clobber)
-                    /*
-                     * %eax contains the pages pointer.  ("0" refers to the
-                     * 0-th argument above, ie "a").  %ebx contains the
-                     * physical address of the Guest's top-level page
-                     * directory.
-                     */
-                    : "0"(pages), 
-                      "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
-                      "m"(lguest_entry)
-                    /*
-                     * We tell gcc that all these registers could change,
-                     * which means we don't have to save and restore them in
-                     * the Switcher.
-                     */
-                    : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
-/*:*/
-
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
-{
-       switch (reg_off) {
-       case offsetof(struct pt_regs, bx):
-               return &cpu->regs->ebx;
-       case offsetof(struct pt_regs, cx):
-               return &cpu->regs->ecx;
-       case offsetof(struct pt_regs, dx):
-               return &cpu->regs->edx;
-       case offsetof(struct pt_regs, si):
-               return &cpu->regs->esi;
-       case offsetof(struct pt_regs, di):
-               return &cpu->regs->edi;
-       case offsetof(struct pt_regs, bp):
-               return &cpu->regs->ebp;
-       case offsetof(struct pt_regs, ax):
-               return &cpu->regs->eax;
-       case offsetof(struct pt_regs, ip):
-               return &cpu->regs->eip;
-       case offsetof(struct pt_regs, sp):
-               return &cpu->regs->esp;
-       }
-
-       /* Launcher can read these, but we don't allow any setting. */
-       if (any) {
-               switch (reg_off) {
-               case offsetof(struct pt_regs, ds):
-                       return &cpu->regs->ds;
-               case offsetof(struct pt_regs, es):
-                       return &cpu->regs->es;
-               case offsetof(struct pt_regs, fs):
-                       return &cpu->regs->fs;
-               case offsetof(struct pt_regs, gs):
-                       return &cpu->regs->gs;
-               case offsetof(struct pt_regs, cs):
-                       return &cpu->regs->cs;
-               case offsetof(struct pt_regs, flags):
-                       return &cpu->regs->eflags;
-               case offsetof(struct pt_regs, ss):
-                       return &cpu->regs->ss;
-               }
-       }
-
-       return NULL;
-}
-
-/*M:002
- * There are hooks in the scheduler which we can register to tell when we
- * get kicked off the CPU (preempt_notifier_register()).  This would allow us
- * to lazily disable SYSENTER which would regain some performance, and should
- * also simplify copy_in_guest_info().  Note that we'd still need to restore
- * things when we exit to Launcher userspace, but that's fairly easy.
- *
- * We could also try using these hooks for PGE, but that might be too expensive.
- *
- * The hooks were designed for KVM, but we can also put them to good use.
-:*/
-
-/*H:040
- * This is the i386-specific code to setup and run the Guest.  Interrupts
- * are disabled: we own the CPU.
- */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
-{
-       /*
-        * SYSENTER is an optimized way of doing system calls.  We can't allow
-        * it because it always jumps to privilege level 0.  A normal Guest
-        * won't try it because we don't advertise it in CPUID, but a malicious
-        * Guest (or malicious Guest userspace program) could, so we tell the
-        * CPU to disable it before running the Guest.
-        */
-       if (boot_cpu_has(X86_FEATURE_SEP))
-               wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
-       /*
-        * Now we actually run the Guest.  It will return when something
-        * interesting happens, and we can examine its registers to see what it
-        * was doing.
-        */
-       run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
-
-       /*
-        * Note that the "regs" structure contains two extra entries which are
-        * not really registers: a trap number which says what interrupt or
-        * trap made the switcher code come back, and an error code which some
-        * traps set.
-        */
-
-        /* Restore SYSENTER if it's supposed to be on. */
-        if (boot_cpu_has(X86_FEATURE_SEP))
-               wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
-       /*
-        * If the Guest page faulted, then the cr2 register will tell us the
-        * bad virtual address.  We have to grab this now, because once we
-        * re-enable interrupts an interrupt could fault and thus overwrite
-        * cr2, or we could even move off to a different CPU.
-        */
-       if (cpu->regs->trapnum == 14)
-               cpu->arch.last_pagefault = read_cr2();
-       /*
-        * Similarly, if we took a trap because the Guest used the FPU,
-        * we have to restore the FPU it expects to see.
-        * fpu__restore() may sleep and we may even move off to
-        * a different CPU. So all the critical stuff should be done
-        * before this.
-        */
-       else if (cpu->regs->trapnum == 7 && !fpregs_active())
-               fpu__restore(&current->thread.fpu);
-}
-
-/*H:130
- * Now we've examined the hypercall code; our Guest can make requests.
- * Our Guest is usually so well behaved; it never tries to do things it isn't
- * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
- * infrastructure isn't quite complete, because it doesn't contain replacements
- * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
- * across one during the boot process as it probes for various things which are
- * usually attached to a PC.
- *
- * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We queue this to be sent out to the
- * Launcher to handle.
- */
-
-/*
- * The eip contains the *virtual* address of the Guest's instruction:
- * we copy the instruction here so the Launcher doesn't have to walk
- * the page tables to decode it.  We handle the case (eg. in a kernel
- * module) where the instruction is over two pages, and the pages are
- * virtually but not physically contiguous.
- *
- * The longest possible x86 instruction is 15 bytes, but we don't handle
- * anything that strange.
- */
-static void copy_from_guest(struct lg_cpu *cpu,
-                           void *dst, unsigned long vaddr, size_t len)
-{
-       size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
-       unsigned long paddr;
-
-       BUG_ON(len > PAGE_SIZE);
-
-       /* If it goes over a page, copy in two parts. */
-       if (len > to_page_end) {
-               /* But make sure the next page is mapped! */
-               if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
-                       copy_from_guest(cpu, dst + to_page_end,
-                                       vaddr + to_page_end,
-                                       len - to_page_end);
-               else
-                       /* Otherwise fill with zeroes. */
-                       memset(dst + to_page_end, 0, len - to_page_end);
-               len = to_page_end;
-       }
-
-       /* This will kill the guest if it isn't mapped, but that
-        * shouldn't happen. */
-       __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-}
-
-
-static void setup_emulate_insn(struct lg_cpu *cpu)
-{
-       cpu->pending.trap = 13;
-       copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-                       sizeof(cpu->pending.insn));
-}
-
-static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-{
-       cpu->pending.trap = 14;
-       cpu->pending.addr = iomem_addr;
-       copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-                       sizeof(cpu->pending.insn));
-}
-
-/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
-{
-       unsigned long iomem_addr;
-
-       switch (cpu->regs->trapnum) {
-       case 13: /* We've intercepted a General Protection Fault. */
-               /* Hand to Launcher to emulate those pesky IN and OUT insns */
-               if (cpu->regs->errcode == 0) {
-                       setup_emulate_insn(cpu);
-                       return;
-               }
-               break;
-       case 14: /* We've intercepted a Page Fault. */
-               /*
-                * The Guest accessed a virtual address that wasn't mapped.
-                * This happens a lot: we don't actually set up most of the page
-                * tables for the Guest at all when we start: as it runs it asks
-                * for more and more, and we set them up as required. In this
-                * case, we don't even tell the Guest that the fault happened.
-                *
-                * The errcode tells whether this was a read or a write, and
-                * whether kernel or userspace code.
-                */
-               if (demand_page(cpu, cpu->arch.last_pagefault,
-                               cpu->regs->errcode, &iomem_addr))
-                       return;
-
-               /* Was this an access to memory mapped IO? */
-               if (iomem_addr) {
-                       /* Tell Launcher, let it handle it. */
-                       setup_iomem_insn(cpu, iomem_addr);
-                       return;
-               }
-
-               /*
-                * OK, it's really not there (or not OK): the Guest needs to
-                * know.  We write out the cr2 value so it knows where the
-                * fault occurred.
-                *
-                * Note that if the Guest were really messed up, this could
-                * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
-                * lg->lguest_data could be NULL
-                */
-               if (cpu->lg->lguest_data &&
-                   put_user(cpu->arch.last_pagefault,
-                            &cpu->lg->lguest_data->cr2))
-                       kill_guest(cpu, "Writing cr2");
-               break;
-       case 7: /* We've intercepted a Device Not Available fault. */
-               /* No special handling is needed here. */
-               break;
-       case 32 ... 255:
-               /* This might be a syscall. */
-               if (could_be_syscall(cpu->regs->trapnum))
-                       break;
-
-               /*
-                * Other values mean a real interrupt occurred, in which case
-                * the Host handler has already been run. We just do a
-                * friendly check if another process should now be run, then
-                * return to run the Guest again.
-                */
-               cond_resched();
-               return;
-       case LGUEST_TRAP_ENTRY:
-               /*
-                * Our 'struct hcall_args' maps directly over our regs: we set
-                * up the pointer now to indicate a hypercall is pending.
-                */
-               cpu->hcall = (struct hcall_args *)cpu->regs;
-               return;
-       }
-
-       /* We didn't handle the trap, so it needs to go to the Guest. */
-       if (!deliver_trap(cpu, cpu->regs->trapnum))
-               /*
-                * If the Guest doesn't have a handler (either it hasn't
-                * registered any yet, or it's one of the faults we don't let
-                * it handle), it dies with this cryptic error message.
-                */
-               kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
-                          cpu->regs->trapnum, cpu->regs->eip,
-                          cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
-                          : cpu->regs->errcode);
-}
-
-/*
- * Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page().  After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete.
-:*/
-static void adjust_pge(void *on)
-{
-       if (on)
-               cr4_set_bits(X86_CR4_PGE);
-       else
-               cr4_clear_bits(X86_CR4_PGE);
-}
-
-/*H:020
- * Now the Switcher is mapped and every thing else is ready, we need to do
- * some more i386-specific initialization.
- */
-void __init lguest_arch_host_init(void)
-{
-       int i;
-
-       /*
-        * Most of the x86/switcher_32.S doesn't care that it's been moved; on
-        * Intel, jumps are relative, and it doesn't access any references to
-        * external code or data.
-        *
-        * The only exception is the interrupt handlers in switcher.S: their
-        * addresses are placed in a table (default_idt_entries), so we need to
-        * update the table with the new addresses.  switcher_offset() is a
-        * convenience function which returns the distance between the
-        * compiled-in switcher code and the high-mapped copy we just made.
-        */
-       for (i = 0; i < IDT_ENTRIES; i++)
-               default_idt_entries[i] += switcher_offset();
-
-       /*
-        * Set up the Switcher's per-cpu areas.
-        *
-        * Each CPU gets two pages of its own within the high-mapped region
-        * (aka. "struct lguest_pages").  Much of this can be initialized now,
-        * but some depends on what Guest we are running (which is set up in
-        * copy_in_guest_info()).
-        */
-       for_each_possible_cpu(i) {
-               /* lguest_pages() returns this CPU's two pages. */
-               struct lguest_pages *pages = lguest_pages(i);
-               /* This is a convenience pointer to make the code neater. */
-               struct lguest_ro_state *state = &pages->state;
-
-               /*
-                * The Global Descriptor Table: the Host has a different one
-                * for each CPU.  We keep a descriptor for the GDT which says
-                * where it is and how big it is (the size is actually the last
-                * byte, not the size, hence the "-1").
-                */
-               state->host_gdt_desc.size = GDT_SIZE-1;
-               state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
-
-               /*
-                * All CPUs on the Host use the same Interrupt Descriptor
-                * Table, so we just use store_idt(), which gets this CPU's IDT
-                * descriptor.
-                */
-               store_idt(&state->host_idt_desc);
-
-               /*
-                * The descriptors for the Guest's GDT and IDT can be filled
-                * out now, too.  We copy the GDT & IDT into ->guest_gdt and
-                * ->guest_idt before actually running the Guest.
-                */
-               state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
-               state->guest_idt_desc.address = (long)&state->guest_idt;
-               state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
-               state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
-               /*
-                * We know where we want the stack to be when the Guest enters
-                * the Switcher: in pages->regs.  The stack grows upwards, so
-                * we start it at the end of that structure.
-                */
-               state->guest_tss.sp0 = (long)(&pages->regs + 1);
-               /*
-                * And this is the GDT entry to use for the stack: we keep a
-                * couple of special LGUEST entries.
-                */
-               state->guest_tss.ss0 = LGUEST_DS;
-
-               /*
-                * x86 can have a finegrained bitmap which indicates what I/O
-                * ports the process can use.  We set it to the end of our
-                * structure, meaning "none".
-                */
-               state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
-               /*
-                * Some GDT entries are the same across all Guests, so we can
-                * set them up now.
-                */
-               setup_default_gdt_entries(state);
-               /* Most IDT entries are the same for all Guests, too.*/
-               setup_default_idt_entries(state, default_idt_entries);
-
-               /*
-                * The Host needs to be able to use the LGUEST segments on this
-                * CPU, too, so put them in the Host GDT.
-                */
-               get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-               get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-       }
-
-       /*
-        * In the Switcher, we want the %cs segment register to use the
-        * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
-        * it will be undisturbed when we switch.  To change %cs and jump we
-        * need this structure to feed to Intel's "lcall" instruction.
-        */
-       lguest_entry.offset = (long)switch_to_guest + switcher_offset();
-       lguest_entry.segment = LGUEST_CS;
-
-       /*
-        * Finally, we need to turn off "Page Global Enable".  PGE is an
-        * optimization where page table entries are specially marked to show
-        * they never change.  The Host kernel marks all the kernel pages this
-        * way because it's always present, even when userspace is running.
-        *
-        * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
-        * switch to the Guest kernel.  If you don't disable this on all CPUs,
-        * you'll get really weird bugs that you'll chase for two days.
-        *
-        * I used to turn PGE off every time we switched to the Guest and back
-        * on when we return, but that slowed the Switcher down noticibly.
-        */
-
-       /*
-        * We don't need the complexity of CPUs coming and going while we're
-        * doing this.
-        */
-       get_online_cpus();
-       if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
-               /* Remember that this was originally set (for cleanup). */
-               cpu_had_pge = 1;
-               /*
-                * adjust_pge is a helper function which sets or unsets the PGE
-                * bit on its CPU, depending on the argument (0 == unset).
-                */
-               on_each_cpu(adjust_pge, (void *)0, 1);
-               /* Turn off the feature in the global feature set. */
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-       }
-       put_online_cpus();
-}
-/*:*/
-
-void __exit lguest_arch_host_fini(void)
-{
-       /* If we had PGE before we started, turn it back on now. */
-       get_online_cpus();
-       if (cpu_had_pge) {
-               set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-               /* adjust_pge's argument "1" means set PGE. */
-               on_each_cpu(adjust_pge, (void *)1, 1);
-       }
-       put_online_cpus();
-}
-
-
-/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-       switch (args->arg0) {
-       case LHCALL_LOAD_GDT_ENTRY:
-               load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
-               break;
-       case LHCALL_LOAD_IDT_ENTRY:
-               load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
-               break;
-       case LHCALL_LOAD_TLS:
-               guest_load_tls(cpu, args->arg1);
-               break;
-       default:
-               /* Bad Guest.  Bad! */
-               return -EIO;
-       }
-       return 0;
-}
-
-/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
-{
-       u32 tsc_speed;
-
-       /*
-        * The pointer to the Guest's "struct lguest_data" is the only argument.
-        * We check that address now.
-        */
-       if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
-                              sizeof(*cpu->lg->lguest_data)))
-               return -EFAULT;
-
-       /*
-        * Having checked it, we simply set lg->lguest_data to point straight
-        * into the Launcher's memory at the right place and then use
-        * copy_to_user/from_user from now on, instead of lgread/write.  I put
-        * this in to show that I'm not immune to writing stupid
-        * optimizations.
-        */
-       cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
-
-       /*
-        * We insist that the Time Stamp Counter exist and doesn't change with
-        * cpu frequency.  Some devious chip manufacturers decided that TSC
-        * changes could be handled in software.  I decided that time going
-        * backwards might be good for benchmarks, but it's bad for users.
-        *
-        * We also insist that the TSC be stable: the kernel detects unreliable
-        * TSCs for its own purposes, and we use that here.
-        */
-       if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
-               tsc_speed = tsc_khz;
-       else
-               tsc_speed = 0;
-       if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
-               return -EFAULT;
-
-       /* The interrupt code might not like the system call vector. */
-       if (!check_syscall_vector(cpu->lg))
-               kill_guest(cpu, "bad syscall vector");
-
-       return 0;
-}
-/*:*/
-
-/*L:030
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0.
- */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
-{
-       struct lguest_regs *regs = cpu->regs;
-
-       /*
-        * There are four "segment" registers which the Guest needs to boot:
-        * The "code segment" register (cs) refers to the kernel code segment
-        * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
-        * refer to the kernel data segment __KERNEL_DS.
-        *
-        * The privilege level is packed into the lower bits.  The Guest runs
-        * at privilege level 1 (GUEST_PL).
-        */
-       regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
-       regs->cs = __KERNEL_CS|GUEST_PL;
-
-       /*
-        * The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
-        * is supposed to always be "1".  Bit 9 (0x200) controls whether
-        * interrupts are enabled.  We always leave interrupts enabled while
-        * running the Guest.
-        */
-       regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-
-       /*
-        * The "Extended Instruction Pointer" register says where the Guest is
-        * running.
-        */
-       regs->eip = start;
-
-       /*
-        * %esi points to our boot information, at physical address 0, so don't
-        * touch it.
-        */
-
-       /* There are a couple of GDT entries the Guest expects at boot. */
-       setup_guest_gdt(cpu);
-}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S

deleted file mode 100644 (file)

index 40634b0..0000000
--- a/drivers/lguest/x86/switcher_32.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*P:900
- * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
- * both the Host and Guest to do the low-level Guest<->Host switch.  It is as
- * simple as it can be made, but it's naturally very specific to x86.
- *
- * You have now completed Preparation.  If this has whet your appetite; if you
- * are feeling invigorated and refreshed then the next, more challenging stage
- * can be found in "make Guest".
- :*/
-
-/*M:012
- * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
- * gain at least 1% more performance.  Since neither LOC nor performance can be
- * measured beforehand, it generally means implementing a feature then deciding
- * if it's worth it.  And once it's implemented, who can say no?
- *
- * This is why I haven't implemented this idea myself.  I want to, but I
- * haven't.  You could, though.
- *
- * The main place where lguest performance sucks is Guest page faulting.  When
- * a Guest userspace process hits an unmapped page we switch back to the Host,
- * walk the page tables, find it's not mapped, switch back to the Guest page
- * fault handler, which calls a hypercall to set the page table entry, then
- * finally returns to userspace.  That's two round-trips.
- *
- * If we had a small walker in the Switcher, we could quickly check the Guest
- * page table and if the page isn't mapped, immediately reflect the fault back
- * into the Guest.  This means the Switcher would have to know the top of the
- * Guest page table and the page fault handler address.
- *
- * For simplicity, the Guest should only handle the case where the privilege
- * level of the fault is 3 and probably only not present or write faults.  It
- * should also detect recursive faults, and hand the original fault to the
- * Host (which is actually really easy).
- *
- * Two questions remain.  Would the performance gain outweigh the complexity?
- * And who would write the verse documenting it?
-:*/
-
-/*M:011
- * Lguest64 handles NMI.  This gave me NMI envy (until I looked at their
- * code).  It's worth doing though, since it would let us use oprofile in the
- * Host when a Guest is running.
-:*/
-
-/*S:100
- * Welcome to the Switcher itself!
- *
- * This file contains the low-level code which changes the CPU to run the Guest
- * code, and returns to the Host when something happens.  Understand this, and
- * you understand the heart of our journey.
- *
- * Because this is in assembler rather than C, our tale switches from prose to
- * verse.  First I tried limericks:
- *
- *     There once was an eax reg,
- *     To which our pointer was fed,
- *     It needed an add,
- *     Which asm-offsets.h had
- *     But this limerick is hurting my head.
- *
- * Next I tried haikus, but fitting the required reference to the seasons in
- * every stanza was quickly becoming tiresome:
- *
- *     The %eax reg
- *     Holds "struct lguest_pages" now:
- *     Cherry blossoms fall.
- *
- * Then I started with Heroic Verse, but the rhyming requirement leeched away
- * the content density and led to some uniquely awful oblique rhymes:
- *
- *     These constants are coming from struct offsets
- *     For use within the asm switcher text.
- *
- * Finally, I settled for something between heroic hexameter, and normal prose
- * with inappropriate linebreaks.  Anyway, it aint no Shakespeare.
- */
-
-// Not all kernel headers work from assembler
-// But these ones are needed: the ENTRY() define
-// And constants extracted from struct offsets
-// To avoid magic numbers and breakage:
-// Should they change the compiler can't save us
-// Down here in the depths of assembler code.
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/segment.h>
-#include <asm/lguest.h>
-
-// We mark the start of the code to copy
-// It's placed in .text tho it's never run here
-// You'll see the trick macro at the end
-// Which interleaves data and text to effect.
-.text
-ENTRY(start_switcher_text)
-
-// When we reach switch_to_guest we have just left
-// The safe and comforting shores of C code
-// %eax has the "struct lguest_pages" to use
-// Where we save state and still see it from the Guest
-// And %ebx holds the Guest shadow pagetable:
-// Once set we have truly left Host behind.
-ENTRY(switch_to_guest)
-       // We told gcc all its regs could fade,
-       // Clobbered by our journey into the Guest
-       // We could have saved them, if we tried
-       // But time is our master and cycles count.
-
-       // Segment registers must be saved for the Host
-       // We push them on the Host stack for later
-       pushl   %es
-       pushl   %ds
-       pushl   %gs
-       pushl   %fs
-       // But the compiler is fickle, and heeds
-       // No warning of %ebp clobbers
-       // When frame pointers are used.  That register
-       // Must be saved and restored or chaos strikes.
-       pushl   %ebp
-       // The Host's stack is done, now save it away
-       // In our "struct lguest_pages" at offset
-       // Distilled into asm-offsets.h
-       movl    %esp, LGUEST_PAGES_host_sp(%eax)
-
-       // All saved and there's now five steps before us:
-       // Stack, GDT, IDT, TSS
-       // Then last of all the page tables are flipped.
-
-       // Yet beware that our stack pointer must be
-       // Always valid lest an NMI hits
-       // %edx does the duty here as we juggle
-       // %eax is lguest_pages: our stack lies within.
-       movl    %eax, %edx
-       addl    $LGUEST_PAGES_regs, %edx
-       movl    %edx, %esp
-
-       // The Guest's GDT we so carefully
-       // Placed in the "struct lguest_pages" before
-       lgdt    LGUEST_PAGES_guest_gdt_desc(%eax)
-
-       // The Guest's IDT we did partially
-       // Copy to "struct lguest_pages" as well.
-       lidt    LGUEST_PAGES_guest_idt_desc(%eax)
-
-       // The TSS entry which controls traps
-       // Must be loaded up with "ltr" now:
-       // The GDT entry that TSS uses 
-       // Changes type when we load it: damn Intel!
-       // For after we switch over our page tables
-       // That entry will be read-only: we'd crash.
-       movl    $(GDT_ENTRY_TSS*8), %edx
-       ltr     %dx
-
-       // Look back now, before we take this last step!
-       // The Host's TSS entry was also marked used;
-       // Let's clear it again for our return.
-       // The GDT descriptor of the Host
-       // Points to the table after two "size" bytes
-       movl    (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
-       // Clear "used" from type field (byte 5, bit 2)
-       andb    $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
-
-       // Once our page table's switched, the Guest is live!
-       // The Host fades as we run this final step.
-       // Our "struct lguest_pages" is now read-only.
-       movl    %ebx, %cr3
-
-       // The page table change did one tricky thing:
-       // The Guest's register page has been mapped
-       // Writable under our %esp (stack) --
-       // We can simply pop off all Guest regs.
-       popl    %eax
-       popl    %ebx
-       popl    %ecx
-       popl    %edx
-       popl    %esi
-       popl    %edi
-       popl    %ebp
-       popl    %gs
-       popl    %fs
-       popl    %ds
-       popl    %es
-
-       // Near the base of the stack lurk two strange fields
-       // Which we fill as we exit the Guest
-       // These are the trap number and its error
-       // We can simply step past them on our way.
-       addl    $8, %esp
-
-       // The last five stack slots hold return address
-       // And everything needed to switch privilege
-       // From Switcher's level 0 to Guest's 1,
-       // And the stack where the Guest had last left it.
-       // Interrupts are turned back on: we are Guest.
-       iret
-
-// We tread two paths to switch back to the Host
-// Yet both must save Guest state and restore Host
-// So we put the routine in a macro.
-#define SWITCH_TO_HOST                                                 \
-       /* We save the Guest state: all registers first                 \
-        * Laid out just as "struct lguest_regs" defines */             \
-       pushl   %es;                                                    \
-       pushl   %ds;                                                    \
-       pushl   %fs;                                                    \
-       pushl   %gs;                                                    \
-       pushl   %ebp;                                                   \
-       pushl   %edi;                                                   \
-       pushl   %esi;                                                   \
-       pushl   %edx;                                                   \
-       pushl   %ecx;                                                   \
-       pushl   %ebx;                                                   \
-       pushl   %eax;                                                   \
-       /* Our stack and our code are using segments                    \
-        * Set in the TSS and IDT                                       \
-        * Yet if we were to touch data we'd use                        \
-        * Whatever data segment the Guest had.                         \
-        * Load the lguest ds segment for now. */                       \
-       movl    $(LGUEST_DS), %eax;                                     \
-       movl    %eax, %ds;                                              \
-       /* So where are we?  Which CPU, which struct?                   \
-        * The stack is our clue: our TSS starts                        \
-        * It at the end of "struct lguest_pages".                      \
-        * Or we may have stumbled while restoring                      \
-        * Our Guest segment regs while in switch_to_guest,             \
-        * The fault pushed atop that part-unwound stack.               \
-        * If we round the stack down to the page start                 \
-        * We're at the start of "struct lguest_pages". */              \
-       movl    %esp, %eax;                                             \
-       andl    $(~(1 << PAGE_SHIFT - 1)), %eax;                        \
-       /* Save our trap number: the switch will obscure it             \
-        * (In the Host the Guest regs are not mapped here)             \
-        * %ebx holds it safe for deliver_to_host */                    \
-       movl    LGUEST_PAGES_regs_trapnum(%eax), %ebx;                  \
-       /* The Host GDT, IDT and stack!                                 \
-        * All these lie safely hidden from the Guest:                  \
-        * We must return to the Host page tables                       \
-        * (Hence that was saved in struct lguest_pages) */             \
-       movl    LGUEST_PAGES_host_cr3(%eax), %edx;                      \
-       movl    %edx, %cr3;                                             \
-       /* As before, when we looked back at the Host                   \
-        * As we left and marked TSS unused                             \
-        * So must we now for the Guest left behind. */                 \
-       andb    $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
-       /* Switch to Host's GDT, IDT. */                                \
-       lgdt    LGUEST_PAGES_host_gdt_desc(%eax);                       \
-       lidt    LGUEST_PAGES_host_idt_desc(%eax);                       \
-       /* Restore the Host's stack where its saved regs lie */         \
-       movl    LGUEST_PAGES_host_sp(%eax), %esp;                       \
-       /* Last the TSS: our Host is returned */                        \
-       movl    $(GDT_ENTRY_TSS*8), %edx;                               \
-       ltr     %dx;                                                    \
-       /* Restore now the regs saved right at the first. */            \
-       popl    %ebp;                                                   \
-       popl    %fs;                                                    \
-       popl    %gs;                                                    \
-       popl    %ds;                                                    \
-       popl    %es
-
-// The first path is trod when the Guest has trapped:
-// (Which trap it was has been pushed on the stack).
-// We need only switch back, and the Host will decode
-// Why we came home, and what needs to be done.
-return_to_host:
-       SWITCH_TO_HOST
-       iret
-
-// We are lead to the second path like so:
-// An interrupt, with some cause external
-// Has ajerked us rudely from the Guest's code
-// Again we must return home to the Host
-deliver_to_host:
-       SWITCH_TO_HOST
-       // But now we must go home via that place
-       // Where that interrupt was supposed to go
-       // Had we not been ensconced, running the Guest.
-       // Here we see the trickness of run_guest_once():
-       // The Host stack is formed like an interrupt
-       // With EIP, CS and EFLAGS layered.
-       // Interrupt handlers end with "iret"
-       // And that will take us home at long long last.
-
-       // But first we must find the handler to call!
-       // The IDT descriptor for the Host
-       // Has two bytes for size, and four for address:
-       // %edx will hold it for us for now.
-       movl    (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
-       // We now know the table address we need,
-       // And saved the trap's number inside %ebx.
-       // Yet the pointer to the handler is smeared
-       // Across the bits of the table entry.
-       // What oracle can tell us how to extract
-       // From such a convoluted encoding?
-       // I consulted gcc, and it gave
-       // These instructions, which I gladly credit:
-       leal    (%edx,%ebx,8), %eax
-       movzwl  (%eax),%edx
-       movl    4(%eax), %eax
-       xorw    %ax, %ax
-       orl     %eax, %edx
-       // Now the address of the handler's in %edx
-       // We call it now: its "iret" drops us home.
-       jmp     *%edx
-
-// Every interrupt can come to us here
-// But we must truly tell each apart.
-// They number two hundred and fifty six
-// And each must land in a different spot,
-// Push its number on stack, and join the stream.
-
-// And worse, a mere six of the traps stand apart
-// And push on their stack an addition:
-// An error number, thirty two bits long
-// So we punish the other two fifty
-// And make them push a zero so they match.
-
-// Yet two fifty six entries is long
-// And all will look most the same as the last
-// So we create a macro which can make
-// As many entries as we need to fill.
-
-// Note the change to .data then .text:
-// We plant the address of each entry
-// Into a (data) table for the Host
-// To know where each Guest interrupt should go.
-.macro IRQ_STUB N TARGET
-       .data; .long 1f; .text; 1:
- // Trap eight, ten through fourteen and seventeen
- // Supply an error number.  Else zero.
- .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
-       pushl   $0
- .endif
-       pushl   $\N
-       jmp     \TARGET
-       ALIGN
-.endm
-
-// This macro creates numerous entries
-// Using GAS macros which out-power C's.
-.macro IRQ_STUBS FIRST LAST TARGET
- irq=\FIRST
- .rept \LAST-\FIRST+1
-       IRQ_STUB irq \TARGET
-  irq=irq+1
- .endr
-.endm
-
-// Here's the marker for our pointer table
-// Laid in the data section just before
-// Each macro places the address of code
-// Forming an array: each one points to text
-// Which handles interrupt in its turn.
-.data
-.global default_idt_entries
-default_idt_entries:
-.text
-       // The first two traps go straight back to the Host
-       IRQ_STUBS 0 1 return_to_host
-       // We'll say nothing, yet, about NMI
-       IRQ_STUB 2 handle_nmi
-       // Other traps also return to the Host
-       IRQ_STUBS 3 31 return_to_host
-       // All interrupts go via their handlers
-       IRQ_STUBS 32 127 deliver_to_host
-       // 'Cept system calls coming from userspace
-       // Are to go to the Guest, never the Host.
-       IRQ_STUB 128 return_to_host
-       IRQ_STUBS 129 255 deliver_to_host
-
-// The NMI, what a fabulous beast
-// Which swoops in and stops us no matter that
-// We're suspended between heaven and hell,
-// (Or more likely between the Host and Guest)
-// When in it comes!  We are dazed and confused
-// So we do the simplest thing which one can.
-// Though we've pushed the trap number and zero
-// We discard them, return, and hope we live.
-handle_nmi:
-       addl    $8, %esp
-       iret
-
-// We are done; all that's left is Mastery
-// And "make Mastery" is a journey long
-// Designed to make your fingers itch to code.
-
-// Here ends the text, the file and poem.
-ENTRY(end_switcher_text)
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig

index 83a1616903f8ab05f9e70419482169de746d0a5b..aba0d652095b0406b5d73159d83f5c7ef7034ea8 100644 (file)
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -333,7 +333,7 @@ config VIRTIO_NET
         depends on VIRTIO
         ---help---
           This is the virtual network driver for virtio.  It can be used with
-         lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+         QEMU based VMMs (like KVM or Xen).  Say Y or M.
  
  config NLMON
         tristate "Virtual netlink monitoring device"
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig

index b8d5ea0ae26b49e8814ac2f187a708a77a23f92f..fec457edad14800c5ed07476afe5f2a9986478c8 100644 (file)
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -4,7 +4,7 @@ config HVC_DRIVER
         bool
         help
           Generic "hypervisor virtual console" infrastructure for various
-         hypervisors (pSeries, iSeries, Xen, lguest).
+         hypervisors (pSeries, iSeries, Xen).
           It will automatically be selected if one of the back-end console drivers
           is selected.
  
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig

index 623f72334fa55727f175ddd1f32358c3c6fc223e..cff773f15b7e2e6bbabe904c8370206e4a1714b9 100644 (file)
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -2,8 +2,8 @@ config VIRTIO
         tristate
         ---help---
           This option is selected by any driver which implements the virtio
-         bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
-         CONFIG_RPMSG or CONFIG_S390_GUEST.
+         bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+         or CONFIG_S390_GUEST.
  
  menu "Virtio drivers"
  
diff --git a/include/linux/lguest.h b/include/linux/lguest.h

deleted file mode 100644 (file)

index 6db19f3..0000000
--- a/include/linux/lguest.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Things the lguest guest needs to know.  Note: like all lguest interfaces,
- * this is subject to wild and random change between versions.
- */
-#ifndef _LINUX_LGUEST_H
-#define _LINUX_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/time.h>
-#include <asm/irq.h>
-#include <asm/lguest_hcall.h>
-
-#define LG_CLOCK_MIN_DELTA     100UL
-#define LG_CLOCK_MAX_DELTA     ULONG_MAX
-
-/*G:031
- * The second method of communicating with the Host is to via "struct
- * lguest_data".  Once the Guest's initialization hypercall tells the Host where
- * this is, the Guest and Host both publish information in it.
-:*/
-struct lguest_data {
-       /*
-        * 512 == enabled (same as eflags in normal hardware).  The Guest
-        * changes interrupts so often that a hypercall is too slow.
-        */
-       unsigned int irq_enabled;
-       /* Fine-grained interrupt disabling by the Guest */
-       DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
-
-       /*
-        * The Host writes the virtual address of the last page fault here,
-        * which saves the Guest a hypercall.  CR2 is the native register where
-        * this address would normally be found.
-        */
-       unsigned long cr2;
-
-       /* Wallclock time set by the Host. */
-       struct timespec time;
-
-       /*
-        * Interrupt pending set by the Host.  The Guest should do a hypercall
-        * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
-        */
-       int irq_pending;
-
-       /*
-        * Async hypercall ring.  Instead of directly making hypercalls, we can
-        * place them in here for processing the next time the Host wants.
-        * This batching can be quite efficient.
-        */
-
-       /* 0xFF == done (set by Host), 0 == pending (set by Guest). */
-       u8 hcall_status[LHCALL_RING_SIZE];
-       /* The actual registers for the hypercalls. */
-       struct hcall_args hcalls[LHCALL_RING_SIZE];
-
-/* Fields initialized by the Host at boot: */
-       /* Memory not to try to access */
-       unsigned long reserve_mem;
-       /* KHz for the TSC clock. */
-       u32 tsc_khz;
-
-/* Fields initialized by the Guest at boot: */
-       /* Instruction to suppress interrupts even if enabled */
-       unsigned long noirq_iret;
-       /* Address above which page tables are all identical. */
-       unsigned long kernel_address;
-       /* The vector to try to use for system calls (0x40 or 0x80). */
-       unsigned int syscall_vec;
-};
-extern struct lguest_data lguest_data;
-#endif /* __ASSEMBLY__ */
-#endif /* _LINUX_LGUEST_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h

deleted file mode 100644 (file)

index acd5b12..0000000
--- a/include/linux/lguest_launcher.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _LINUX_LGUEST_LAUNCHER
-#define _LINUX_LGUEST_LAUNCHER
-/* Everything the "lguest" userspace program needs to know. */
-#include <linux/types.h>
-
-/*D:010
- * Drivers
- *
- * The Guest needs devices to do anything useful.  Since we don't let it touch
- * real devices (think of the damage it could do!) we provide virtual devices.
- * We emulate a PCI bus with virtio devices on it; we used to have our own
- * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
- *
- * Virtio devices are also used by kvm, so we can simply reuse their optimized
- * device drivers.  And one day when everyone uses virtio, my plan will be
- * complete.  Bwahahahah!
- */
-
-/* Write command first word is a request. */
-enum lguest_req
-{
-       LHREQ_INITIALIZE, /* + base, pfnlimit, start */
-       LHREQ_GETDMA, /* No longer used */
-       LHREQ_IRQ, /* + irq */
-       LHREQ_BREAK, /* No longer used */
-       LHREQ_EVENTFD, /* No longer used. */
-       LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
-       LHREQ_SETREG, /* + offset within struct pt_regs, value. */
-       LHREQ_TRAP, /* + trap number to deliver to guest. */
-};
-
-/*
- * This is what read() of the lguest fd populates.  trap ==
- * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
- * argument), 14 for a page fault in the MMIO region (addr is
- * the trap address, insn is the instruction), or 13 for a GPF
- * (insn is the instruction).
- */
-struct lguest_pending {
-       __u8 trap;
-       __u8 insn[7];
-       __u32 addr;
-};
-#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h

index c07295969b7e134ec85bf58b72b100949a6462fa..6d5d5faa989b9247cba12e1e27ac26683dc2efca 100644 (file)
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -1,7 +1,7 @@
  #ifndef _UAPI_LINUX_VIRTIO_RING_H
  #define _UAPI_LINUX_VIRTIO_RING_H
-/* An interface for efficient virtio implementation, currently for use by KVM
- * and lguest, but hopefully others soon.  Do NOT change this since it will
+/* An interface for efficient virtio implementation, currently for use by KVM,
+ * but hopefully others soon.  Do NOT change this since it will
   * break existing servers and clients.
   *
   * This header is BSD licensed so anyone can use the definitions to implement
diff --git a/tools/Makefile b/tools/Makefile

index 221e1ce78b06bb43bb344b36e01cd6dd72b10feb..a19b176b914b0d7f1ef3182c306164402dab71f4 100644 (file)
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -18,7 +18,6 @@ help:
         @echo '  iio                    - IIO tools'
         @echo '  kvm_stat               - top-like utility for displaying kvm statistics'
         @echo '  leds                   - LEDs  tools'
-       @echo '  lguest                 - a minimal 32-bit x86 hypervisor'
         @echo '  liblockdep             - user-space wrapper for kernel locking-validator'
         @echo '  net                    - misc networking tools'
         @echo '  perf                   - Linux performance measurement and analysis tool'
@@ -90,7 +89,7 @@ freefall: FORCE
  kvm_stat: FORCE
         $(call descend,kvm/$@)
  
-all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
+all: acpi cgroup cpupower gpio hv firewire liblockdep \
                 perf selftests turbostat usb \
                 virtio vm net x86_energy_perf_policy \
                 tmon freefall objtool kvm_stat
@@ -101,7 +100,7 @@ acpi_install:
  cpupower_install:
         $(call descend,power/$(@:_install=),install)
  
-cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
         $(call descend,$(@:_install=),install)
  
  liblockdep_install:
@@ -123,7 +122,7 @@ kvm_stat_install:
         $(call descend,kvm/$(@:_install=),install)
  
  install: acpi_install cgroup_install cpupower_install gpio_install \
-               hv_install firewire_install lguest_install liblockdep_install \
+               hv_install firewire_install liblockdep_install \
                 perf_install selftests_install turbostat_install usb_install \
                 virtio_install vm_install net_install x86_energy_perf_policy_install \
                 tmon_install freefall_install objtool_install kvm_stat_install
@@ -134,7 +133,7 @@ acpi_clean:
  cpupower_clean:
         $(call descend,power/cpupower,clean)
  
-cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
         $(call descend,$(@:_clean=),clean)
  
  liblockdep_clean:
@@ -168,7 +167,7 @@ freefall_clean:
  build_clean:
         $(call descend,build,clean)
  
-clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
                 perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
                 vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
                 freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore

deleted file mode 100644 (file)

index 8d9a838..0000000
--- a/tools/lguest/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lguest
-include
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile

deleted file mode 100644 (file)

index d04599a..0000000
--- a/tools/lguest/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
-
-all: lguest
-
-include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
-       mkdir -p include/linux 2>&1 || true
-       ln -sf ../../../../include/uapi/linux/virtio_types.h $@
-
-lguest: include/linux/virtio_types.h
-
-clean:
-       rm -f lguest
-       rm -rf include
diff --git a/tools/lguest/extract b/tools/lguest/extract

deleted file mode 100644 (file)

index 7730bb6..0000000
--- a/tools/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
-#! /bin/sh
-
-set -e
-
-PREFIX=$1
-shift
-
-trap 'rm -r $TMPDIR' 0
-TMPDIR=`mktemp -d`
-
-exec 3>/dev/null
-for f; do
-    while IFS="
-" read -r LINE; do
-       case "$LINE" in
-           *$PREFIX:[0-9]*:\**)
-               NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-               if [ -f $TMPDIR/$NUM ]; then
-                   echo "$TMPDIR/$NUM already exits prior to $f"
-                   exit 1
-               fi
-               exec 3>>$TMPDIR/$NUM
-               echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-               /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
-               ;;
-           *$PREFIX:[0-9]*)
-               NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-               if [ -f $TMPDIR/$NUM ]; then
-                   echo "$TMPDIR/$NUM already exits prior to $f"
-                   exit 1
-               fi
-               exec 3>>$TMPDIR/$NUM
-               echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-               /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
-               ;;
-           *:\**)
-               /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
-               echo >&3
-               exec 3>/dev/null
-               ;;
-           *)
-               /bin/echo "$LINE" >&3
-               ;;
-       esac
-    done < $f
-    echo >&3
-    exec 3>/dev/null
-done
-
-LASTFILE=""
-for f in $TMPDIR/*; do
-    if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
-       LASTFILE=$(cat $TMPDIR/.$(basename $f) )
-       echo "[ $LASTFILE ]"
-    fi
-    cat $f
-done
-
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c

deleted file mode 100644 (file)

index 897cd6f..0000000
--- a/tools/lguest/lguest.c
+++ /dev/null
@@ -1,3420 +0,0 @@
-/*P:100
- * This is the Launcher code, a simple program which lays out the "physical"
- * memory for the new Guest by mapping the kernel image and the virtual
- * devices, then opens /dev/lguest to tell the kernel about the Guest and
- * control it.
-:*/
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/eventfd.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <ctype.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <getopt.h>
-#include <assert.h>
-#include <sched.h>
-#include <limits.h>
-#include <stddef.h>
-#include <signal.h>
-#include <pwd.h>
-#include <grp.h>
-#include <sys/user.h>
-#include <linux/pci_regs.h>
-
-#ifndef VIRTIO_F_ANY_LAYOUT
-#define VIRTIO_F_ANY_LAYOUT            27
-#endif
-
-/*L:110
- * We can ignore the 43 include files we need for this program, but I do want
- * to draw attention to the use of kernel-style types.
- *
- * As Linus said, "C is a Spartan language, and so should your naming be."  I
- * like these abbreviations, so we define them here.  Note that u64 is always
- * unsigned long long, which works on all Linux systems: this means that we can
- * use %llu in printf for any u64.
- */
-typedef unsigned long long u64;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-/*:*/
-
-#define VIRTIO_CONFIG_NO_LEGACY
-#define VIRTIO_PCI_NO_LEGACY
-#define VIRTIO_BLK_NO_LEGACY
-#define VIRTIO_NET_NO_LEGACY
-
-/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
-#include "../../include/uapi/linux/virtio_config.h"
-#include "../../include/uapi/linux/virtio_net.h"
-#include "../../include/uapi/linux/virtio_blk.h"
-#include "../../include/uapi/linux/virtio_console.h"
-#include "../../include/uapi/linux/virtio_rng.h"
-#include <linux/virtio_ring.h>
-#include "../../include/uapi/linux/virtio_pci.h"
-#include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
-
-#define BRIDGE_PFX "bridge:"
-#ifndef SIOCBRADDIF
-#define SIOCBRADDIF    0x89a2          /* add interface to bridge      */
-#endif
-/* We can have up to 256 pages for devices. */
-#define DEVICE_PAGES 256
-/* This will occupy 3 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 256
-
-/*L:120
- * verbose is both a global flag and a macro.  The C preprocessor allows
- * this, and although I wouldn't recommend it, it works quite nicely here.
- */
-static bool verbose;
-#define verbose(args...) \
-       do { if (verbose) printf(args); } while(0)
-/*:*/
-
-/* The pointer to the start of guest memory. */
-static void *guest_base;
-/* The maximum guest physical address allowed, and maximum possible. */
-static unsigned long guest_limit, guest_max, guest_mmio;
-/* The /dev/lguest file descriptor. */
-static int lguest_fd;
-
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
-/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
-#define MAX_PCI_DEVICES 32
-
-/* This is our list of devices. */
-struct device_list {
-       /* Counter to assign interrupt numbers. */
-       unsigned int next_irq;
-
-       /* Counter to print out convenient device numbers. */
-       unsigned int device_num;
-
-       /* PCI devices. */
-       struct device *pci[MAX_PCI_DEVICES];
-};
-
-/* The list of Guest devices, based on command line arguments. */
-static struct device_list devices;
-
-/*
- * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h,
- * but uses a u32 explicitly for the data.
- */
-struct virtio_pci_cfg_cap_u32 {
-       struct virtio_pci_cap cap;
-       u32 pci_cfg_data; /* Data for BAR access. */
-};
-
-struct virtio_pci_mmio {
-       struct virtio_pci_common_cfg cfg;
-       u16 notify;
-       u8 isr;
-       u8 padding;
-       /* Device-specific configuration follows this. */
-};
-
-/* This is the layout (little-endian) of the PCI config space. */
-struct pci_config {
-       u16 vendor_id, device_id;
-       u16 command, status;
-       u8 revid, prog_if, subclass, class;
-       u8 cacheline_size, lat_timer, header_type, bist;
-       u32 bar[6];
-       u32 cardbus_cis_ptr;
-       u16 subsystem_vendor_id, subsystem_device_id;
-       u32 expansion_rom_addr;
-       u8 capabilities, reserved1[3];
-       u32 reserved2;
-       u8 irq_line, irq_pin, min_grant, max_latency;
-
-       /* Now, this is the linked capability list. */
-       struct virtio_pci_cap common;
-       struct virtio_pci_notify_cap notify;
-       struct virtio_pci_cap isr;
-       struct virtio_pci_cap device;
-       struct virtio_pci_cfg_cap_u32 cfg_access;
-};
-
-/* The device structure describes a single device. */
-struct device {
-       /* The name of this device, for --verbose. */
-       const char *name;
-
-       /* Any queues attached to this device */
-       struct virtqueue *vq;
-
-       /* Is it operational */
-       bool running;
-
-       /* Has it written FEATURES_OK but not re-checked it? */
-       bool wrote_features_ok;
-
-       /* PCI configuration */
-       union {
-               struct pci_config config;
-               u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
-       };
-
-       /* Features we offer, and those accepted. */
-       u64 features, features_accepted;
-
-       /* Device-specific config hangs off the end of this. */
-       struct virtio_pci_mmio *mmio;
-
-       /* PCI MMIO resources (all in BAR0) */
-       size_t mmio_size;
-       u32 mmio_addr;
-
-       /* Device-specific data. */
-       void *priv;
-};
-
-/* The virtqueue structure describes a queue attached to a device. */
-struct virtqueue {
-       struct virtqueue *next;
-
-       /* Which device owns me. */
-       struct device *dev;
-
-       /* Name for printing errors. */
-       const char *name;
-
-       /* The actual ring of buffers. */
-       struct vring vring;
-
-       /* The information about this virtqueue (we only use queue_size on) */
-       struct virtio_pci_common_cfg pci_config;
-
-       /* Last available index we saw. */
-       u16 last_avail_idx;
-
-       /* How many are used since we sent last irq? */
-       unsigned int pending_used;
-
-       /* Eventfd where Guest notifications arrive. */
-       int eventfd;
-
-       /* Function for the thread which is servicing this virtqueue. */
-       void (*service)(struct virtqueue *vq);
-       pid_t thread;
-};
-
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
-/* The original tty settings to restore on exit. */
-static struct termios orig_term;
-
-/*
- * We have to be careful with barriers: our devices are all run in separate
- * threads and so we need to make sure that changes visible to the Guest happen
- * in precise order.
- */
-#define wmb() __asm__ __volatile__("" : : : "memory")
-#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-
-/* Wrapper for the last available index.  Makes it easier to change. */
-#define lg_last_avail(vq)      ((vq)->last_avail_idx)
-
-/*
- * The virtio configuration space is defined to be little-endian.  x86 is
- * little-endian too, but it's nice to be explicit so we have these helpers.
- */
-#define cpu_to_le16(v16) (v16)
-#define cpu_to_le32(v32) (v32)
-#define cpu_to_le64(v64) (v64)
-#define le16_to_cpu(v16) (v16)
-#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v64) (v64)
-
-/*
- * A real device would ignore weird/non-compliant driver behaviour.  We
- * stop and flag it, to help debugging Linux problems.
- */
-#define bad_driver(d, fmt, ...) \
-       errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
-#define bad_driver_vq(vq, fmt, ...)                           \
-       errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
-            vq->name, ## __VA_ARGS__)
-
-/* Is this iovec empty? */
-static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
-{
-       unsigned int i;
-
-       for (i = 0; i < num_iov; i++)
-               if (iov[i].iov_len)
-                       return false;
-       return true;
-}
-
-/* Take len bytes from the front of this iovec. */
-static void iov_consume(struct device *d,
-                       struct iovec iov[], unsigned num_iov,
-                       void *dest, unsigned len)
-{
-       unsigned int i;
-
-       for (i = 0; i < num_iov; i++) {
-               unsigned int used;
-
-               used = iov[i].iov_len < len ? iov[i].iov_len : len;
-               if (dest) {
-                       memcpy(dest, iov[i].iov_base, used);
-                       dest += used;
-               }
-               iov[i].iov_base += used;
-               iov[i].iov_len -= used;
-               len -= used;
-       }
-       if (len != 0)
-               bad_driver(d, "iovec too short!");
-}
-
-/*L:100
- * The Launcher code itself takes us out into userspace, that scary place where
- * pointers run wild and free!  Unfortunately, like most userspace programs,
- * it's quite boring (which is why everyone likes to hack on the kernel!).
- * Perhaps if you make up an Lguest Drinking Game at this point, it will get
- * you through this section.  Or, maybe not.
- *
- * The Launcher sets up a big chunk of memory to be the Guest's "physical"
- * memory and stores it in "guest_base".  In other words, Guest physical ==
- * Launcher virtual with an offset.
- *
- * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us its
- * "physical" addresses:
- */
-static void *from_guest_phys(unsigned long addr)
-{
-       return guest_base + addr;
-}
-
-static unsigned long to_guest_phys(const void *addr)
-{
-       return (addr - guest_base);
-}
-
-/*L:130
- * Loading the Kernel.
- *
- * We start with couple of simple helper routines.  open_or_die() avoids
- * error-checking code cluttering the callers:
- */
-static int open_or_die(const char *name, int flags)
-{
-       int fd = open(name, flags);
-       if (fd < 0)
-               err(1, "Failed to open %s", name);
-       return fd;
-}
-
-/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned int num)
-{
-       int fd = open_or_die("/dev/zero", O_RDONLY);
-       void *addr;
-
-       /*
-        * We use a private mapping (ie. if we write to the page, it will be
-        * copied). We allocate an extra two pages PROT_NONE to act as guard
-        * pages against read/write attempts that exceed allocated space.
-        */
-       addr = mmap(NULL, getpagesize() * (num+2),
-                   PROT_NONE, MAP_PRIVATE, fd, 0);
-
-       if (addr == MAP_FAILED)
-               err(1, "Mmapping %u pages of /dev/zero", num);
-
-       if (mprotect(addr + getpagesize(), getpagesize() * num,
-                    PROT_READ|PROT_WRITE) == -1)
-               err(1, "mprotect rw %u pages failed", num);
-
-       /*
-        * One neat mmap feature is that you can close the fd, and it
-        * stays mapped.
-        */
-       close(fd);
-
-       /* Return address after PROT_NONE page */
-       return addr + getpagesize();
-}
-
-/* Get some bytes which won't be mapped into the guest. */
-static unsigned long get_mmio_region(size_t size)
-{
-       unsigned long addr = guest_mmio;
-       size_t i;
-
-       if (!size)
-               return addr;
-
-       /* Size has to be a power of 2 (and multiple of 16) */
-       for (i = 1; i < size; i <<= 1);
-
-       guest_mmio += i;
-
-       return addr;
-}
-
-/*
- * This routine is used to load the kernel or initrd.  It tries mmap, but if
- * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
- * it falls back to reading the memory in.
- */
-static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
-{
-       ssize_t r;
-
-       /*
-        * We map writable even though for some segments are marked read-only.
-        * The kernel really wants to be writable: it patches its own
-        * instructions.
-        *
-        * MAP_PRIVATE means that the page won't be copied until a write is
-        * done to it.  This allows us to share untouched memory between
-        * Guests.
-        */
-       if (mmap(addr, len, PROT_READ|PROT_WRITE,
-                MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
-               return;
-
-       /* pread does a seek and a read in one shot: saves a few lines. */
-       r = pread(fd, addr, len, offset);
-       if (r != len)
-               err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
-}
-
-/*
- * This routine takes an open vmlinux image, which is in ELF, and maps it into
- * the Guest memory.  ELF = Embedded Linking Format, which is the format used
- * by all modern binaries on Linux including the kernel.
- *
- * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  We use the physical address; the Guest will map itself to the
- * virtual address.
- *
- * We return the starting address.
- */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-{
-       Elf32_Phdr phdr[ehdr->e_phnum];
-       unsigned int i;
-
-       /*
-        * Sanity checks on the main ELF header: an x86 executable with a
-        * reasonable number of correctly-sized program headers.
-        */
-       if (ehdr->e_type != ET_EXEC
-           || ehdr->e_machine != EM_386
-           || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-           || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
-               errx(1, "Malformed elf header");
-
-       /*
-        * An ELF executable contains an ELF header and a number of "program"
-        * headers which indicate which parts ("segments") of the program to
-        * load where.
-        */
-
-       /* We read in all the program headers at once: */
-       if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
-               err(1, "Seeking to program headers");
-       if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
-               err(1, "Reading program headers");
-
-       /*
-        * Try all the headers: there are usually only three.  A read-only one,
-        * a read-write one, and a "note" section which we don't load.
-        */
-       for (i = 0; i < ehdr->e_phnum; i++) {
-               /* If this isn't a loadable segment, we ignore it */
-               if (phdr[i].p_type != PT_LOAD)
-                       continue;
-
-               verbose("Section %i: size %i addr %p\n",
-                       i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
-               /* We map this section of the file at its physical address. */
-               map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
-                      phdr[i].p_offset, phdr[i].p_filesz);
-       }
-
-       /* The entry point is given in the ELF header. */
-       return ehdr->e_entry;
-}
-
-/*L:150
- * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
- * to jump into it and it will unpack itself.  We used to have to perform some
- * hairy magic because the unpacking code scared me.
- *
- * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
- * a small patch to jump over the tricky bits in the Guest, so now we just read
- * the funky header so we know where in the file to load, and away we go!
- */
-static unsigned long load_bzimage(int fd)
-{
-       struct boot_params boot;
-       int r;
-       /* Modern bzImages get loaded at 1M. */
-       void *p = from_guest_phys(0x100000);
-
-       /*
-        * Go back to the start of the file and read the header.  It should be
-        * a Linux boot header (see Documentation/x86/boot.txt)
-        */
-       lseek(fd, 0, SEEK_SET);
-       read(fd, &boot, sizeof(boot));
-
-       /* Inside the setup_hdr, we expect the magic "HdrS" */
-       if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
-               errx(1, "This doesn't look like a bzImage to me");
-
-       /* Skip over the extra sectors of the header. */
-       lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
-
-       /* Now read everything into memory. in nice big chunks. */
-       while ((r = read(fd, p, 65536)) > 0)
-               p += r;
-
-       /* Finally, code32_start tells us where to enter the kernel. */
-       return boot.hdr.code32_start;
-}
-
-/*L:140
- * Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format.  With a little
- * work, we can load those, too.
- */
-static unsigned long load_kernel(int fd)
-{
-       Elf32_Ehdr hdr;
-
-       /* Read in the first few bytes. */
-       if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
-               err(1, "Reading kernel");
-
-       /* If it's an ELF file, it starts with "\177ELF" */
-       if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-               return map_elf(fd, &hdr);
-
-       /* Otherwise we assume it's a bzImage, and try to load it. */
-       return load_bzimage(fd);
-}
-
-/*
- * This is a trivial little helper to align pages.  Andi Kleen hated it because
- * it calls getpagesize() twice: "it's dumb code."
- *
- * Kernel guys get really het up about optimization, even when it's not
- * necessary.  I leave this code as a reaction against that.
- */
-static inline unsigned long page_align(unsigned long addr)
-{
-       /* Add upwards and truncate downwards. */
-       return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/*L:180
- * An "initial ram disk" is a disk image loaded into memory along with the
- * kernel which the kernel can use to boot from without needing any drivers.
- * Most distributions now use this as standard: the initrd contains the code to
- * load the appropriate driver modules for the current machine.
- *
- * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
- * kernels.  He sent me this (and tells me when I break it).
- */
-static unsigned long load_initrd(const char *name, unsigned long mem)
-{
-       int ifd;
-       struct stat st;
-       unsigned long len;
-
-       ifd = open_or_die(name, O_RDONLY);
-       /* fstat() is needed to get the file size. */
-       if (fstat(ifd, &st) < 0)
-               err(1, "fstat() on initrd '%s'", name);
-
-       /*
-        * We map the initrd at the top of memory, but mmap wants it to be
-        * page-aligned, so we round the size up for that.
-        */
-       len = page_align(st.st_size);
-       map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
-       /*
-        * Once a file is mapped, you can close the file descriptor.  It's a
-        * little odd, but quite useful.
-        */
-       close(ifd);
-       verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
-
-       /* We return the initrd size. */
-       return len;
-}
-/*:*/
-
-/*
- * Simple routine to roll all the commandline arguments together with spaces
- * between them.
- */
-static void concat(char *dst, char *args[])
-{
-       unsigned int i, len = 0;
-
-       for (i = 0; args[i]; i++) {
-               if (i) {
-                       strcat(dst+len, " ");
-                       len++;
-               }
-               strcpy(dst+len, args[i]);
-               len += strlen(args[i]);
-       }
-       /* In case it's empty. */
-       dst[len] = '\0';
-}
-
-/*L:185
- * This is where we actually tell the kernel to initialize the Guest.  We
- * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow and the
- * entry point for the Guest.
- */
-static void tell_kernel(unsigned long start)
-{
-       unsigned long args[] = { LHREQ_INITIALIZE,
-                                (unsigned long)guest_base,
-                                guest_limit / getpagesize(), start,
-                                (guest_mmio+getpagesize()-1) / getpagesize() };
-       verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
-               guest_base, guest_base + guest_limit,
-               guest_limit, guest_mmio);
-       lguest_fd = open_or_die("/dev/lguest", O_RDWR);
-       if (write(lguest_fd, args, sizeof(args)) < 0)
-               err(1, "Writing to /dev/lguest");
-}
-/*:*/
-
-/*L:200
- * Device Handling.
- *
- * When the Guest gives us a buffer, it sends an array of addresses and sizes.
- * We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which checks it and exits with an error message
- * if something funny is going on:
- */
-static void *_check_pointer(struct device *d,
-                           unsigned long addr, unsigned int size,
-                           unsigned int line)
-{
-       /*
-        * Check if the requested address and size exceeds the allocated memory,
-        * or addr + size wraps around.
-        */
-       if ((addr + size) > guest_limit || (addr + size) < addr)
-               bad_driver(d, "%s:%i: Invalid address %#lx",
-                          __FILE__, line, addr);
-       /*
-        * We return a pointer for the caller's convenience, now we know it's
-        * safe to use.
-        */
-       return from_guest_phys(addr);
-}
-/* A macro which transparently hands the line number to the real function. */
-#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
-
-/*
- * Each buffer in the virtqueues is actually a chain of descriptors.  This
- * function returns the next descriptor in the chain, or vq->vring.num if we're
- * at the end.
- */
-static unsigned next_desc(struct device *d, struct vring_desc *desc,
-                         unsigned int i, unsigned int max)
-{
-       unsigned int next;
-
-       /* If this descriptor says it doesn't chain, we're done. */
-       if (!(desc[i].flags & VRING_DESC_F_NEXT))
-               return max;
-
-       /* Check they're not leading us off end of descriptors. */
-       next = desc[i].next;
-       /* Make sure compiler knows to grab that: we don't want it changing! */
-       wmb();
-
-       if (next >= max)
-               bad_driver(d, "Desc next is %u", next);
-
-       return next;
-}
-
-/*
- * This actually sends the interrupt for this virtqueue, if we've used a
- * buffer.
- */
-static void trigger_irq(struct virtqueue *vq)
-{
-       unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
-
-       /* Don't inform them if nothing used. */
-       if (!vq->pending_used)
-               return;
-       vq->pending_used = 0;
-
-       /*
-        * 2.4.7.1:
-        *
-        *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-        *    The driver MUST set flags to 0 or 1. 
-        */
-       if (vq->vring.avail->flags > 1)
-               bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
-
-       /*
-        * 2.4.7.2:
-        *
-        *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-        *
-        *     - The device MUST ignore the used_event value.
-        *     - After the device writes a descriptor index into the used ring:
-        *         - If flags is 1, the device SHOULD NOT send an interrupt.
-        *         - If flags is 0, the device MUST send an interrupt.
-        */
-       if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
-               return;
-       }
-
-       /*
-        * 4.1.4.5.1:
-        *
-        *  If MSI-X capability is disabled, the device MUST set the Queue
-        *  Interrupt bit in ISR status before sending a virtqueue notification
-        *  to the driver.
-        */
-       vq->dev->mmio->isr = 0x1;
-
-       /* Send the Guest an interrupt tell them we used something up. */
-       if (write(lguest_fd, buf, sizeof(buf)) != 0)
-               err(1, "Triggering irq %i", vq->dev->config.irq_line);
-}
-
-/*
- * This looks in the virtqueue for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function waits if necessary, and returns the descriptor number found.
- */
-static unsigned wait_for_vq_desc(struct virtqueue *vq,
-                                struct iovec iov[],
-                                unsigned int *out_num, unsigned int *in_num)
-{
-       unsigned int i, head, max;
-       struct vring_desc *desc;
-       u16 last_avail = lg_last_avail(vq);
-
-       /*
-        * 2.4.7.1:
-        *
-        *   The driver MUST handle spurious interrupts from the device.
-        *
-        * That's why this is a while loop.
-        */
-
-       /* There's nothing available? */
-       while (last_avail == vq->vring.avail->idx) {
-               u64 event;
-
-               /*
-                * Since we're about to sleep, now is a good time to tell the
-                * Guest about what we've used up to now.
-                */
-               trigger_irq(vq);
-
-               /* OK, now we need to know about added descriptors. */
-               vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-
-               /*
-                * They could have slipped one in as we were doing that: make
-                * sure it's written, then check again.
-                */
-               mb();
-               if (last_avail != vq->vring.avail->idx) {
-                       vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-                       break;
-               }
-
-               /* Nothing new?  Wait for eventfd to tell us they refilled. */
-               if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
-                       errx(1, "Event read failed?");
-
-               /* We don't need to be notified again. */
-               vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-       }
-
-       /* Check it isn't doing very strange things with descriptor numbers. */
-       if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
-               bad_driver_vq(vq, "Guest moved used index from %u to %u",
-                             last_avail, vq->vring.avail->idx);
-
-       /* 
-        * Make sure we read the descriptor number *after* we read the ring
-        * update; don't let the cpu or compiler change the order.
-        */
-       rmb();
-
-       /*
-        * Grab the next descriptor number they're advertising, and increment
-        * the index we've seen.
-        */
-       head = vq->vring.avail->ring[last_avail % vq->vring.num];
-       lg_last_avail(vq)++;
-
-       /* If their number is silly, that's a fatal mistake. */
-       if (head >= vq->vring.num)
-               bad_driver_vq(vq, "Guest says index %u is available", head);
-
-       /* When we start there are none of either input nor output. */
-       *out_num = *in_num = 0;
-
-       max = vq->vring.num;
-       desc = vq->vring.desc;
-       i = head;
-
-       /*
-        * We have to read the descriptor after we read the descriptor number,
-        * but there's a data dependency there so the CPU shouldn't reorder
-        * that: no rmb() required.
-        */
-
-       do {
-               /*
-                * If this is an indirect entry, then this buffer contains a
-                * descriptor table which we handle as if it's any normal
-                * descriptor chain.
-                */
-               if (desc[i].flags & VRING_DESC_F_INDIRECT) {
-                       /* 2.4.5.3.1:
-                        *
-                        *  The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-                        *  flag unless the VIRTIO_F_INDIRECT_DESC feature was
-                        *  negotiated.
-                        */
-                       if (!(vq->dev->features_accepted &
-                             (1<<VIRTIO_RING_F_INDIRECT_DESC)))
-                               bad_driver_vq(vq, "vq indirect not negotiated");
-
-                       /*
-                        * 2.4.5.3.1:
-                        *
-                        *   The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-                        *   flag within an indirect descriptor (ie. only one
-                        *   table per descriptor).
-                        */
-                       if (desc != vq->vring.desc)
-                               bad_driver_vq(vq, "Indirect within indirect");
-
-                       /*
-                        * Proposed update VIRTIO-134 spells this out:
-                        *
-                        *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
-                        *   and VIRTQ_DESC_F_NEXT in flags.
-                        */
-                       if (desc[i].flags & VRING_DESC_F_NEXT)
-                               bad_driver_vq(vq, "indirect and next together");
-
-                       if (desc[i].len % sizeof(struct vring_desc))
-                               bad_driver_vq(vq,
-                                             "Invalid size for indirect table");
-                       /*
-                        * 2.4.5.3.2:
-                        *
-                        *  The device MUST ignore the write-only flag
-                        *  (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
-                        *  refers to an indirect table.
-                        *
-                        * We ignore it here: :)
-                        */
-
-                       max = desc[i].len / sizeof(struct vring_desc);
-                       desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
-                       i = 0;
-
-                       /* 2.4.5.3.1:
-                        *
-                        *  A driver MUST NOT create a descriptor chain longer
-                        *  than the Queue Size of the device.
-                        */
-                       if (max > vq->pci_config.queue_size)
-                               bad_driver_vq(vq,
-                                             "indirect has too many entries");
-               }
-
-               /* Grab the first descriptor, and check it's OK. */
-               iov[*out_num + *in_num].iov_len = desc[i].len;
-               iov[*out_num + *in_num].iov_base
-                       = check_pointer(vq->dev, desc[i].addr, desc[i].len);
-               /* If this is an input descriptor, increment that count. */
-               if (desc[i].flags & VRING_DESC_F_WRITE)
-                       (*in_num)++;
-               else {
-                       /*
-                        * If it's an output descriptor, they're all supposed
-                        * to come before any input descriptors.
-                        */
-                       if (*in_num)
-                               bad_driver_vq(vq,
-                                             "Descriptor has out after in");
-                       (*out_num)++;
-               }
-
-               /* If we've got too many, that implies a descriptor loop. */
-               if (*out_num + *in_num > max)
-                       bad_driver_vq(vq, "Looped descriptor");
-       } while ((i = next_desc(vq->dev, desc, i, max)) != max);
-
-       return head;
-}
-
-/*
- * After we've used one of their buffers, we tell the Guest about it.  Sometime
- * later we'll want to send them an interrupt using trigger_irq(); note that
- * wait_for_vq_desc() does that for us if it has to wait.
- */
-static void add_used(struct virtqueue *vq, unsigned int head, int len)
-{
-       struct vring_used_elem *used;
-
-       /*
-        * The virtqueue contains a ring of used buffers.  Get a pointer to the
-        * next entry in that used ring.
-        */
-       used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-       used->id = head;
-       used->len = len;
-       /* Make sure buffer is written before we update index. */
-       wmb();
-       vq->vring.used->idx++;
-       vq->pending_used++;
-}
-
-/* And here's the combo meal deal.  Supersize me! */
-static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
-{
-       add_used(vq, head, len);
-       trigger_irq(vq);
-}
-
-/*
- * The Console
- *
- * We associate some data with the console for our exit hack.
- */
-struct console_abort {
-       /* How many times have they hit ^C? */
-       int count;
-       /* When did they start? */
-       struct timeval start;
-};
-
-/* This is the routine which handles console input (ie. stdin). */
-static void console_input(struct virtqueue *vq)
-{
-       int len;
-       unsigned int head, in_num, out_num;
-       struct console_abort *abort = vq->dev->priv;
-       struct iovec iov[vq->vring.num];
-
-       /* Make sure there's a descriptor available. */
-       head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-       if (out_num)
-               bad_driver_vq(vq, "Output buffers in console in queue?");
-
-       /* Read into it.  This is where we usually wait. */
-       len = readv(STDIN_FILENO, iov, in_num);
-       if (len <= 0) {
-               /* Ran out of input? */
-               warnx("Failed to get console input, ignoring console.");
-               /*
-                * For simplicity, dying threads kill the whole Launcher.  So
-                * just nap here.
-                */
-               for (;;)
-                       pause();
-       }
-
-       /* Tell the Guest we used a buffer. */
-       add_used_and_trigger(vq, head, len);
-
-       /*
-        * Three ^C within one second?  Exit.
-        *
-        * This is such a hack, but works surprisingly well.  Each ^C has to
-        * be in a buffer by itself, so they can't be too fast.  But we check
-        * that we get three within about a second, so they can't be too
-        * slow.
-        */
-       if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
-               abort->count = 0;
-               return;
-       }
-
-       abort->count++;
-       if (abort->count == 1)
-               gettimeofday(&abort->start, NULL);
-       else if (abort->count == 3) {
-               struct timeval now;
-               gettimeofday(&now, NULL);
-               /* Kill all Launcher processes with SIGINT, like normal ^C */
-               if (now.tv_sec <= abort->start.tv_sec+1)
-                       kill(0, SIGINT);
-               abort->count = 0;
-       }
-}
-
-/* This is the routine which handles console output (ie. stdout). */
-static void console_output(struct virtqueue *vq)
-{
-       unsigned int head, out, in;
-       struct iovec iov[vq->vring.num];
-
-       /* We usually wait in here, for the Guest to give us something. */
-       head = wait_for_vq_desc(vq, iov, &out, &in);
-       if (in)
-               bad_driver_vq(vq, "Input buffers in console output queue?");
-
-       /* writev can return a partial write, so we loop here. */
-       while (!iov_empty(iov, out)) {
-               int len = writev(STDOUT_FILENO, iov, out);
-               if (len <= 0) {
-                       warn("Write to stdout gave %i (%d)", len, errno);
-                       break;
-               }
-               iov_consume(vq->dev, iov, out, NULL, len);
-       }
-
-       /*
-        * We're finished with that buffer: if we're going to sleep,
-        * wait_for_vq_desc() will prod the Guest with an interrupt.
-        */
-       add_used(vq, head, 0);
-}
-
-/*
- * The Network
- *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
- */
-struct net_info {
-       int tunfd;
-};
-
-static void net_output(struct virtqueue *vq)
-{
-       struct net_info *net_info = vq->dev->priv;
-       unsigned int head, out, in;
-       struct iovec iov[vq->vring.num];
-
-       /* We usually wait in here for the Guest to give us a packet. */
-       head = wait_for_vq_desc(vq, iov, &out, &in);
-       if (in)
-               bad_driver_vq(vq, "Input buffers in net output queue?");
-       /*
-        * Send the whole thing through to /dev/net/tun.  It expects the exact
-        * same format: what a coincidence!
-        */
-       if (writev(net_info->tunfd, iov, out) < 0)
-               warnx("Write to tun failed (%d)?", errno);
-
-       /*
-        * Done with that one; wait_for_vq_desc() will send the interrupt if
-        * all packets are processed.
-        */
-       add_used(vq, head, 0);
-}
-
-/*
- * Handling network input is a bit trickier, because I've tried to optimize it.
- *
- * First we have a helper routine which tells is if from this file descriptor
- * (ie. the /dev/net/tun device) will block:
- */
-static bool will_block(int fd)
-{
-       fd_set fdset;
-       struct timeval zero = { 0, 0 };
-       FD_ZERO(&fdset);
-       FD_SET(fd, &fdset);
-       return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
-}
-
-/*
- * This handles packets coming in from the tun device to our Guest.  Like all
- * service routines, it gets called again as soon as it returns, so you don't
- * see a while(1) loop here.
- */
-static void net_input(struct virtqueue *vq)
-{
-       int len;
-       unsigned int head, out, in;
-       struct iovec iov[vq->vring.num];
-       struct net_info *net_info = vq->dev->priv;
-
-       /*
-        * Get a descriptor to write an incoming packet into.  This will also
-        * send an interrupt if they're out of descriptors.
-        */
-       head = wait_for_vq_desc(vq, iov, &out, &in);
-       if (out)
-               bad_driver_vq(vq, "Output buffers in net input queue?");
-
-       /*
-        * If it looks like we'll block reading from the tun device, send them
-        * an interrupt.
-        */
-       if (vq->pending_used && will_block(net_info->tunfd))
-               trigger_irq(vq);
-
-       /*
-        * Read in the packet.  This is where we normally wait (when there's no
-        * incoming network traffic).
-        */
-       len = readv(net_info->tunfd, iov, in);
-       if (len <= 0)
-               warn("Failed to read from tun (%d).", errno);
-
-       /*
-        * Mark that packet buffer as used, but don't interrupt here.  We want
-        * to wait until we've done as much work as we can.
-        */
-       add_used(vq, head, len);
-}
-/*:*/
-
-/* This is the helper to create threads: run the service routine in a loop. */
-static int do_thread(void *_vq)
-{
-       struct virtqueue *vq = _vq;
-
-       for (;;)
-               vq->service(vq);
-       return 0;
-}
-
-/*
- * When a child dies, we kill our entire process group with SIGTERM.  This
- * also has the side effect that the shell restores the console for us!
- */
-static void kill_launcher(int signal)
-{
-       kill(0, SIGTERM);
-}
-
-static void reset_vq_pci_config(struct virtqueue *vq)
-{
-       vq->pci_config.queue_size = VIRTQUEUE_NUM;
-       vq->pci_config.queue_enable = 0;
-}
-
-static void reset_device(struct device *dev)
-{
-       struct virtqueue *vq;
-
-       verbose("Resetting device %s\n", dev->name);
-
-       /* Clear any features they've acked. */
-       dev->features_accepted = 0;
-
-       /* We're going to be explicitly killing threads, so ignore them. */
-       signal(SIGCHLD, SIG_IGN);
-
-       /*
-        * 4.1.4.3.1:
-        *
-        *   The device MUST present a 0 in queue_enable on reset. 
-        *
-        * This means we set it here, and reset the saved ones in every vq.
-        */
-       dev->mmio->cfg.queue_enable = 0;
-
-       /* Get rid of the virtqueue threads */
-       for (vq = dev->vq; vq; vq = vq->next) {
-               vq->last_avail_idx = 0;
-               reset_vq_pci_config(vq);
-               if (vq->thread != (pid_t)-1) {
-                       kill(vq->thread, SIGTERM);
-                       waitpid(vq->thread, NULL, 0);
-                       vq->thread = (pid_t)-1;
-               }
-       }
-       dev->running = false;
-       dev->wrote_features_ok = false;
-
-       /* Now we care if threads die. */
-       signal(SIGCHLD, (void *)kill_launcher);
-}
-
-static void cleanup_devices(void)
-{
-       unsigned int i;
-
-       for (i = 1; i < MAX_PCI_DEVICES; i++) {
-               struct device *d = devices.pci[i];
-               if (!d)
-                       continue;
-               reset_device(d);
-       }
-
-       /* If we saved off the original terminal settings, restore them now. */
-       if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
-               tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/*L:217
- * We do PCI.  This is mainly done to let us test the kernel virtio PCI
- * code.
- */
-
-/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
-static struct device pci_host_bridge;
-
-static void init_pci_host_bridge(void)
-{
-       pci_host_bridge.name = "PCI Host Bridge";
-       pci_host_bridge.config.class = 0x06; /* bridge */
-       pci_host_bridge.config.subclass = 0; /* host bridge */
-       devices.pci[0] = &pci_host_bridge;
-}
-
-/* The IO ports used to read the PCI config space. */
-#define PCI_CONFIG_ADDR 0xCF8
-#define PCI_CONFIG_DATA 0xCFC
-
-/*
- * Not really portable, but does help readability: this is what the Guest
- * writes to the PCI_CONFIG_ADDR IO port.
- */
-union pci_config_addr {
-       struct {
-               unsigned mbz: 2;
-               unsigned offset: 6;
-               unsigned funcnum: 3;
-               unsigned devnum: 5;
-               unsigned busnum: 8;
-               unsigned reserved: 7;
-               unsigned enabled : 1;
-       } bits;
-       u32 val;
-};
-
-/*
- * We cache what they wrote to the address port, so we know what they're
- * talking about when they access the data port.
- */
-static union pci_config_addr pci_config_addr;
-
-static struct device *find_pci_device(unsigned int index)
-{
-       return devices.pci[index];
-}
-
-/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
-static void ioread(u16 off, u32 v, u32 mask, u32 *val)
-{
-       assert(off < 4);
-       assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-       *val = (v >> (off * 8)) & mask;
-}
-
-/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
-static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
-{
-       assert(off < 4);
-       assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-       *dst &= ~(mask << (off * 8));
-       *dst |= (v & mask) << (off * 8);
-}
-
-/*
- * Where PCI_CONFIG_DATA accesses depends on the previous write to
- * PCI_CONFIG_ADDR.
- */
-static struct device *dev_and_reg(u32 *reg)
-{
-       if (!pci_config_addr.bits.enabled)
-               return NULL;
-
-       if (pci_config_addr.bits.funcnum != 0)
-               return NULL;
-
-       if (pci_config_addr.bits.busnum != 0)
-               return NULL;
-
-       if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
-               return NULL;
-
-       *reg = pci_config_addr.bits.offset;
-       return find_pci_device(pci_config_addr.bits.devnum);
-}
-
-/*
- * We can get invalid combinations of values while they're writing, so we
- * only fault if they try to write with some invalid bar/offset/length.
- */
-static bool valid_bar_access(struct device *d,
-                            struct virtio_pci_cfg_cap_u32 *cfg_access)
-{
-       /* We only have 1 bar (BAR0) */
-       if (cfg_access->cap.bar != 0)
-               return false;
-
-       /* Check it's within BAR0. */
-       if (cfg_access->cap.offset >= d->mmio_size
-           || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
-               return false;
-
-       /* Check length is 1, 2 or 4. */
-       if (cfg_access->cap.length != 1
-           && cfg_access->cap.length != 2
-           && cfg_access->cap.length != 4)
-               return false;
-
-       /*
-        * 4.1.4.7.2:
-        *
-        *  The driver MUST NOT write a cap.offset which is not a multiple of
-        *  cap.length (ie. all accesses MUST be aligned).
-        */
-       if (cfg_access->cap.offset % cfg_access->cap.length != 0)
-               return false;
-
-       /* Return pointer into word in BAR0. */
-       return true;
-}
-
-/* Is this accessing the PCI config address port?. */
-static bool is_pci_addr_port(u16 port)
-{
-       return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
-}
-
-static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
-{
-       iowrite(port - PCI_CONFIG_ADDR, val, mask,
-               &pci_config_addr.val);
-       verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
-               pci_config_addr.bits.enabled ? "" : " DISABLED",
-               val, mask,
-               pci_config_addr.bits.busnum,
-               pci_config_addr.bits.devnum,
-               pci_config_addr.bits.funcnum,
-               pci_config_addr.bits.offset);
-       return true;
-}
-
-static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
-{
-       ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
-}
-
-/* Is this accessing the PCI config data port?. */
-static bool is_pci_data_port(u16 port)
-{
-       return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
-
-static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
-{
-       u32 reg, portoff;
-       struct device *d = dev_and_reg(&reg);
-
-       /* Complain if they don't belong to a device. */
-       if (!d)
-               return false;
-
-       /* They can do 1 byte writes, etc. */
-       portoff = port - PCI_CONFIG_DATA;
-
-       /*
-        * PCI uses a weird way to determine the BAR size: the OS
-        * writes all 1's, and sees which ones stick.
-        */
-       if (&d->config_words[reg] == &d->config.bar[0]) {
-               int i;
-
-               iowrite(portoff, val, mask, &d->config.bar[0]);
-               for (i = 0; (1 << i) < d->mmio_size; i++)
-                       d->config.bar[0] &= ~(1 << i);
-               return true;
-       } else if ((&d->config_words[reg] > &d->config.bar[0]
-                   && &d->config_words[reg] <= &d->config.bar[6])
-                  || &d->config_words[reg] == &d->config.expansion_rom_addr) {
-               /* Allow writing to any other BAR, or expansion ROM */
-               iowrite(portoff, val, mask, &d->config_words[reg]);
-               return true;
-               /* We let them override latency timer and cacheline size */
-       } else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
-               /* Only let them change the first two fields. */
-               if (mask == 0xFFFFFFFF)
-                       mask = 0xFFFF;
-               iowrite(portoff, val, mask, &d->config_words[reg]);
-               return true;
-       } else if (&d->config_words[reg] == (void *)&d->config.command
-                  && mask == 0xFFFF) {
-               /* Ignore command writes. */
-               return true;
-       } else if (&d->config_words[reg]
-                  == (void *)&d->config.cfg_access.cap.bar
-                  || &d->config_words[reg]
-                  == &d->config.cfg_access.cap.length
-                  || &d->config_words[reg]
-                  == &d->config.cfg_access.cap.offset) {
-
-               /*
-                * The VIRTIO_PCI_CAP_PCI_CFG capability
-                * provides a backdoor to access the MMIO
-                * regions without mapping them.  Weird, but
-                * useful.
-                */
-               iowrite(portoff, val, mask, &d->config_words[reg]);
-               return true;
-       } else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-               u32 write_mask;
-
-               /*
-                * 4.1.4.7.1:
-                *
-                *  Upon detecting driver write access to pci_cfg_data, the
-                *  device MUST execute a write access at offset cap.offset at
-                *  BAR selected by cap.bar using the first cap.length bytes
-                *  from pci_cfg_data.
-                */
-
-               /* Must be bar 0 */
-               if (!valid_bar_access(d, &d->config.cfg_access))
-                       return false;
-
-               iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
-
-               /*
-                * Now emulate a write.  The mask we use is set by
-                * len, *not* this write!
-                */
-               write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
-               verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
-                       d->config.cfg_access.pci_cfg_data, write_mask,
-                       d->config.cfg_access.cap.bar,
-                       d->config.cfg_access.cap.offset,
-                       d->config.cfg_access.cap.length);
-
-               emulate_mmio_write(d, d->config.cfg_access.cap.offset,
-                                  d->config.cfg_access.pci_cfg_data,
-                                  write_mask);
-               return true;
-       }
-
-       /*
-        * 4.1.4.1:
-        *
-        *  The driver MUST NOT write into any field of the capability
-        *  structure, with the exception of those with cap_type
-        *  VIRTIO_PCI_CAP_PCI_CFG...
-        */
-       return false;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
-
-static void pci_data_ioread(u16 port, u32 mask, u32 *val)
-{
-       u32 reg;
-       struct device *d = dev_and_reg(&reg);
-
-       if (!d)
-               return;
-
-       /* Read through the PCI MMIO access window is special */
-       if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-               u32 read_mask;
-
-               /*
-                * 4.1.4.7.1:
-                *
-                *  Upon detecting driver read access to pci_cfg_data, the
-                *  device MUST execute a read access of length cap.length at
-                *  offset cap.offset at BAR selected by cap.bar and store the
-                *  first cap.length bytes in pci_cfg_data.
-                */
-               /* Must be bar 0 */
-               if (!valid_bar_access(d, &d->config.cfg_access))
-                       bad_driver(d,
-                            "Invalid cfg_access to bar%u, offset %u len %u",
-                            d->config.cfg_access.cap.bar,
-                            d->config.cfg_access.cap.offset,
-                            d->config.cfg_access.cap.length);
-
-               /*
-                * Read into the window.  The mask we use is set by
-                * len, *not* this read!
-                */
-               read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
-               d->config.cfg_access.pci_cfg_data
-                       = emulate_mmio_read(d,
-                                           d->config.cfg_access.cap.offset,
-                                           read_mask);
-               verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
-                       d->config.cfg_access.pci_cfg_data, read_mask,
-                       d->config.cfg_access.cap.bar,
-                       d->config.cfg_access.cap.offset,
-                       d->config.cfg_access.cap.length);
-       }
-       ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
-}
-
-/*L:216
- * This is where we emulate a handful of Guest instructions.  It's ugly
- * and we used to do it in the kernel but it grew over time.
- */
-
-/*
- * We use the ptrace syscall's pt_regs struct to talk about registers
- * to lguest: these macros convert the names to the offsets.
- */
-#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
-#define setreg(name, val) \
-       setreg_off(offsetof(struct user_regs_struct, name), (val))
-
-static u32 getreg_off(size_t offset)
-{
-       u32 r;
-       unsigned long args[] = { LHREQ_GETREG, offset };
-
-       if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-               err(1, "Getting register %u", offset);
-       if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
-               err(1, "Reading register %u", offset);
-
-       return r;
-}
-
-static void setreg_off(size_t offset, u32 val)
-{
-       unsigned long args[] = { LHREQ_SETREG, offset, val };
-
-       if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-               err(1, "Setting register %u", offset);
-}
-
-/* Get register by instruction encoding */
-static u32 getreg_num(unsigned regnum, u32 mask)
-{
-       /* 8 bit ops use regnums 4-7 for high parts of word */
-       if (mask == 0xFF && (regnum & 0x4))
-               return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
-
-       switch (regnum) {
-       case 0: return getreg(eax) & mask;
-       case 1: return getreg(ecx) & mask;
-       case 2: return getreg(edx) & mask;
-       case 3: return getreg(ebx) & mask;
-       case 4: return getreg(esp) & mask;
-       case 5: return getreg(ebp) & mask;
-       case 6: return getreg(esi) & mask;
-       case 7: return getreg(edi) & mask;
-       }
-       abort();
-}
-
-/* Set register by instruction encoding */
-static void setreg_num(unsigned regnum, u32 val, u32 mask)
-{
-       /* Don't try to set bits out of range */
-       assert(~(val & ~mask));
-
-       /* 8 bit ops use regnums 4-7 for high parts of word */
-       if (mask == 0xFF && (regnum & 0x4)) {
-               /* Construct the 16 bits we want. */
-               val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
-               setreg_num(regnum & 0x3, val, 0xFFFF);
-               return;
-       }
-
-       switch (regnum) {
-       case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
-       case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
-       case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
-       case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
-       case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
-       case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
-       case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
-       case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
-       }
-       abort();
-}
-
-/* Get bytes of displacement appended to instruction, from r/m encoding */
-static u32 insn_displacement_len(u8 mod_reg_rm)
-{
-       /* Switch on the mod bits */
-       switch (mod_reg_rm >> 6) {
-       case 0:
-               /* If mod == 0, and r/m == 101, 16-bit displacement follows */
-               if ((mod_reg_rm & 0x7) == 0x5)
-                       return 2;
-               /* Normally, mod == 0 means no literal displacement */
-               return 0;
-       case 1:
-               /* One byte displacement */
-               return 1;
-       case 2:
-               /* Four byte displacement */
-               return 4;
-       case 3:
-               /* Register mode */
-               return 0;
-       }
-       abort();
-}
-
-static void emulate_insn(const u8 insn[])
-{
-       unsigned long args[] = { LHREQ_TRAP, 13 };
-       unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
-       unsigned int eax, port, mask;
-       /*
-        * Default is to return all-ones on IO port reads, which traditionally
-        * means "there's nothing there".
-        */
-       u32 val = 0xFFFFFFFF;
-
-       /*
-        * This must be the Guest kernel trying to do something, not userspace!
-        * The bottom two bits of the CS segment register are the privilege
-        * level.
-        */
-       if ((getreg(xcs) & 3) != 0x1)
-               goto no_emulate;
-
-       /* Decoding x86 instructions is icky. */
-
-       /*
-        * Around 2.6.33, the kernel started using an emulation for the
-        * cmpxchg8b instruction in early boot on many configurations.  This
-        * code isn't paravirtualized, and it tries to disable interrupts.
-        * Ignore it, which will Mostly Work.
-        */
-       if (insn[insnlen] == 0xfa) {
-               /* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-               insnlen = 1;
-               goto skip_insn;
-       }
-
-       /*
-        * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-        */
-       if (insn[insnlen] == 0x66) {
-               small_operand = 1;
-               /* The instruction is 1 byte so far, read the next byte. */
-               insnlen = 1;
-       }
-
-       /* If the lower bit isn't set, it's a single byte access */
-       byte_access = !(insn[insnlen] & 1);
-
-       /*
-        * Now we can ignore the lower bit and decode the 4 opcodes
-        * we need to emulate.
-        */
-       switch (insn[insnlen] & 0xFE) {
-       case 0xE4: /* in     <next byte>,%al */
-               port = insn[insnlen+1];
-               insnlen += 2;
-               in = 1;
-               break;
-       case 0xEC: /* in     (%dx),%al */
-               port = getreg(edx) & 0xFFFF;
-               insnlen += 1;
-               in = 1;
-               break;
-       case 0xE6: /* out    %al,<next byte> */
-               port = insn[insnlen+1];
-               insnlen += 2;
-               break;
-       case 0xEE: /* out    %al,(%dx) */
-               port = getreg(edx) & 0xFFFF;
-               insnlen += 1;
-               break;
-       default:
-               /* OK, we don't know what this is, can't emulate. */
-               goto no_emulate;
-       }
-
-       /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
-       if (byte_access)
-               mask = 0xFF;
-       else if (small_operand)
-               mask = 0xFFFF;
-       else
-               mask = 0xFFFFFFFF;
-
-       /*
-        * If it was an "IN" instruction, they expect the result to be read
-        * into %eax, so we change %eax.
-        */
-       eax = getreg(eax);
-
-       if (in) {
-               /* This is the PS/2 keyboard status; 1 means ready for output */
-               if (port == 0x64)
-                       val = 1;
-               else if (is_pci_addr_port(port))
-                       pci_addr_ioread(port, mask, &val);
-               else if (is_pci_data_port(port))
-                       pci_data_ioread(port, mask, &val);
-
-               /* Clear the bits we're about to read */
-               eax &= ~mask;
-               /* Copy bits in from val. */
-               eax |= val & mask;
-               /* Now update the register. */
-               setreg(eax, eax);
-       } else {
-               if (is_pci_addr_port(port)) {
-                       if (!pci_addr_iowrite(port, mask, eax))
-                               goto bad_io;
-               } else if (is_pci_data_port(port)) {
-                       if (!pci_data_iowrite(port, mask, eax))
-                               goto bad_io;
-               }
-               /* There are many other ports, eg. CMOS clock, serial
-                * and parallel ports, so we ignore them all. */
-       }
-
-       verbose("IO %s of %x to %u: %#08x\n",
-               in ? "IN" : "OUT", mask, port, eax);
-skip_insn:
-       /* Finally, we've "done" the instruction, so move past it. */
-       setreg(eip, getreg(eip) + insnlen);
-       return;
-
-bad_io:
-       warnx("Attempt to %s port %u (%#x mask)",
-             in ? "read from" : "write to", port, mask);
-
-no_emulate:
-       /* Inject trap into Guest. */
-       if (write(lguest_fd, args, sizeof(args)) < 0)
-               err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
-}
-
-static struct device *find_mmio_region(unsigned long paddr, u32 *off)
-{
-       unsigned int i;
-
-       for (i = 1; i < MAX_PCI_DEVICES; i++) {
-               struct device *d = devices.pci[i];
-
-               if (!d)
-                       continue;
-               if (paddr < d->mmio_addr)
-                       continue;
-               if (paddr >= d->mmio_addr + d->mmio_size)
-                       continue;
-               *off = paddr - d->mmio_addr;
-               return d;
-       }
-       return NULL;
-}
-
-/* FIXME: Use vq array. */
-static struct virtqueue *vq_by_num(struct device *d, u32 num)
-{
-       struct virtqueue *vq = d->vq;
-
-       while (num-- && vq)
-               vq = vq->next;
-
-       return vq;
-}
-
-static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
-                          struct virtqueue *vq)
-{
-       vq->pci_config = *cfg;
-}
-
-static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
-                             struct virtqueue *vq)
-{
-       /* Only restore the per-vq part */
-       size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
-
-       memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
-              sizeof(*cfg) - off);
-}
-
-/*
- * 4.1.4.3.2:
- *
- *  The driver MUST configure the other virtqueue fields before
- *  enabling the virtqueue with queue_enable.
- *
- * When they enable the virtqueue, we check that their setup is valid.
- */
-static void check_virtqueue(struct device *d, struct virtqueue *vq)
-{
-       /* Because lguest is 32 bit, all the descriptor high bits must be 0 */
-       if (vq->pci_config.queue_desc_hi
-           || vq->pci_config.queue_avail_hi
-           || vq->pci_config.queue_used_hi)
-               bad_driver_vq(vq, "invalid 64-bit queue address");
-
-       /*
-        * 2.4.1:
-        *
-        *  The driver MUST ensure that the physical address of the first byte
-        *  of each virtqueue part is a multiple of the specified alignment
-        *  value in the above table.
-        */
-       if (vq->pci_config.queue_desc_lo % 16
-           || vq->pci_config.queue_avail_lo % 2
-           || vq->pci_config.queue_used_lo % 4)
-               bad_driver_vq(vq, "invalid alignment in queue addresses");
-
-       /* Initialize the virtqueue and check they're all in range. */
-       vq->vring.num = vq->pci_config.queue_size;
-       vq->vring.desc = check_pointer(vq->dev,
-                                      vq->pci_config.queue_desc_lo,
-                                      sizeof(*vq->vring.desc) * vq->vring.num);
-       vq->vring.avail = check_pointer(vq->dev,
-                                       vq->pci_config.queue_avail_lo,
-                                       sizeof(*vq->vring.avail)
-                                       + (sizeof(vq->vring.avail->ring[0])
-                                          * vq->vring.num));
-       vq->vring.used = check_pointer(vq->dev,
-                                      vq->pci_config.queue_used_lo,
-                                      sizeof(*vq->vring.used)
-                                      + (sizeof(vq->vring.used->ring[0])
-                                         * vq->vring.num));
-
-       /*
-        * 2.4.9.1:
-        *
-        *   The driver MUST initialize flags in the used ring to 0
-        *   when allocating the used ring.
-        */
-       if (vq->vring.used->flags != 0)
-               bad_driver_vq(vq, "invalid initial used.flags %#x",
-                             vq->vring.used->flags);
-}
-
-static void start_virtqueue(struct virtqueue *vq)
-{
-       /*
-        * Create stack for thread.  Since the stack grows upwards, we point
-        * the stack pointer to the end of this region.
-        */
-       char *stack = malloc(32768);
-
-       /* Create a zero-initialized eventfd. */
-       vq->eventfd = eventfd(0, 0);
-       if (vq->eventfd < 0)
-               err(1, "Creating eventfd");
-
-       /*
-        * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
-        * we get a signal if it dies.
-        */
-       vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
-       if (vq->thread == (pid_t)-1)
-               err(1, "Creating clone");
-}
-
-static void start_virtqueues(struct device *d)
-{
-       struct virtqueue *vq;
-
-       for (vq = d->vq; vq; vq = vq->next) {
-               if (vq->pci_config.queue_enable)
-                       start_virtqueue(vq);
-       }
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
-{
-       struct virtqueue *vq;
-
-       switch (off) {
-       case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-               /*
-                * 4.1.4.3.1:
-                *
-                * The device MUST present the feature bits it is offering in
-                * device_feature, starting at bit device_feature_select ∗ 32
-                * for any device_feature_select written by the driver
-                */
-               if (val == 0)
-                       d->mmio->cfg.device_feature = d->features;
-               else if (val == 1)
-                       d->mmio->cfg.device_feature = (d->features >> 32);
-               else
-                       d->mmio->cfg.device_feature = 0;
-               goto feature_write_through32;
-       case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-               if (val > 1)
-                       bad_driver(d, "Unexpected driver select %u", val);
-               goto feature_write_through32;
-       case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-               if (d->mmio->cfg.guest_feature_select == 0) {
-                       d->features_accepted &= ~((u64)0xFFFFFFFF);
-                       d->features_accepted |= val;
-               } else {
-                       assert(d->mmio->cfg.guest_feature_select == 1);
-                       d->features_accepted &= 0xFFFFFFFF;
-                       d->features_accepted |= ((u64)val) << 32;
-               }
-               /*
-                * 2.2.1:
-                *
-                *   The driver MUST NOT accept a feature which the device did
-                *   not offer
-                */
-               if (d->features_accepted & ~d->features)
-                       bad_driver(d, "over-accepted features %#llx of %#llx",
-                                  d->features_accepted, d->features);
-               goto feature_write_through32;
-       case offsetof(struct virtio_pci_mmio, cfg.device_status): {
-               u8 prev;
-
-               verbose("%s: device status -> %#x\n", d->name, val);
-               /*
-                * 4.1.4.3.1:
-                * 
-                *  The device MUST reset when 0 is written to device_status,
-                *  and present a 0 in device_status once that is done.
-                */
-               if (val == 0) {
-                       reset_device(d);
-                       goto write_through8;
-               }
-
-               /* 2.1.1: The driver MUST NOT clear a device status bit. */
-               if (d->mmio->cfg.device_status & ~val)
-                       bad_driver(d, "unset of device status bit %#x -> %#x",
-                                  d->mmio->cfg.device_status, val);
-
-               /*
-                * 2.1.2:
-                *
-                *  The device MUST NOT consume buffers or notify the driver
-                *  before DRIVER_OK.
-                */
-               if (val & VIRTIO_CONFIG_S_DRIVER_OK
-                   && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-                       start_virtqueues(d);
-
-               /*
-                * 3.1.1:
-                *
-                *   The driver MUST follow this sequence to initialize a device:
-                *   - Reset the device.
-                *   - Set the ACKNOWLEDGE status bit: the guest OS has
-                 *     notice the device.
-                *   - Set the DRIVER status bit: the guest OS knows how
-                 *     to drive the device.
-                *   - Read device feature bits, and write the subset
-                *     of feature bits understood by the OS and driver
-                *     to the device. During this step the driver MAY
-                *     read (but MUST NOT write) the device-specific
-                *     configuration fields to check that it can
-                *     support the device before accepting it.
-                *   - Set the FEATURES_OK status bit.  The driver
-                *     MUST not accept new feature bits after this
-                *     step.
-                *   - Re-read device status to ensure the FEATURES_OK
-                *     bit is still set: otherwise, the device does
-                *     not support our subset of features and the
-                *     device is unusable.
-                *   - Perform device-specific setup, including
-                *     discovery of virtqueues for the device,
-                *     optional per-bus setup, reading and possibly
-                *     writing the device’s virtio configuration
-                *     space, and population of virtqueues.
-                *   - Set the DRIVER_OK status bit. At this point the
-                 *     device is “live”.
-                */
-               prev = 0;
-               switch (val & ~d->mmio->cfg.device_status) {
-               case VIRTIO_CONFIG_S_DRIVER_OK:
-                       prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
-               case VIRTIO_CONFIG_S_FEATURES_OK:
-                       prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
-               case VIRTIO_CONFIG_S_DRIVER:
-                       prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
-               case VIRTIO_CONFIG_S_ACKNOWLEDGE:
-                       break;
-               default:
-                       bad_driver(d, "unknown device status bit %#x -> %#x",
-                                  d->mmio->cfg.device_status, val);
-               }
-               if (d->mmio->cfg.device_status != prev)
-                       bad_driver(d, "unexpected status transition %#x -> %#x",
-                                  d->mmio->cfg.device_status, val);
-
-               /* If they just wrote FEATURES_OK, we make sure they read */
-               switch (val & ~d->mmio->cfg.device_status) {
-               case VIRTIO_CONFIG_S_FEATURES_OK:
-                       d->wrote_features_ok = true;
-                       break;
-               case VIRTIO_CONFIG_S_DRIVER_OK:
-                       if (d->wrote_features_ok)
-                               bad_driver(d, "did not re-read FEATURES_OK");
-                       break;
-               }
-               goto write_through8;
-       }
-       case offsetof(struct virtio_pci_mmio, cfg.queue_select):
-               vq = vq_by_num(d, val);
-               /*
-                * 4.1.4.3.1:
-                *
-                *  The device MUST present a 0 in queue_size if the virtqueue
-                *  corresponding to the current queue_select is unavailable.
-                */
-               if (!vq) {
-                       d->mmio->cfg.queue_size = 0;
-                       goto write_through16;
-               }
-               /* Save registers for old vq, if it was a valid vq */
-               if (d->mmio->cfg.queue_size)
-                       save_vq_config(&d->mmio->cfg,
-                                      vq_by_num(d, d->mmio->cfg.queue_select));
-               /* Restore the registers for the queue they asked for */
-               restore_vq_config(&d->mmio->cfg, vq);
-               goto write_through16;
-       case offsetof(struct virtio_pci_mmio, cfg.queue_size):
-               /*
-                * 4.1.4.3.2:
-                *
-                *  The driver MUST NOT write a value which is not a power of 2
-                *  to queue_size.
-                */
-               if (val & (val-1))
-                       bad_driver(d, "invalid queue size %u", val);
-               if (d->mmio->cfg.queue_enable)
-                       bad_driver(d, "changing queue size on live device");
-               goto write_through16;
-       case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
-               bad_driver(d, "attempt to set MSIX vector to %u", val);
-       case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
-               struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
-
-               /*
-                * 4.1.4.3.2:
-                *
-                *  The driver MUST NOT write a 0 to queue_enable.
-                */
-               if (val != 1)
-                       bad_driver(d, "setting queue_enable to %u", val);
-
-               /*
-                * 3.1.1:
-                *
-                *  7. Perform device-specific setup, including discovery of
-                *     virtqueues for the device, optional per-bus setup,
-                *     reading and possibly writing the device’s virtio
-                *     configuration space, and population of virtqueues.
-                *  8. Set the DRIVER_OK status bit.
-                *
-                * All our devices require all virtqueues to be enabled, so
-                * they should have done that before setting DRIVER_OK.
-                */
-               if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
-                       bad_driver(d, "enabling vq after DRIVER_OK");
-
-               d->mmio->cfg.queue_enable = val;
-               save_vq_config(&d->mmio->cfg, vq);
-               check_virtqueue(d, vq);
-               goto write_through16;
-       }
-       case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
-               bad_driver(d, "attempt to write to queue_notify_off");
-       case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
-       case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
-       case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
-       case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
-       case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
-       case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
-               /*
-                * 4.1.4.3.2:
-                *
-                *  The driver MUST configure the other virtqueue fields before
-                *  enabling the virtqueue with queue_enable.
-                */
-               if (d->mmio->cfg.queue_enable)
-                       bad_driver(d, "changing queue on live device");
-
-               /*
-                * 3.1.1:
-                *
-                *  The driver MUST follow this sequence to initialize a device:
-                *...
-                *  5. Set the FEATURES_OK status bit. The driver MUST not
-                *  accept new feature bits after this step.
-                */
-               if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
-                       bad_driver(d, "setting up vq before FEATURES_OK");
-
-               /*
-                *  6. Re-read device status to ensure the FEATURES_OK bit is
-                *     still set...
-                */
-               if (d->wrote_features_ok)
-                       bad_driver(d, "didn't re-read FEATURES_OK before setup");
-
-               goto write_through32;
-       case offsetof(struct virtio_pci_mmio, notify):
-               vq = vq_by_num(d, val);
-               if (!vq)
-                       bad_driver(d, "Invalid vq notification on %u", val);
-               /* Notify the process handling this vq by adding 1 to eventfd */
-               write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
-               goto write_through16;
-       case offsetof(struct virtio_pci_mmio, isr):
-               bad_driver(d, "Unexpected write to isr");
-       /* Weird corner case: write to emerg_wr of console */
-       case sizeof(struct virtio_pci_mmio)
-               + offsetof(struct virtio_console_config, emerg_wr):
-               if (strcmp(d->name, "console") == 0) {
-                       char c = val;
-                       write(STDOUT_FILENO, &c, 1);
-                       goto write_through32;
-               }
-               /* Fall through... */
-       default:
-               /*
-                * 4.1.4.3.2:
-                *
-                *   The driver MUST NOT write to device_feature, num_queues,
-                *   config_generation or queue_notify_off.
-                */
-               bad_driver(d, "Unexpected write to offset %u", off);
-       }
-
-feature_write_through32:
-       /*
-        * 3.1.1:
-        *
-        *   The driver MUST follow this sequence to initialize a device:
-        *...
-        *   - Set the DRIVER status bit: the guest OS knows how
-        *     to drive the device.
-        *   - Read device feature bits, and write the subset
-        *     of feature bits understood by the OS and driver
-        *     to the device.
-        *...
-        *   - Set the FEATURES_OK status bit. The driver MUST not
-        *     accept new feature bits after this step.
-        */
-       if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-               bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
-       if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
-               bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
-
-       /*
-        * 4.1.3.1:
-        *
-        *  The driver MUST access each field using the “natural” access
-        *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-        *  16-bit fields and 8-bit accesses for 8-bit fields.
-        */
-write_through32:
-       if (mask != 0xFFFFFFFF) {
-               bad_driver(d, "non-32-bit write to offset %u (%#x)",
-                          off, getreg(eip));
-               return;
-       }
-       memcpy((char *)d->mmio + off, &val, 4);
-       return;
-
-write_through16:
-       if (mask != 0xFFFF)
-               bad_driver(d, "non-16-bit write to offset %u (%#x)",
-                          off, getreg(eip));
-       memcpy((char *)d->mmio + off, &val, 2);
-       return;
-
-write_through8:
-       if (mask != 0xFF)
-               bad_driver(d, "non-8-bit write to offset %u (%#x)",
-                          off, getreg(eip));
-       memcpy((char *)d->mmio + off, &val, 1);
-       return;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
-{
-       u8 isr;
-       u32 val = 0;
-
-       switch (off) {
-       case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-       case offsetof(struct virtio_pci_mmio, cfg.device_feature):
-       case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-       case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-               /*
-                * 3.1.1:
-                *
-                *   The driver MUST follow this sequence to initialize a device:
-                *...
-                *   - Set the DRIVER status bit: the guest OS knows how
-                *     to drive the device.
-                *   - Read device feature bits, and write the subset
-                *     of feature bits understood by the OS and driver
-                *     to the device.
-                */
-               if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-                       bad_driver(d,
-                                  "feature read before VIRTIO_CONFIG_S_DRIVER");
-               goto read_through32;
-       case offsetof(struct virtio_pci_mmio, cfg.msix_config):
-               bad_driver(d, "read of msix_config");
-       case offsetof(struct virtio_pci_mmio, cfg.num_queues):
-               goto read_through16;
-       case offsetof(struct virtio_pci_mmio, cfg.device_status):
-               /* As they did read, any write of FEATURES_OK is now fine. */
-               d->wrote_features_ok = false;
-               goto read_through8;
-       case offsetof(struct virtio_pci_mmio, cfg.config_generation):
-               /*
-                * 4.1.4.3.1:
-                *
-                *  The device MUST present a changed config_generation after
-                *  the driver has read a device-specific configuration value
-                *  which has changed since any part of the device-specific
-                *  configuration was last read.
-                *
-                * This is simple: none of our devices change config, so this
-                * is always 0.
-                */
-               goto read_through8;
-       case offsetof(struct virtio_pci_mmio, notify):
-               /*
-                * 3.1.1:
-                *
-                *   The driver MUST NOT notify the device before setting
-                *   DRIVER_OK.
-                */
-               if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-                       bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
-               goto read_through16;
-       case offsetof(struct virtio_pci_mmio, isr):
-               if (mask != 0xFF)
-                       bad_driver(d, "non-8-bit read from offset %u (%#x)",
-                                  off, getreg(eip));
-               isr = d->mmio->isr;
-               /*
-                * 4.1.4.5.1:
-                *
-                *  The device MUST reset ISR status to 0 on driver read. 
-                */
-               d->mmio->isr = 0;
-               return isr;
-       case offsetof(struct virtio_pci_mmio, padding):
-               bad_driver(d, "read from padding (%#x)", getreg(eip));
-       default:
-               /* Read from device config space, beware unaligned overflow */
-               if (off > d->mmio_size - 4)
-                       bad_driver(d, "read past end (%#x)", getreg(eip));
-
-               /*
-                * 3.1.1:
-                *  The driver MUST follow this sequence to initialize a device:
-                *...
-                *  3. Set the DRIVER status bit: the guest OS knows how to
-                *  drive the device.
-                *  4. Read device feature bits, and write the subset of
-                *  feature bits understood by the OS and driver to the
-                *  device. During this step the driver MAY read (but MUST NOT
-                *  write) the device-specific configuration fields to check
-                *  that it can support the device before accepting it.
-                */
-               if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-                       bad_driver(d,
-                                  "config read before VIRTIO_CONFIG_S_DRIVER");
-
-               if (mask == 0xFFFFFFFF)
-                       goto read_through32;
-               else if (mask == 0xFFFF)
-                       goto read_through16;
-               else
-                       goto read_through8;
-       }
-
-       /*
-        * 4.1.3.1:
-        *
-        *  The driver MUST access each field using the “natural” access
-        *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-        *  16-bit fields and 8-bit accesses for 8-bit fields.
-        */
-read_through32:
-       if (mask != 0xFFFFFFFF)
-               bad_driver(d, "non-32-bit read to offset %u (%#x)",
-                          off, getreg(eip));
-       memcpy(&val, (char *)d->mmio + off, 4);
-       return val;
-
-read_through16:
-       if (mask != 0xFFFF)
-               bad_driver(d, "non-16-bit read to offset %u (%#x)",
-                          off, getreg(eip));
-       memcpy(&val, (char *)d->mmio + off, 2);
-       return val;
-
-read_through8:
-       if (mask != 0xFF)
-               bad_driver(d, "non-8-bit read to offset %u (%#x)",
-                          off, getreg(eip));
-       memcpy(&val, (char *)d->mmio + off, 1);
-       return val;
-}
-
-static void emulate_mmio(unsigned long paddr, const u8 *insn)
-{
-       u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
-       struct device *d = find_mmio_region(paddr, &off);
-       unsigned long args[] = { LHREQ_TRAP, 14 };
-
-       if (!d) {
-               warnx("MMIO touching %#08lx (not a device)", paddr);
-               goto reinject;
-       }
-
-       /* Prefix makes it a 16 bit op */
-       if (insn[0] == 0x66) {
-               mask = 0xFFFF;
-               insnlen++;
-       }
-
-       /* iowrite */
-       if (insn[insnlen] == 0x89) {
-               /* Next byte is r/m byte: bits 3-5 are register. */
-               val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
-               emulate_mmio_write(d, off, val, mask);
-               insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-       } else if (insn[insnlen] == 0x8b) { /* ioread */
-               /* Next byte is r/m byte: bits 3-5 are register. */
-               val = emulate_mmio_read(d, off, mask);
-               setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
-               insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-       } else if (insn[0] == 0x88) { /* 8-bit iowrite */
-               mask = 0xff;
-               /* Next byte is r/m byte: bits 3-5 are register. */
-               val = getreg_num((insn[1] >> 3) & 0x7, mask);
-               emulate_mmio_write(d, off, val, mask);
-               insnlen = 2 + insn_displacement_len(insn[1]);
-       } else if (insn[0] == 0x8a) { /* 8-bit ioread */
-               mask = 0xff;
-               val = emulate_mmio_read(d, off, mask);
-               setreg_num((insn[1] >> 3) & 0x7, val, mask);
-               insnlen = 2 + insn_displacement_len(insn[1]);
-       } else {
-               warnx("Unknown MMIO instruction touching %#08lx:"
-                    " %02x %02x %02x %02x at %u",
-                    paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
-       reinject:
-               /* Inject trap into Guest. */
-               if (write(lguest_fd, args, sizeof(args)) < 0)
-                       err(1, "Reinjecting trap 14 for fault at %#x",
-                           getreg(eip));
-               return;
-       }
-
-       /* Finally, we've "done" the instruction, so move past it. */
-       setreg(eip, getreg(eip) + insnlen);
-}
-
-/*L:190
- * Device Setup
- *
- * All devices need a descriptor so the Guest knows it exists, and a "struct
- * device" so the Launcher can keep track of it.  We have common helper
- * routines to allocate and manage them.
- */
-static void add_pci_virtqueue(struct device *dev,
-                             void (*service)(struct virtqueue *),
-                             const char *name)
-{
-       struct virtqueue **i, *vq = malloc(sizeof(*vq));
-
-       /* Initialize the virtqueue */
-       vq->next = NULL;
-       vq->last_avail_idx = 0;
-       vq->dev = dev;
-       vq->name = name;
-
-       /*
-        * This is the routine the service thread will run, and its Process ID
-        * once it's running.
-        */
-       vq->service = service;
-       vq->thread = (pid_t)-1;
-
-       /* Initialize the configuration. */
-       reset_vq_pci_config(vq);
-       vq->pci_config.queue_notify_off = 0;
-
-       /* Add one to the number of queues */
-       vq->dev->mmio->cfg.num_queues++;
-
-       /*
-        * Add to tail of list, so dev->vq is first vq, dev->vq->next is
-        * second.
-        */
-       for (i = &dev->vq; *i; i = &(*i)->next);
-       *i = vq;
-}
-
-/* The Guest accesses the feature bits via the PCI common config MMIO region */
-static void add_pci_feature(struct device *dev, unsigned bit)
-{
-       dev->features |= (1ULL << bit);
-}
-
-/* For devices with no config. */
-static void no_device_config(struct device *dev)
-{
-       dev->mmio_addr = get_mmio_region(dev->mmio_size);
-
-       dev->config.bar[0] = dev->mmio_addr;
-       /* Bottom 4 bits must be zero */
-       assert(~(dev->config.bar[0] & 0xF));
-}
-
-/* This puts the device config into BAR0 */
-static void set_device_config(struct device *dev, const void *conf, size_t len)
-{
-       /* Set up BAR 0 */
-       dev->mmio_size += len;
-       dev->mmio = realloc(dev->mmio, dev->mmio_size);
-       memcpy(dev->mmio + 1, conf, len);
-
-       /*
-        * 4.1.4.6:
-        *
-        *  The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
-        *  capability for any device type which has a device-specific
-        *  configuration.
-        */
-       /* Hook up device cfg */
-       dev->config.cfg_access.cap.cap_next
-               = offsetof(struct pci_config, device);
-
-       /*
-        * 4.1.4.6.1:
-        *
-        *  The offset for the device-specific configuration MUST be 4-byte
-        *  aligned.
-        */
-       assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
-
-       /* Fix up device cfg field length. */
-       dev->config.device.length = len;
-
-       /* The rest is the same as the no-config case */
-       no_device_config(dev);
-}
-
-static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
-                    size_t bar_offset, size_t bar_bytes, u8 next)
-{
-       cap->cap_vndr = PCI_CAP_ID_VNDR;
-       cap->cap_next = next;
-       cap->cap_len = caplen;
-       cap->cfg_type = type;
-       cap->bar = 0;
-       memset(cap->padding, 0, sizeof(cap->padding));
-       cap->offset = bar_offset;
-       cap->length = bar_bytes;
-}
-
-/*
- * This sets up the pci_config structure, as defined in the virtio 1.0
- * standard (and PCI standard).
- */
-static void init_pci_config(struct pci_config *pci, u16 type,
-                           u8 class, u8 subclass)
-{
-       size_t bar_offset, bar_len;
-
-       /*
-        * 4.1.4.4.1:
-        *
-        *  The device MUST either present notify_off_multiplier as an even
-        *  power of 2, or present notify_off_multiplier as 0.
-        *
-        * 2.1.2:
-        *
-        *   The device MUST initialize device status to 0 upon reset. 
-        */
-       memset(pci, 0, sizeof(*pci));
-
-       /* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
-       pci->vendor_id = 0x1AF4;
-       /* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
-       pci->device_id = 0x1040 + type;
-
-       /*
-        * PCI have specific codes for different types of devices.
-        * Linux doesn't care, but it's a good clue for people looking
-        * at the device.
-        */
-       pci->class = class;
-       pci->subclass = subclass;
-
-       /*
-        * 4.1.2.1:
-        *
-        *  Non-transitional devices SHOULD have a PCI Revision ID of 1 or
-        *  higher
-        */
-       pci->revid = 1;
-
-       /*
-        * 4.1.2.1:
-        *
-        *  Non-transitional devices SHOULD have a PCI Subsystem Device ID of
-        *  0x40 or higher.
-        */
-       pci->subsystem_device_id = 0x40;
-
-       /* We use our dummy interrupt controller, and irq_line is the irq */
-       pci->irq_line = devices.next_irq++;
-       pci->irq_pin = 0;
-
-       /* Support for extended capabilities. */
-       pci->status = (1 << 4);
-
-       /* Link them in. */
-       /*
-        * 4.1.4.3.1:
-        *
-        *  The device MUST present at least one common configuration
-        *  capability.
-        */
-       pci->capabilities = offsetof(struct pci_config, common);
-
-       /* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
-       assert(pci->capabilities % 4 == 0);
-
-       bar_offset = offsetof(struct virtio_pci_mmio, cfg);
-       bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
-       init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
-                bar_offset, bar_len,
-                offsetof(struct pci_config, notify));
-
-       /*
-        * 4.1.4.4.1:
-        *
-        *  The device MUST present at least one notification capability.
-        */
-       bar_offset += bar_len;
-       bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
-
-       /*
-        * 4.1.4.4.1:
-        *
-        *  The cap.offset MUST be 2-byte aligned.
-        */
-       assert(pci->common.cap_next % 2 == 0);
-
-       /* FIXME: Use a non-zero notify_off, for per-queue notification? */
-       /*
-        * 4.1.4.4.1:
-        *
-        *  The value cap.length presented by the device MUST be at least 2 and
-        *  MUST be large enough to support queue notification offsets for all
-        *  supported queues in all possible configurations.
-        */
-       assert(bar_len >= 2);
-
-       init_cap(&pci->notify.cap, sizeof(pci->notify),
-                VIRTIO_PCI_CAP_NOTIFY_CFG,
-                bar_offset, bar_len,
-                offsetof(struct pci_config, isr));
-
-       bar_offset += bar_len;
-       bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
-       /*
-        * 4.1.4.5.1:
-        *
-        *  The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
-        *  capability.
-        */
-       init_cap(&pci->isr, sizeof(pci->isr),
-                VIRTIO_PCI_CAP_ISR_CFG,
-                bar_offset, bar_len,
-                offsetof(struct pci_config, cfg_access));
-
-       /*
-        * 4.1.4.7.1:
-        *
-        * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
-        * capability.
-        */
-       /* This doesn't have any presence in the BAR */
-       init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
-                VIRTIO_PCI_CAP_PCI_CFG,
-                0, 0, 0);
-
-       bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
-       assert(bar_offset == sizeof(struct virtio_pci_mmio));
-
-       /*
-        * This gets sewn in and length set in set_device_config().
-        * Some devices don't have a device configuration interface, so
-        * we never expose this if we don't call set_device_config().
-        */
-       init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
-                bar_offset, 0, 0);
-}
-
-/*
- * This routine does all the creation and setup of a new device, but we don't
- * actually place the MMIO region until we know the size (if any) of the
- * device-specific config.  And we don't actually start the service threads
- * until later.
- *
- * See what I mean about userspace being boring?
- */
-static struct device *new_pci_device(const char *name, u16 type,
-                                    u8 class, u8 subclass)
-{
-       struct device *dev = malloc(sizeof(*dev));
-
-       /* Now we populate the fields one at a time. */
-       dev->name = name;
-       dev->vq = NULL;
-       dev->running = false;
-       dev->wrote_features_ok = false;
-       dev->mmio_size = sizeof(struct virtio_pci_mmio);
-       dev->mmio = calloc(1, dev->mmio_size);
-       dev->features = (u64)1 << VIRTIO_F_VERSION_1;
-       dev->features_accepted = 0;
-
-       if (devices.device_num + 1 >= MAX_PCI_DEVICES)
-               errx(1, "Can only handle 31 PCI devices");
-
-       init_pci_config(&dev->config, type, class, subclass);
-       assert(!devices.pci[devices.device_num+1]);
-       devices.pci[++devices.device_num] = dev;
-
-       return dev;
-}
-
-/*
- * Our first setup routine is the console.  It's a fairly simple device, but
- * UNIX tty handling makes it uglier than it could be.
- */
-static void setup_console(void)
-{
-       struct device *dev;
-       struct virtio_console_config conf;
-
-       /* If we can save the initial standard input settings... */
-       if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
-               struct termios term = orig_term;
-               /*
-                * Then we turn off echo, line buffering and ^C etc: We want a
-                * raw input stream to the Guest.
-                */
-               term.c_lflag &= ~(ISIG|ICANON|ECHO);
-               tcsetattr(STDIN_FILENO, TCSANOW, &term);
-       }
-
-       dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
-
-       /* We store the console state in dev->priv, and initialize it. */
-       dev->priv = malloc(sizeof(struct console_abort));
-       ((struct console_abort *)dev->priv)->count = 0;
-
-       /*
-        * The console needs two virtqueues: the input then the output.  When
-        * they put something the input queue, we make sure we're listening to
-        * stdin.  When they put something in the output queue, we write it to
-        * stdout.
-        */
-       add_pci_virtqueue(dev, console_input, "input");
-       add_pci_virtqueue(dev, console_output, "output");
-
-       /* We need a configuration area for the emerg_wr early writes. */
-       add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
-       set_device_config(dev, &conf, sizeof(conf));
-
-       verbose("device %u: console\n", devices.device_num);
-}
-/*:*/
-
-/*M:010
- * Inter-guest networking is an interesting area.  Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe.  This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sophisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic.  Of course, namespace and permissions issues need to be
- * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could use a virtio network switch in the kernel, ie. vhost.
-:*/
-
-static u32 str2ip(const char *ipaddr)
-{
-       unsigned int b[4];
-
-       if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
-               errx(1, "Failed to parse IP address '%s'", ipaddr);
-       return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-}
-
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
-       unsigned int m[6];
-       if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
-                  &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
-               errx(1, "Failed to parse mac address '%s'", macaddr);
-       mac[0] = m[0];
-       mac[1] = m[1];
-       mac[2] = m[2];
-       mac[3] = m[3];
-       mac[4] = m[4];
-       mac[5] = m[5];
-}
-
-/*
- * This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it.
- */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-       int ifidx;
-       struct ifreq ifr;
-
-       if (!*br_name)
-               errx(1, "must specify bridge name");
-
-       ifidx = if_nametoindex(if_name);
-       if (!ifidx)
-               errx(1, "interface %s does not exist!", if_name);
-
-       strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-       ifr.ifr_name[IFNAMSIZ-1] = '\0';
-       ifr.ifr_ifindex = ifidx;
-       if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-               err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/*
- * This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer.
- */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
-       struct ifreq ifr;
-       struct sockaddr_in sin;
-
-       memset(&ifr, 0, sizeof(ifr));
-       strcpy(ifr.ifr_name, tapif);
-
-       /* Don't read these incantations.  Just cut & paste them like I did! */
-       sin.sin_family = AF_INET;
-       sin.sin_addr.s_addr = htonl(ipaddr);
-       memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
-       if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-               err(1, "Setting %s interface address", tapif);
-       ifr.ifr_flags = IFF_UP;
-       if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-               err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
-       struct ifreq ifr;
-       int vnet_hdr_sz;
-       int netfd;
-
-       /* Start with this zeroed.  Messy but sure. */
-       memset(&ifr, 0, sizeof(ifr));
-
-       /*
-        * We open the /dev/net/tun device and tell it we want a tap device.  A
-        * tap device is like a tun device, only somehow different.  To tell
-        * the truth, I completely blundered my way through this code, but it
-        * works now!
-        */
-       netfd = open_or_die("/dev/net/tun", O_RDWR);
-       ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-       strcpy(ifr.ifr_name, "tap%d");
-       if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-               err(1, "configuring /dev/net/tun");
-
-       if (ioctl(netfd, TUNSETOFFLOAD,
-                 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
-               err(1, "Could not set features for tun device");
-
-       /*
-        * We don't need checksums calculated for packets coming in this
-        * device: trust us!
-        */
-       ioctl(netfd, TUNSETNOCSUM, 1);
-
-       /*
-        * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
-        * field at the end of the network header iff
-        * VIRTIO_NET_F_MRG_RXBUF was negotiated.  For virtio 1.0,
-        * that became the norm, but we need to tell the tun device
-        * about our expanded header (which is called
-        * virtio_net_hdr_mrg_rxbuf in the legacy system).
-        */
-       vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
-       if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
-               err(1, "Setting tun header size to %u", vnet_hdr_sz);
-
-       memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
-       return netfd;
-}
-
-/*L:195
- * Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device.
- */
-static void setup_tun_net(char *arg)
-{
-       struct device *dev;
-       struct net_info *net_info = malloc(sizeof(*net_info));
-       int ipfd;
-       u32 ip = INADDR_ANY;
-       bool bridging = false;
-       char tapif[IFNAMSIZ], *p;
-       struct virtio_net_config conf;
-
-       net_info->tunfd = get_tun_device(tapif);
-
-       /* First we create a new network device. */
-       dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
-       dev->priv = net_info;
-
-       /* Network devices need a recv and a send queue, just like console. */
-       add_pci_virtqueue(dev, net_input, "rx");
-       add_pci_virtqueue(dev, net_output, "tx");
-
-       /*
-        * We need a socket to perform the magic network ioctls to bring up the
-        * tap interface, connect to the bridge etc.  Any socket will do!
-        */
-       ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-       if (ipfd < 0)
-               err(1, "opening IP socket");
-
-       /* If the command line was --tunnet=bridge:<name> do bridging. */
-       if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-               arg += strlen(BRIDGE_PFX);
-               bridging = true;
-       }
-
-       /* A mac address may follow the bridge name or IP address */
-       p = strchr(arg, ':');
-       if (p) {
-               str2mac(p+1, conf.mac);
-               add_pci_feature(dev, VIRTIO_NET_F_MAC);
-               *p = '\0';
-       }
-
-       /* arg is now either an IP address or a bridge name */
-       if (bridging)
-               add_to_bridge(ipfd, tapif, arg);
-       else
-               ip = str2ip(arg);
-
-       /* Set up the tun device. */
-       configure_device(ipfd, tapif, ip);
-
-       /* Expect Guest to handle everything except UFO */
-       add_pci_feature(dev, VIRTIO_NET_F_CSUM);
-       add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
-       add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
-       add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
-       add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
-       add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
-       add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
-       add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
-       /* We handle indirect ring entries */
-       add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
-       set_device_config(dev, &conf, sizeof(conf));
-
-       /* We don't need the socket any more; setup is done. */
-       close(ipfd);
-
-       if (bridging)
-               verbose("device %u: tun %s attached to bridge: %s\n",
-                       devices.device_num, tapif, arg);
-       else
-               verbose("device %u: tun %s: %s\n",
-                       devices.device_num, tapif, arg);
-}
-/*:*/
-
-/* This hangs off device->priv. */
-struct vblk_info {
-       /* The size of the file. */
-       off64_t len;
-
-       /* The file descriptor for the file. */
-       int fd;
-
-};
-
-/*L:210
- * The Disk
- *
- * The disk only has one virtqueue, so it only has one thread.  It is really
- * simple: the Guest asks for a block number and we read or write that position
- * in the file.
- *
- * Before we serviced each virtqueue in a separate thread, that was unacceptably
- * slow: the Guest waits until the read is finished before running anything
- * else, even if it could have been doing useful work.
- *
- * We could have used async I/O, except it's reputed to suck so hard that
- * characters actually go missing from your code when you try to use it.
- */
-static void blk_request(struct virtqueue *vq)
-{
-       struct vblk_info *vblk = vq->dev->priv;
-       unsigned int head, out_num, in_num, wlen;
-       int ret, i;
-       u8 *in;
-       struct virtio_blk_outhdr out;
-       struct iovec iov[vq->vring.num];
-       off64_t off;
-
-       /*
-        * Get the next request, where we normally wait.  It triggers the
-        * interrupt to acknowledge previously serviced requests (if any).
-        */
-       head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-
-       /* Copy the output header from the front of the iov (adjusts iov) */
-       iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
-
-       /* Find and trim end of iov input array, for our status byte. */
-       in = NULL;
-       for (i = out_num + in_num - 1; i >= out_num; i--) {
-               if (iov[i].iov_len > 0) {
-                       in = iov[i].iov_base + iov[i].iov_len - 1;
-                       iov[i].iov_len--;
-                       break;
-               }
-       }
-       if (!in)
-               bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
-
-       /*
-        * For historical reasons, block operations are expressed in 512 byte
-        * "sectors".
-        */
-       off = out.sector * 512;
-
-       if (out.type & VIRTIO_BLK_T_OUT) {
-               /*
-                * Write
-                *
-                * Move to the right location in the block file.  This can fail
-                * if they try to write past end.
-                */
-               if (lseek64(vblk->fd, off, SEEK_SET) != off)
-                       err(1, "Bad seek to sector %llu", out.sector);
-
-               ret = writev(vblk->fd, iov, out_num);
-               verbose("WRITE to sector %llu: %i\n", out.sector, ret);
-
-               /*
-                * Grr... Now we know how long the descriptor they sent was, we
-                * make sure they didn't try to write over the end of the block
-                * file (possibly extending it).
-                */
-               if (ret > 0 && off + ret > vblk->len) {
-                       /* Trim it back to the correct length */
-                       ftruncate64(vblk->fd, vblk->len);
-                       /* Die, bad Guest, die. */
-                       bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
-               }
-
-               wlen = sizeof(*in);
-               *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-       } else if (out.type & VIRTIO_BLK_T_FLUSH) {
-               /* Flush */
-               ret = fdatasync(vblk->fd);
-               verbose("FLUSH fdatasync: %i\n", ret);
-               wlen = sizeof(*in);
-               *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-       } else {
-               /*
-                * Read
-                *
-                * Move to the right location in the block file.  This can fail
-                * if they try to read past end.
-                */
-               if (lseek64(vblk->fd, off, SEEK_SET) != off)
-                       err(1, "Bad seek to sector %llu", out.sector);
-
-               ret = readv(vblk->fd, iov + out_num, in_num);
-               if (ret >= 0) {
-                       wlen = sizeof(*in) + ret;
-                       *in = VIRTIO_BLK_S_OK;
-               } else {
-                       wlen = sizeof(*in);
-                       *in = VIRTIO_BLK_S_IOERR;
-               }
-       }
-
-       /* Finished that request. */
-       add_used(vq, head, wlen);
-}
-
-/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
-{
-       struct device *dev;
-       struct vblk_info *vblk;
-       struct virtio_blk_config conf;
-
-       /* Create the device. */
-       dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
-
-       /* The device has one virtqueue, where the Guest places requests. */
-       add_pci_virtqueue(dev, blk_request, "request");
-
-       /* Allocate the room for our own bookkeeping */
-       vblk = dev->priv = malloc(sizeof(*vblk));
-
-       /* First we open the file and store the length. */
-       vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
-       vblk->len = lseek64(vblk->fd, 0, SEEK_END);
-
-       /* Tell Guest how many sectors this device has. */
-       conf.capacity = cpu_to_le64(vblk->len / 512);
-
-       /*
-        * Tell Guest not to put in too many descriptors at once: two are used
-        * for the in and out elements.
-        */
-       add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
-       conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
-
-       set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
-
-       verbose("device %u: virtblock %llu sectors\n",
-               devices.device_num, le64_to_cpu(conf.capacity));
-}
-
-/*L:211
- * Our random number generator device reads from /dev/urandom into the Guest's
- * input buffers.  The usual case is that the Guest doesn't want random numbers
- * and so has no buffers although /dev/urandom is still readable, whereas
- * console is the reverse.
- *
- * The same logic applies, however.
- */
-struct rng_info {
-       int rfd;
-};
-
-static void rng_input(struct virtqueue *vq)
-{
-       int len;
-       unsigned int head, in_num, out_num, totlen = 0;
-       struct rng_info *rng_info = vq->dev->priv;
-       struct iovec iov[vq->vring.num];
-
-       /* First we need a buffer from the Guests's virtqueue. */
-       head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-       if (out_num)
-               bad_driver_vq(vq, "Output buffers in rng?");
-
-       /*
-        * Just like the console write, we loop to cover the whole iovec.
-        * In this case, short reads actually happen quite a bit.
-        */
-       while (!iov_empty(iov, in_num)) {
-               len = readv(rng_info->rfd, iov, in_num);
-               if (len <= 0)
-                       err(1, "Read from /dev/urandom gave %i", len);
-               iov_consume(vq->dev, iov, in_num, NULL, len);
-               totlen += len;
-       }
-
-       /* Tell the Guest about the new input. */
-       add_used(vq, head, totlen);
-}
-
-/*L:199
- * This creates a "hardware" random number device for the Guest.
- */
-static void setup_rng(void)
-{
-       struct device *dev;
-       struct rng_info *rng_info = malloc(sizeof(*rng_info));
-
-       /* Our device's private info simply contains the /dev/urandom fd. */
-       rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
-
-       /* Create the new device. */
-       dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
-       dev->priv = rng_info;
-
-       /* The device has one virtqueue, where the Guest places inbufs. */
-       add_pci_virtqueue(dev, rng_input, "input");
-
-       /* We don't have any configuration space */
-       no_device_config(dev);
-
-       verbose("device %u: rng\n", devices.device_num);
-}
-/* That's the end of device setup. */
-
-/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
-static void __attribute__((noreturn)) restart_guest(void)
-{
-       unsigned int i;
-
-       /*
-        * Since we don't track all open fds, we simply close everything beyond
-        * stderr.
-        */
-       for (i = 3; i < FD_SETSIZE; i++)
-               close(i);
-
-       /* Reset all the devices (kills all threads). */
-       cleanup_devices();
-
-       execv(main_args[0], main_args);
-       err(1, "Could not exec %s", main_args[0]);
-}
-
-/*L:220
- * Finally we reach the core of the Launcher which runs the Guest, serves
- * its input and output, and finally, lays it to rest.
- */
-static void __attribute__((noreturn)) run_guest(void)
-{
-       for (;;) {
-               struct lguest_pending notify;
-               int readval;
-
-               /* We read from the /dev/lguest device to run the Guest. */
-               readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
-               if (readval == sizeof(notify)) {
-                       if (notify.trap == 13) {
-                               verbose("Emulating instruction at %#x\n",
-                                       getreg(eip));
-                               emulate_insn(notify.insn);
-                       } else if (notify.trap == 14) {
-                               verbose("Emulating MMIO at %#x\n",
-                                       getreg(eip));
-                               emulate_mmio(notify.addr, notify.insn);
-                       } else
-                               errx(1, "Unknown trap %i addr %#08x\n",
-                                    notify.trap, notify.addr);
-               /* ENOENT means the Guest died.  Reading tells us why. */
-               } else if (errno == ENOENT) {
-                       char reason[1024] = { 0 };
-                       pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
-                       errx(1, "%s", reason);
-               /* ERESTART means that we need to reboot the guest */
-               } else if (errno == ERESTART) {
-                       restart_guest();
-               /* Anything else means a bug or incompatible change. */
-               } else
-                       err(1, "Running guest failed");
-       }
-}
-/*L:240
- * This is the end of the Launcher.  The good news: we are over halfway
- * through!  The bad news: the most fiendish part of the code still lies ahead
- * of us.
- *
- * Are you ready?  Take a deep breath and join me in the core of the Host, in
- * "make Host".
-:*/
-
-static struct option opts[] = {
-       { "verbose", 0, NULL, 'v' },
-       { "tunnet", 1, NULL, 't' },
-       { "block", 1, NULL, 'b' },
-       { "rng", 0, NULL, 'r' },
-       { "initrd", 1, NULL, 'i' },
-       { "username", 1, NULL, 'u' },
-       { "chroot", 1, NULL, 'c' },
-       { NULL },
-};
-static void usage(void)
-{
-       errx(1, "Usage: lguest [--verbose] "
-            "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
-            "|--block=<filename>|--initrd=<filename>]...\n"
-            "<mem-in-mb> vmlinux [args...]");
-}
-
-/*L:105 The main routine is where the real work begins: */
-int main(int argc, char *argv[])
-{
-       /* Memory, code startpoint and size of the (optional) initrd. */
-       unsigned long mem = 0, start, initrd_size = 0;
-       /* Two temporaries. */
-       int i, c;
-       /* The boot information for the Guest. */
-       struct boot_params *boot;
-       /* If they specify an initrd file to load. */
-       const char *initrd_name = NULL;
-
-       /* Password structure for initgroups/setres[gu]id */
-       struct passwd *user_details = NULL;
-
-       /* Directory to chroot to */
-       char *chroot_path = NULL;
-
-       /* Save the args: we "reboot" by execing ourselves again. */
-       main_args = argv;
-
-       /*
-        * First we initialize the device list.  We remember next interrupt
-        * number to use for devices (1: remember that 0 is used by the timer).
-        */
-       devices.next_irq = 1;
-
-       /* We're CPU 0.  In fact, that's the only CPU possible right now. */
-       cpu_id = 0;
-
-       /*
-        * We need to know how much memory so we can set up the device
-        * descriptor and memory pages for the devices as we parse the command
-        * line.  So we quickly look through the arguments to find the amount
-        * of memory now.
-        */
-       for (i = 1; i < argc; i++) {
-               if (argv[i][0] != '-') {
-                       mem = atoi(argv[i]) * 1024 * 1024;
-                       /*
-                        * We start by mapping anonymous pages over all of
-                        * guest-physical memory range.  This fills it with 0,
-                        * and ensures that the Guest won't be killed when it
-                        * tries to access it.
-                        */
-                       guest_base = map_zeroed_pages(mem / getpagesize()
-                                                     + DEVICE_PAGES);
-                       guest_limit = mem;
-                       guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
-                       break;
-               }
-       }
-
-       /* If we exit via err(), this kills all the threads, restores tty. */
-       atexit(cleanup_devices);
-
-       /* We always have a console device, and it's always device 1. */
-       setup_console();
-
-       /* The options are fairly straight-forward */
-       while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
-               switch (c) {
-               case 'v':
-                       verbose = true;
-                       break;
-               case 't':
-                       setup_tun_net(optarg);
-                       break;
-               case 'b':
-                       setup_block_file(optarg);
-                       break;
-               case 'r':
-                       setup_rng();
-                       break;
-               case 'i':
-                       initrd_name = optarg;
-                       break;
-               case 'u':
-                       user_details = getpwnam(optarg);
-                       if (!user_details)
-                               err(1, "getpwnam failed, incorrect username?");
-                       break;
-               case 'c':
-                       chroot_path = optarg;
-                       break;
-               default:
-                       warnx("Unknown argument %s", argv[optind]);
-                       usage();
-               }
-       }
-       /*
-        * After the other arguments we expect memory and kernel image name,
-        * followed by command line arguments for the kernel.
-        */
-       if (optind + 2 > argc)
-               usage();
-
-       verbose("Guest base is at %p\n", guest_base);
-
-       /* Initialize the (fake) PCI host bridge device. */
-       init_pci_host_bridge();
-
-       /* Now we load the kernel */
-       start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-
-       /* Boot information is stashed at physical address 0 */
-       boot = from_guest_phys(0);
-
-       /* Map the initrd image if requested (at top of physical memory) */
-       if (initrd_name) {
-               initrd_size = load_initrd(initrd_name, mem);
-               /*
-                * These are the location in the Linux boot header where the
-                * start and size of the initrd are expected to be found.
-                */
-               boot->hdr.ramdisk_image = mem - initrd_size;
-               boot->hdr.ramdisk_size = initrd_size;
-               /* The bootloader type 0xFF means "unknown"; that's OK. */
-               boot->hdr.type_of_loader = 0xFF;
-       }
-
-       /*
-        * The Linux boot header contains an "E820" memory map: ours is a
-        * simple, single region.
-        */
-       boot->e820_entries = 1;
-       boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
-       /*
-        * The boot header contains a command line pointer: we put the command
-        * line after the boot header.
-        */
-       boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
-       /* We use a simple helper to copy the arguments separated by spaces. */
-       concat((char *)(boot + 1), argv+optind+2);
-
-       /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
-       boot->hdr.kernel_alignment = 0x1000000;
-
-       /* Boot protocol version: 2.07 supports the fields for lguest. */
-       boot->hdr.version = 0x207;
-
-       /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
-       boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
-
-       /* Tell the entry path not to try to reload segment registers. */
-       boot->hdr.loadflags |= KEEP_SEGMENTS;
-
-       /* We don't support tboot: */
-       boot->tboot_addr = 0;
-
-       /* Ensure this is 0 to prevent APM from loading: */
-       boot->apm_bios_info.version = 0;
-
-       /* We tell the kernel to initialize the Guest. */
-       tell_kernel(start);
-
-       /* Ensure that we terminate if a device-servicing child dies. */
-       signal(SIGCHLD, kill_launcher);
-
-       /* If requested, chroot to a directory */
-       if (chroot_path) {
-               if (chroot(chroot_path) != 0)
-                       err(1, "chroot(\"%s\") failed", chroot_path);
-
-               if (chdir("/") != 0)
-                       err(1, "chdir(\"/\") failed");
-
-               verbose("chroot done\n");
-       }
-
-       /* If requested, drop privileges */
-       if (user_details) {
-               uid_t u;
-               gid_t g;
-
-               u = user_details->pw_uid;
-               g = user_details->pw_gid;
-
-               if (initgroups(user_details->pw_name, g) != 0)
-                       err(1, "initgroups failed");
-
-               if (setresgid(g, g, g) != 0)
-                       err(1, "setresgid failed");
-
-               if (setresuid(u, u, u) != 0)
-                       err(1, "setresuid failed");
-
-               verbose("Dropping privileges completed\n");
-       }
-
-       /* Finally, run the Guest.  This doesn't return. */
-       run_guest();
-}
-/*:*/
-
-/*M:999
- * Mastery is done: you now know everything I do.
- *
- * But surely you have seen code, features and bugs in your wanderings which
- * you now yearn to attack?  That is the real game, and I look forward to you
- * patching and forking lguest into the Your-Name-Here-visor.
- *
- * Farewell, and good coding!
- * Rusty Russell.
- */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt

deleted file mode 100644 (file)

index 06e1f46..0000000
--- a/tools/lguest/lguest.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-      __
- (___()'`;  Rusty's Remarkably Unreliable Guide to Lguest
- /,    /`      - or, A Young Coder's Illustrated Hypervisor
- \\"--\\    http://lguest.ozlabs.org
-
-Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
-for Linux developers and users to experiment with virtualization with the
-minimum of complexity.  Nonetheless, it should have sufficient features to
-make it useful for specific tasks, and, of course, you are encouraged to fork
-and enhance it (see drivers/lguest/README).
-
-Features:
-
-- Kernel module which runs in a normal kernel.
-- Simple I/O model for communication.
-- Simple program to create new guests.
-- Logo contains cute puppies: http://lguest.ozlabs.org
-
-Developer features:
-
-- Fun to hack on.
-- No ABI: being tied to a specific kernel anyway, you can change anything.
-- Many opportunities for improvement or feature implementation.
-
-Running Lguest:
-
-- The easiest way to run lguest is to use same kernel as guest and host.
-  You can configure them differently, but usually it's easiest not to.
-
-  You will need to configure your kernel with the following options:
-
-  "Processor type and features":
-     "Paravirtualized guest support" = Y
-        "Lguest guest support" = Y
-     "High Memory Support" = off/4GB
-     "Alignment value to which kernel should be aligned" = 0x100000
-        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
-         CONFIG_PHYSICAL_ALIGN=0x100000)
-
-  "Device Drivers":
-     "Block devices"
-        "Virtio block driver" = M/Y
-     "Network device support"
-        "Universal TUN/TAP device driver support" = M/Y
-        "Virtio network driver" = M/Y
-           (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
-
-  "Virtualization"
-     "Linux hypervisor example code" = M/Y
-        (CONFIG_LGUEST=m)
-
-- A tool called "lguest" is available in this directory: type "make"
-  to build it.  If you didn't build your kernel in-tree, use "make
-  O=<builddir>".
-
-- Create or find a root disk image.  There are several useful ones
-  around, such as the xm-test tiny root image at
-         http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
-
-  For more serious work, I usually use a distribution ISO image and
-  install it under qemu, then make multiple copies:
-
-         dd if=/dev/zero of=rootfile bs=1M count=2048
-         qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
-
-  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
-  console!
-
-- "modprobe lg" if you built it as a module.
-
-- Run an lguest as root:
-
-      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
-        --block=rootfile root=/dev/vda
-
-   Explanation:
-    64: the amount of memory to use, in MB.
-
-    vmlinux: the kernel image found in the top of your build directory.  You
-       can also use a standard bzImage.
-
-    --tunnet=192.168.19.1: configures a "tap" device for networking with this
-       IP address.
-
-    --block=rootfile: a file or block device which becomes /dev/vda
-       inside the guest.
-
-    root=/dev/vda: this (and anything else on the command line) are
-       kernel boot parameters.
-
-- Configuring networking.  I usually have the host masquerade, using
-  "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
-  /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
-  eth0 inside the guest at 192.168.19.2.
-
-  Another method is to bridge the tap device to an external interface
-  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
-  to obtain an IP address.  The bridge needs to be configured first:
-  this option simply adds the tap interface to it.
-
-  A simple example on my system:
-
-    ifconfig eth0 0.0.0.0
-    brctl addbr lg0
-    ifconfig lg0 up
-    brctl addif lg0 eth0
-    dhclient lg0
-
-  Then use --tunnet=bridge:lg0 when launching the guest.
-
-  See:
-  
-    http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
-    
-  for general information on how to get bridging to work.
-
-- Random number generation. Using the --rng option will provide a
-  /dev/hwrng in the guest that will read from the host's /dev/random.
-  Use this option in conjunction with rng-tools (see ../hw_random.txt)
-  to provide entropy to the guest kernel's /dev/random.
-
-There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-
-Good luck!
-Rusty Russell rusty@rustcorp.com.au.
author	Juergen Gross <jgross@suse.com>
	Wed, 16 Aug 2017 17:31:57 +0000 (19:31 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 24 Aug 2017 07:57:28 +0000 (09:57 +0200)
MAINTAINERS		patch \| blob \| blame \| history
arch/x86/Kbuild		patch \| blob \| blame \| history
arch/x86/Kconfig		patch \| blob \| blame \| history
arch/x86/include/asm/lguest.h	[deleted file]	patch \| blob \| blame \| history
arch/x86/include/asm/lguest_hcall.h	[deleted file]	patch \| blob \| blame \| history
arch/x86/include/asm/processor.h		patch \| blob \| blame \| history
arch/x86/include/uapi/asm/bootparam.h		patch \| blob \| blame \| history
arch/x86/kernel/asm-offsets_32.c		patch \| blob \| blame \| history
arch/x86/kernel/head_32.S		patch \| blob \| blame \| history
arch/x86/kernel/platform-quirks.c		patch \| blob \| blame \| history
arch/x86/kvm/Kconfig		patch \| blob \| blame \| history
arch/x86/lguest/Kconfig	[deleted file]	patch \| blob \| blame \| history
arch/x86/lguest/Makefile	[deleted file]	patch \| blob \| blame \| history
arch/x86/lguest/boot.c	[deleted file]	patch \| blob \| blame \| history
arch/x86/lguest/head_32.S	[deleted file]	patch \| blob \| blame \| history
drivers/Makefile		patch \| blob \| blame \| history
drivers/block/Kconfig		patch \| blob \| blame \| history
drivers/char/Kconfig		patch \| blob \| blame \| history
drivers/char/virtio_console.c		patch \| blob \| blame \| history
drivers/lguest/Kconfig	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/Makefile	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/README	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/core.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/hypercalls.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/interrupts_and_traps.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/lg.h	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/lguest_user.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/page_tables.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/segments.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/x86/core.c	[deleted file]	patch \| blob \| blame \| history
drivers/lguest/x86/switcher_32.S	[deleted file]	patch \| blob \| blame \| history
drivers/net/Kconfig		patch \| blob \| blame \| history
drivers/tty/hvc/Kconfig		patch \| blob \| blame \| history
drivers/virtio/Kconfig		patch \| blob \| blame \| history
include/linux/lguest.h	[deleted file]	patch \| blob \| blame \| history
include/linux/lguest_launcher.h	[deleted file]	patch \| blob \| blame \| history
include/uapi/linux/virtio_ring.h		patch \| blob \| blame \| history
tools/Makefile		patch \| blob \| blame \| history
tools/lguest/.gitignore	[deleted file]	patch \| blob \| blame \| history
tools/lguest/Makefile	[deleted file]	patch \| blob \| blame \| history
tools/lguest/extract	[deleted file]	patch \| blob \| blame \| history
tools/lguest/lguest.c	[deleted file]	patch \| blob \| blame \| history
tools/lguest/lguest.txt	[deleted file]	patch \| blob \| blame \| history