Commit | Line | Data |
---|---|---|
f938d2c8 RR |
1 | /*P:400 This contains run_guest() which actually calls into the Host<->Guest |
2 | * Switcher and analyzes the return, such as determining if the Guest wants the | |
3 | * Host to do something. This file also contains useful helper routines, and a | |
4 | * couple of non-obvious setup and teardown pieces which were implemented after | |
5 | * days of debugging pain. :*/ | |
d7e28ffe RR |
6 | #include <linux/module.h> |
7 | #include <linux/stringify.h> | |
8 | #include <linux/stddef.h> | |
9 | #include <linux/io.h> | |
10 | #include <linux/mm.h> | |
11 | #include <linux/vmalloc.h> | |
12 | #include <linux/cpu.h> | |
13 | #include <linux/freezer.h> | |
14 | #include <asm/paravirt.h> | |
15 | #include <asm/desc.h> | |
16 | #include <asm/pgtable.h> | |
17 | #include <asm/uaccess.h> | |
18 | #include <asm/poll.h> | |
19 | #include <asm/highmem.h> | |
20 | #include <asm/asm-offsets.h> | |
21 | #include <asm/i387.h> | |
22 | #include "lg.h" | |
23 | ||
24 | /* Found in switcher.S */ | |
25 | extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; | |
26 | extern unsigned long default_idt_entries[]; | |
27 | ||
28 | /* Every guest maps the core switcher code. */ | |
29 | #define SHARED_SWITCHER_PAGES \ | |
30 | DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) | |
31 | /* Pages for switcher itself, then two pages per cpu */ | |
32 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) | |
33 | ||
34 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | |
35 | #define SWITCHER_ADDR 0xFFC00000 | |
36 | ||
37 | static struct vm_struct *switcher_vma; | |
38 | static struct page **switcher_page; | |
39 | ||
40 | static int cpu_had_pge; | |
41 | static struct { | |
42 | unsigned long offset; | |
43 | unsigned short segment; | |
44 | } lguest_entry; | |
45 | ||
46 | /* This One Big lock protects all inter-guest data structures. */ | |
47 | DEFINE_MUTEX(lguest_lock); | |
48 | static DEFINE_PER_CPU(struct lguest *, last_guest); | |
49 | ||
50 | /* FIXME: Make dynamic. */ | |
51 | #define MAX_LGUEST_GUESTS 16 | |
52 | struct lguest lguests[MAX_LGUEST_GUESTS]; | |
53 | ||
54 | /* Offset from where switcher.S was compiled to where we've copied it */ | |
55 | static unsigned long switcher_offset(void) | |
56 | { | |
57 | return SWITCHER_ADDR - (unsigned long)start_switcher_text; | |
58 | } | |
59 | ||
60 | /* This cpu's struct lguest_pages. */ | |
61 | static struct lguest_pages *lguest_pages(unsigned int cpu) | |
62 | { | |
63 | return &(((struct lguest_pages *) | |
64 | (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); | |
65 | } | |
66 | ||
67 | static __init int map_switcher(void) | |
68 | { | |
69 | int i, err; | |
70 | struct page **pagep; | |
71 | ||
72 | switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, | |
73 | GFP_KERNEL); | |
74 | if (!switcher_page) { | |
75 | err = -ENOMEM; | |
76 | goto out; | |
77 | } | |
78 | ||
79 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { | |
80 | unsigned long addr = get_zeroed_page(GFP_KERNEL); | |
81 | if (!addr) { | |
82 | err = -ENOMEM; | |
83 | goto free_some_pages; | |
84 | } | |
85 | switcher_page[i] = virt_to_page(addr); | |
86 | } | |
87 | ||
88 | switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, | |
89 | VM_ALLOC, SWITCHER_ADDR, VMALLOC_END); | |
90 | if (!switcher_vma) { | |
91 | err = -ENOMEM; | |
92 | printk("lguest: could not map switcher pages high\n"); | |
93 | goto free_pages; | |
94 | } | |
95 | ||
96 | pagep = switcher_page; | |
97 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | |
98 | if (err) { | |
99 | printk("lguest: map_vm_area failed: %i\n", err); | |
100 | goto free_vma; | |
101 | } | |
102 | memcpy(switcher_vma->addr, start_switcher_text, | |
103 | end_switcher_text - start_switcher_text); | |
104 | ||
105 | /* Fix up IDT entries to point into copied text. */ | |
106 | for (i = 0; i < IDT_ENTRIES; i++) | |
107 | default_idt_entries[i] += switcher_offset(); | |
108 | ||
109 | for_each_possible_cpu(i) { | |
110 | struct lguest_pages *pages = lguest_pages(i); | |
111 | struct lguest_ro_state *state = &pages->state; | |
112 | ||
113 | /* These fields are static: rest done in copy_in_guest_info */ | |
114 | state->host_gdt_desc.size = GDT_SIZE-1; | |
115 | state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); | |
116 | store_idt(&state->host_idt_desc); | |
117 | state->guest_idt_desc.size = sizeof(state->guest_idt)-1; | |
118 | state->guest_idt_desc.address = (long)&state->guest_idt; | |
119 | state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; | |
120 | state->guest_gdt_desc.address = (long)&state->guest_gdt; | |
121 | state->guest_tss.esp0 = (long)(&pages->regs + 1); | |
122 | state->guest_tss.ss0 = LGUEST_DS; | |
123 | /* No I/O for you! */ | |
124 | state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); | |
125 | setup_default_gdt_entries(state); | |
126 | setup_default_idt_entries(state, default_idt_entries); | |
127 | ||
128 | /* Setup LGUEST segments on all cpus */ | |
129 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; | |
130 | get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; | |
131 | } | |
132 | ||
133 | /* Initialize entry point into switcher. */ | |
134 | lguest_entry.offset = (long)switch_to_guest + switcher_offset(); | |
135 | lguest_entry.segment = LGUEST_CS; | |
136 | ||
137 | printk(KERN_INFO "lguest: mapped switcher at %p\n", | |
138 | switcher_vma->addr); | |
139 | return 0; | |
140 | ||
141 | free_vma: | |
142 | vunmap(switcher_vma->addr); | |
143 | free_pages: | |
144 | i = TOTAL_SWITCHER_PAGES; | |
145 | free_some_pages: | |
146 | for (--i; i >= 0; i--) | |
147 | __free_pages(switcher_page[i], 0); | |
148 | kfree(switcher_page); | |
149 | out: | |
150 | return err; | |
151 | } | |
152 | ||
153 | static void unmap_switcher(void) | |
154 | { | |
155 | unsigned int i; | |
156 | ||
157 | vunmap(switcher_vma->addr); | |
158 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | |
159 | __free_pages(switcher_page[i], 0); | |
160 | } | |
161 | ||
162 | /* IN/OUT insns: enough to get us past boot-time probing. */ | |
163 | static int emulate_insn(struct lguest *lg) | |
164 | { | |
165 | u8 insn; | |
166 | unsigned int insnlen = 0, in = 0, shift = 0; | |
167 | unsigned long physaddr = guest_pa(lg, lg->regs->eip); | |
168 | ||
169 | /* This only works for addresses in linear mapping... */ | |
170 | if (lg->regs->eip < lg->page_offset) | |
171 | return 0; | |
172 | lgread(lg, &insn, physaddr, 1); | |
173 | ||
174 | /* Operand size prefix means it's actually for ax. */ | |
175 | if (insn == 0x66) { | |
176 | shift = 16; | |
177 | insnlen = 1; | |
178 | lgread(lg, &insn, physaddr + insnlen, 1); | |
179 | } | |
180 | ||
181 | switch (insn & 0xFE) { | |
182 | case 0xE4: /* in <next byte>,%al */ | |
183 | insnlen += 2; | |
184 | in = 1; | |
185 | break; | |
186 | case 0xEC: /* in (%dx),%al */ | |
187 | insnlen += 1; | |
188 | in = 1; | |
189 | break; | |
190 | case 0xE6: /* out %al,<next byte> */ | |
191 | insnlen += 2; | |
192 | break; | |
193 | case 0xEE: /* out %al,(%dx) */ | |
194 | insnlen += 1; | |
195 | break; | |
196 | default: | |
197 | return 0; | |
198 | } | |
199 | ||
200 | if (in) { | |
201 | /* Lower bit tells is whether it's a 16 or 32 bit access */ | |
202 | if (insn & 0x1) | |
203 | lg->regs->eax = 0xFFFFFFFF; | |
204 | else | |
205 | lg->regs->eax |= (0xFFFF << shift); | |
206 | } | |
207 | lg->regs->eip += insnlen; | |
208 | return 1; | |
209 | } | |
210 | ||
dde79789 RR |
211 | /*L:305 |
212 | * Dealing With Guest Memory. | |
213 | * | |
214 | * When the Guest gives us (what it thinks is) a physical address, we can use | |
215 | * the normal copy_from_user() & copy_to_user() on that address: remember, | |
216 | * Guest physical == Launcher virtual. | |
217 | * | |
218 | * But we can't trust the Guest: it might be trying to access the Launcher | |
219 | * code. We have to check that the range is below the pfn_limit the Launcher | |
220 | * gave us. We have to make sure that addr + len doesn't give us a false | |
221 | * positive by overflowing, too. */ | |
d7e28ffe RR |
222 | int lguest_address_ok(const struct lguest *lg, |
223 | unsigned long addr, unsigned long len) | |
224 | { | |
225 | return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); | |
226 | } | |
227 | ||
dde79789 RR |
228 | /* This is a convenient routine to get a 32-bit value from the Guest (a very |
229 | * common operation). Here we can see how useful the kill_lguest() routine we | |
230 | * met in the Launcher can be: we return a random value (0) instead of needing | |
231 | * to return an error. */ | |
d7e28ffe RR |
232 | u32 lgread_u32(struct lguest *lg, unsigned long addr) |
233 | { | |
234 | u32 val = 0; | |
235 | ||
dde79789 | 236 | /* Don't let them access lguest binary. */ |
d7e28ffe RR |
237 | if (!lguest_address_ok(lg, addr, sizeof(val)) |
238 | || get_user(val, (u32 __user *)addr) != 0) | |
239 | kill_guest(lg, "bad read address %#lx", addr); | |
240 | return val; | |
241 | } | |
242 | ||
dde79789 | 243 | /* Same thing for writing a value. */ |
d7e28ffe RR |
244 | void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) |
245 | { | |
246 | if (!lguest_address_ok(lg, addr, sizeof(val)) | |
247 | || put_user(val, (u32 __user *)addr) != 0) | |
248 | kill_guest(lg, "bad write address %#lx", addr); | |
249 | } | |
250 | ||
dde79789 RR |
251 | /* This routine is more generic, and copies a range of Guest bytes into a |
252 | * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so | |
253 | * the caller doesn't end up using uninitialized kernel memory. */ | |
d7e28ffe RR |
254 | void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) |
255 | { | |
256 | if (!lguest_address_ok(lg, addr, bytes) | |
257 | || copy_from_user(b, (void __user *)addr, bytes) != 0) { | |
258 | /* copy_from_user should do this, but as we rely on it... */ | |
259 | memset(b, 0, bytes); | |
260 | kill_guest(lg, "bad read address %#lx len %u", addr, bytes); | |
261 | } | |
262 | } | |
263 | ||
dde79789 | 264 | /* Similarly, our generic routine to copy into a range of Guest bytes. */ |
d7e28ffe RR |
265 | void lgwrite(struct lguest *lg, unsigned long addr, const void *b, |
266 | unsigned bytes) | |
267 | { | |
268 | if (!lguest_address_ok(lg, addr, bytes) | |
269 | || copy_to_user((void __user *)addr, b, bytes) != 0) | |
270 | kill_guest(lg, "bad write address %#lx len %u", addr, bytes); | |
271 | } | |
dde79789 | 272 | /* (end of memory access helper routines) :*/ |
d7e28ffe RR |
273 | |
274 | static void set_ts(void) | |
275 | { | |
276 | u32 cr0; | |
277 | ||
278 | cr0 = read_cr0(); | |
279 | if (!(cr0 & 8)) | |
280 | write_cr0(cr0|8); | |
281 | } | |
282 | ||
283 | static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) | |
284 | { | |
285 | if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { | |
286 | __get_cpu_var(last_guest) = lg; | |
287 | lg->last_pages = pages; | |
288 | lg->changed = CHANGED_ALL; | |
289 | } | |
290 | ||
291 | /* These are pretty cheap, so we do them unconditionally. */ | |
292 | pages->state.host_cr3 = __pa(current->mm->pgd); | |
293 | map_switcher_in_guest(lg, pages); | |
294 | pages->state.guest_tss.esp1 = lg->esp1; | |
295 | pages->state.guest_tss.ss1 = lg->ss1; | |
296 | ||
297 | /* Copy direct trap entries. */ | |
298 | if (lg->changed & CHANGED_IDT) | |
299 | copy_traps(lg, pages->state.guest_idt, default_idt_entries); | |
300 | ||
301 | /* Copy all GDT entries but the TSS. */ | |
302 | if (lg->changed & CHANGED_GDT) | |
303 | copy_gdt(lg, pages->state.guest_gdt); | |
304 | /* If only the TLS entries have changed, copy them. */ | |
305 | else if (lg->changed & CHANGED_GDT_TLS) | |
306 | copy_gdt_tls(lg, pages->state.guest_gdt); | |
307 | ||
308 | lg->changed = 0; | |
309 | } | |
310 | ||
311 | static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) | |
312 | { | |
313 | unsigned int clobber; | |
314 | ||
315 | copy_in_guest_info(lg, pages); | |
316 | ||
317 | /* Put eflags on stack, lcall does rest: suitable for iret return. */ | |
318 | asm volatile("pushf; lcall *lguest_entry" | |
319 | : "=a"(clobber), "=b"(clobber) | |
320 | : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) | |
321 | : "memory", "%edx", "%ecx", "%edi", "%esi"); | |
322 | } | |
323 | ||
324 | int run_guest(struct lguest *lg, unsigned long __user *user) | |
325 | { | |
326 | while (!lg->dead) { | |
327 | unsigned int cr2 = 0; /* Damn gcc */ | |
328 | ||
329 | /* Hypercalls first: we might have been out to userspace */ | |
330 | do_hypercalls(lg); | |
331 | if (lg->dma_is_pending) { | |
332 | if (put_user(lg->pending_dma, user) || | |
333 | put_user(lg->pending_key, user+1)) | |
334 | return -EFAULT; | |
335 | return sizeof(unsigned long)*2; | |
336 | } | |
337 | ||
338 | if (signal_pending(current)) | |
339 | return -ERESTARTSYS; | |
340 | ||
341 | /* If Waker set break_out, return to Launcher. */ | |
342 | if (lg->break_out) | |
343 | return -EAGAIN; | |
344 | ||
345 | maybe_do_interrupt(lg); | |
346 | ||
347 | try_to_freeze(); | |
348 | ||
349 | if (lg->dead) | |
350 | break; | |
351 | ||
352 | if (lg->halted) { | |
353 | set_current_state(TASK_INTERRUPTIBLE); | |
354 | schedule(); | |
355 | continue; | |
356 | } | |
357 | ||
358 | local_irq_disable(); | |
359 | ||
360 | /* Even if *we* don't want FPU trap, guest might... */ | |
361 | if (lg->ts) | |
362 | set_ts(); | |
363 | ||
364 | /* Don't let Guest do SYSENTER: we can't handle it. */ | |
365 | if (boot_cpu_has(X86_FEATURE_SEP)) | |
366 | wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); | |
367 | ||
368 | run_guest_once(lg, lguest_pages(raw_smp_processor_id())); | |
369 | ||
370 | /* Save cr2 now if we page-faulted. */ | |
371 | if (lg->regs->trapnum == 14) | |
372 | cr2 = read_cr2(); | |
373 | else if (lg->regs->trapnum == 7) | |
374 | math_state_restore(); | |
375 | ||
376 | if (boot_cpu_has(X86_FEATURE_SEP)) | |
377 | wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); | |
378 | local_irq_enable(); | |
379 | ||
380 | switch (lg->regs->trapnum) { | |
381 | case 13: /* We've intercepted a GPF. */ | |
382 | if (lg->regs->errcode == 0) { | |
383 | if (emulate_insn(lg)) | |
384 | continue; | |
385 | } | |
386 | break; | |
387 | case 14: /* We've intercepted a page fault. */ | |
388 | if (demand_page(lg, cr2, lg->regs->errcode)) | |
389 | continue; | |
390 | ||
391 | /* If lguest_data is NULL, this won't hurt. */ | |
392 | if (put_user(cr2, &lg->lguest_data->cr2)) | |
393 | kill_guest(lg, "Writing cr2"); | |
394 | break; | |
395 | case 7: /* We've intercepted a Device Not Available fault. */ | |
396 | /* If they don't want to know, just absorb it. */ | |
397 | if (!lg->ts) | |
398 | continue; | |
399 | break; | |
400 | case 32 ... 255: /* Real interrupt, fall thru */ | |
401 | cond_resched(); | |
402 | case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ | |
403 | continue; | |
404 | } | |
405 | ||
406 | if (deliver_trap(lg, lg->regs->trapnum)) | |
407 | continue; | |
408 | ||
409 | kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", | |
410 | lg->regs->trapnum, lg->regs->eip, | |
411 | lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); | |
412 | } | |
413 | return -ENOENT; | |
414 | } | |
415 | ||
416 | int find_free_guest(void) | |
417 | { | |
418 | unsigned int i; | |
419 | for (i = 0; i < MAX_LGUEST_GUESTS; i++) | |
420 | if (!lguests[i].tsk) | |
421 | return i; | |
422 | return -1; | |
423 | } | |
424 | ||
425 | static void adjust_pge(void *on) | |
426 | { | |
427 | if (on) | |
428 | write_cr4(read_cr4() | X86_CR4_PGE); | |
429 | else | |
430 | write_cr4(read_cr4() & ~X86_CR4_PGE); | |
431 | } | |
432 | ||
433 | static int __init init(void) | |
434 | { | |
435 | int err; | |
436 | ||
437 | if (paravirt_enabled()) { | |
438 | printk("lguest is afraid of %s\n", paravirt_ops.name); | |
439 | return -EPERM; | |
440 | } | |
441 | ||
442 | err = map_switcher(); | |
443 | if (err) | |
444 | return err; | |
445 | ||
446 | err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); | |
447 | if (err) { | |
448 | unmap_switcher(); | |
449 | return err; | |
450 | } | |
451 | lguest_io_init(); | |
452 | ||
453 | err = lguest_device_init(); | |
454 | if (err) { | |
455 | free_pagetables(); | |
456 | unmap_switcher(); | |
457 | return err; | |
458 | } | |
459 | lock_cpu_hotplug(); | |
460 | if (cpu_has_pge) { /* We have a broader idea of "global". */ | |
461 | cpu_had_pge = 1; | |
462 | on_each_cpu(adjust_pge, (void *)0, 0, 1); | |
463 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | |
464 | } | |
465 | unlock_cpu_hotplug(); | |
466 | return 0; | |
467 | } | |
468 | ||
469 | static void __exit fini(void) | |
470 | { | |
471 | lguest_device_remove(); | |
472 | free_pagetables(); | |
473 | unmap_switcher(); | |
474 | lock_cpu_hotplug(); | |
475 | if (cpu_had_pge) { | |
476 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | |
477 | on_each_cpu(adjust_pge, (void *)1, 0, 1); | |
478 | } | |
479 | unlock_cpu_hotplug(); | |
480 | } | |
481 | ||
482 | module_init(init); | |
483 | module_exit(fini); | |
484 | MODULE_LICENSE("GPL"); | |
485 | MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); |