x86: add might_sleep() to do_page_fault()
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
0fd0e3da 13#include <linux/mmiotrace.h>
1da177e4
LT
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
1da177e4
LT
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h>
c61e211d
HH
22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 24#include <linux/vmalloc.h>
1da177e4 25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
ab2bf0c1 27#include <linux/uaccess.h>
1eeb66a1 28#include <linux/kdebug.h>
1da177e4
LT
29
30#include <asm/system.h>
c61e211d
HH
31#include <asm/desc.h>
32#include <asm/segment.h>
1da177e4
LT
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
1da177e4 37#include <asm-generic/sections.h>
70ef5641 38#include <asm/traps.h>
1da177e4 39
33cb5243
HH
40/*
41 * Page fault error code bits
42 * bit 0 == 0 means no page found, 1 means protection fault
43 * bit 1 == 0 means read, 1 means write
44 * bit 2 == 0 means kernel, 1 means user-mode
45 * bit 3 == 1 means use of reserved bit detected
46 * bit 4 == 1 means fault was an instruction fetch
47 */
8a19da7b 48#define PF_PROT (1<<0)
66c58156 49#define PF_WRITE (1<<1)
8a19da7b
IM
50#define PF_USER (1<<2)
51#define PF_RSVD (1<<3)
66c58156
AK
52#define PF_INSTR (1<<4)
53
0fd0e3da 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 55{
fd3fdf11 56#ifdef CONFIG_MMIOTRACE
0fd0e3da
PP
57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1)
59 return -1;
86069782 60#endif
0fd0e3da 61 return 0;
86069782
PP
62}
63
74a0b576 64static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 65{
33cb5243 66#ifdef CONFIG_KPROBES
74a0b576
CH
67 int ret = 0;
68
69 /* kprobe_running() needs smp_processor_id() */
f8c2ee22 70 if (!user_mode_vm(regs)) {
74a0b576
CH
71 preempt_disable();
72 if (kprobe_running() && kprobe_fault_handler(regs, 14))
73 ret = 1;
74 preempt_enable();
75 }
1bd858a5 76
74a0b576 77 return ret;
74a0b576 78#else
74a0b576 79 return 0;
74a0b576 80#endif
33cb5243 81}
1bd858a5 82
1dc85be0
HH
83/*
84 * X86_32
85 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
86 * Check that here and ignore it.
87 *
88 * X86_64
89 * Sometimes the CPU reports invalid exceptions on prefetch.
90 * Check that here and ignore it.
91 *
92 * Opcode checker based on code by Richard Brunner
93 */
92181f19
NP
94static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
95 unsigned long addr)
33cb5243 96{
ab2bf0c1 97 unsigned char *instr;
1da177e4 98 int scan_more = 1;
33cb5243 99 int prefetch = 0;
f1290ec9 100 unsigned char *max_instr;
1da177e4 101
3085354d
IM
102 /*
103 * If it was a exec (instruction fetch) fault on NX page, then
104 * do not ignore the fault:
105 */
66c58156 106 if (error_code & PF_INSTR)
1da177e4 107 return 0;
1dc85be0 108
f2857ce9 109 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 110 max_instr = instr + 15;
1da177e4 111
76381fee 112 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
113 return 0;
114
33cb5243 115 while (scan_more && instr < max_instr) {
1da177e4
LT
116 unsigned char opcode;
117 unsigned char instr_hi;
118 unsigned char instr_lo;
119
ab2bf0c1 120 if (probe_kernel_address(instr, opcode))
33cb5243 121 break;
1da177e4 122
33cb5243
HH
123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
1da177e4
LT
125 instr++;
126
33cb5243 127 switch (instr_hi) {
1da177e4
LT
128 case 0x20:
129 case 0x30:
33cb5243
HH
130 /*
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
135 */
1da177e4
LT
136 scan_more = ((instr_lo & 7) == 0x6);
137 break;
33cb5243 138#ifdef CONFIG_X86_64
1da177e4 139 case 0x40:
33cb5243
HH
140 /*
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
146 */
76381fee 147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 148 break;
33cb5243 149#endif
1da177e4
LT
150 case 0x60:
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 153 break;
1da177e4 154 case 0xF0:
1dc85be0 155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 156 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 157 break;
1da177e4
LT
158 case 0x00:
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
160 scan_more = 0;
f2857ce9 161
ab2bf0c1 162 if (probe_kernel_address(instr, opcode))
1da177e4
LT
163 break;
164 prefetch = (instr_lo == 0xF) &&
165 (opcode == 0x0D || opcode == 0x18);
33cb5243 166 break;
1da177e4
LT
167 default:
168 scan_more = 0;
169 break;
33cb5243 170 }
1da177e4
LT
171 }
172 return prefetch;
173}
174
c4aba4a8
HH
175static void force_sig_info_fault(int si_signo, int si_code,
176 unsigned long address, struct task_struct *tsk)
177{
178 siginfo_t info;
179
180 info.si_signo = si_signo;
181 info.si_errno = 0;
182 info.si_code = si_code;
183 info.si_addr = (void __user *)address;
184 force_sig_info(si_signo, &info, tsk);
185}
186
1156e098 187#ifdef CONFIG_X86_64
33cb5243
HH
188static int bad_address(void *p)
189{
1da177e4 190 unsigned long dummy;
ab2bf0c1 191 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 192}
1156e098 193#endif
1da177e4 194
cae30f82 195static void dump_pagetable(unsigned long address)
1da177e4 196{
1156e098
HH
197#ifdef CONFIG_X86_32
198 __typeof__(pte_val(__pte(0))) page;
199
200 page = read_cr3();
201 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
202#ifdef CONFIG_X86_PAE
203 printk("*pdpt = %016Lx ", page);
204 if ((page >> PAGE_SHIFT) < max_low_pfn
205 && page & _PAGE_PRESENT) {
206 page &= PAGE_MASK;
207 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
208 & (PTRS_PER_PMD - 1)];
209 printk(KERN_CONT "*pde = %016Lx ", page);
210 page &= ~_PAGE_NX;
211 }
212#else
213 printk("*pde = %08lx ", page);
214#endif
215
216 /*
217 * We must not directly access the pte in the highpte
218 * case if the page table is located in highmem.
219 * And let's rather not kmap-atomic the pte, just in case
220 * it's allocated already.
221 */
222 if ((page >> PAGE_SHIFT) < max_low_pfn
223 && (page & _PAGE_PRESENT)
224 && !(page & _PAGE_PSE)) {
225 page &= PAGE_MASK;
226 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
227 & (PTRS_PER_PTE - 1)];
228 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
229 }
230
231 printk("\n");
232#else /* CONFIG_X86_64 */
1da177e4
LT
233 pgd_t *pgd;
234 pud_t *pud;
235 pmd_t *pmd;
236 pte_t *pte;
237
f51c9452 238 pgd = (pgd_t *)read_cr3();
1da177e4 239
33cb5243 240 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 241 pgd += pgd_index(address);
1da177e4 242 if (bad_address(pgd)) goto bad;
d646bce4 243 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 244 if (!pgd_present(*pgd)) goto ret;
1da177e4 245
d2ae5b5f 246 pud = pud_offset(pgd, address);
1da177e4
LT
247 if (bad_address(pud)) goto bad;
248 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
249 if (!pud_present(*pud) || pud_large(*pud))
250 goto ret;
1da177e4
LT
251
252 pmd = pmd_offset(pud, address);
253 if (bad_address(pmd)) goto bad;
254 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 255 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
256
257 pte = pte_offset_kernel(pmd, address);
258 if (bad_address(pte)) goto bad;
33cb5243 259 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
260ret:
261 printk("\n");
262 return;
263bad:
264 printk("BAD\n");
1156e098
HH
265#endif
266}
267
268#ifdef CONFIG_X86_32
269static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
270{
271 unsigned index = pgd_index(address);
272 pgd_t *pgd_k;
273 pud_t *pud, *pud_k;
274 pmd_t *pmd, *pmd_k;
275
276 pgd += index;
277 pgd_k = init_mm.pgd + index;
278
279 if (!pgd_present(*pgd_k))
280 return NULL;
281
282 /*
283 * set_pgd(pgd, *pgd_k); here would be useless on PAE
284 * and redundant with the set_pmd() on non-PAE. As would
285 * set_pud.
286 */
287
288 pud = pud_offset(pgd, address);
289 pud_k = pud_offset(pgd_k, address);
290 if (!pud_present(*pud_k))
291 return NULL;
292
293 pmd = pmd_offset(pud, address);
294 pmd_k = pmd_offset(pud_k, address);
295 if (!pmd_present(*pmd_k))
296 return NULL;
297 if (!pmd_present(*pmd)) {
298 set_pmd(pmd, *pmd_k);
299 arch_flush_lazy_mmu_mode();
300 } else
301 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
302 return pmd_k;
1da177e4 303}
1156e098 304#endif
1da177e4 305
1dc85be0 306#ifdef CONFIG_X86_64
33cb5243 307static const char errata93_warning[] =
1da177e4
LT
308KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
309KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
310KERN_ERR "******* Please consider a BIOS update.\n"
311KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 312#endif
1da177e4
LT
313
314/* Workaround for K8 erratum #93 & buggy BIOS.
315 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
316 to avoid corruption of the 64bit RIP register on C stepping K8.
317 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
318 The OS sees this as a page fault with the upper 32bits of RIP cleared.
319 Try to work around it here.
fdfe8aa8
HH
320 Note we only handle faults in kernel here.
321 Does nothing for X86_32
322 */
33cb5243 323static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 324{
fdfe8aa8 325#ifdef CONFIG_X86_64
1da177e4 326 static int warned;
65ea5b03 327 if (address != regs->ip)
1da177e4 328 return 0;
33cb5243 329 if ((address >> 32) != 0)
1da177e4
LT
330 return 0;
331 address |= 0xffffffffUL << 32;
33cb5243
HH
332 if ((address >= (u64)_stext && address <= (u64)_etext) ||
333 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 334 if (!warned) {
33cb5243 335 printk(errata93_warning);
1da177e4
LT
336 warned = 1;
337 }
65ea5b03 338 regs->ip = address;
1da177e4
LT
339 return 1;
340 }
fdfe8aa8 341#endif
1da177e4 342 return 0;
33cb5243 343}
1da177e4 344
35f3266f
HH
345/*
346 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
347 * addresses >4GB. We catch this in the page fault handler because these
348 * addresses are not reachable. Just detect this case and return. Any code
349 * segment in LDT is compatibility mode.
350 */
351static int is_errata100(struct pt_regs *regs, unsigned long address)
352{
353#ifdef CONFIG_X86_64
354 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
355 (address >> 32))
356 return 1;
357#endif
358 return 0;
359}
360
29caf2f9
HH
361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
362{
363#ifdef CONFIG_X86_F00F_BUG
364 unsigned long nr;
365 /*
366 * Pentium F0 0F C7 C8 bug workaround.
367 */
368 if (boot_cpu_data.f00f_bug) {
369 nr = (address - idt_descr.address) >> 3;
370
371 if (nr == 6) {
372 do_invalid_op(regs, 0);
373 return 1;
374 }
375 }
376#endif
377 return 0;
378}
379
b3279c7f
HH
380static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
381 unsigned long address)
382{
1156e098
HH
383#ifdef CONFIG_X86_32
384 if (!oops_may_print())
385 return;
fd40d6e3 386#endif
1156e098
HH
387
388#ifdef CONFIG_X86_PAE
389 if (error_code & PF_INSTR) {
93809be8 390 unsigned int level;
1156e098
HH
391 pte_t *pte = lookup_address(address, &level);
392
393 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute "
395 "NX-protected page - exploit attempt? "
350b4da7 396 "(uid: %d)\n", current_uid());
1156e098
HH
397 }
398#endif
1156e098 399
19f0dda9 400 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 401 if (address < PAGE_SIZE)
19f0dda9 402 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 403 else
19f0dda9 404 printk(KERN_CONT "paging request");
f294a8ce 405 printk(KERN_CONT " at %p\n", (void *) address);
19f0dda9 406 printk(KERN_ALERT "IP:");
b3279c7f
HH
407 printk_address(regs->ip, 1);
408 dump_pagetable(address);
409}
410
1156e098 411#ifdef CONFIG_X86_64
92181f19
NP
412static noinline void pgtable_bad(struct pt_regs *regs,
413 unsigned long error_code, unsigned long address)
1da177e4 414{
1209140c 415 unsigned long flags = oops_begin();
874d93d1 416 int sig = SIGKILL;
92181f19 417 struct task_struct *tsk = current;
1209140c 418
1da177e4 419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
92181f19 420 tsk->comm, address);
1da177e4 421 dump_pagetable(address);
6e3f3617
JB
422 tsk->thread.cr2 = address;
423 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code;
22f5991c 425 if (__die("Bad pagetable", regs, error_code))
874d93d1
AH
426 sig = 0;
427 oops_end(flags, regs, sig);
1da177e4 428}
1156e098 429#endif
1da177e4 430
92181f19
NP
431static noinline void no_context(struct pt_regs *regs,
432 unsigned long error_code, unsigned long address)
433{
434 struct task_struct *tsk = current;
435#ifdef CONFIG_X86_64
436 unsigned long flags;
437 int sig;
438#endif
439
440 /* Are we prepared to handle this kernel fault? */
441 if (fixup_exception(regs))
442 return;
443
444 /*
445 * X86_32
446 * Valid to do another page fault here, because if this fault
447 * had been triggered by is_prefetch fixup_exception would have
448 * handled it.
449 *
450 * X86_64
451 * Hall of shame of CPU/BIOS bugs.
452 */
453 if (is_prefetch(regs, error_code, address))
454 return;
455
456 if (is_errata93(regs, address))
457 return;
458
459 /*
460 * Oops. The kernel tried to access some bad page. We'll have to
461 * terminate things with extreme prejudice.
462 */
463#ifdef CONFIG_X86_32
464 bust_spinlocks(1);
465#else
466 flags = oops_begin();
467#endif
468
469 show_fault_oops(regs, error_code, address);
470
471 tsk->thread.cr2 = address;
472 tsk->thread.trap_no = 14;
473 tsk->thread.error_code = error_code;
474
475#ifdef CONFIG_X86_32
476 die("Oops", regs, error_code);
477 bust_spinlocks(0);
478 do_exit(SIGKILL);
479#else
480 sig = SIGKILL;
481 if (__die("Oops", regs, error_code))
482 sig = 0;
483 /* Executive summary in case the body of the oops scrolled away */
484 printk(KERN_EMERG "CR2: %016lx\n", address);
485 oops_end(flags, regs, sig);
486#endif
487}
488
489static void __bad_area_nosemaphore(struct pt_regs *regs,
490 unsigned long error_code, unsigned long address,
491 int si_code)
492{
493 struct task_struct *tsk = current;
494
495 /* User mode accesses just cause a SIGSEGV */
496 if (error_code & PF_USER) {
497 /*
498 * It's possible to have interrupts off here.
499 */
500 local_irq_enable();
501
502 /*
503 * Valid to do another page fault here because this one came
504 * from user space.
505 */
506 if (is_prefetch(regs, error_code, address))
507 return;
508
509 if (is_errata100(regs, address))
510 return;
511
512 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
513 printk_ratelimit()) {
514 printk(
515 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
516 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
517 tsk->comm, task_pid_nr(tsk), address,
518 (void *) regs->ip, (void *) regs->sp, error_code);
519 print_vma_addr(" in ", regs->ip);
520 printk("\n");
521 }
522
523 tsk->thread.cr2 = address;
524 /* Kernel addresses are always protection faults */
525 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
526 tsk->thread.trap_no = 14;
527 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
528 return;
529 }
530
531 if (is_f00f_bug(regs, address))
532 return;
533
534 no_context(regs, error_code, address);
535}
536
537static noinline void bad_area_nosemaphore(struct pt_regs *regs,
538 unsigned long error_code, unsigned long address)
539{
540 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
541}
542
543static void __bad_area(struct pt_regs *regs,
544 unsigned long error_code, unsigned long address,
545 int si_code)
546{
547 struct mm_struct *mm = current->mm;
548
549 /*
550 * Something tried to access memory that isn't in our memory map..
551 * Fix it, but check if it's kernel or user first..
552 */
553 up_read(&mm->mmap_sem);
554
555 __bad_area_nosemaphore(regs, error_code, address, si_code);
556}
557
558static noinline void bad_area(struct pt_regs *regs,
559 unsigned long error_code, unsigned long address)
560{
561 __bad_area(regs, error_code, address, SEGV_MAPERR);
562}
563
564static noinline void bad_area_access_error(struct pt_regs *regs,
565 unsigned long error_code, unsigned long address)
566{
567 __bad_area(regs, error_code, address, SEGV_ACCERR);
568}
569
570/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
571static void out_of_memory(struct pt_regs *regs,
572 unsigned long error_code, unsigned long address)
573{
574 /*
575 * We ran out of memory, call the OOM killer, and return the userspace
576 * (which will retry the fault, or kill us if we got oom-killed).
577 */
578 up_read(&current->mm->mmap_sem);
579 pagefault_out_of_memory();
580}
581
582static void do_sigbus(struct pt_regs *regs,
583 unsigned long error_code, unsigned long address)
584{
585 struct task_struct *tsk = current;
586 struct mm_struct *mm = tsk->mm;
587
588 up_read(&mm->mmap_sem);
589
590 /* Kernel mode? Handle exceptions or die */
591 if (!(error_code & PF_USER))
592 no_context(regs, error_code, address);
593#ifdef CONFIG_X86_32
594 /* User space => ok to do another page fault */
595 if (is_prefetch(regs, error_code, address))
596 return;
597#endif
598 tsk->thread.cr2 = address;
599 tsk->thread.error_code = error_code;
600 tsk->thread.trap_no = 14;
601 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
602}
603
604static noinline void mm_fault_error(struct pt_regs *regs,
605 unsigned long error_code, unsigned long address, unsigned int fault)
606{
607 if (fault & VM_FAULT_OOM)
608 out_of_memory(regs, error_code, address);
609 else if (fault & VM_FAULT_SIGBUS)
610 do_sigbus(regs, error_code, address);
611 else
612 BUG();
613}
614
d8b57bb7
TG
615static int spurious_fault_check(unsigned long error_code, pte_t *pte)
616{
617 if ((error_code & PF_WRITE) && !pte_write(*pte))
618 return 0;
619 if ((error_code & PF_INSTR) && !pte_exec(*pte))
620 return 0;
621
622 return 1;
623}
624
5b727a3b
JF
625/*
626 * Handle a spurious fault caused by a stale TLB entry. This allows
627 * us to lazily refresh the TLB when increasing the permissions of a
628 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
629 * expensive since that implies doing a full cross-processor TLB
630 * flush, even if no stale TLB entries exist on other processors.
631 * There are no security implications to leaving a stale TLB when
632 * increasing the permissions on a page.
633 */
92181f19
NP
634static noinline int spurious_fault(unsigned long error_code,
635 unsigned long address)
5b727a3b
JF
636{
637 pgd_t *pgd;
638 pud_t *pud;
639 pmd_t *pmd;
640 pte_t *pte;
641
642 /* Reserved-bit violation or user access to kernel space? */
643 if (error_code & (PF_USER | PF_RSVD))
644 return 0;
645
646 pgd = init_mm.pgd + pgd_index(address);
647 if (!pgd_present(*pgd))
648 return 0;
649
650 pud = pud_offset(pgd, address);
651 if (!pud_present(*pud))
652 return 0;
653
d8b57bb7
TG
654 if (pud_large(*pud))
655 return spurious_fault_check(error_code, (pte_t *) pud);
656
5b727a3b
JF
657 pmd = pmd_offset(pud, address);
658 if (!pmd_present(*pmd))
659 return 0;
660
d8b57bb7
TG
661 if (pmd_large(*pmd))
662 return spurious_fault_check(error_code, (pte_t *) pmd);
663
5b727a3b
JF
664 pte = pte_offset_kernel(pmd, address);
665 if (!pte_present(*pte))
666 return 0;
667
d8b57bb7 668 return spurious_fault_check(error_code, pte);
5b727a3b
JF
669}
670
1da177e4 671/*
f8c2ee22
HH
672 * X86_32
673 * Handle a fault on the vmalloc or module mapping area
674 *
675 * X86_64
f95190b2 676 * Handle a fault on the vmalloc area
3b9ba4d5
AK
677 *
678 * This assumes no large pages in there.
1da177e4 679 */
92181f19 680static noinline int vmalloc_fault(unsigned long address)
1da177e4 681{
fdfe8aa8
HH
682#ifdef CONFIG_X86_32
683 unsigned long pgd_paddr;
684 pmd_t *pmd_k;
685 pte_t *pte_k;
b29c701d
HN
686
687 /* Make sure we are in vmalloc area */
688 if (!(address >= VMALLOC_START && address < VMALLOC_END))
689 return -1;
690
fdfe8aa8
HH
691 /*
692 * Synchronize this task's top level page-table
693 * with the 'reference' page table.
694 *
695 * Do _not_ use "current" here. We might be inside
696 * an interrupt in the middle of a task switch..
697 */
698 pgd_paddr = read_cr3();
699 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
700 if (!pmd_k)
701 return -1;
702 pte_k = pte_offset_kernel(pmd_k, address);
703 if (!pte_present(*pte_k))
704 return -1;
705 return 0;
706#else
1da177e4
LT
707 pgd_t *pgd, *pgd_ref;
708 pud_t *pud, *pud_ref;
709 pmd_t *pmd, *pmd_ref;
710 pte_t *pte, *pte_ref;
711
cf89ec92
HH
712 /* Make sure we are in vmalloc area */
713 if (!(address >= VMALLOC_START && address < VMALLOC_END))
714 return -1;
715
1da177e4
LT
716 /* Copy kernel mappings over when needed. This can also
717 happen within a race in page table update. In the later
718 case just flush. */
719
f313e123 720 pgd = pgd_offset(current->active_mm, address);
1da177e4
LT
721 pgd_ref = pgd_offset_k(address);
722 if (pgd_none(*pgd_ref))
723 return -1;
724 if (pgd_none(*pgd))
725 set_pgd(pgd, *pgd_ref);
8c914cb7 726 else
46a82b2d 727 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
728
729 /* Below here mismatches are bugs because these lower tables
730 are shared */
731
732 pud = pud_offset(pgd, address);
733 pud_ref = pud_offset(pgd_ref, address);
734 if (pud_none(*pud_ref))
735 return -1;
46a82b2d 736 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
737 BUG();
738 pmd = pmd_offset(pud, address);
739 pmd_ref = pmd_offset(pud_ref, address);
740 if (pmd_none(*pmd_ref))
741 return -1;
742 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
743 BUG();
744 pte_ref = pte_offset_kernel(pmd_ref, address);
745 if (!pte_present(*pte_ref))
746 return -1;
747 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
748 /* Don't use pte_page here, because the mappings can point
749 outside mem_map, and the NUMA hash lookup cannot handle
750 that. */
751 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 752 BUG();
1da177e4 753 return 0;
fdfe8aa8 754#endif
1da177e4
LT
755}
756
abd4f750 757int show_unhandled_signals = 1;
1da177e4 758
92181f19
NP
759static inline int access_error(unsigned long error_code, int write,
760 struct vm_area_struct *vma)
761{
762 if (write) {
763 /* write, present and write, not present */
764 if (unlikely(!(vma->vm_flags & VM_WRITE)))
765 return 1;
766 } else if (unlikely(error_code & PF_PROT)) {
767 /* read, present */
768 return 1;
769 } else {
770 /* read, not present */
771 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
772 return 1;
773 }
774
775 return 0;
776}
777
1da177e4
LT
778/*
779 * This routine handles page faults. It determines the address,
780 * and the problem, and then passes it off to one of the appropriate
781 * routines.
1da177e4 782 */
f8c2ee22
HH
783#ifdef CONFIG_X86_64
784asmlinkage
785#endif
786void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4 787{
92181f19 788 unsigned long address;
1da177e4
LT
789 struct task_struct *tsk;
790 struct mm_struct *mm;
33cb5243 791 struct vm_area_struct *vma;
92181f19 792 int write;
f8c2ee22 793 int fault;
1da177e4 794
a9ba9a3b
AV
795 tsk = current;
796 mm = tsk->mm;
797 prefetchw(&mm->mmap_sem);
798
1da177e4 799 /* get the address */
f51c9452 800 address = read_cr2();
1da177e4 801
92181f19 802 if (unlikely(notify_page_fault(regs)))
608566b4 803 return;
0fd0e3da 804 if (unlikely(kmmio_fault(regs, address)))
86069782 805 return;
1da177e4
LT
806
807 /*
808 * We fault-in kernel-space virtual memory on-demand. The
809 * 'reference' page table is init_mm.pgd.
810 *
811 * NOTE! We MUST NOT take any locks for this case. We may
812 * be in an interrupt or a critical region, and should
813 * only copy the information from the master page table,
814 * nothing more.
815 *
816 * This verifies that the fault happens in kernel space
817 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 818 * protection error (error_code & 9) == 0.
1da177e4 819 */
f8c2ee22
HH
820#ifdef CONFIG_X86_32
821 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
822#else
823 if (unlikely(address >= TASK_SIZE64)) {
824#endif
f8c2ee22
HH
825 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
826 vmalloc_fault(address) >= 0)
827 return;
5b727a3b
JF
828
829 /* Can handle a stale RO->RW TLB */
92181f19 830 if (spurious_fault(error_code, address))
5b727a3b
JF
831 return;
832
f8c2ee22
HH
833 /*
834 * Don't take the mm semaphore here. If we fixup a prefetch
835 * fault we could otherwise deadlock.
836 */
92181f19
NP
837 bad_area_nosemaphore(regs, error_code, address);
838 return;
f8c2ee22
HH
839 }
840
f8c2ee22 841 /*
891cffbd
LT
842 * It's safe to allow irq's after cr2 has been saved and the
843 * vmalloc fault has been handled.
844 *
845 * User-mode registers count as a user access even for any
846 * potential system fault or CPU buglet.
f8c2ee22 847 */
891cffbd
LT
848 if (user_mode_vm(regs)) {
849 local_irq_enable();
850 error_code |= PF_USER;
851 } else if (regs->flags & X86_EFLAGS_IF)
8c914cb7
JB
852 local_irq_enable();
853
891cffbd 854#ifdef CONFIG_X86_64
66c58156 855 if (unlikely(error_code & PF_RSVD))
92181f19 856 pgtable_bad(regs, error_code, address);
891cffbd 857#endif
1da177e4
LT
858
859 /*
33cb5243
HH
860 * If we're in an interrupt, have no user context or are running in an
861 * atomic region then we must not take the fault.
1da177e4 862 */
92181f19
NP
863 if (unlikely(in_atomic() || !mm)) {
864 bad_area_nosemaphore(regs, error_code, address);
865 return;
866 }
1da177e4 867
3a1dfe6e
IM
868 /*
869 * When running in the kernel we expect faults to occur only to
1da177e4 870 * addresses in user space. All other faults represent errors in the
676b1855 871 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 872 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
873 * we will deadlock attempting to validate the fault against the
874 * address space. Luckily the kernel only validly references user
875 * space from well defined areas of code, which are listed in the
876 * exceptions table.
877 *
878 * As the vast majority of faults will be valid we will only perform
676b1855 879 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
880 * Attempt to lock the address space, if we cannot we then validate the
881 * source. If this is invalid we can skip the address space check,
882 * thus avoiding the deadlock.
883 */
92181f19 884 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
66c58156 885 if ((error_code & PF_USER) == 0 &&
92181f19
NP
886 !search_exception_tables(regs->ip)) {
887 bad_area_nosemaphore(regs, error_code, address);
888 return;
889 }
1da177e4 890 down_read(&mm->mmap_sem);
01006074
PZ
891 } else {
892 /*
893 * The above down_read_trylock() might have succeeded in which
894 * case we'll have missed the might_sleep() from down_read().
895 */
896 might_sleep();
1da177e4
LT
897 }
898
899 vma = find_vma(mm, address);
92181f19
NP
900 if (unlikely(!vma)) {
901 bad_area(regs, error_code, address);
902 return;
903 }
904 if (likely(vma->vm_start <= address))
1da177e4 905 goto good_area;
92181f19
NP
906 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
907 bad_area(regs, error_code, address);
908 return;
909 }
33cb5243 910 if (error_code & PF_USER) {
6f4d368e
HH
911 /*
912 * Accessing the stack below %sp is always a bug.
913 * The large cushion allows instructions like enter
914 * and pusha to work. ("enter $65535,$31" pushes
915 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 916 */
92181f19
NP
917 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
918 bad_area(regs, error_code, address);
919 return;
920 }
1da177e4 921 }
92181f19
NP
922 if (unlikely(expand_stack(vma, address))) {
923 bad_area(regs, error_code, address);
924 return;
925 }
926
927 /*
928 * Ok, we have a good vm_area for this memory access, so
929 * we can handle it..
930 */
1da177e4 931good_area:
92181f19
NP
932 write = error_code & PF_WRITE;
933 if (unlikely(access_error(error_code, write, vma))) {
934 bad_area_access_error(regs, error_code, address);
935 return;
1da177e4
LT
936 }
937
938 /*
939 * If for any reason at all we couldn't handle the fault,
940 * make sure we exit gracefully rather than endlessly redo
941 * the fault.
942 */
83c54070
NP
943 fault = handle_mm_fault(mm, vma, address, write);
944 if (unlikely(fault & VM_FAULT_ERROR)) {
92181f19
NP
945 mm_fault_error(regs, error_code, address, fault);
946 return;
1da177e4 947 }
83c54070
NP
948 if (fault & VM_FAULT_MAJOR)
949 tsk->maj_flt++;
950 else
951 tsk->min_flt++;
d729ab35
HH
952
953#ifdef CONFIG_X86_32
954 /*
955 * Did it hit the DOS screen memory VA from vm86 mode?
956 */
957 if (v8086_mode(regs)) {
958 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
959 if (bit < 32)
960 tsk->thread.screen_bitmap |= 1 << bit;
961 }
962#endif
1da177e4 963 up_read(&mm->mmap_sem);
1da177e4 964}
9e43e1b7 965
8c914cb7 966DEFINE_SPINLOCK(pgd_lock);
2bff7383 967LIST_HEAD(pgd_list);
8c914cb7
JB
968
969void vmalloc_sync_all(void)
970{
1156e098
HH
971 unsigned long address;
972
cc643d46 973#ifdef CONFIG_X86_32
1156e098
HH
974 if (SHARED_KERNEL_PMD)
975 return;
976
cc643d46
JB
977 for (address = VMALLOC_START & PMD_MASK;
978 address >= TASK_SIZE && address < FIXADDR_TOP;
979 address += PMD_SIZE) {
67350a5c
JF
980 unsigned long flags;
981 struct page *page;
982
983 spin_lock_irqsave(&pgd_lock, flags);
984 list_for_each_entry(page, &pgd_list, lru) {
985 if (!vmalloc_sync_one(page_address(page),
986 address))
987 break;
1156e098 988 }
67350a5c 989 spin_unlock_irqrestore(&pgd_lock, flags);
1156e098
HH
990 }
991#else /* CONFIG_X86_64 */
cc643d46
JB
992 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
993 address += PGDIR_SIZE) {
67350a5c
JF
994 const pgd_t *pgd_ref = pgd_offset_k(address);
995 unsigned long flags;
996 struct page *page;
997
998 if (pgd_none(*pgd_ref))
999 continue;
1000 spin_lock_irqsave(&pgd_lock, flags);
1001 list_for_each_entry(page, &pgd_list, lru) {
1002 pgd_t *pgd;
1003 pgd = (pgd_t *)page_address(page) + pgd_index(address);
1004 if (pgd_none(*pgd))
1005 set_pgd(pgd, *pgd_ref);
1006 else
1007 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 1008 }
67350a5c 1009 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7 1010 }
1156e098 1011#endif
8c914cb7 1012}