x86: don't destroy %rbp on kernel-mode faults
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
1da177e4
LT
28
29#include <asm/system.h>
c61e211d
HH
30#include <asm/desc.h>
31#include <asm/segment.h>
1da177e4
LT
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
1da177e4 36#include <asm-generic/sections.h>
1da177e4 37
33cb5243
HH
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
8a19da7b 46#define PF_PROT (1<<0)
66c58156 47#define PF_WRITE (1<<1)
8a19da7b
IM
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
66c58156
AK
50#define PF_INSTR (1<<4)
51
74a0b576 52static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 53{
33cb5243 54#ifdef CONFIG_KPROBES
74a0b576
CH
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
74a0b576 61 if (!user_mode(regs)) {
f8c2ee22 62#endif
74a0b576
CH
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
1bd858a5 68
74a0b576 69 return ret;
74a0b576 70#else
74a0b576 71 return 0;
74a0b576 72#endif
33cb5243 73}
1bd858a5 74
1dc85be0
HH
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
33cb5243 88{
ab2bf0c1 89 unsigned char *instr;
1da177e4 90 int scan_more = 1;
33cb5243 91 int prefetch = 0;
f1290ec9 92 unsigned char *max_instr;
1da177e4 93
3085354d
IM
94 /*
95 * If it was a exec (instruction fetch) fault on NX page, then
96 * do not ignore the fault:
97 */
66c58156 98 if (error_code & PF_INSTR)
1da177e4 99 return 0;
1dc85be0 100
f2857ce9 101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 102 max_instr = instr + 15;
1da177e4 103
76381fee 104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
105 return 0;
106
33cb5243 107 while (scan_more && instr < max_instr) {
1da177e4
LT
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
ab2bf0c1 112 if (probe_kernel_address(instr, opcode))
33cb5243 113 break;
1da177e4 114
33cb5243
HH
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
1da177e4
LT
117 instr++;
118
33cb5243 119 switch (instr_hi) {
1da177e4
LT
120 case 0x20:
121 case 0x30:
33cb5243
HH
122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
1da177e4
LT
128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
33cb5243 130#ifdef CONFIG_X86_64
1da177e4 131 case 0x40:
33cb5243
HH
132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
76381fee 139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 140 break;
33cb5243 141#endif
1da177e4
LT
142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 145 break;
1da177e4 146 case 0xF0:
1dc85be0 147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 148 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 149 break;
1da177e4
LT
150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
f2857ce9 153
ab2bf0c1 154 if (probe_kernel_address(instr, opcode))
1da177e4
LT
155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
33cb5243 158 break;
1da177e4
LT
159 default:
160 scan_more = 0;
161 break;
33cb5243 162 }
1da177e4
LT
163 }
164 return prefetch;
165}
166
c4aba4a8
HH
167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
1156e098 179#ifdef CONFIG_X86_64
33cb5243
HH
180static int bad_address(void *p)
181{
1da177e4 182 unsigned long dummy;
ab2bf0c1 183 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 184}
1156e098 185#endif
1da177e4 186
cae30f82 187static void dump_pagetable(unsigned long address)
1da177e4 188{
1156e098
HH
189#ifdef CONFIG_X86_32
190 __typeof__(pte_val(__pte(0))) page;
191
192 page = read_cr3();
193 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
194#ifdef CONFIG_X86_PAE
195 printk("*pdpt = %016Lx ", page);
196 if ((page >> PAGE_SHIFT) < max_low_pfn
197 && page & _PAGE_PRESENT) {
198 page &= PAGE_MASK;
199 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
200 & (PTRS_PER_PMD - 1)];
201 printk(KERN_CONT "*pde = %016Lx ", page);
202 page &= ~_PAGE_NX;
203 }
204#else
205 printk("*pde = %08lx ", page);
206#endif
207
208 /*
209 * We must not directly access the pte in the highpte
210 * case if the page table is located in highmem.
211 * And let's rather not kmap-atomic the pte, just in case
212 * it's allocated already.
213 */
214 if ((page >> PAGE_SHIFT) < max_low_pfn
215 && (page & _PAGE_PRESENT)
216 && !(page & _PAGE_PSE)) {
217 page &= PAGE_MASK;
218 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
219 & (PTRS_PER_PTE - 1)];
220 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
221 }
222
223 printk("\n");
224#else /* CONFIG_X86_64 */
1da177e4
LT
225 pgd_t *pgd;
226 pud_t *pud;
227 pmd_t *pmd;
228 pte_t *pte;
229
f51c9452 230 pgd = (pgd_t *)read_cr3();
1da177e4 231
33cb5243 232 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 233 pgd += pgd_index(address);
1da177e4 234 if (bad_address(pgd)) goto bad;
d646bce4 235 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 236 if (!pgd_present(*pgd)) goto ret;
1da177e4 237
d2ae5b5f 238 pud = pud_offset(pgd, address);
1da177e4
LT
239 if (bad_address(pud)) goto bad;
240 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
241 if (!pud_present(*pud) || pud_large(*pud))
242 goto ret;
1da177e4
LT
243
244 pmd = pmd_offset(pud, address);
245 if (bad_address(pmd)) goto bad;
246 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 247 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
248
249 pte = pte_offset_kernel(pmd, address);
250 if (bad_address(pte)) goto bad;
33cb5243 251 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
252ret:
253 printk("\n");
254 return;
255bad:
256 printk("BAD\n");
1156e098
HH
257#endif
258}
259
260#ifdef CONFIG_X86_32
261static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
262{
263 unsigned index = pgd_index(address);
264 pgd_t *pgd_k;
265 pud_t *pud, *pud_k;
266 pmd_t *pmd, *pmd_k;
267
268 pgd += index;
269 pgd_k = init_mm.pgd + index;
270
271 if (!pgd_present(*pgd_k))
272 return NULL;
273
274 /*
275 * set_pgd(pgd, *pgd_k); here would be useless on PAE
276 * and redundant with the set_pmd() on non-PAE. As would
277 * set_pud.
278 */
279
280 pud = pud_offset(pgd, address);
281 pud_k = pud_offset(pgd_k, address);
282 if (!pud_present(*pud_k))
283 return NULL;
284
285 pmd = pmd_offset(pud, address);
286 pmd_k = pmd_offset(pud_k, address);
287 if (!pmd_present(*pmd_k))
288 return NULL;
289 if (!pmd_present(*pmd)) {
290 set_pmd(pmd, *pmd_k);
291 arch_flush_lazy_mmu_mode();
292 } else
293 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
294 return pmd_k;
1da177e4 295}
1156e098 296#endif
1da177e4 297
1dc85be0 298#ifdef CONFIG_X86_64
33cb5243 299static const char errata93_warning[] =
1da177e4
LT
300KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
301KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
302KERN_ERR "******* Please consider a BIOS update.\n"
303KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 304#endif
1da177e4
LT
305
306/* Workaround for K8 erratum #93 & buggy BIOS.
307 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
308 to avoid corruption of the 64bit RIP register on C stepping K8.
309 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
310 The OS sees this as a page fault with the upper 32bits of RIP cleared.
311 Try to work around it here.
fdfe8aa8
HH
312 Note we only handle faults in kernel here.
313 Does nothing for X86_32
314 */
33cb5243 315static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 316{
fdfe8aa8 317#ifdef CONFIG_X86_64
1da177e4 318 static int warned;
65ea5b03 319 if (address != regs->ip)
1da177e4 320 return 0;
33cb5243 321 if ((address >> 32) != 0)
1da177e4
LT
322 return 0;
323 address |= 0xffffffffUL << 32;
33cb5243
HH
324 if ((address >= (u64)_stext && address <= (u64)_etext) ||
325 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 326 if (!warned) {
33cb5243 327 printk(errata93_warning);
1da177e4
LT
328 warned = 1;
329 }
65ea5b03 330 regs->ip = address;
1da177e4
LT
331 return 1;
332 }
fdfe8aa8 333#endif
1da177e4 334 return 0;
33cb5243 335}
1da177e4 336
35f3266f
HH
337/*
338 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
339 * addresses >4GB. We catch this in the page fault handler because these
340 * addresses are not reachable. Just detect this case and return. Any code
341 * segment in LDT is compatibility mode.
342 */
343static int is_errata100(struct pt_regs *regs, unsigned long address)
344{
345#ifdef CONFIG_X86_64
346 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
347 (address >> 32))
348 return 1;
349#endif
350 return 0;
351}
352
29caf2f9
HH
353void do_invalid_op(struct pt_regs *, unsigned long);
354
355static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
356{
357#ifdef CONFIG_X86_F00F_BUG
358 unsigned long nr;
359 /*
360 * Pentium F0 0F C7 C8 bug workaround.
361 */
362 if (boot_cpu_data.f00f_bug) {
363 nr = (address - idt_descr.address) >> 3;
364
365 if (nr == 6) {
366 do_invalid_op(regs, 0);
367 return 1;
368 }
369 }
370#endif
371 return 0;
372}
373
b3279c7f
HH
374static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
375 unsigned long address)
376{
1156e098
HH
377#ifdef CONFIG_X86_32
378 if (!oops_may_print())
379 return;
fd40d6e3 380#endif
1156e098
HH
381
382#ifdef CONFIG_X86_PAE
383 if (error_code & PF_INSTR) {
93809be8 384 unsigned int level;
1156e098
HH
385 pte_t *pte = lookup_address(address, &level);
386
387 if (pte && pte_present(*pte) && !pte_exec(*pte))
388 printk(KERN_CRIT "kernel tried to execute "
389 "NX-protected page - exploit attempt? "
390 "(uid: %d)\n", current->uid);
391 }
392#endif
1156e098 393
19f0dda9 394 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 395 if (address < PAGE_SIZE)
19f0dda9 396 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 397 else
19f0dda9 398 printk(KERN_CONT "paging request");
fd40d6e3
HH
399#ifdef CONFIG_X86_32
400 printk(KERN_CONT " at %08lx\n", address);
401#else
19f0dda9 402 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 403#endif
19f0dda9 404 printk(KERN_ALERT "IP:");
b3279c7f
HH
405 printk_address(regs->ip, 1);
406 dump_pagetable(address);
407}
408
1156e098 409#ifdef CONFIG_X86_64
1da177e4
LT
410static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
411 unsigned long error_code)
412{
1209140c 413 unsigned long flags = oops_begin();
6e3f3617 414 struct task_struct *tsk;
1209140c 415
1da177e4
LT
416 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
417 current->comm, address);
418 dump_pagetable(address);
6e3f3617
JB
419 tsk = current;
420 tsk->thread.cr2 = address;
421 tsk->thread.trap_no = 14;
422 tsk->thread.error_code = error_code;
22f5991c
JB
423 if (__die("Bad pagetable", regs, error_code))
424 regs = NULL;
425 oops_end(flags, regs, SIGKILL);
1da177e4 426}
1156e098 427#endif
1da177e4 428
d8b57bb7
TG
429static int spurious_fault_check(unsigned long error_code, pte_t *pte)
430{
431 if ((error_code & PF_WRITE) && !pte_write(*pte))
432 return 0;
433 if ((error_code & PF_INSTR) && !pte_exec(*pte))
434 return 0;
435
436 return 1;
437}
438
5b727a3b
JF
439/*
440 * Handle a spurious fault caused by a stale TLB entry. This allows
441 * us to lazily refresh the TLB when increasing the permissions of a
442 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
443 * expensive since that implies doing a full cross-processor TLB
444 * flush, even if no stale TLB entries exist on other processors.
445 * There are no security implications to leaving a stale TLB when
446 * increasing the permissions on a page.
447 */
448static int spurious_fault(unsigned long address,
449 unsigned long error_code)
450{
451 pgd_t *pgd;
452 pud_t *pud;
453 pmd_t *pmd;
454 pte_t *pte;
455
456 /* Reserved-bit violation or user access to kernel space? */
457 if (error_code & (PF_USER | PF_RSVD))
458 return 0;
459
460 pgd = init_mm.pgd + pgd_index(address);
461 if (!pgd_present(*pgd))
462 return 0;
463
464 pud = pud_offset(pgd, address);
465 if (!pud_present(*pud))
466 return 0;
467
d8b57bb7
TG
468 if (pud_large(*pud))
469 return spurious_fault_check(error_code, (pte_t *) pud);
470
5b727a3b
JF
471 pmd = pmd_offset(pud, address);
472 if (!pmd_present(*pmd))
473 return 0;
474
d8b57bb7
TG
475 if (pmd_large(*pmd))
476 return spurious_fault_check(error_code, (pte_t *) pmd);
477
5b727a3b
JF
478 pte = pte_offset_kernel(pmd, address);
479 if (!pte_present(*pte))
480 return 0;
481
d8b57bb7 482 return spurious_fault_check(error_code, pte);
5b727a3b
JF
483}
484
1da177e4 485/*
f8c2ee22
HH
486 * X86_32
487 * Handle a fault on the vmalloc or module mapping area
488 *
489 * X86_64
f95190b2 490 * Handle a fault on the vmalloc area
3b9ba4d5
AK
491 *
492 * This assumes no large pages in there.
1da177e4
LT
493 */
494static int vmalloc_fault(unsigned long address)
495{
fdfe8aa8
HH
496#ifdef CONFIG_X86_32
497 unsigned long pgd_paddr;
498 pmd_t *pmd_k;
499 pte_t *pte_k;
b29c701d
HN
500
501 /* Make sure we are in vmalloc area */
502 if (!(address >= VMALLOC_START && address < VMALLOC_END))
503 return -1;
504
fdfe8aa8
HH
505 /*
506 * Synchronize this task's top level page-table
507 * with the 'reference' page table.
508 *
509 * Do _not_ use "current" here. We might be inside
510 * an interrupt in the middle of a task switch..
511 */
512 pgd_paddr = read_cr3();
513 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
514 if (!pmd_k)
515 return -1;
516 pte_k = pte_offset_kernel(pmd_k, address);
517 if (!pte_present(*pte_k))
518 return -1;
519 return 0;
520#else
1da177e4
LT
521 pgd_t *pgd, *pgd_ref;
522 pud_t *pud, *pud_ref;
523 pmd_t *pmd, *pmd_ref;
524 pte_t *pte, *pte_ref;
525
cf89ec92
HH
526 /* Make sure we are in vmalloc area */
527 if (!(address >= VMALLOC_START && address < VMALLOC_END))
528 return -1;
529
1da177e4
LT
530 /* Copy kernel mappings over when needed. This can also
531 happen within a race in page table update. In the later
532 case just flush. */
533
534 pgd = pgd_offset(current->mm ?: &init_mm, address);
535 pgd_ref = pgd_offset_k(address);
536 if (pgd_none(*pgd_ref))
537 return -1;
538 if (pgd_none(*pgd))
539 set_pgd(pgd, *pgd_ref);
8c914cb7 540 else
46a82b2d 541 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
542
543 /* Below here mismatches are bugs because these lower tables
544 are shared */
545
546 pud = pud_offset(pgd, address);
547 pud_ref = pud_offset(pgd_ref, address);
548 if (pud_none(*pud_ref))
549 return -1;
46a82b2d 550 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
551 BUG();
552 pmd = pmd_offset(pud, address);
553 pmd_ref = pmd_offset(pud_ref, address);
554 if (pmd_none(*pmd_ref))
555 return -1;
556 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
557 BUG();
558 pte_ref = pte_offset_kernel(pmd_ref, address);
559 if (!pte_present(*pte_ref))
560 return -1;
561 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
562 /* Don't use pte_page here, because the mappings can point
563 outside mem_map, and the NUMA hash lookup cannot handle
564 that. */
565 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 566 BUG();
1da177e4 567 return 0;
fdfe8aa8 568#endif
1da177e4
LT
569}
570
abd4f750 571int show_unhandled_signals = 1;
1da177e4
LT
572
573/*
574 * This routine handles page faults. It determines the address,
575 * and the problem, and then passes it off to one of the appropriate
576 * routines.
1da177e4 577 */
f8c2ee22
HH
578#ifdef CONFIG_X86_64
579asmlinkage
580#endif
581void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
582{
583 struct task_struct *tsk;
584 struct mm_struct *mm;
33cb5243 585 struct vm_area_struct *vma;
1da177e4 586 unsigned long address;
f8c2ee22
HH
587 int write, si_code;
588 int fault;
589#ifdef CONFIG_X86_64
1209140c 590 unsigned long flags;
f8c2ee22 591#endif
1da177e4 592
143a5d32
PZ
593 /*
594 * We can fault from pretty much anywhere, with unknown IRQ state.
595 */
596 trace_hardirqs_fixup();
597
a9ba9a3b
AV
598 tsk = current;
599 mm = tsk->mm;
600 prefetchw(&mm->mmap_sem);
601
1da177e4 602 /* get the address */
f51c9452 603 address = read_cr2();
1da177e4 604
c4aba4a8 605 si_code = SEGV_MAPERR;
1da177e4 606
608566b4
HH
607 if (notify_page_fault(regs))
608 return;
1da177e4
LT
609
610 /*
611 * We fault-in kernel-space virtual memory on-demand. The
612 * 'reference' page table is init_mm.pgd.
613 *
614 * NOTE! We MUST NOT take any locks for this case. We may
615 * be in an interrupt or a critical region, and should
616 * only copy the information from the master page table,
617 * nothing more.
618 *
619 * This verifies that the fault happens in kernel space
620 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 621 * protection error (error_code & 9) == 0.
1da177e4 622 */
f8c2ee22
HH
623#ifdef CONFIG_X86_32
624 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
625#else
626 if (unlikely(address >= TASK_SIZE64)) {
627#endif
f8c2ee22
HH
628 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
629 vmalloc_fault(address) >= 0)
630 return;
5b727a3b
JF
631
632 /* Can handle a stale RO->RW TLB */
633 if (spurious_fault(address, error_code))
634 return;
635
f8c2ee22
HH
636 /*
637 * Don't take the mm semaphore here. If we fixup a prefetch
638 * fault we could otherwise deadlock.
639 */
640 goto bad_area_nosemaphore;
641 }
642
cf89ec92
HH
643
644#ifdef CONFIG_X86_32
f8c2ee22
HH
645 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
646 fault has been handled. */
6b6891f9 647 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
f8c2ee22
HH
648 local_irq_enable();
649
650 /*
651 * If we're in an interrupt, have no user context or are running in an
652 * atomic region then we must not take the fault.
653 */
654 if (in_atomic() || !mm)
655 goto bad_area_nosemaphore;
656#else /* CONFIG_X86_64 */
65ea5b03 657 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
658 local_irq_enable();
659
66c58156 660 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
661 pgtable_bad(address, regs, error_code);
662
663 /*
33cb5243
HH
664 * If we're in an interrupt, have no user context or are running in an
665 * atomic region then we must not take the fault.
1da177e4
LT
666 */
667 if (unlikely(in_atomic() || !mm))
668 goto bad_area_nosemaphore;
669
dbe3ed1c
LT
670 /*
671 * User-mode registers count as a user access even for any
672 * potential system fault or CPU buglet.
673 */
674 if (user_mode_vm(regs))
675 error_code |= PF_USER;
f8c2ee22
HH
676again:
677#endif
1da177e4
LT
678 /* When running in the kernel we expect faults to occur only to
679 * addresses in user space. All other faults represent errors in the
676b1855 680 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 681 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
682 * we will deadlock attempting to validate the fault against the
683 * address space. Luckily the kernel only validly references user
684 * space from well defined areas of code, which are listed in the
685 * exceptions table.
686 *
687 * As the vast majority of faults will be valid we will only perform
676b1855 688 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
689 * Attempt to lock the address space, if we cannot we then validate the
690 * source. If this is invalid we can skip the address space check,
691 * thus avoiding the deadlock.
692 */
693 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 694 if ((error_code & PF_USER) == 0 &&
65ea5b03 695 !search_exception_tables(regs->ip))
1da177e4
LT
696 goto bad_area_nosemaphore;
697 down_read(&mm->mmap_sem);
698 }
699
700 vma = find_vma(mm, address);
701 if (!vma)
702 goto bad_area;
f8c2ee22 703 if (vma->vm_start <= address)
1da177e4
LT
704 goto good_area;
705 if (!(vma->vm_flags & VM_GROWSDOWN))
706 goto bad_area;
33cb5243 707 if (error_code & PF_USER) {
6f4d368e
HH
708 /*
709 * Accessing the stack below %sp is always a bug.
710 * The large cushion allows instructions like enter
711 * and pusha to work. ("enter $65535,$31" pushes
712 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 713 */
65ea5b03 714 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
715 goto bad_area;
716 }
717 if (expand_stack(vma, address))
718 goto bad_area;
719/*
720 * Ok, we have a good vm_area for this memory access, so
721 * we can handle it..
722 */
723good_area:
c4aba4a8 724 si_code = SEGV_ACCERR;
1da177e4 725 write = 0;
66c58156 726 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
727 default: /* 3: write, present */
728 /* fall through */
729 case PF_WRITE: /* write, not present */
730 if (!(vma->vm_flags & VM_WRITE))
731 goto bad_area;
732 write++;
733 break;
734 case PF_PROT: /* read, present */
735 goto bad_area;
736 case 0: /* read, not present */
737 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 738 goto bad_area;
1da177e4
LT
739 }
740
f8c2ee22
HH
741#ifdef CONFIG_X86_32
742survive:
743#endif
1da177e4
LT
744 /*
745 * If for any reason at all we couldn't handle the fault,
746 * make sure we exit gracefully rather than endlessly redo
747 * the fault.
748 */
83c54070
NP
749 fault = handle_mm_fault(mm, vma, address, write);
750 if (unlikely(fault & VM_FAULT_ERROR)) {
751 if (fault & VM_FAULT_OOM)
752 goto out_of_memory;
753 else if (fault & VM_FAULT_SIGBUS)
754 goto do_sigbus;
755 BUG();
1da177e4 756 }
83c54070
NP
757 if (fault & VM_FAULT_MAJOR)
758 tsk->maj_flt++;
759 else
760 tsk->min_flt++;
d729ab35
HH
761
762#ifdef CONFIG_X86_32
763 /*
764 * Did it hit the DOS screen memory VA from vm86 mode?
765 */
766 if (v8086_mode(regs)) {
767 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
768 if (bit < 32)
769 tsk->thread.screen_bitmap |= 1 << bit;
770 }
771#endif
1da177e4
LT
772 up_read(&mm->mmap_sem);
773 return;
774
775/*
776 * Something tried to access memory that isn't in our memory map..
777 * Fix it, but check if it's kernel or user first..
778 */
779bad_area:
780 up_read(&mm->mmap_sem);
781
782bad_area_nosemaphore:
1da177e4 783 /* User mode accesses just cause a SIGSEGV */
66c58156 784 if (error_code & PF_USER) {
e5e3c84b
SR
785 /*
786 * It's possible to have interrupts off here.
787 */
788 local_irq_enable();
789
1156e098
HH
790 /*
791 * Valid to do another page fault here because this one came
792 * from user space.
793 */
1da177e4
LT
794 if (is_prefetch(regs, address, error_code))
795 return;
796
35f3266f 797 if (is_errata100(regs, address))
1da177e4
LT
798 return;
799
abd4f750
MAS
800 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
801 printk_ratelimit()) {
1da177e4 802 printk(
6f4d368e 803#ifdef CONFIG_X86_32
edcd8119 804 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 805#else
03252919 806 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
807#endif
808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
809 tsk->comm, task_pid_nr(tsk), address, regs->ip,
810 regs->sp, error_code);
03252919
AK
811 print_vma_addr(" in ", regs->ip);
812 printk("\n");
1da177e4 813 }
33cb5243 814
1da177e4
LT
815 tsk->thread.cr2 = address;
816 /* Kernel addresses are always protection faults */
817 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
818 tsk->thread.trap_no = 14;
c4aba4a8 819 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
820 return;
821 }
822
29caf2f9
HH
823 if (is_f00f_bug(regs, address))
824 return;
825
1da177e4 826no_context:
1da177e4 827 /* Are we prepared to handle this kernel fault? */
33cb5243 828 if (fixup_exception(regs))
1da177e4 829 return;
1da177e4 830
33cb5243 831 /*
f8c2ee22
HH
832 * X86_32
833 * Valid to do another page fault here, because if this fault
834 * had been triggered by is_prefetch fixup_exception would have
835 * handled it.
836 *
837 * X86_64
1da177e4
LT
838 * Hall of shame of CPU/BIOS bugs.
839 */
33cb5243
HH
840 if (is_prefetch(regs, address, error_code))
841 return;
1da177e4
LT
842
843 if (is_errata93(regs, address))
33cb5243 844 return;
1da177e4
LT
845
846/*
847 * Oops. The kernel tried to access some bad page. We'll have to
848 * terminate things with extreme prejudice.
849 */
f8c2ee22
HH
850#ifdef CONFIG_X86_32
851 bust_spinlocks(1);
fd40d6e3
HH
852#else
853 flags = oops_begin();
854#endif
f8c2ee22
HH
855
856 show_fault_oops(regs, error_code, address);
1da177e4 857
f8c2ee22
HH
858 tsk->thread.cr2 = address;
859 tsk->thread.trap_no = 14;
860 tsk->thread.error_code = error_code;
fd40d6e3
HH
861
862#ifdef CONFIG_X86_32
f8c2ee22
HH
863 die("Oops", regs, error_code);
864 bust_spinlocks(0);
865 do_exit(SIGKILL);
fd40d6e3 866#else
22f5991c
JB
867 if (__die("Oops", regs, error_code))
868 regs = NULL;
1da177e4
LT
869 /* Executive summary in case the body of the oops scrolled away */
870 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 871 oops_end(flags, regs, SIGKILL);
f8c2ee22 872#endif
1da177e4
LT
873
874/*
875 * We ran out of memory, or some other thing happened to us that made
876 * us unable to handle the page fault gracefully.
877 */
878out_of_memory:
879 up_read(&mm->mmap_sem);
f8c2ee22
HH
880 if (is_global_init(tsk)) {
881 yield();
fd40d6e3 882#ifdef CONFIG_X86_32
f8c2ee22
HH
883 down_read(&mm->mmap_sem);
884 goto survive;
f8c2ee22 885#else
1da177e4 886 goto again;
f8c2ee22 887#endif
fd40d6e3
HH
888 }
889
1da177e4 890 printk("VM: killing process %s\n", tsk->comm);
318aa296 891 if (error_code & PF_USER)
021daae2 892 do_group_exit(SIGKILL);
1da177e4
LT
893 goto no_context;
894
895do_sigbus:
896 up_read(&mm->mmap_sem);
897
898 /* Kernel mode? Handle exceptions or die */
66c58156 899 if (!(error_code & PF_USER))
1da177e4 900 goto no_context;
f8c2ee22
HH
901#ifdef CONFIG_X86_32
902 /* User space => ok to do another page fault */
903 if (is_prefetch(regs, address, error_code))
904 return;
905#endif
1da177e4
LT
906 tsk->thread.cr2 = address;
907 tsk->thread.error_code = error_code;
908 tsk->thread.trap_no = 14;
c4aba4a8 909 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 910}
9e43e1b7 911
8c914cb7 912DEFINE_SPINLOCK(pgd_lock);
2bff7383 913LIST_HEAD(pgd_list);
8c914cb7
JB
914
915void vmalloc_sync_all(void)
916{
1156e098
HH
917#ifdef CONFIG_X86_32
918 /*
919 * Note that races in the updates of insync and start aren't
920 * problematic: insync can only get set bits added, and updates to
921 * start are only improving performance (without affecting correctness
922 * if undone).
923 */
924 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
925 static unsigned long start = TASK_SIZE;
926 unsigned long address;
927
928 if (SHARED_KERNEL_PMD)
929 return;
930
931 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
932 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
933 if (!test_bit(pgd_index(address), insync)) {
934 unsigned long flags;
935 struct page *page;
936
937 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 938 list_for_each_entry(page, &pgd_list, lru) {
1156e098 939 if (!vmalloc_sync_one(page_address(page),
e3ed910d 940 address))
1156e098 941 break;
e3ed910d 942 }
1156e098
HH
943 spin_unlock_irqrestore(&pgd_lock, flags);
944 if (!page)
945 set_bit(pgd_index(address), insync);
946 }
947 if (address == start && test_bit(pgd_index(address), insync))
948 start = address + PGDIR_SIZE;
949 }
950#else /* CONFIG_X86_64 */
6f4d368e
HH
951 /*
952 * Note that races in the updates of insync and start aren't
953 * problematic: insync can only get set bits added, and updates to
954 * start are only improving performance (without affecting correctness
955 * if undone).
956 */
8c914cb7
JB
957 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
958 static unsigned long start = VMALLOC_START & PGDIR_MASK;
959 unsigned long address;
960
961 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
962 if (!test_bit(pgd_index(address), insync)) {
963 const pgd_t *pgd_ref = pgd_offset_k(address);
58d5d0d8 964 unsigned long flags;
8c914cb7
JB
965 struct page *page;
966
967 if (pgd_none(*pgd_ref))
968 continue;
58d5d0d8 969 spin_lock_irqsave(&pgd_lock, flags);
2bff7383 970 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
971 pgd_t *pgd;
972 pgd = (pgd_t *)page_address(page) + pgd_index(address);
973 if (pgd_none(*pgd))
974 set_pgd(pgd, *pgd_ref);
975 else
46a82b2d 976 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 977 }
58d5d0d8 978 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7
JB
979 set_bit(pgd_index(address), insync);
980 }
981 if (address == start)
982 start = address + PGDIR_SIZE;
983 }
1156e098 984#endif
8c914cb7 985}