2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
76 * Opcode checker based on code by Richard Brunner
78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
84 unsigned char *max_instr;
87 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
96 /* If it was a exec fault ignore */
97 if (error_code & PF_INSTR)
101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
102 max_instr = instr + 15;
104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
107 while (scan_more && instr < max_instr) {
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
112 if (probe_kernel_address(instr, opcode))
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
128 scan_more = ((instr_lo & 7) == 0x6);
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
148 scan_more = !instr_lo || (instr_lo>>1) == 1;
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 if (probe_kernel_address(instr, opcode))
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
167 static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
172 info.si_signo = si_signo;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
179 static int bad_address(void *p)
182 return probe_kernel_address((unsigned long *)p, dummy);
185 void dump_pagetable(unsigned long address)
192 pgd = (pgd_t *)read_cr3();
194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
195 pgd += pgd_index(address);
196 if (bad_address(pgd)) goto bad;
197 printk("PGD %lx ", pgd_val(*pgd));
198 if (!pgd_present(*pgd)) goto ret;
200 pud = pud_offset(pgd, address);
201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
212 printk("PTE %lx", pte_val(*pte));
221 static const char errata93_warning[] =
222 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224 KERN_ERR "******* Please consider a BIOS update.\n"
225 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
227 /* Workaround for K8 erratum #93 & buggy BIOS.
228 BIOS SMM functions are required to use a specific workaround
229 to avoid corruption of the 64bit RIP register on C stepping K8.
230 A lot of BIOS that didn't get tested properly miss this.
231 The OS sees this as a page fault with the upper 32bits of RIP cleared.
232 Try to work around it here.
233 Note we only handle faults in kernel here. */
235 static int is_errata93(struct pt_regs *regs, unsigned long address)
238 if (address != regs->ip)
240 if ((address >> 32) != 0)
242 address |= 0xffffffffUL << 32;
243 if ((address >= (u64)_stext && address <= (u64)_etext) ||
244 (address >= MODULES_VADDR && address <= MODULES_END)) {
246 printk(errata93_warning);
256 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
257 unsigned long error_code)
259 unsigned long flags = oops_begin();
260 struct task_struct *tsk;
262 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
263 current->comm, address);
264 dump_pagetable(address);
266 tsk->thread.cr2 = address;
267 tsk->thread.trap_no = 14;
268 tsk->thread.error_code = error_code;
269 if (__die("Bad pagetable", regs, error_code))
271 oops_end(flags, regs, SIGKILL);
275 * Handle a fault on the vmalloc area
277 * This assumes no large pages in there.
279 static int vmalloc_fault(unsigned long address)
281 pgd_t *pgd, *pgd_ref;
282 pud_t *pud, *pud_ref;
283 pmd_t *pmd, *pmd_ref;
284 pte_t *pte, *pte_ref;
286 /* Copy kernel mappings over when needed. This can also
287 happen within a race in page table update. In the later
290 pgd = pgd_offset(current->mm ?: &init_mm, address);
291 pgd_ref = pgd_offset_k(address);
292 if (pgd_none(*pgd_ref))
295 set_pgd(pgd, *pgd_ref);
297 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
299 /* Below here mismatches are bugs because these lower tables
302 pud = pud_offset(pgd, address);
303 pud_ref = pud_offset(pgd_ref, address);
304 if (pud_none(*pud_ref))
306 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
308 pmd = pmd_offset(pud, address);
309 pmd_ref = pmd_offset(pud_ref, address);
310 if (pmd_none(*pmd_ref))
312 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
314 pte_ref = pte_offset_kernel(pmd_ref, address);
315 if (!pte_present(*pte_ref))
317 pte = pte_offset_kernel(pmd, address);
318 /* Don't use pte_page here, because the mappings can point
319 outside mem_map, and the NUMA hash lookup cannot handle
321 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
326 int show_unhandled_signals = 1;
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
333 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
336 struct task_struct *tsk;
337 struct mm_struct *mm;
338 struct vm_area_struct *vma;
339 unsigned long address;
345 * We can fault from pretty much anywhere, with unknown IRQ state.
347 trace_hardirqs_fixup();
351 prefetchw(&mm->mmap_sem);
353 /* get the address */
354 address = read_cr2();
356 si_code = SEGV_MAPERR;
360 * We fault-in kernel-space virtual memory on-demand. The
361 * 'reference' page table is init_mm.pgd.
363 * NOTE! We MUST NOT take any locks for this case. We may
364 * be in an interrupt or a critical region, and should
365 * only copy the information from the master page table,
368 * This verifies that the fault happens in kernel space
369 * (error_code & 4) == 0, and that the fault was not a
370 * protection error (error_code & 9) == 0.
372 if (unlikely(address >= TASK_SIZE64)) {
374 * Don't check for the module range here: its PML4
375 * is always initialized because it's shared with the main
376 * kernel text. Only vmalloc may need PML4 syncups.
378 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
379 ((address >= VMALLOC_START && address < VMALLOC_END))) {
380 if (vmalloc_fault(address) >= 0)
383 if (notify_page_fault(regs))
386 * Don't take the mm semaphore here. If we fixup a prefetch
387 * fault we could otherwise deadlock.
389 goto bad_area_nosemaphore;
392 if (notify_page_fault(regs))
395 if (likely(regs->flags & X86_EFLAGS_IF))
398 if (unlikely(error_code & PF_RSVD))
399 pgtable_bad(address, regs, error_code);
402 * If we're in an interrupt, have no user context or are running in an
403 * atomic region then we must not take the fault.
405 if (unlikely(in_atomic() || !mm))
406 goto bad_area_nosemaphore;
409 * User-mode registers count as a user access even for any
410 * potential system fault or CPU buglet.
412 if (user_mode_vm(regs))
413 error_code |= PF_USER;
416 /* When running in the kernel we expect faults to occur only to
417 * addresses in user space. All other faults represent errors in the
418 * kernel and should generate an OOPS. Unfortunately, in the case of an
419 * erroneous fault occurring in a code path which already holds mmap_sem
420 * we will deadlock attempting to validate the fault against the
421 * address space. Luckily the kernel only validly references user
422 * space from well defined areas of code, which are listed in the
425 * As the vast majority of faults will be valid we will only perform
426 * the source reference check when there is a possibility of a deadlock.
427 * Attempt to lock the address space, if we cannot we then validate the
428 * source. If this is invalid we can skip the address space check,
429 * thus avoiding the deadlock.
431 if (!down_read_trylock(&mm->mmap_sem)) {
432 if ((error_code & PF_USER) == 0 &&
433 !search_exception_tables(regs->ip))
434 goto bad_area_nosemaphore;
435 down_read(&mm->mmap_sem);
438 vma = find_vma(mm, address);
441 if (likely(vma->vm_start <= address))
443 if (!(vma->vm_flags & VM_GROWSDOWN))
445 if (error_code & PF_USER) {
446 /* Allow userspace just enough access below the stack pointer
447 * to let the 'enter' instruction work.
449 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
452 if (expand_stack(vma, address))
455 * Ok, we have a good vm_area for this memory access, so
459 si_code = SEGV_ACCERR;
461 switch (error_code & (PF_PROT|PF_WRITE)) {
462 default: /* 3: write, present */
464 case PF_WRITE: /* write, not present */
465 if (!(vma->vm_flags & VM_WRITE))
469 case PF_PROT: /* read, present */
471 case 0: /* read, not present */
472 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
477 * If for any reason at all we couldn't handle the fault,
478 * make sure we exit gracefully rather than endlessly redo
481 fault = handle_mm_fault(mm, vma, address, write);
482 if (unlikely(fault & VM_FAULT_ERROR)) {
483 if (fault & VM_FAULT_OOM)
485 else if (fault & VM_FAULT_SIGBUS)
489 if (fault & VM_FAULT_MAJOR)
493 up_read(&mm->mmap_sem);
497 * Something tried to access memory that isn't in our memory map..
498 * Fix it, but check if it's kernel or user first..
501 up_read(&mm->mmap_sem);
503 bad_area_nosemaphore:
504 /* User mode accesses just cause a SIGSEGV */
505 if (error_code & PF_USER) {
508 * It's possible to have interrupts off here.
512 if (is_prefetch(regs, address, error_code))
515 /* Work around K8 erratum #100 K8 in compat mode
516 occasionally jumps to illegal addresses >4GB. We
517 catch this here in the page fault handler because
518 these addresses are not reachable. Just detect this
519 case and return. Any code segment in LDT is
520 compatibility mode. */
521 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
525 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
526 printk_ratelimit()) {
528 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
529 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
530 tsk->comm, tsk->pid, address, regs->ip,
531 regs->sp, error_code);
534 tsk->thread.cr2 = address;
535 /* Kernel addresses are always protection faults */
536 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
537 tsk->thread.trap_no = 14;
539 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
544 /* Are we prepared to handle this kernel fault? */
545 if (fixup_exception(regs))
549 * Hall of shame of CPU/BIOS bugs.
552 if (is_prefetch(regs, address, error_code))
555 if (is_errata93(regs, address))
559 * Oops. The kernel tried to access some bad page. We'll have to
560 * terminate things with extreme prejudice.
563 flags = oops_begin();
565 if (address < PAGE_SIZE)
566 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
568 printk(KERN_ALERT "Unable to handle kernel paging request");
569 printk(" at %016lx RIP: \n" KERN_ALERT, address);
570 printk_address(regs->ip, regs->bp);
571 dump_pagetable(address);
572 tsk->thread.cr2 = address;
573 tsk->thread.trap_no = 14;
574 tsk->thread.error_code = error_code;
575 if (__die("Oops", regs, error_code))
577 /* Executive summary in case the body of the oops scrolled away */
578 printk(KERN_EMERG "CR2: %016lx\n", address);
579 oops_end(flags, regs, SIGKILL);
582 * We ran out of memory, or some other thing happened to us that made
583 * us unable to handle the page fault gracefully.
586 up_read(&mm->mmap_sem);
587 if (is_global_init(current)) {
591 printk("VM: killing process %s\n", tsk->comm);
592 if (error_code & PF_USER)
593 do_group_exit(SIGKILL);
597 up_read(&mm->mmap_sem);
599 /* Kernel mode? Handle exceptions or die */
600 if (!(error_code & PF_USER))
603 tsk->thread.cr2 = address;
604 tsk->thread.error_code = error_code;
605 tsk->thread.trap_no = 14;
606 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
610 DEFINE_SPINLOCK(pgd_lock);
613 void vmalloc_sync_all(void)
615 /* Note that races in the updates of insync and start aren't
617 insync can only get set bits added, and updates to start are only
618 improving performance (without affecting correctness if undone). */
619 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
620 static unsigned long start = VMALLOC_START & PGDIR_MASK;
621 unsigned long address;
623 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
624 if (!test_bit(pgd_index(address), insync)) {
625 const pgd_t *pgd_ref = pgd_offset_k(address);
628 if (pgd_none(*pgd_ref))
630 spin_lock(&pgd_lock);
631 list_for_each_entry(page, &pgd_list, lru) {
633 pgd = (pgd_t *)page_address(page) + pgd_index(address);
635 set_pgd(pgd, *pgd_ref);
637 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
639 spin_unlock(&pgd_lock);
640 set_bit(pgd_index(address), insync);
642 if (address == start)
643 start = address + PGDIR_SIZE;
645 /* Check that there is no need to do the same for the modules area. */
646 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
647 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
648 (__START_KERNEL & PGDIR_MASK)));