2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
69 * Return EIP plus the CS segment base. The segment limit is also
70 * adjusted, clamped to the kernel/user address space (whichever is
71 * appropriate), and returned in *eip_limit.
73 * The segment is checked, because it might have been changed by another
74 * task between the original faulting instruction and here.
76 * If CS is no longer a valid code segment, or if EIP is beyond the
77 * limit, or if it is a kernel address when CS is not a kernel segment,
78 * then the returned value will be greater than *eip_limit.
80 * This is slow, but is very rarely executed.
82 static inline unsigned long get_segment_eip(struct pt_regs *regs,
83 unsigned long *eip_limit)
85 unsigned long ip = regs->ip;
86 unsigned seg = regs->cs & 0xffff;
87 u32 seg_ar, seg_limit, base, *desc;
89 /* Unlikely, but must come before segment checks. */
90 if (unlikely(regs->flags & VM_MASK)) {
92 *eip_limit = base + 0xffff;
93 return base + (ip & 0xffff);
96 /* The standard kernel/user address space limit. */
97 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
99 /* By far the most common cases. */
100 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
103 /* Check the segment exists, is within the current LDT/GDT size,
104 that kernel/user (ring 0..3) has the appropriate privilege,
105 that it's a code segment, and get the limit. */
106 __asm__("larl %3,%0; lsll %3,%1"
107 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
108 if ((~seg_ar & 0x9800) || ip > seg_limit) {
110 return 1; /* So that returned ip > *eip_limit. */
113 /* Get the GDT/LDT descriptor base.
114 When you look for races in this code remember that
115 LDT and other horrors are only used in user space. */
117 /* Must lock the LDT while reading it. */
118 mutex_lock(¤t->mm->context.lock);
119 desc = current->mm->context.ldt;
120 desc = (void *)desc + (seg & ~7);
122 /* Must disable preemption while reading the GDT. */
123 desc = (u32 *)get_cpu_gdt_table(get_cpu());
124 desc = (void *)desc + (seg & ~7);
127 /* Decode the code segment base from the descriptor */
128 base = get_desc_base((struct desc_struct *)desc);
131 mutex_unlock(¤t->mm->context.lock);
135 /* Adjust EIP and segment limit, and clamp at the kernel limit.
136 It's legitimate for segments to wrap at 0xffffffff. */
138 if (seg_limit < *eip_limit && seg_limit >= base)
139 *eip_limit = seg_limit;
146 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
147 * Check that here and ignore it.
150 * Sometimes the CPU reports invalid exceptions on prefetch.
151 * Check that here and ignore it.
153 * Opcode checker based on code by Richard Brunner
155 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
156 unsigned long error_code)
158 unsigned char *instr;
161 unsigned char *max_instr;
165 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
166 boot_cpu_data.x86 >= 6)) {
167 /* Catch an obscure case of prefetch inside an NX page. */
168 if (nx_enabled && (error_code & PF_INSTR))
173 instr = (unsigned char *)get_segment_eip(regs, &limit);
175 /* If it was a exec fault ignore */
176 if (error_code & PF_INSTR)
178 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
181 max_instr = instr + 15;
184 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
188 while (scan_more && instr < max_instr) {
189 unsigned char opcode;
190 unsigned char instr_hi;
191 unsigned char instr_lo;
194 if (instr > (unsigned char *)limit)
197 if (probe_kernel_address(instr, opcode))
200 instr_hi = opcode & 0xf0;
201 instr_lo = opcode & 0x0f;
208 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
209 * In X86_64 long mode, the CPU will signal invalid
210 * opcode if some of these prefixes are present so
211 * X86_64 will never get here anyway
213 scan_more = ((instr_lo & 7) == 0x6);
218 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
219 * Need to figure out under what instruction mode the
220 * instruction was issued. Could check the LDT for lm,
221 * but for now it's good enough to assume that long
222 * mode only uses well known segments or kernel.
224 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
228 /* 0x64 thru 0x67 are valid prefixes in all modes. */
229 scan_more = (instr_lo & 0xC) == 0x4;
232 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
233 scan_more = !instr_lo || (instr_lo>>1) == 1;
236 /* Prefetch instruction is 0x0F0D or 0x0F18 */
239 if (instr > (unsigned char *)limit)
242 if (probe_kernel_address(instr, opcode))
244 prefetch = (instr_lo == 0xF) &&
245 (opcode == 0x0D || opcode == 0x18);
255 static int bad_address(void *p)
258 return probe_kernel_address((unsigned long *)p, dummy);
261 void dump_pagetable(unsigned long address)
268 pgd = (pgd_t *)read_cr3();
270 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
271 pgd += pgd_index(address);
272 if (bad_address(pgd)) goto bad;
273 printk("PGD %lx ", pgd_val(*pgd));
274 if (!pgd_present(*pgd)) goto ret;
276 pud = pud_offset(pgd, address);
277 if (bad_address(pud)) goto bad;
278 printk("PUD %lx ", pud_val(*pud));
279 if (!pud_present(*pud)) goto ret;
281 pmd = pmd_offset(pud, address);
282 if (bad_address(pmd)) goto bad;
283 printk("PMD %lx ", pmd_val(*pmd));
284 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
286 pte = pte_offset_kernel(pmd, address);
287 if (bad_address(pte)) goto bad;
288 printk("PTE %lx", pte_val(*pte));
297 static const char errata93_warning[] =
298 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
299 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
300 KERN_ERR "******* Please consider a BIOS update.\n"
301 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
303 /* Workaround for K8 erratum #93 & buggy BIOS.
304 BIOS SMM functions are required to use a specific workaround
305 to avoid corruption of the 64bit RIP register on C stepping K8.
306 A lot of BIOS that didn't get tested properly miss this.
307 The OS sees this as a page fault with the upper 32bits of RIP cleared.
308 Try to work around it here.
309 Note we only handle faults in kernel here. */
311 static int is_errata93(struct pt_regs *regs, unsigned long address)
314 if (address != regs->ip)
316 if ((address >> 32) != 0)
318 address |= 0xffffffffUL << 32;
319 if ((address >= (u64)_stext && address <= (u64)_etext) ||
320 (address >= MODULES_VADDR && address <= MODULES_END)) {
322 printk(errata93_warning);
332 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
333 unsigned long error_code)
335 unsigned long flags = oops_begin();
336 struct task_struct *tsk;
338 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
339 current->comm, address);
340 dump_pagetable(address);
342 tsk->thread.cr2 = address;
343 tsk->thread.trap_no = 14;
344 tsk->thread.error_code = error_code;
345 if (__die("Bad pagetable", regs, error_code))
347 oops_end(flags, regs, SIGKILL);
351 * Handle a fault on the vmalloc area
353 * This assumes no large pages in there.
355 static int vmalloc_fault(unsigned long address)
357 pgd_t *pgd, *pgd_ref;
358 pud_t *pud, *pud_ref;
359 pmd_t *pmd, *pmd_ref;
360 pte_t *pte, *pte_ref;
362 /* Copy kernel mappings over when needed. This can also
363 happen within a race in page table update. In the later
366 pgd = pgd_offset(current->mm ?: &init_mm, address);
367 pgd_ref = pgd_offset_k(address);
368 if (pgd_none(*pgd_ref))
371 set_pgd(pgd, *pgd_ref);
373 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
375 /* Below here mismatches are bugs because these lower tables
378 pud = pud_offset(pgd, address);
379 pud_ref = pud_offset(pgd_ref, address);
380 if (pud_none(*pud_ref))
382 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
384 pmd = pmd_offset(pud, address);
385 pmd_ref = pmd_offset(pud_ref, address);
386 if (pmd_none(*pmd_ref))
388 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
390 pte_ref = pte_offset_kernel(pmd_ref, address);
391 if (!pte_present(*pte_ref))
393 pte = pte_offset_kernel(pmd, address);
394 /* Don't use pte_page here, because the mappings can point
395 outside mem_map, and the NUMA hash lookup cannot handle
397 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
402 int show_unhandled_signals = 1;
405 * This routine handles page faults. It determines the address,
406 * and the problem, and then passes it off to one of the appropriate
409 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
410 unsigned long error_code)
412 struct task_struct *tsk;
413 struct mm_struct *mm;
414 struct vm_area_struct *vma;
415 unsigned long address;
421 * We can fault from pretty much anywhere, with unknown IRQ state.
423 trace_hardirqs_fixup();
427 prefetchw(&mm->mmap_sem);
429 /* get the address */
430 address = read_cr2();
432 info.si_code = SEGV_MAPERR;
436 * We fault-in kernel-space virtual memory on-demand. The
437 * 'reference' page table is init_mm.pgd.
439 * NOTE! We MUST NOT take any locks for this case. We may
440 * be in an interrupt or a critical region, and should
441 * only copy the information from the master page table,
444 * This verifies that the fault happens in kernel space
445 * (error_code & 4) == 0, and that the fault was not a
446 * protection error (error_code & 9) == 0.
448 if (unlikely(address >= TASK_SIZE64)) {
450 * Don't check for the module range here: its PML4
451 * is always initialized because it's shared with the main
452 * kernel text. Only vmalloc may need PML4 syncups.
454 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
455 ((address >= VMALLOC_START && address < VMALLOC_END))) {
456 if (vmalloc_fault(address) >= 0)
459 if (notify_page_fault(regs))
462 * Don't take the mm semaphore here. If we fixup a prefetch
463 * fault we could otherwise deadlock.
465 goto bad_area_nosemaphore;
468 if (notify_page_fault(regs))
471 if (likely(regs->flags & X86_EFLAGS_IF))
474 if (unlikely(error_code & PF_RSVD))
475 pgtable_bad(address, regs, error_code);
478 * If we're in an interrupt, have no user context or are running in an
479 * atomic region then we must not take the fault.
481 if (unlikely(in_atomic() || !mm))
482 goto bad_area_nosemaphore;
485 * User-mode registers count as a user access even for any
486 * potential system fault or CPU buglet.
488 if (user_mode_vm(regs))
489 error_code |= PF_USER;
492 /* When running in the kernel we expect faults to occur only to
493 * addresses in user space. All other faults represent errors in the
494 * kernel and should generate an OOPS. Unfortunately, in the case of an
495 * erroneous fault occurring in a code path which already holds mmap_sem
496 * we will deadlock attempting to validate the fault against the
497 * address space. Luckily the kernel only validly references user
498 * space from well defined areas of code, which are listed in the
501 * As the vast majority of faults will be valid we will only perform
502 * the source reference check when there is a possibility of a deadlock.
503 * Attempt to lock the address space, if we cannot we then validate the
504 * source. If this is invalid we can skip the address space check,
505 * thus avoiding the deadlock.
507 if (!down_read_trylock(&mm->mmap_sem)) {
508 if ((error_code & PF_USER) == 0 &&
509 !search_exception_tables(regs->ip))
510 goto bad_area_nosemaphore;
511 down_read(&mm->mmap_sem);
514 vma = find_vma(mm, address);
517 if (likely(vma->vm_start <= address))
519 if (!(vma->vm_flags & VM_GROWSDOWN))
521 if (error_code & PF_USER) {
522 /* Allow userspace just enough access below the stack pointer
523 * to let the 'enter' instruction work.
525 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
528 if (expand_stack(vma, address))
531 * Ok, we have a good vm_area for this memory access, so
535 info.si_code = SEGV_ACCERR;
537 switch (error_code & (PF_PROT|PF_WRITE)) {
538 default: /* 3: write, present */
540 case PF_WRITE: /* write, not present */
541 if (!(vma->vm_flags & VM_WRITE))
545 case PF_PROT: /* read, present */
547 case 0: /* read, not present */
548 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
553 * If for any reason at all we couldn't handle the fault,
554 * make sure we exit gracefully rather than endlessly redo
557 fault = handle_mm_fault(mm, vma, address, write);
558 if (unlikely(fault & VM_FAULT_ERROR)) {
559 if (fault & VM_FAULT_OOM)
561 else if (fault & VM_FAULT_SIGBUS)
565 if (fault & VM_FAULT_MAJOR)
569 up_read(&mm->mmap_sem);
573 * Something tried to access memory that isn't in our memory map..
574 * Fix it, but check if it's kernel or user first..
577 up_read(&mm->mmap_sem);
579 bad_area_nosemaphore:
580 /* User mode accesses just cause a SIGSEGV */
581 if (error_code & PF_USER) {
584 * It's possible to have interrupts off here.
588 if (is_prefetch(regs, address, error_code))
591 /* Work around K8 erratum #100 K8 in compat mode
592 occasionally jumps to illegal addresses >4GB. We
593 catch this here in the page fault handler because
594 these addresses are not reachable. Just detect this
595 case and return. Any code segment in LDT is
596 compatibility mode. */
597 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
601 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
602 printk_ratelimit()) {
604 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
605 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
606 tsk->comm, tsk->pid, address, regs->ip,
607 regs->sp, error_code);
610 tsk->thread.cr2 = address;
611 /* Kernel addresses are always protection faults */
612 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
613 tsk->thread.trap_no = 14;
614 info.si_signo = SIGSEGV;
616 /* info.si_code has been set above */
617 info.si_addr = (void __user *)address;
618 force_sig_info(SIGSEGV, &info, tsk);
623 /* Are we prepared to handle this kernel fault? */
624 if (fixup_exception(regs))
628 * Hall of shame of CPU/BIOS bugs.
631 if (is_prefetch(regs, address, error_code))
634 if (is_errata93(regs, address))
638 * Oops. The kernel tried to access some bad page. We'll have to
639 * terminate things with extreme prejudice.
642 flags = oops_begin();
644 if (address < PAGE_SIZE)
645 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
647 printk(KERN_ALERT "Unable to handle kernel paging request");
648 printk(" at %016lx RIP: \n" KERN_ALERT, address);
649 printk_address(regs->ip);
650 dump_pagetable(address);
651 tsk->thread.cr2 = address;
652 tsk->thread.trap_no = 14;
653 tsk->thread.error_code = error_code;
654 if (__die("Oops", regs, error_code))
656 /* Executive summary in case the body of the oops scrolled away */
657 printk(KERN_EMERG "CR2: %016lx\n", address);
658 oops_end(flags, regs, SIGKILL);
661 * We ran out of memory, or some other thing happened to us that made
662 * us unable to handle the page fault gracefully.
665 up_read(&mm->mmap_sem);
666 if (is_global_init(current)) {
670 printk("VM: killing process %s\n", tsk->comm);
672 do_group_exit(SIGKILL);
676 up_read(&mm->mmap_sem);
678 /* Kernel mode? Handle exceptions or die */
679 if (!(error_code & PF_USER))
682 tsk->thread.cr2 = address;
683 tsk->thread.error_code = error_code;
684 tsk->thread.trap_no = 14;
685 info.si_signo = SIGBUS;
687 info.si_code = BUS_ADRERR;
688 info.si_addr = (void __user *)address;
689 force_sig_info(SIGBUS, &info, tsk);
693 DEFINE_SPINLOCK(pgd_lock);
696 void vmalloc_sync_all(void)
698 /* Note that races in the updates of insync and start aren't
700 insync can only get set bits added, and updates to start are only
701 improving performance (without affecting correctness if undone). */
702 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
703 static unsigned long start = VMALLOC_START & PGDIR_MASK;
704 unsigned long address;
706 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
707 if (!test_bit(pgd_index(address), insync)) {
708 const pgd_t *pgd_ref = pgd_offset_k(address);
711 if (pgd_none(*pgd_ref))
713 spin_lock(&pgd_lock);
714 list_for_each_entry(page, &pgd_list, lru) {
716 pgd = (pgd_t *)page_address(page) + pgd_index(address);
718 set_pgd(pgd, *pgd_ref);
720 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
722 spin_unlock(&pgd_lock);
723 set_bit(pgd_index(address), insync);
725 if (address == start)
726 start = address + PGDIR_SIZE;
728 /* Check that there is no need to do the same for the modules area. */
729 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
730 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
731 (__START_KERNEL & PGDIR_MASK)));