2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
67 /* Sometimes the CPU reports invalid exceptions on prefetch.
68 Check that here and ignore.
69 Opcode checker based on code by Richard Brunner */
70 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
71 unsigned long error_code)
76 unsigned char *max_instr;
78 /* If it was a exec fault ignore */
79 if (error_code & PF_INSTR)
82 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
83 max_instr = instr + 15;
85 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
88 while (scan_more && instr < max_instr) {
90 unsigned char instr_hi;
91 unsigned char instr_lo;
93 if (probe_kernel_address(instr, opcode))
96 instr_hi = opcode & 0xf0;
97 instr_lo = opcode & 0x0f;
104 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
105 * In X86_64 long mode, the CPU will signal invalid
106 * opcode if some of these prefixes are present so
107 * X86_64 will never get here anyway
109 scan_more = ((instr_lo & 7) == 0x6);
114 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
115 * Need to figure out under what instruction mode the
116 * instruction was issued. Could check the LDT for lm,
117 * but for now it's good enough to assume that long
118 * mode only uses well known segments or kernel.
120 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
124 /* 0x64 thru 0x67 are valid prefixes in all modes. */
125 scan_more = (instr_lo & 0xC) == 0x4;
128 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
129 scan_more = !instr_lo || (instr_lo>>1) == 1;
132 /* Prefetch instruction is 0x0F0D or 0x0F18 */
134 if (probe_kernel_address(instr, opcode))
136 prefetch = (instr_lo == 0xF) &&
137 (opcode == 0x0D || opcode == 0x18);
147 static int bad_address(void *p)
150 return probe_kernel_address((unsigned long *)p, dummy);
153 void dump_pagetable(unsigned long address)
160 pgd = (pgd_t *)read_cr3();
162 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
163 pgd += pgd_index(address);
164 if (bad_address(pgd)) goto bad;
165 printk("PGD %lx ", pgd_val(*pgd));
166 if (!pgd_present(*pgd)) goto ret;
168 pud = pud_offset(pgd, address);
169 if (bad_address(pud)) goto bad;
170 printk("PUD %lx ", pud_val(*pud));
171 if (!pud_present(*pud)) goto ret;
173 pmd = pmd_offset(pud, address);
174 if (bad_address(pmd)) goto bad;
175 printk("PMD %lx ", pmd_val(*pmd));
176 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
178 pte = pte_offset_kernel(pmd, address);
179 if (bad_address(pte)) goto bad;
180 printk("PTE %lx", pte_val(*pte));
188 static const char errata93_warning[] =
189 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
190 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
191 KERN_ERR "******* Please consider a BIOS update.\n"
192 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
194 /* Workaround for K8 erratum #93 & buggy BIOS.
195 BIOS SMM functions are required to use a specific workaround
196 to avoid corruption of the 64bit RIP register on C stepping K8.
197 A lot of BIOS that didn't get tested properly miss this.
198 The OS sees this as a page fault with the upper 32bits of RIP cleared.
199 Try to work around it here.
200 Note we only handle faults in kernel here. */
202 static int is_errata93(struct pt_regs *regs, unsigned long address)
205 if (address != regs->ip)
207 if ((address >> 32) != 0)
209 address |= 0xffffffffUL << 32;
210 if ((address >= (u64)_stext && address <= (u64)_etext) ||
211 (address >= MODULES_VADDR && address <= MODULES_END)) {
213 printk(errata93_warning);
222 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
223 unsigned long error_code)
225 unsigned long flags = oops_begin();
226 struct task_struct *tsk;
228 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
229 current->comm, address);
230 dump_pagetable(address);
232 tsk->thread.cr2 = address;
233 tsk->thread.trap_no = 14;
234 tsk->thread.error_code = error_code;
235 if (__die("Bad pagetable", regs, error_code))
237 oops_end(flags, regs, SIGKILL);
241 * Handle a fault on the vmalloc area
243 * This assumes no large pages in there.
245 static int vmalloc_fault(unsigned long address)
247 pgd_t *pgd, *pgd_ref;
248 pud_t *pud, *pud_ref;
249 pmd_t *pmd, *pmd_ref;
250 pte_t *pte, *pte_ref;
252 /* Copy kernel mappings over when needed. This can also
253 happen within a race in page table update. In the later
256 pgd = pgd_offset(current->mm ?: &init_mm, address);
257 pgd_ref = pgd_offset_k(address);
258 if (pgd_none(*pgd_ref))
261 set_pgd(pgd, *pgd_ref);
263 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
265 /* Below here mismatches are bugs because these lower tables
268 pud = pud_offset(pgd, address);
269 pud_ref = pud_offset(pgd_ref, address);
270 if (pud_none(*pud_ref))
272 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
274 pmd = pmd_offset(pud, address);
275 pmd_ref = pmd_offset(pud_ref, address);
276 if (pmd_none(*pmd_ref))
278 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
280 pte_ref = pte_offset_kernel(pmd_ref, address);
281 if (!pte_present(*pte_ref))
283 pte = pte_offset_kernel(pmd, address);
284 /* Don't use pte_page here, because the mappings can point
285 outside mem_map, and the NUMA hash lookup cannot handle
287 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
292 int show_unhandled_signals = 1;
295 * This routine handles page faults. It determines the address,
296 * and the problem, and then passes it off to one of the appropriate
299 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
300 unsigned long error_code)
302 struct task_struct *tsk;
303 struct mm_struct *mm;
304 struct vm_area_struct *vma;
305 unsigned long address;
311 * We can fault from pretty much anywhere, with unknown IRQ state.
313 trace_hardirqs_fixup();
317 prefetchw(&mm->mmap_sem);
319 /* get the address */
320 address = read_cr2();
322 info.si_code = SEGV_MAPERR;
326 * We fault-in kernel-space virtual memory on-demand. The
327 * 'reference' page table is init_mm.pgd.
329 * NOTE! We MUST NOT take any locks for this case. We may
330 * be in an interrupt or a critical region, and should
331 * only copy the information from the master page table,
334 * This verifies that the fault happens in kernel space
335 * (error_code & 4) == 0, and that the fault was not a
336 * protection error (error_code & 9) == 0.
338 if (unlikely(address >= TASK_SIZE64)) {
340 * Don't check for the module range here: its PML4
341 * is always initialized because it's shared with the main
342 * kernel text. Only vmalloc may need PML4 syncups.
344 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
345 ((address >= VMALLOC_START && address < VMALLOC_END))) {
346 if (vmalloc_fault(address) >= 0)
349 if (notify_page_fault(regs))
352 * Don't take the mm semaphore here. If we fixup a prefetch
353 * fault we could otherwise deadlock.
355 goto bad_area_nosemaphore;
358 if (notify_page_fault(regs))
361 if (likely(regs->flags & X86_EFLAGS_IF))
364 if (unlikely(error_code & PF_RSVD))
365 pgtable_bad(address, regs, error_code);
368 * If we're in an interrupt, have no user context or are running in an
369 * atomic region then we must not take the fault.
371 if (unlikely(in_atomic() || !mm))
372 goto bad_area_nosemaphore;
375 * User-mode registers count as a user access even for any
376 * potential system fault or CPU buglet.
378 if (user_mode_vm(regs))
379 error_code |= PF_USER;
382 /* When running in the kernel we expect faults to occur only to
383 * addresses in user space. All other faults represent errors in the
384 * kernel and should generate an OOPS. Unfortunately, in the case of an
385 * erroneous fault occurring in a code path which already holds mmap_sem
386 * we will deadlock attempting to validate the fault against the
387 * address space. Luckily the kernel only validly references user
388 * space from well defined areas of code, which are listed in the
391 * As the vast majority of faults will be valid we will only perform
392 * the source reference check when there is a possibility of a deadlock.
393 * Attempt to lock the address space, if we cannot we then validate the
394 * source. If this is invalid we can skip the address space check,
395 * thus avoiding the deadlock.
397 if (!down_read_trylock(&mm->mmap_sem)) {
398 if ((error_code & PF_USER) == 0 &&
399 !search_exception_tables(regs->ip))
400 goto bad_area_nosemaphore;
401 down_read(&mm->mmap_sem);
404 vma = find_vma(mm, address);
407 if (likely(vma->vm_start <= address))
409 if (!(vma->vm_flags & VM_GROWSDOWN))
411 if (error_code & PF_USER) {
412 /* Allow userspace just enough access below the stack pointer
413 * to let the 'enter' instruction work.
415 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
418 if (expand_stack(vma, address))
421 * Ok, we have a good vm_area for this memory access, so
425 info.si_code = SEGV_ACCERR;
427 switch (error_code & (PF_PROT|PF_WRITE)) {
428 default: /* 3: write, present */
430 case PF_WRITE: /* write, not present */
431 if (!(vma->vm_flags & VM_WRITE))
435 case PF_PROT: /* read, present */
437 case 0: /* read, not present */
438 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
443 * If for any reason at all we couldn't handle the fault,
444 * make sure we exit gracefully rather than endlessly redo
447 fault = handle_mm_fault(mm, vma, address, write);
448 if (unlikely(fault & VM_FAULT_ERROR)) {
449 if (fault & VM_FAULT_OOM)
451 else if (fault & VM_FAULT_SIGBUS)
455 if (fault & VM_FAULT_MAJOR)
459 up_read(&mm->mmap_sem);
463 * Something tried to access memory that isn't in our memory map..
464 * Fix it, but check if it's kernel or user first..
467 up_read(&mm->mmap_sem);
469 bad_area_nosemaphore:
470 /* User mode accesses just cause a SIGSEGV */
471 if (error_code & PF_USER) {
474 * It's possible to have interrupts off here.
478 if (is_prefetch(regs, address, error_code))
481 /* Work around K8 erratum #100 K8 in compat mode
482 occasionally jumps to illegal addresses >4GB. We
483 catch this here in the page fault handler because
484 these addresses are not reachable. Just detect this
485 case and return. Any code segment in LDT is
486 compatibility mode. */
487 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
491 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
492 printk_ratelimit()) {
494 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
495 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
496 tsk->comm, tsk->pid, address, regs->ip,
497 regs->sp, error_code);
500 tsk->thread.cr2 = address;
501 /* Kernel addresses are always protection faults */
502 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
503 tsk->thread.trap_no = 14;
504 info.si_signo = SIGSEGV;
506 /* info.si_code has been set above */
507 info.si_addr = (void __user *)address;
508 force_sig_info(SIGSEGV, &info, tsk);
513 /* Are we prepared to handle this kernel fault? */
514 if (fixup_exception(regs))
518 * Hall of shame of CPU/BIOS bugs.
521 if (is_prefetch(regs, address, error_code))
524 if (is_errata93(regs, address))
528 * Oops. The kernel tried to access some bad page. We'll have to
529 * terminate things with extreme prejudice.
532 flags = oops_begin();
534 if (address < PAGE_SIZE)
535 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
537 printk(KERN_ALERT "Unable to handle kernel paging request");
538 printk(" at %016lx RIP: \n" KERN_ALERT, address);
539 printk_address(regs->ip);
540 dump_pagetable(address);
541 tsk->thread.cr2 = address;
542 tsk->thread.trap_no = 14;
543 tsk->thread.error_code = error_code;
544 if (__die("Oops", regs, error_code))
546 /* Executive summary in case the body of the oops scrolled away */
547 printk(KERN_EMERG "CR2: %016lx\n", address);
548 oops_end(flags, regs, SIGKILL);
551 * We ran out of memory, or some other thing happened to us that made
552 * us unable to handle the page fault gracefully.
555 up_read(&mm->mmap_sem);
556 if (is_global_init(current)) {
560 printk("VM: killing process %s\n", tsk->comm);
562 do_group_exit(SIGKILL);
566 up_read(&mm->mmap_sem);
568 /* Kernel mode? Handle exceptions or die */
569 if (!(error_code & PF_USER))
572 tsk->thread.cr2 = address;
573 tsk->thread.error_code = error_code;
574 tsk->thread.trap_no = 14;
575 info.si_signo = SIGBUS;
577 info.si_code = BUS_ADRERR;
578 info.si_addr = (void __user *)address;
579 force_sig_info(SIGBUS, &info, tsk);
583 DEFINE_SPINLOCK(pgd_lock);
586 void vmalloc_sync_all(void)
588 /* Note that races in the updates of insync and start aren't
590 insync can only get set bits added, and updates to start are only
591 improving performance (without affecting correctness if undone). */
592 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
593 static unsigned long start = VMALLOC_START & PGDIR_MASK;
594 unsigned long address;
596 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
597 if (!test_bit(pgd_index(address), insync)) {
598 const pgd_t *pgd_ref = pgd_offset_k(address);
601 if (pgd_none(*pgd_ref))
603 spin_lock(&pgd_lock);
604 list_for_each_entry(page, &pgd_list, lru) {
606 pgd = (pgd_t *)page_address(page) + pgd_index(address);
608 set_pgd(pgd, *pgd_ref);
610 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
612 spin_unlock(&pgd_lock);
613 set_bit(pgd_index(address), insync);
615 if (address == start)
616 start = address + PGDIR_SIZE;
618 /* Check that there is no need to do the same for the modules area. */
619 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
620 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
621 (__START_KERNEL & PGDIR_MASK)));