err.no Git - linux-2.6/blob - arch/x86/mm/fault_64.c

   1 /*
   2  *  Copyright (C) 1995  Linus Torvalds
   3  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   4  */
   5
   6 #include <linux/signal.h>
   7 #include <linux/sched.h>
   8 #include <linux/kernel.h>
   9 #include <linux/errno.h>
  10 #include <linux/string.h>
  11 #include <linux/types.h>
  12 #include <linux/ptrace.h>
  13 #include <linux/mman.h>
  14 #include <linux/mm.h>
  15 #include <linux/smp.h>
  16 #include <linux/interrupt.h>
  17 #include <linux/init.h>
  18 #include <linux/tty.h>
  19 #include <linux/vt_kern.h>              /* For unblank_screen() */
  20 #include <linux/compiler.h>
  21 #include <linux/vmalloc.h>
  22 #include <linux/module.h>
  23 #include <linux/kprobes.h>
  24 #include <linux/uaccess.h>
  25 #include <linux/kdebug.h>
  26
  27 #include <asm/system.h>
  28 #include <asm/pgalloc.h>
  29 #include <asm/smp.h>
  30 #include <asm/tlbflush.h>
  31 #include <asm/proto.h>
  32 #include <asm-generic/sections.h>
  33
  34 /*
  35  * Page fault error code bits
  36  *      bit 0 == 0 means no page found, 1 means protection fault
  37  *      bit 1 == 0 means read, 1 means write
  38  *      bit 2 == 0 means kernel, 1 means user-mode
  39  *      bit 3 == 1 means use of reserved bit detected
  40  *      bit 4 == 1 means fault was an instruction fetch
  41  */
  42 #define PF_PROT         (1<<0)
  43 #define PF_WRITE        (1<<1)
  44 #define PF_USER         (1<<2)
  45 #define PF_RSVD         (1<<3)
  46 #define PF_INSTR        (1<<4)
  47
  48 static inline int notify_page_fault(struct pt_regs *regs)
  49 {
  50 #ifdef CONFIG_KPROBES
  51         int ret = 0;
  52
  53         /* kprobe_running() needs smp_processor_id() */
  54         if (!user_mode(regs)) {
  55                 preempt_disable();
  56                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  57                         ret = 1;
  58                 preempt_enable();
  59         }
  60
  61         return ret;
  62 #else
  63         return 0;
  64 #endif
  65 }
  66
  67 /*
  68  * X86_32
  69  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  70  * Check that here and ignore it.
  71  *
  72  * X86_64
  73  * Sometimes the CPU reports invalid exceptions on prefetch.
  74  * Check that here and ignore it.
  75  *
  76  * Opcode checker based on code by Richard Brunner
  77  */
  78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
  79                        unsigned long error_code)
  80 {
  81         unsigned char *instr;
  82         int scan_more = 1;
  83         int prefetch = 0;
  84         unsigned char *max_instr;
  85
  86 #ifdef CONFIG_X86_32
  87         if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
  88                      boot_cpu_data.x86 >= 6)) {
  89                 /* Catch an obscure case of prefetch inside an NX page. */
  90                 if (nx_enabled && (error_code & PF_INSTR))
  91                         return 0;
  92         } else {
  93                 return 0;
  94         }
  95 #else
  96         /* If it was a exec fault ignore */
  97         if (error_code & PF_INSTR)
  98                 return 0;
  99 #endif
 100
 101         instr = (unsigned char *)convert_ip_to_linear(current, regs);
 102         max_instr = instr + 15;
 103
 104         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 105                 return 0;
 106
 107         while (scan_more && instr < max_instr) {
 108                 unsigned char opcode;
 109                 unsigned char instr_hi;
 110                 unsigned char instr_lo;
 111
 112                 if (probe_kernel_address(instr, opcode))
 113                         break;
 114
 115                 instr_hi = opcode & 0xf0;
 116                 instr_lo = opcode & 0x0f;
 117                 instr++;
 118
 119                 switch (instr_hi) {
 120                 case 0x20:
 121                 case 0x30:
 122                         /*
 123                          * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
 124                          * In X86_64 long mode, the CPU will signal invalid
 125                          * opcode if some of these prefixes are present so
 126                          * X86_64 will never get here anyway
 127                          */
 128                         scan_more = ((instr_lo & 7) == 0x6);
 129                         break;
 130 #ifdef CONFIG_X86_64
 131                 case 0x40:
 132                         /*
 133                          * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 134                          * Need to figure out under what instruction mode the
 135                          * instruction was issued. Could check the LDT for lm,
 136                          * but for now it's good enough to assume that long
 137                          * mode only uses well known segments or kernel.
 138                          */
 139                         scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 140                         break;
 141 #endif
 142                 case 0x60:
 143                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 144                         scan_more = (instr_lo & 0xC) == 0x4;
 145                         break;
 146                 case 0xF0:
 147                         /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 148                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 149                         break;
 150                 case 0x00:
 151                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 152                         scan_more = 0;
 153
 154                         if (probe_kernel_address(instr, opcode))
 155                                 break;
 156                         prefetch = (instr_lo == 0xF) &&
 157                                 (opcode == 0x0D || opcode == 0x18);
 158                         break;
 159                 default:
 160                         scan_more = 0;
 161                         break;
 162                 }
 163         }
 164         return prefetch;
 165 }
 166
 167 static void force_sig_info_fault(int si_signo, int si_code,
 168         unsigned long address, struct task_struct *tsk)
 169 {
 170         siginfo_t info;
 171
 172         info.si_signo = si_signo;
 173         info.si_errno = 0;
 174         info.si_code = si_code;
 175         info.si_addr = (void __user *)address;
 176         force_sig_info(si_signo, &info, tsk);
 177 }
 178
 179 static int bad_address(void *p)
 180 {
 181         unsigned long dummy;
 182         return probe_kernel_address((unsigned long *)p, dummy);
 183 }
 184
 185 void dump_pagetable(unsigned long address)
 186 {
 187         pgd_t *pgd;
 188         pud_t *pud;
 189         pmd_t *pmd;
 190         pte_t *pte;
 191
 192         pgd = (pgd_t *)read_cr3();
 193
 194         pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 195         pgd += pgd_index(address);
 196         if (bad_address(pgd)) goto bad;
 197         printk("PGD %lx ", pgd_val(*pgd));
 198         if (!pgd_present(*pgd)) goto ret;
 199
 200         pud = pud_offset(pgd, address);
 201         if (bad_address(pud)) goto bad;
 202         printk("PUD %lx ", pud_val(*pud));
 203         if (!pud_present(*pud)) goto ret;
 204
 205         pmd = pmd_offset(pud, address);
 206         if (bad_address(pmd)) goto bad;
 207         printk("PMD %lx ", pmd_val(*pmd));
 208         if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
 209
 210         pte = pte_offset_kernel(pmd, address);
 211         if (bad_address(pte)) goto bad;
 212         printk("PTE %lx", pte_val(*pte));
 213 ret:
 214         printk("\n");
 215         return;
 216 bad:
 217         printk("BAD\n");
 218 }
 219
 220 #ifdef CONFIG_X86_64
 221 static const char errata93_warning[] =
 222 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 223 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 224 KERN_ERR "******* Please consider a BIOS update.\n"
 225 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 226
 227 /* Workaround for K8 erratum #93 & buggy BIOS.
 228    BIOS SMM functions are required to use a specific workaround
 229    to avoid corruption of the 64bit RIP register on C stepping K8.
 230    A lot of BIOS that didn't get tested properly miss this.
 231    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 232    Try to work around it here.
 233    Note we only handle faults in kernel here. */
 234
 235 static int is_errata93(struct pt_regs *regs, unsigned long address)
 236 {
 237         static int warned;
 238         if (address != regs->ip)
 239                 return 0;
 240         if ((address >> 32) != 0)
 241                 return 0;
 242         address |= 0xffffffffUL << 32;
 243         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 244             (address >= MODULES_VADDR && address <= MODULES_END)) {
 245                 if (!warned) {
 246                         printk(errata93_warning);
 247                         warned = 1;
 248                 }
 249                 regs->ip = address;
 250                 return 1;
 251         }
 252         return 0;
 253 }
 254 #endif
 255
 256 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 257                                  unsigned long error_code)
 258 {
 259         unsigned long flags = oops_begin();
 260         struct task_struct *tsk;
 261
 262         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 263                current->comm, address);
 264         dump_pagetable(address);
 265         tsk = current;
 266         tsk->thread.cr2 = address;
 267         tsk->thread.trap_no = 14;
 268         tsk->thread.error_code = error_code;
 269         if (__die("Bad pagetable", regs, error_code))
 270                 regs = NULL;
 271         oops_end(flags, regs, SIGKILL);
 272 }
 273
 274 /*
 275  * Handle a fault on the vmalloc area
 276  *
 277  * This assumes no large pages in there.
 278  */
 279 static int vmalloc_fault(unsigned long address)
 280 {
 281         pgd_t *pgd, *pgd_ref;
 282         pud_t *pud, *pud_ref;
 283         pmd_t *pmd, *pmd_ref;
 284         pte_t *pte, *pte_ref;
 285
 286         /* Copy kernel mappings over when needed. This can also
 287            happen within a race in page table update. In the later
 288            case just flush. */
 289
 290         pgd = pgd_offset(current->mm ?: &init_mm, address);
 291         pgd_ref = pgd_offset_k(address);
 292         if (pgd_none(*pgd_ref))
 293                 return -1;
 294         if (pgd_none(*pgd))
 295                 set_pgd(pgd, *pgd_ref);
 296         else
 297                 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 298
 299         /* Below here mismatches are bugs because these lower tables
 300            are shared */
 301
 302         pud = pud_offset(pgd, address);
 303         pud_ref = pud_offset(pgd_ref, address);
 304         if (pud_none(*pud_ref))
 305                 return -1;
 306         if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 307                 BUG();
 308         pmd = pmd_offset(pud, address);
 309         pmd_ref = pmd_offset(pud_ref, address);
 310         if (pmd_none(*pmd_ref))
 311                 return -1;
 312         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 313                 BUG();
 314         pte_ref = pte_offset_kernel(pmd_ref, address);
 315         if (!pte_present(*pte_ref))
 316                 return -1;
 317         pte = pte_offset_kernel(pmd, address);
 318         /* Don't use pte_page here, because the mappings can point
 319            outside mem_map, and the NUMA hash lookup cannot handle
 320            that. */
 321         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 322                 BUG();
 323         return 0;
 324 }
 325
 326 int show_unhandled_signals = 1;
 327
 328 /*
 329  * This routine handles page faults.  It determines the address,
 330  * and the problem, and then passes it off to one of the appropriate
 331  * routines.
 332  */
 333 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 334                                         unsigned long error_code)
 335 {
 336         struct task_struct *tsk;
 337         struct mm_struct *mm;
 338         struct vm_area_struct *vma;
 339         unsigned long address;
 340         int write, fault;
 341         unsigned long flags;
 342         int si_code;
 343
 344         /*
 345          * We can fault from pretty much anywhere, with unknown IRQ state.
 346          */
 347         trace_hardirqs_fixup();
 348
 349         tsk = current;
 350         mm = tsk->mm;
 351         prefetchw(&mm->mmap_sem);
 352
 353         /* get the address */
 354         address = read_cr2();
 355
 356         si_code = SEGV_MAPERR;
 357
 358
 359         /*
 360          * We fault-in kernel-space virtual memory on-demand. The
 361          * 'reference' page table is init_mm.pgd.
 362          *
 363          * NOTE! We MUST NOT take any locks for this case. We may
 364          * be in an interrupt or a critical region, and should
 365          * only copy the information from the master page table,
 366          * nothing more.
 367          *
 368          * This verifies that the fault happens in kernel space
 369          * (error_code & 4) == 0, and that the fault was not a
 370          * protection error (error_code & 9) == 0.
 371          */
 372         if (unlikely(address >= TASK_SIZE64)) {
 373                 /*
 374                  * Don't check for the module range here: its PML4
 375                  * is always initialized because it's shared with the main
 376                  * kernel text. Only vmalloc may need PML4 syncups.
 377                  */
 378                 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 379                       ((address >= VMALLOC_START && address < VMALLOC_END))) {
 380                         if (vmalloc_fault(address) >= 0)
 381                                 return;
 382                 }
 383                 if (notify_page_fault(regs))
 384                         return;
 385                 /*
 386                  * Don't take the mm semaphore here. If we fixup a prefetch
 387                  * fault we could otherwise deadlock.
 388                  */
 389                 goto bad_area_nosemaphore;
 390         }
 391
 392         if (notify_page_fault(regs))
 393                 return;
 394
 395         if (likely(regs->flags & X86_EFLAGS_IF))
 396                 local_irq_enable();
 397
 398         if (unlikely(error_code & PF_RSVD))
 399                 pgtable_bad(address, regs, error_code);
 400
 401         /*
 402          * If we're in an interrupt, have no user context or are running in an
 403          * atomic region then we must not take the fault.
 404          */
 405         if (unlikely(in_atomic() || !mm))
 406                 goto bad_area_nosemaphore;
 407
 408         /*
 409          * User-mode registers count as a user access even for any
 410          * potential system fault or CPU buglet.
 411          */
 412         if (user_mode_vm(regs))
 413                 error_code |= PF_USER;
 414
 415  again:
 416         /* When running in the kernel we expect faults to occur only to
 417          * addresses in user space.  All other faults represent errors in the
 418          * kernel and should generate an OOPS.  Unfortunately, in the case of an
 419          * erroneous fault occurring in a code path which already holds mmap_sem
 420          * we will deadlock attempting to validate the fault against the
 421          * address space.  Luckily the kernel only validly references user
 422          * space from well defined areas of code, which are listed in the
 423          * exceptions table.
 424          *
 425          * As the vast majority of faults will be valid we will only perform
 426          * the source reference check when there is a possibility of a deadlock.
 427          * Attempt to lock the address space, if we cannot we then validate the
 428          * source.  If this is invalid we can skip the address space check,
 429          * thus avoiding the deadlock.
 430          */
 431         if (!down_read_trylock(&mm->mmap_sem)) {
 432                 if ((error_code & PF_USER) == 0 &&
 433                     !search_exception_tables(regs->ip))
 434                         goto bad_area_nosemaphore;
 435                 down_read(&mm->mmap_sem);
 436         }
 437
 438         vma = find_vma(mm, address);
 439         if (!vma)
 440                 goto bad_area;
 441         if (likely(vma->vm_start <= address))
 442                 goto good_area;
 443         if (!(vma->vm_flags & VM_GROWSDOWN))
 444                 goto bad_area;
 445         if (error_code & PF_USER) {
 446                 /* Allow userspace just enough access below the stack pointer
 447                  * to let the 'enter' instruction work.
 448                  */
 449                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 450                         goto bad_area;
 451         }
 452         if (expand_stack(vma, address))
 453                 goto bad_area;
 454 /*
 455  * Ok, we have a good vm_area for this memory access, so
 456  * we can handle it..
 457  */
 458 good_area:
 459         si_code = SEGV_ACCERR;
 460         write = 0;
 461         switch (error_code & (PF_PROT|PF_WRITE)) {
 462         default:        /* 3: write, present */
 463                 /* fall through */
 464         case PF_WRITE:          /* write, not present */
 465                 if (!(vma->vm_flags & VM_WRITE))
 466                         goto bad_area;
 467                 write++;
 468                 break;
 469         case PF_PROT:           /* read, present */
 470                 goto bad_area;
 471         case 0:                 /* read, not present */
 472                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 473                         goto bad_area;
 474         }
 475
 476         /*
 477          * If for any reason at all we couldn't handle the fault,
 478          * make sure we exit gracefully rather than endlessly redo
 479          * the fault.
 480          */
 481         fault = handle_mm_fault(mm, vma, address, write);
 482         if (unlikely(fault & VM_FAULT_ERROR)) {
 483                 if (fault & VM_FAULT_OOM)
 484                         goto out_of_memory;
 485                 else if (fault & VM_FAULT_SIGBUS)
 486                         goto do_sigbus;
 487                 BUG();
 488         }
 489         if (fault & VM_FAULT_MAJOR)
 490                 tsk->maj_flt++;
 491         else
 492                 tsk->min_flt++;
 493         up_read(&mm->mmap_sem);
 494         return;
 495
 496 /*
 497  * Something tried to access memory that isn't in our memory map..
 498  * Fix it, but check if it's kernel or user first..
 499  */
 500 bad_area:
 501         up_read(&mm->mmap_sem);
 502
 503 bad_area_nosemaphore:
 504         /* User mode accesses just cause a SIGSEGV */
 505         if (error_code & PF_USER) {
 506
 507                 /*
 508                  * It's possible to have interrupts off here.
 509                  */
 510                 local_irq_enable();
 511
 512                 if (is_prefetch(regs, address, error_code))
 513                         return;
 514
 515                 /* Work around K8 erratum #100 K8 in compat mode
 516                    occasionally jumps to illegal addresses >4GB.  We
 517                    catch this here in the page fault handler because
 518                    these addresses are not reachable. Just detect this
 519                    case and return.  Any code segment in LDT is
 520                    compatibility mode. */
 521                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 522                     (address >> 32))
 523                         return;
 524
 525                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 526                     printk_ratelimit()) {
 527                         printk(
 528                        "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
 529                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 530                                         tsk->comm, tsk->pid, address, regs->ip,
 531                                         regs->sp, error_code);
 532                 }
 533
 534                 tsk->thread.cr2 = address;
 535                 /* Kernel addresses are always protection faults */
 536                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 537                 tsk->thread.trap_no = 14;
 538
 539                 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
 540                 return;
 541         }
 542
 543 no_context:
 544         /* Are we prepared to handle this kernel fault?  */
 545         if (fixup_exception(regs))
 546                 return;
 547
 548         /*
 549          * Hall of shame of CPU/BIOS bugs.
 550          */
 551
 552         if (is_prefetch(regs, address, error_code))
 553                 return;
 554
 555         if (is_errata93(regs, address))
 556                 return;
 557
 558 /*
 559  * Oops. The kernel tried to access some bad page. We'll have to
 560  * terminate things with extreme prejudice.
 561  */
 562
 563         flags = oops_begin();
 564
 565         if (address < PAGE_SIZE)
 566                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 567         else
 568                 printk(KERN_ALERT "Unable to handle kernel paging request");
 569         printk(" at %016lx RIP: \n" KERN_ALERT, address);
 570         printk_address(regs->ip, regs->bp);
 571         dump_pagetable(address);
 572         tsk->thread.cr2 = address;
 573         tsk->thread.trap_no = 14;
 574         tsk->thread.error_code = error_code;
 575         if (__die("Oops", regs, error_code))
 576                 regs = NULL;
 577         /* Executive summary in case the body of the oops scrolled away */
 578         printk(KERN_EMERG "CR2: %016lx\n", address);
 579         oops_end(flags, regs, SIGKILL);
 580
 581 /*
 582  * We ran out of memory, or some other thing happened to us that made
 583  * us unable to handle the page fault gracefully.
 584  */
 585 out_of_memory:
 586         up_read(&mm->mmap_sem);
 587         if (is_global_init(current)) {
 588                 yield();
 589                 goto again;
 590         }
 591         printk("VM: killing process %s\n", tsk->comm);
 592         if (error_code & PF_USER)
 593                 do_group_exit(SIGKILL);
 594         goto no_context;
 595
 596 do_sigbus:
 597         up_read(&mm->mmap_sem);
 598
 599         /* Kernel mode? Handle exceptions or die */
 600         if (!(error_code & PF_USER))
 601                 goto no_context;
 602
 603         tsk->thread.cr2 = address;
 604         tsk->thread.error_code = error_code;
 605         tsk->thread.trap_no = 14;
 606         force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 607         return;
 608 }
 609
 610 DEFINE_SPINLOCK(pgd_lock);
 611 LIST_HEAD(pgd_list);
 612
 613 void vmalloc_sync_all(void)
 614 {
 615         /* Note that races in the updates of insync and start aren't
 616            problematic:
 617            insync can only get set bits added, and updates to start are only
 618            improving performance (without affecting correctness if undone). */
 619         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 620         static unsigned long start = VMALLOC_START & PGDIR_MASK;
 621         unsigned long address;
 622
 623         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 624                 if (!test_bit(pgd_index(address), insync)) {
 625                         const pgd_t *pgd_ref = pgd_offset_k(address);
 626                         struct page *page;
 627
 628                         if (pgd_none(*pgd_ref))
 629                                 continue;
 630                         spin_lock(&pgd_lock);
 631                         list_for_each_entry(page, &pgd_list, lru) {
 632                                 pgd_t *pgd;
 633                                 pgd = (pgd_t *)page_address(page) + pgd_index(address);
 634                                 if (pgd_none(*pgd))
 635                                         set_pgd(pgd, *pgd_ref);
 636                                 else
 637                                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 638                         }
 639                         spin_unlock(&pgd_lock);
 640                         set_bit(pgd_index(address), insync);
 641                 }
 642                 if (address == start)
 643                         start = address + PGDIR_SIZE;
 644         }
 645         /* Check that there is no need to do the same for the modules area. */
 646         BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 647         BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 648                                 (__START_KERNEL & PGDIR_MASK)));
 649 }