2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle) == 0)
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status &= ~TS_POLLING;
105 * TS_POLLING-cleared state must be visible before we
110 safe_halt(); /* enables interrupts racelessly */
113 current_thread_info()->status |= TS_POLLING;
116 #ifdef CONFIG_HOTPLUG_CPU
117 DECLARE_PER_CPU(int, cpu_state);
120 /* We halt the CPU with physical CPU hotplug */
121 static inline void play_dead(void)
127 __get_cpu_var(cpu_state) = CPU_DEAD;
134 static inline void play_dead(void)
138 #endif /* CONFIG_HOTPLUG_CPU */
141 * The idle thread. There's no useful work to be
142 * done, so just try to conserve power and have a
143 * low exit latency (ie sit in a loop waiting for
144 * somebody to say that they'd like to reschedule)
148 current_thread_info()->status |= TS_POLLING;
149 /* endless idle loop with no priority at all */
151 tick_nohz_stop_sched_tick();
152 while (!need_resched()) {
159 if (cpu_is_offline(smp_processor_id()))
162 * Idle routines should keep interrupts disabled
163 * from here on, until they go to idle.
164 * Otherwise, idle callbacks can misfire.
168 /* Don't trace irqs off for idle */
169 stop_critical_timings();
171 start_critical_timings();
172 /* In many cases the interrupt that ended idle
173 has already called exit_idle. But some idle
174 loops can be woken up without interrupt. */
178 tick_nohz_restart_sched_tick();
179 preempt_enable_no_resched();
185 /* Prints also some state that isn't saved in the pt_regs */
186 void __show_regs(struct pt_regs * regs)
188 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
189 unsigned long d0, d1, d2, d3, d6, d7;
190 unsigned int fsindex, gsindex;
191 unsigned int ds, cs, es;
195 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
196 current->pid, current->comm, print_tainted(),
197 init_utsname()->release,
198 (int)strcspn(init_utsname()->version, " "),
199 init_utsname()->version);
200 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
201 printk_address(regs->ip, 1);
202 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
204 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
205 regs->ax, regs->bx, regs->cx);
206 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
207 regs->dx, regs->si, regs->di);
208 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
209 regs->bp, regs->r8, regs->r9);
210 printk("R10: %016lx R11: %016lx R12: %016lx\n",
211 regs->r10, regs->r11, regs->r12);
212 printk("R13: %016lx R14: %016lx R15: %016lx\n",
213 regs->r13, regs->r14, regs->r15);
215 asm("movl %%ds,%0" : "=r" (ds));
216 asm("movl %%cs,%0" : "=r" (cs));
217 asm("movl %%es,%0" : "=r" (es));
218 asm("movl %%fs,%0" : "=r" (fsindex));
219 asm("movl %%gs,%0" : "=r" (gsindex));
221 rdmsrl(MSR_FS_BASE, fs);
222 rdmsrl(MSR_GS_BASE, gs);
223 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
230 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
231 fs,fsindex,gs,gsindex,shadowgs);
232 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
233 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
238 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
242 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
245 void show_regs(struct pt_regs *regs)
247 printk("CPU %d:", smp_processor_id());
249 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
253 * Free current thread data structures etc..
255 void exit_thread(void)
257 struct task_struct *me = current;
258 struct thread_struct *t = &me->thread;
260 if (me->thread.io_bitmap_ptr) {
261 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
263 kfree(t->io_bitmap_ptr);
264 t->io_bitmap_ptr = NULL;
265 clear_thread_flag(TIF_IO_BITMAP);
267 * Careful, clear this in the TSS too:
269 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
270 t->io_bitmap_max = 0;
275 void flush_thread(void)
277 struct task_struct *tsk = current;
279 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
280 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
281 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
282 clear_tsk_thread_flag(tsk, TIF_IA32);
284 set_tsk_thread_flag(tsk, TIF_IA32);
285 current_thread_info()->status |= TS_COMPAT;
288 clear_tsk_thread_flag(tsk, TIF_DEBUG);
290 tsk->thread.debugreg0 = 0;
291 tsk->thread.debugreg1 = 0;
292 tsk->thread.debugreg2 = 0;
293 tsk->thread.debugreg3 = 0;
294 tsk->thread.debugreg6 = 0;
295 tsk->thread.debugreg7 = 0;
296 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
298 * Forget coprocessor state..
304 void release_thread(struct task_struct *dead_task)
307 if (dead_task->mm->context.size) {
308 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
310 dead_task->mm->context.ldt,
311 dead_task->mm->context.size);
317 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
319 struct user_desc ud = {
326 struct desc_struct *desc = t->thread.tls_array;
331 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
333 return get_desc_base(&t->thread.tls_array[tls]);
337 * This gets called before we allocate a new thread and copy
338 * the current task into it.
340 void prepare_to_copy(struct task_struct *tsk)
345 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
346 unsigned long unused,
347 struct task_struct * p, struct pt_regs * regs)
350 struct pt_regs * childregs;
351 struct task_struct *me = current;
353 childregs = ((struct pt_regs *)
354 (THREAD_SIZE + task_stack_page(p))) - 1;
360 childregs->sp = (unsigned long)childregs;
362 p->thread.sp = (unsigned long) childregs;
363 p->thread.sp0 = (unsigned long) (childregs+1);
364 p->thread.usersp = me->thread.usersp;
366 set_tsk_thread_flag(p, TIF_FORK);
368 p->thread.fs = me->thread.fs;
369 p->thread.gs = me->thread.gs;
371 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
372 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
373 asm("mov %%es,%0" : "=m" (p->thread.es));
374 asm("mov %%ds,%0" : "=m" (p->thread.ds));
376 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
377 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
378 if (!p->thread.io_bitmap_ptr) {
379 p->thread.io_bitmap_max = 0;
382 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
384 set_tsk_thread_flag(p, TIF_IO_BITMAP);
388 * Set a new TLS for the child thread?
390 if (clone_flags & CLONE_SETTLS) {
391 #ifdef CONFIG_IA32_EMULATION
392 if (test_thread_flag(TIF_IA32))
393 err = do_set_thread_area(p, -1,
394 (struct user_desc __user *)childregs->si, 0);
397 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
403 if (err && p->thread.io_bitmap_ptr) {
404 kfree(p->thread.io_bitmap_ptr);
405 p->thread.io_bitmap_max = 0;
411 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
413 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
417 write_pda(oldrsp, new_sp);
418 regs->cs = __USER_CS;
419 regs->ss = __USER_DS;
423 * Free the old FP and other extended state
425 free_thread_xstate(current);
427 EXPORT_SYMBOL_GPL(start_thread);
429 static void hard_disable_TSC(void)
431 write_cr4(read_cr4() | X86_CR4_TSD);
434 void disable_TSC(void)
437 if (!test_and_set_thread_flag(TIF_NOTSC))
439 * Must flip the CPU state synchronously with
440 * TIF_NOTSC in the current running context.
446 static void hard_enable_TSC(void)
448 write_cr4(read_cr4() & ~X86_CR4_TSD);
451 static void enable_TSC(void)
454 if (test_and_clear_thread_flag(TIF_NOTSC))
456 * Must flip the CPU state synchronously with
457 * TIF_NOTSC in the current running context.
463 int get_tsc_mode(unsigned long adr)
467 if (test_thread_flag(TIF_NOTSC))
468 val = PR_TSC_SIGSEGV;
472 return put_user(val, (unsigned int __user *)adr);
475 int set_tsc_mode(unsigned int val)
477 if (val == PR_TSC_SIGSEGV)
479 else if (val == PR_TSC_ENABLE)
488 * This special macro can be used to load a debugging register
490 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
492 static inline void __switch_to_xtra(struct task_struct *prev_p,
493 struct task_struct *next_p,
494 struct tss_struct *tss)
496 struct thread_struct *prev, *next;
497 unsigned long debugctl;
499 prev = &prev_p->thread,
500 next = &next_p->thread;
502 debugctl = prev->debugctlmsr;
503 if (next->ds_area_msr != prev->ds_area_msr) {
504 /* we clear debugctl to make sure DS
505 * is not in use when we change it */
507 update_debugctlmsr(0);
508 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
511 if (next->debugctlmsr != debugctl)
512 update_debugctlmsr(next->debugctlmsr);
514 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
524 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
525 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
526 /* prev and next are different */
527 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
533 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
535 * Copy the relevant range of the IO bitmap.
536 * Normally this is 128 bytes or less:
538 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
539 max(prev->io_bitmap_max, next->io_bitmap_max));
540 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
542 * Clear any possible leftover bits:
544 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
548 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
549 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
551 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
552 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
557 * switch_to(x,y) should switch tasks from x to y.
559 * This could still be optimized:
560 * - fold all the options into a flag word and test it with a single test.
561 * - could test fs/gs bitsliced
563 * Kprobes not supported here. Set the probe on schedule instead.
566 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
568 struct thread_struct *prev = &prev_p->thread,
569 *next = &next_p->thread;
570 int cpu = smp_processor_id();
571 struct tss_struct *tss = &per_cpu(init_tss, cpu);
573 /* we're going to use this soon, after a few expensive things */
574 if (next_p->fpu_counter>5)
575 prefetch(next->xstate);
578 * Reload esp0, LDT and the page table pointer:
584 * This won't pick up thread selector changes, but I guess that is ok.
586 asm volatile("mov %%es,%0" : "=m" (prev->es));
587 if (unlikely(next->es | prev->es))
588 loadsegment(es, next->es);
590 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
591 if (unlikely(next->ds | prev->ds))
592 loadsegment(ds, next->ds);
601 asm volatile("movl %%fs,%0" : "=r" (fsindex));
602 /* segment register != 0 always requires a reload.
603 also reload when it has changed.
604 when prev process used 64bit base always reload
605 to avoid an information leak. */
606 if (unlikely(fsindex | next->fsindex | prev->fs)) {
607 loadsegment(fs, next->fsindex);
608 /* check if the user used a selector != 0
609 * if yes clear 64bit base, since overloaded base
610 * is always mapped to the Null selector
615 /* when next process has a 64bit base use it */
617 wrmsrl(MSR_FS_BASE, next->fs);
618 prev->fsindex = fsindex;
622 asm volatile("movl %%gs,%0" : "=r" (gsindex));
623 if (unlikely(gsindex | next->gsindex | prev->gs)) {
624 load_gs_index(next->gsindex);
629 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
630 prev->gsindex = gsindex;
633 /* Must be after DS reload */
637 * Switch the PDA and FPU contexts.
639 prev->usersp = read_pda(oldrsp);
640 write_pda(oldrsp, next->usersp);
641 write_pda(pcurrent, next_p);
643 write_pda(kernelstack,
644 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
645 #ifdef CONFIG_CC_STACKPROTECTOR
646 write_pda(stack_canary, next_p->stack_canary);
648 * Build time only check to make sure the stack_canary is at
649 * offset 40 in the pda; this is a gcc ABI requirement
651 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
655 * Now maybe reload the debug registers and handle I/O bitmaps
657 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
658 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
659 __switch_to_xtra(prev_p, next_p, tss);
661 /* If the task has used fpu the last 5 timeslices, just do a full
662 * restore of the math state immediately to avoid the trap; the
663 * chances of needing FPU soon are obviously high now
665 * tsk_used_math() checks prevent calling math_state_restore(),
666 * which can sleep in the case of !tsk_used_math()
668 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
669 math_state_restore();
674 * sys_execve() executes a new program.
677 long sys_execve(char __user *name, char __user * __user *argv,
678 char __user * __user *envp, struct pt_regs *regs)
683 filename = getname(name);
684 error = PTR_ERR(filename);
685 if (IS_ERR(filename))
687 error = do_execve(filename, argv, envp, regs);
692 void set_personality_64bit(void)
694 /* inherit personality from parent */
696 /* Make sure to be in 64bit mode */
697 clear_thread_flag(TIF_IA32);
699 /* TBD: overwrites user setup. Should have two bits.
700 But 64bit processes have always behaved this way,
701 so it's not too bad. The main problem is just that
702 32bit childs are affected again. */
703 current->personality &= ~READ_IMPLIES_EXEC;
706 asmlinkage long sys_fork(struct pt_regs *regs)
708 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
712 sys_clone(unsigned long clone_flags, unsigned long newsp,
713 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
717 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
721 * This is trivial, and on the face of it looks like it
722 * could equally well be done in user mode.
724 * Not so, for quite unobvious reasons - register pressure.
725 * In user mode vfork() cannot have a stack frame, and if
726 * done by calling the "clone()" system call directly, you
727 * do not have enough call-clobbered registers to hold all
728 * the information you need.
730 asmlinkage long sys_vfork(struct pt_regs *regs)
732 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
736 unsigned long get_wchan(struct task_struct *p)
742 if (!p || p == current || p->state==TASK_RUNNING)
744 stack = (unsigned long)task_stack_page(p);
745 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
747 fp = *(u64 *)(p->thread.sp);
749 if (fp < (unsigned long)stack ||
750 fp > (unsigned long)stack+THREAD_SIZE)
753 if (!in_sched_functions(ip))
756 } while (count++ < 16);
760 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
763 int doit = task == current;
768 if (addr >= TASK_SIZE_OF(task))
771 /* handle small bases via the GDT because that's faster to
773 if (addr <= 0xffffffff) {
774 set_32bit_tls(task, GS_TLS, addr);
776 load_TLS(&task->thread, cpu);
777 load_gs_index(GS_TLS_SEL);
779 task->thread.gsindex = GS_TLS_SEL;
782 task->thread.gsindex = 0;
783 task->thread.gs = addr;
786 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
792 /* Not strictly needed for fs, but do it for symmetry
794 if (addr >= TASK_SIZE_OF(task))
797 /* handle small bases via the GDT because that's faster to
799 if (addr <= 0xffffffff) {
800 set_32bit_tls(task, FS_TLS, addr);
802 load_TLS(&task->thread, cpu);
803 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
805 task->thread.fsindex = FS_TLS_SEL;
808 task->thread.fsindex = 0;
809 task->thread.fs = addr;
811 /* set the selector to 0 to not confuse
813 asm volatile("movl %0,%%fs" :: "r" (0));
814 ret = checking_wrmsrl(MSR_FS_BASE, addr);
821 if (task->thread.fsindex == FS_TLS_SEL)
822 base = read_32bit_tls(task, FS_TLS);
824 rdmsrl(MSR_FS_BASE, base);
826 base = task->thread.fs;
827 ret = put_user(base, (unsigned long __user *)addr);
833 if (task->thread.gsindex == GS_TLS_SEL)
834 base = read_32bit_tls(task, GS_TLS);
836 asm("movl %%gs,%0" : "=r" (gsindex));
838 rdmsrl(MSR_KERNEL_GS_BASE, base);
840 base = task->thread.gs;
843 base = task->thread.gs;
844 ret = put_user(base, (unsigned long __user *)addr);
856 long sys_arch_prctl(int code, unsigned long addr)
858 return do_arch_prctl(current, code, addr);
861 unsigned long arch_align_stack(unsigned long sp)
863 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
864 sp -= get_random_int() % 8192;
868 unsigned long arch_randomize_brk(struct mm_struct *mm)
870 unsigned long range_end = mm->brk + 0x02000000;
871 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;