2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/module.h>
31 #include <linux/a.out.h>
32 #include <linux/interrupt.h>
33 #include <linux/delay.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
37 #include <linux/notifier.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 void idle_notifier_unregister(struct notifier_block *n)
79 atomic_notifier_chain_unregister(&idle_notifier, n);
81 EXPORT_SYMBOL(idle_notifier_unregister);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 static void __exit_idle(void)
91 if (test_and_clear_bit_pda(0, isidle) == 0)
93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 * We use this if we don't have any better
109 static void default_idle(void)
113 current_thread_info()->status &= ~TS_POLLING;
114 smp_mb__after_clear_bit();
115 while (!need_resched()) {
122 current_thread_info()->status |= TS_POLLING;
126 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution.
130 static void poll_idle (void)
140 "i" (_TIF_NEED_RESCHED),
141 "m" (current_thread_info()->flags));
144 void cpu_idle_wait(void)
146 unsigned int cpu, this_cpu = get_cpu();
149 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
153 for_each_online_cpu(cpu) {
154 per_cpu(cpu_idle_state, cpu) = 1;
158 __get_cpu_var(cpu_idle_state) = 0;
163 for_each_online_cpu(cpu) {
164 if (cpu_isset(cpu, map) &&
165 !per_cpu(cpu_idle_state, cpu))
168 cpus_and(map, map, cpu_online_map);
169 } while (!cpus_empty(map));
171 EXPORT_SYMBOL_GPL(cpu_idle_wait);
173 #ifdef CONFIG_HOTPLUG_CPU
174 DECLARE_PER_CPU(int, cpu_state);
177 /* We halt the CPU with physical CPU hotplug */
178 static inline void play_dead(void)
184 __get_cpu_var(cpu_state) = CPU_DEAD;
191 static inline void play_dead(void)
195 #endif /* CONFIG_HOTPLUG_CPU */
198 * The idle thread. There's no useful work to be
199 * done, so just try to conserve power and have a
200 * low exit latency (ie sit in a loop waiting for
201 * somebody to say that they'd like to reschedule)
205 current_thread_info()->status |= TS_POLLING;
206 /* endless idle loop with no priority at all */
208 while (!need_resched()) {
211 if (__get_cpu_var(cpu_idle_state))
212 __get_cpu_var(cpu_idle_state) = 0;
218 if (cpu_is_offline(smp_processor_id()))
222 /* In many cases the interrupt that ended idle
223 has already called exit_idle. But some idle
224 loops can be woken up without interrupt. */
228 preempt_enable_no_resched();
235 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
236 * which can obviate IPI to trigger checking of need_resched.
237 * We execute MONITOR against need_resched and enter optimized wait state
238 * through MWAIT. Whenever someone changes need_resched, we would be woken
239 * up from MWAIT (without an IPI).
241 * New with Core Duo processors, MWAIT can take some hints based on CPU
244 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
246 if (!need_resched()) {
247 __monitor((void *)¤t_thread_info()->flags, 0, 0);
254 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
255 static void mwait_idle(void)
258 while (!need_resched())
259 mwait_idle_with_hints(0,0);
262 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
265 if (cpu_has(c, X86_FEATURE_MWAIT)) {
267 * Skip, if setup has overridden idle.
268 * One CPU supports mwait => All CPUs supports mwait
272 printk("using mwait in idle threads.\n");
275 pm_idle = mwait_idle;
280 static int __init idle_setup (char *str)
282 if (!strncmp(str, "poll", 4)) {
283 printk("using polling idle threads.\n");
287 boot_option_idle_override = 1;
291 __setup("idle=", idle_setup);
293 /* Prints also some state that isn't saved in the pt_regs */
294 void __show_regs(struct pt_regs * regs)
296 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
297 unsigned int fsindex,gsindex;
298 unsigned int ds,cs,es;
302 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
303 current->pid, current->comm, print_tainted(),
304 init_utsname()->release,
305 (int)strcspn(init_utsname()->version, " "),
306 init_utsname()->version);
307 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
308 printk_address(regs->rip);
309 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
311 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
312 regs->rax, regs->rbx, regs->rcx);
313 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
314 regs->rdx, regs->rsi, regs->rdi);
315 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
316 regs->rbp, regs->r8, regs->r9);
317 printk("R10: %016lx R11: %016lx R12: %016lx\n",
318 regs->r10, regs->r11, regs->r12);
319 printk("R13: %016lx R14: %016lx R15: %016lx\n",
320 regs->r13, regs->r14, regs->r15);
322 asm("movl %%ds,%0" : "=r" (ds));
323 asm("movl %%cs,%0" : "=r" (cs));
324 asm("movl %%es,%0" : "=r" (es));
325 asm("movl %%fs,%0" : "=r" (fsindex));
326 asm("movl %%gs,%0" : "=r" (gsindex));
328 rdmsrl(MSR_FS_BASE, fs);
329 rdmsrl(MSR_GS_BASE, gs);
330 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
332 asm("movq %%cr0, %0": "=r" (cr0));
333 asm("movq %%cr2, %0": "=r" (cr2));
334 asm("movq %%cr3, %0": "=r" (cr3));
335 asm("movq %%cr4, %0": "=r" (cr4));
337 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
338 fs,fsindex,gs,gsindex,shadowgs);
339 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
340 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
343 void show_regs(struct pt_regs *regs)
345 printk("CPU %d:", smp_processor_id());
347 show_trace(NULL, regs, (void *)(regs + 1));
351 * Free current thread data structures etc..
353 void exit_thread(void)
355 struct task_struct *me = current;
356 struct thread_struct *t = &me->thread;
358 if (me->thread.io_bitmap_ptr) {
359 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
361 kfree(t->io_bitmap_ptr);
362 t->io_bitmap_ptr = NULL;
363 clear_thread_flag(TIF_IO_BITMAP);
365 * Careful, clear this in the TSS too:
367 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
368 t->io_bitmap_max = 0;
373 void flush_thread(void)
375 struct task_struct *tsk = current;
376 struct thread_info *t = current_thread_info();
378 if (t->flags & _TIF_ABI_PENDING) {
379 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
380 if (t->flags & _TIF_IA32)
381 current_thread_info()->status |= TS_COMPAT;
383 t->flags &= ~_TIF_DEBUG;
385 tsk->thread.debugreg0 = 0;
386 tsk->thread.debugreg1 = 0;
387 tsk->thread.debugreg2 = 0;
388 tsk->thread.debugreg3 = 0;
389 tsk->thread.debugreg6 = 0;
390 tsk->thread.debugreg7 = 0;
391 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
393 * Forget coprocessor state..
399 void release_thread(struct task_struct *dead_task)
402 if (dead_task->mm->context.size) {
403 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
405 dead_task->mm->context.ldt,
406 dead_task->mm->context.size);
412 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
414 struct user_desc ud = {
421 struct n_desc_struct *desc = (void *)t->thread.tls_array;
423 desc->a = LDT_entry_a(&ud);
424 desc->b = LDT_entry_b(&ud);
427 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
429 struct desc_struct *desc = (void *)t->thread.tls_array;
432 (((u32)desc->base1) << 16) |
433 (((u32)desc->base2) << 24);
437 * This gets called before we allocate a new thread and copy
438 * the current task into it.
440 void prepare_to_copy(struct task_struct *tsk)
445 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
446 unsigned long unused,
447 struct task_struct * p, struct pt_regs * regs)
450 struct pt_regs * childregs;
451 struct task_struct *me = current;
453 childregs = ((struct pt_regs *)
454 (THREAD_SIZE + task_stack_page(p))) - 1;
458 childregs->rsp = rsp;
460 childregs->rsp = (unsigned long)childregs;
462 p->thread.rsp = (unsigned long) childregs;
463 p->thread.rsp0 = (unsigned long) (childregs+1);
464 p->thread.userrsp = me->thread.userrsp;
466 set_tsk_thread_flag(p, TIF_FORK);
468 p->thread.fs = me->thread.fs;
469 p->thread.gs = me->thread.gs;
471 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
472 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
473 asm("mov %%es,%0" : "=m" (p->thread.es));
474 asm("mov %%ds,%0" : "=m" (p->thread.ds));
476 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
477 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
478 if (!p->thread.io_bitmap_ptr) {
479 p->thread.io_bitmap_max = 0;
482 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
484 set_tsk_thread_flag(p, TIF_IO_BITMAP);
488 * Set a new TLS for the child thread?
490 if (clone_flags & CLONE_SETTLS) {
491 #ifdef CONFIG_IA32_EMULATION
492 if (test_thread_flag(TIF_IA32))
493 err = ia32_child_tls(p, childregs);
496 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
502 if (err && p->thread.io_bitmap_ptr) {
503 kfree(p->thread.io_bitmap_ptr);
504 p->thread.io_bitmap_max = 0;
510 * This special macro can be used to load a debugging register
512 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
514 static inline void __switch_to_xtra(struct task_struct *prev_p,
515 struct task_struct *next_p,
516 struct tss_struct *tss)
518 struct thread_struct *prev, *next;
520 prev = &prev_p->thread,
521 next = &next_p->thread;
523 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
533 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
535 * Copy the relevant range of the IO bitmap.
536 * Normally this is 128 bytes or less:
538 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
539 max(prev->io_bitmap_max, next->io_bitmap_max));
540 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
542 * Clear any possible leftover bits:
544 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
549 * switch_to(x,y) should switch tasks from x to y.
551 * This could still be optimized:
552 * - fold all the options into a flag word and test it with a single test.
553 * - could test fs/gs bitsliced
555 * Kprobes not supported here. Set the probe on schedule instead.
557 __kprobes struct task_struct *
558 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
560 struct thread_struct *prev = &prev_p->thread,
561 *next = &next_p->thread;
562 int cpu = smp_processor_id();
563 struct tss_struct *tss = &per_cpu(init_tss, cpu);
565 /* we're going to use this soon, after a few expensive things */
566 if (next_p->fpu_counter>5)
567 prefetch(&next->i387.fxsave);
570 * Reload esp0, LDT and the page table pointer:
572 tss->rsp0 = next->rsp0;
576 * This won't pick up thread selector changes, but I guess that is ok.
578 asm volatile("mov %%es,%0" : "=m" (prev->es));
579 if (unlikely(next->es | prev->es))
580 loadsegment(es, next->es);
582 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
583 if (unlikely(next->ds | prev->ds))
584 loadsegment(ds, next->ds);
593 asm volatile("movl %%fs,%0" : "=r" (fsindex));
594 /* segment register != 0 always requires a reload.
595 also reload when it has changed.
596 when prev process used 64bit base always reload
597 to avoid an information leak. */
598 if (unlikely(fsindex | next->fsindex | prev->fs)) {
599 loadsegment(fs, next->fsindex);
600 /* check if the user used a selector != 0
601 * if yes clear 64bit base, since overloaded base
602 * is always mapped to the Null selector
607 /* when next process has a 64bit base use it */
609 wrmsrl(MSR_FS_BASE, next->fs);
610 prev->fsindex = fsindex;
614 asm volatile("movl %%gs,%0" : "=r" (gsindex));
615 if (unlikely(gsindex | next->gsindex | prev->gs)) {
616 load_gs_index(next->gsindex);
621 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
622 prev->gsindex = gsindex;
625 /* Must be after DS reload */
629 * Switch the PDA and FPU contexts.
631 prev->userrsp = read_pda(oldrsp);
632 write_pda(oldrsp, next->userrsp);
633 write_pda(pcurrent, next_p);
635 write_pda(kernelstack,
636 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
637 #ifdef CONFIG_CC_STACKPROTECTOR
638 write_pda(stack_canary, next_p->stack_canary);
640 * Build time only check to make sure the stack_canary is at
641 * offset 40 in the pda; this is a gcc ABI requirement
643 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
647 * Now maybe reload the debug registers and handle I/O bitmaps
649 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
650 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
651 __switch_to_xtra(prev_p, next_p, tss);
653 /* If the task has used fpu the last 5 timeslices, just do a full
654 * restore of the math state immediately to avoid the trap; the
655 * chances of needing FPU soon are obviously high now
657 if (next_p->fpu_counter>5)
658 math_state_restore();
663 * sys_execve() executes a new program.
666 long sys_execve(char __user *name, char __user * __user *argv,
667 char __user * __user *envp, struct pt_regs regs)
672 filename = getname(name);
673 error = PTR_ERR(filename);
674 if (IS_ERR(filename))
676 error = do_execve(filename, argv, envp, ®s);
679 current->ptrace &= ~PT_DTRACE;
680 task_unlock(current);
686 void set_personality_64bit(void)
688 /* inherit personality from parent */
690 /* Make sure to be in 64bit mode */
691 clear_thread_flag(TIF_IA32);
693 /* TBD: overwrites user setup. Should have two bits.
694 But 64bit processes have always behaved this way,
695 so it's not too bad. The main problem is just that
696 32bit childs are affected again. */
697 current->personality &= ~READ_IMPLIES_EXEC;
700 asmlinkage long sys_fork(struct pt_regs *regs)
702 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
706 sys_clone(unsigned long clone_flags, unsigned long newsp,
707 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
711 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
715 * This is trivial, and on the face of it looks like it
716 * could equally well be done in user mode.
718 * Not so, for quite unobvious reasons - register pressure.
719 * In user mode vfork() cannot have a stack frame, and if
720 * done by calling the "clone()" system call directly, you
721 * do not have enough call-clobbered registers to hold all
722 * the information you need.
724 asmlinkage long sys_vfork(struct pt_regs *regs)
726 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
730 unsigned long get_wchan(struct task_struct *p)
736 if (!p || p == current || p->state==TASK_RUNNING)
738 stack = (unsigned long)task_stack_page(p);
739 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
741 fp = *(u64 *)(p->thread.rsp);
743 if (fp < (unsigned long)stack ||
744 fp > (unsigned long)stack+THREAD_SIZE)
746 rip = *(u64 *)(fp+8);
747 if (!in_sched_functions(rip))
750 } while (count++ < 16);
754 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
757 int doit = task == current;
762 if (addr >= TASK_SIZE_OF(task))
765 /* handle small bases via the GDT because that's faster to
767 if (addr <= 0xffffffff) {
768 set_32bit_tls(task, GS_TLS, addr);
770 load_TLS(&task->thread, cpu);
771 load_gs_index(GS_TLS_SEL);
773 task->thread.gsindex = GS_TLS_SEL;
776 task->thread.gsindex = 0;
777 task->thread.gs = addr;
780 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
786 /* Not strictly needed for fs, but do it for symmetry
788 if (addr >= TASK_SIZE_OF(task))
791 /* handle small bases via the GDT because that's faster to
793 if (addr <= 0xffffffff) {
794 set_32bit_tls(task, FS_TLS, addr);
796 load_TLS(&task->thread, cpu);
797 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
799 task->thread.fsindex = FS_TLS_SEL;
802 task->thread.fsindex = 0;
803 task->thread.fs = addr;
805 /* set the selector to 0 to not confuse
807 asm volatile("movl %0,%%fs" :: "r" (0));
808 ret = checking_wrmsrl(MSR_FS_BASE, addr);
815 if (task->thread.fsindex == FS_TLS_SEL)
816 base = read_32bit_tls(task, FS_TLS);
818 rdmsrl(MSR_FS_BASE, base);
820 base = task->thread.fs;
821 ret = put_user(base, (unsigned long __user *)addr);
827 if (task->thread.gsindex == GS_TLS_SEL)
828 base = read_32bit_tls(task, GS_TLS);
830 asm("movl %%gs,%0" : "=r" (gsindex));
832 rdmsrl(MSR_KERNEL_GS_BASE, base);
834 base = task->thread.gs;
837 base = task->thread.gs;
838 ret = put_user(base, (unsigned long __user *)addr);
850 long sys_arch_prctl(int code, unsigned long addr)
852 return do_arch_prctl(current, code, addr);
856 * Capture the user space registers if the task is not running (in user space)
858 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
860 struct pt_regs *pp, ptregs;
862 pp = task_pt_regs(tsk);
868 elf_core_copy_regs(regs, &ptregs);
873 unsigned long arch_align_stack(unsigned long sp)
875 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
876 sp -= get_random_int() % 8192;