2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/ptrace.h>
34 #include <linux/utsname.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
82 static void __exit_idle(void)
84 if (test_and_clear_bit_pda(0, isidle) == 0)
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
89 /* Called from interrupts to signify idle end */
92 /* idle loop has pid 0 */
99 * We use this if we don't have any better
102 static void default_idle(void)
104 current_thread_info()->status &= ~TS_POLLING;
106 * TS_POLLING-cleared state must be visible before we
111 if (!need_resched()) {
116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
124 current_thread_info()->status |= TS_POLLING;
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
132 static void poll_idle (void)
138 static void do_nothing(void *unused)
142 void cpu_idle_wait(void)
144 unsigned int cpu, this_cpu = get_cpu();
145 cpumask_t map, tmp = current->cpus_allowed;
147 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
151 for_each_online_cpu(cpu) {
152 per_cpu(cpu_idle_state, cpu) = 1;
156 __get_cpu_var(cpu_idle_state) = 0;
161 for_each_online_cpu(cpu) {
162 if (cpu_isset(cpu, map) &&
163 !per_cpu(cpu_idle_state, cpu))
166 cpus_and(map, map, cpu_online_map);
168 * We waited 1 sec, if a CPU still did not call idle
169 * it may be because it is in idle and not waking up
170 * because it has nothing to do.
171 * Give all the remaining CPUS a kick.
173 smp_call_function_mask(map, do_nothing, 0, 0);
174 } while (!cpus_empty(map));
176 set_cpus_allowed(current, tmp);
178 EXPORT_SYMBOL_GPL(cpu_idle_wait);
180 #ifdef CONFIG_HOTPLUG_CPU
181 DECLARE_PER_CPU(int, cpu_state);
184 /* We halt the CPU with physical CPU hotplug */
185 static inline void play_dead(void)
191 __get_cpu_var(cpu_state) = CPU_DEAD;
198 static inline void play_dead(void)
202 #endif /* CONFIG_HOTPLUG_CPU */
205 * The idle thread. There's no useful work to be
206 * done, so just try to conserve power and have a
207 * low exit latency (ie sit in a loop waiting for
208 * somebody to say that they'd like to reschedule)
212 current_thread_info()->status |= TS_POLLING;
213 /* endless idle loop with no priority at all */
215 while (!need_resched()) {
218 if (__get_cpu_var(cpu_idle_state))
219 __get_cpu_var(cpu_idle_state) = 0;
221 tick_nohz_stop_sched_tick();
227 if (cpu_is_offline(smp_processor_id()))
230 * Idle routines should keep interrupts disabled
231 * from here on, until they go to idle.
232 * Otherwise, idle callbacks can misfire.
237 /* In many cases the interrupt that ended idle
238 has already called exit_idle. But some idle
239 loops can be woken up without interrupt. */
243 tick_nohz_restart_sched_tick();
244 preempt_enable_no_resched();
251 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
252 * which can obviate IPI to trigger checking of need_resched.
253 * We execute MONITOR against need_resched and enter optimized wait state
254 * through MWAIT. Whenever someone changes need_resched, we would be woken
255 * up from MWAIT (without an IPI).
257 * New with Core Duo processors, MWAIT can take some hints based on CPU
260 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
262 if (!need_resched()) {
263 __monitor((void *)¤t_thread_info()->flags, 0, 0);
270 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
271 static void mwait_idle(void)
273 if (!need_resched()) {
274 __monitor((void *)¤t_thread_info()->flags, 0, 0);
285 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
288 if (cpu_has(c, X86_FEATURE_MWAIT)) {
290 * Skip, if setup has overridden idle.
291 * One CPU supports mwait => All CPUs supports mwait
295 printk(KERN_INFO "using mwait in idle threads.\n");
298 pm_idle = mwait_idle;
303 static int __init idle_setup (char *str)
305 if (!strcmp(str, "poll")) {
306 printk("using polling idle threads.\n");
308 } else if (!strcmp(str, "mwait"))
313 boot_option_idle_override = 1;
316 early_param("idle", idle_setup);
318 /* Prints also some state that isn't saved in the pt_regs */
319 void __show_regs(struct pt_regs * regs)
321 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
322 unsigned long d0, d1, d2, d3, d6, d7;
323 unsigned int fsindex,gsindex;
324 unsigned int ds,cs,es;
328 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
329 current->pid, current->comm, print_tainted(),
330 init_utsname()->release,
331 (int)strcspn(init_utsname()->version, " "),
332 init_utsname()->version);
333 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
334 printk_address(regs->rip);
335 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
337 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
338 regs->rax, regs->rbx, regs->rcx);
339 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
340 regs->rdx, regs->rsi, regs->rdi);
341 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
342 regs->rbp, regs->r8, regs->r9);
343 printk("R10: %016lx R11: %016lx R12: %016lx\n",
344 regs->r10, regs->r11, regs->r12);
345 printk("R13: %016lx R14: %016lx R15: %016lx\n",
346 regs->r13, regs->r14, regs->r15);
348 asm("movl %%ds,%0" : "=r" (ds));
349 asm("movl %%cs,%0" : "=r" (cs));
350 asm("movl %%es,%0" : "=r" (es));
351 asm("movl %%fs,%0" : "=r" (fsindex));
352 asm("movl %%gs,%0" : "=r" (gsindex));
354 rdmsrl(MSR_FS_BASE, fs);
355 rdmsrl(MSR_GS_BASE, gs);
356 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
363 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
364 fs,fsindex,gs,gsindex,shadowgs);
365 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
366 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
371 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
375 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
378 void show_regs(struct pt_regs *regs)
380 printk("CPU %d:", smp_processor_id());
382 show_trace(NULL, regs, (void *)(regs + 1));
386 * Free current thread data structures etc..
388 void exit_thread(void)
390 struct task_struct *me = current;
391 struct thread_struct *t = &me->thread;
393 if (me->thread.io_bitmap_ptr) {
394 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
396 kfree(t->io_bitmap_ptr);
397 t->io_bitmap_ptr = NULL;
398 clear_thread_flag(TIF_IO_BITMAP);
400 * Careful, clear this in the TSS too:
402 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
403 t->io_bitmap_max = 0;
408 void flush_thread(void)
410 struct task_struct *tsk = current;
412 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
413 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
414 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
415 clear_tsk_thread_flag(tsk, TIF_IA32);
417 set_tsk_thread_flag(tsk, TIF_IA32);
418 current_thread_info()->status |= TS_COMPAT;
421 clear_tsk_thread_flag(tsk, TIF_DEBUG);
423 tsk->thread.debugreg0 = 0;
424 tsk->thread.debugreg1 = 0;
425 tsk->thread.debugreg2 = 0;
426 tsk->thread.debugreg3 = 0;
427 tsk->thread.debugreg6 = 0;
428 tsk->thread.debugreg7 = 0;
429 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
431 * Forget coprocessor state..
437 void release_thread(struct task_struct *dead_task)
440 if (dead_task->mm->context.size) {
441 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
443 dead_task->mm->context.ldt,
444 dead_task->mm->context.size);
450 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
452 struct user_desc ud = {
459 struct n_desc_struct *desc = (void *)t->thread.tls_array;
461 desc->a = LDT_entry_a(&ud);
462 desc->b = LDT_entry_b(&ud);
465 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
467 return get_desc_base(&t->thread.tls_array[tls]);
471 * This gets called before we allocate a new thread and copy
472 * the current task into it.
474 void prepare_to_copy(struct task_struct *tsk)
479 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
480 unsigned long unused,
481 struct task_struct * p, struct pt_regs * regs)
484 struct pt_regs * childregs;
485 struct task_struct *me = current;
487 childregs = ((struct pt_regs *)
488 (THREAD_SIZE + task_stack_page(p))) - 1;
492 childregs->rsp = rsp;
494 childregs->rsp = (unsigned long)childregs;
496 p->thread.rsp = (unsigned long) childregs;
497 p->thread.rsp0 = (unsigned long) (childregs+1);
498 p->thread.userrsp = me->thread.userrsp;
500 set_tsk_thread_flag(p, TIF_FORK);
502 p->thread.fs = me->thread.fs;
503 p->thread.gs = me->thread.gs;
505 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
506 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
507 asm("mov %%es,%0" : "=m" (p->thread.es));
508 asm("mov %%ds,%0" : "=m" (p->thread.ds));
510 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
511 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
512 if (!p->thread.io_bitmap_ptr) {
513 p->thread.io_bitmap_max = 0;
516 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
518 set_tsk_thread_flag(p, TIF_IO_BITMAP);
522 * Set a new TLS for the child thread?
524 if (clone_flags & CLONE_SETTLS) {
525 #ifdef CONFIG_IA32_EMULATION
526 if (test_thread_flag(TIF_IA32))
527 err = do_set_thread_area(p, -1,
528 (struct user_desc __user *)childregs->rsi, 0);
531 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
537 if (err && p->thread.io_bitmap_ptr) {
538 kfree(p->thread.io_bitmap_ptr);
539 p->thread.io_bitmap_max = 0;
545 * This special macro can be used to load a debugging register
547 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
549 static inline void __switch_to_xtra(struct task_struct *prev_p,
550 struct task_struct *next_p,
551 struct tss_struct *tss)
553 struct thread_struct *prev, *next;
555 prev = &prev_p->thread,
556 next = &next_p->thread;
558 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
568 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
570 * Copy the relevant range of the IO bitmap.
571 * Normally this is 128 bytes or less:
573 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
574 max(prev->io_bitmap_max, next->io_bitmap_max));
575 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
577 * Clear any possible leftover bits:
579 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
584 * switch_to(x,y) should switch tasks from x to y.
586 * This could still be optimized:
587 * - fold all the options into a flag word and test it with a single test.
588 * - could test fs/gs bitsliced
590 * Kprobes not supported here. Set the probe on schedule instead.
593 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
595 struct thread_struct *prev = &prev_p->thread,
596 *next = &next_p->thread;
597 int cpu = smp_processor_id();
598 struct tss_struct *tss = &per_cpu(init_tss, cpu);
600 /* we're going to use this soon, after a few expensive things */
601 if (next_p->fpu_counter>5)
602 prefetch(&next->i387.fxsave);
605 * Reload esp0, LDT and the page table pointer:
607 tss->rsp0 = next->rsp0;
611 * This won't pick up thread selector changes, but I guess that is ok.
613 asm volatile("mov %%es,%0" : "=m" (prev->es));
614 if (unlikely(next->es | prev->es))
615 loadsegment(es, next->es);
617 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
618 if (unlikely(next->ds | prev->ds))
619 loadsegment(ds, next->ds);
628 asm volatile("movl %%fs,%0" : "=r" (fsindex));
629 /* segment register != 0 always requires a reload.
630 also reload when it has changed.
631 when prev process used 64bit base always reload
632 to avoid an information leak. */
633 if (unlikely(fsindex | next->fsindex | prev->fs)) {
634 loadsegment(fs, next->fsindex);
635 /* check if the user used a selector != 0
636 * if yes clear 64bit base, since overloaded base
637 * is always mapped to the Null selector
642 /* when next process has a 64bit base use it */
644 wrmsrl(MSR_FS_BASE, next->fs);
645 prev->fsindex = fsindex;
649 asm volatile("movl %%gs,%0" : "=r" (gsindex));
650 if (unlikely(gsindex | next->gsindex | prev->gs)) {
651 load_gs_index(next->gsindex);
656 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
657 prev->gsindex = gsindex;
660 /* Must be after DS reload */
664 * Switch the PDA and FPU contexts.
666 prev->userrsp = read_pda(oldrsp);
667 write_pda(oldrsp, next->userrsp);
668 write_pda(pcurrent, next_p);
670 write_pda(kernelstack,
671 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
672 #ifdef CONFIG_CC_STACKPROTECTOR
673 write_pda(stack_canary, next_p->stack_canary);
675 * Build time only check to make sure the stack_canary is at
676 * offset 40 in the pda; this is a gcc ABI requirement
678 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
682 * Now maybe reload the debug registers and handle I/O bitmaps
684 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
685 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
686 __switch_to_xtra(prev_p, next_p, tss);
688 /* If the task has used fpu the last 5 timeslices, just do a full
689 * restore of the math state immediately to avoid the trap; the
690 * chances of needing FPU soon are obviously high now
692 if (next_p->fpu_counter>5)
693 math_state_restore();
698 * sys_execve() executes a new program.
701 long sys_execve(char __user *name, char __user * __user *argv,
702 char __user * __user *envp, struct pt_regs regs)
707 filename = getname(name);
708 error = PTR_ERR(filename);
709 if (IS_ERR(filename))
711 error = do_execve(filename, argv, envp, ®s);
714 current->ptrace &= ~PT_DTRACE;
715 task_unlock(current);
721 void set_personality_64bit(void)
723 /* inherit personality from parent */
725 /* Make sure to be in 64bit mode */
726 clear_thread_flag(TIF_IA32);
728 /* TBD: overwrites user setup. Should have two bits.
729 But 64bit processes have always behaved this way,
730 so it's not too bad. The main problem is just that
731 32bit childs are affected again. */
732 current->personality &= ~READ_IMPLIES_EXEC;
735 asmlinkage long sys_fork(struct pt_regs *regs)
737 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
741 sys_clone(unsigned long clone_flags, unsigned long newsp,
742 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
746 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
750 * This is trivial, and on the face of it looks like it
751 * could equally well be done in user mode.
753 * Not so, for quite unobvious reasons - register pressure.
754 * In user mode vfork() cannot have a stack frame, and if
755 * done by calling the "clone()" system call directly, you
756 * do not have enough call-clobbered registers to hold all
757 * the information you need.
759 asmlinkage long sys_vfork(struct pt_regs *regs)
761 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
765 unsigned long get_wchan(struct task_struct *p)
771 if (!p || p == current || p->state==TASK_RUNNING)
773 stack = (unsigned long)task_stack_page(p);
774 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
776 fp = *(u64 *)(p->thread.rsp);
778 if (fp < (unsigned long)stack ||
779 fp > (unsigned long)stack+THREAD_SIZE)
781 rip = *(u64 *)(fp+8);
782 if (!in_sched_functions(rip))
785 } while (count++ < 16);
789 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
792 int doit = task == current;
797 if (addr >= TASK_SIZE_OF(task))
800 /* handle small bases via the GDT because that's faster to
802 if (addr <= 0xffffffff) {
803 set_32bit_tls(task, GS_TLS, addr);
805 load_TLS(&task->thread, cpu);
806 load_gs_index(GS_TLS_SEL);
808 task->thread.gsindex = GS_TLS_SEL;
811 task->thread.gsindex = 0;
812 task->thread.gs = addr;
815 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
821 /* Not strictly needed for fs, but do it for symmetry
823 if (addr >= TASK_SIZE_OF(task))
826 /* handle small bases via the GDT because that's faster to
828 if (addr <= 0xffffffff) {
829 set_32bit_tls(task, FS_TLS, addr);
831 load_TLS(&task->thread, cpu);
832 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
834 task->thread.fsindex = FS_TLS_SEL;
837 task->thread.fsindex = 0;
838 task->thread.fs = addr;
840 /* set the selector to 0 to not confuse
842 asm volatile("movl %0,%%fs" :: "r" (0));
843 ret = checking_wrmsrl(MSR_FS_BASE, addr);
850 if (task->thread.fsindex == FS_TLS_SEL)
851 base = read_32bit_tls(task, FS_TLS);
853 rdmsrl(MSR_FS_BASE, base);
855 base = task->thread.fs;
856 ret = put_user(base, (unsigned long __user *)addr);
862 if (task->thread.gsindex == GS_TLS_SEL)
863 base = read_32bit_tls(task, GS_TLS);
865 asm("movl %%gs,%0" : "=r" (gsindex));
867 rdmsrl(MSR_KERNEL_GS_BASE, base);
869 base = task->thread.gs;
872 base = task->thread.gs;
873 ret = put_user(base, (unsigned long __user *)addr);
885 long sys_arch_prctl(int code, unsigned long addr)
887 return do_arch_prctl(current, code, addr);
891 * Capture the user space registers if the task is not running (in user space)
893 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
895 struct pt_regs *pp, ptregs;
897 pp = task_pt_regs(tsk);
903 elf_core_copy_regs(regs, &ptregs);
908 unsigned long arch_align_stack(unsigned long sp)
910 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
911 sp -= get_random_int() % 8192;
915 unsigned long arch_randomize_brk(struct mm_struct *mm)
917 unsigned long range_end = mm->brk + 0x02000000;
918 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;