From b460cbc581a53cc088ceba80608021dd49c63c43 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Thu, 18 Oct 2007 23:39:52 -0700 Subject: [PATCH] pid namespaces: define is_global_init() and is_container_init() is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn Signed-off-by: Sukadev Bhattiprolu Acked-by: Pavel Emelianov Cc: Eric W. Biederman Cc: Cedric Le Goater Cc: Dave Hansen Cc: Herbert Poetzel Cc: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/avr32/kernel/traps.c | 2 +- arch/avr32/mm/fault.c | 6 +++--- arch/ia64/mm/fault.c | 2 +- arch/m32r/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/powerpc/kernel/traps.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/powerpc/platforms/pseries/ras.c | 2 +- arch/ppc/kernel/traps.c | 2 +- arch/ppc/mm/fault.c | 2 +- arch/s390/lib/uaccess_pt.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sh64/mm/fault.c | 6 +++--- arch/um/kernel/trap.c | 2 +- arch/x86/lib/usercopy_32.c | 2 +- arch/x86/mm/fault_32.c | 2 +- arch/x86/mm/fault_64.c | 2 +- arch/xtensa/mm/fault.c | 2 +- drivers/char/sysrq.c | 2 +- include/linux/sched.h | 12 ++++++++++-- kernel/capability.c | 3 ++- kernel/exit.c | 2 +- kernel/kexec.c | 2 +- kernel/pid.c | 5 +++++ kernel/signal.c | 2 +- kernel/sysctl.c | 2 +- mm/oom_kill.c | 4 ++-- security/commoncap.c | 3 ++- 32 files changed, 52 insertions(+), 37 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 25154df305..e0593e6061 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -188,7 +188,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, /* We ran out of memory, or some other thing happened to us that made us unable to handle the page fault gracefully. */ out_of_memory: - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 59ed1d05b7..a8a7dab757 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -197,7 +197,7 @@ survive: return fault; out_of_memory: - if (!is_init(tsk)) + if (!is_global_init(tsk)) goto out; /* diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 9a73ce7eb5..8a7caf8e7b 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -89,7 +89,7 @@ void _exception(long signr, struct pt_regs *regs, int code, * generate the same exception over and over again and we get * nowhere. Better to kill it and let the kernel panic. */ - if (is_init(current)) { + if (is_global_init(current)) { __sighandler_t handler; spin_lock_irq(¤t->sighand->siglock); diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index 11472f8701..6560cb18b4 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -160,7 +160,7 @@ bad_area: if (exception_trace && printk_ratelimit()) printk("%s%s[%d]: segfault at %08lx pc %08lx " "sp %08lx ecr %lu\n", - is_init(tsk) ? KERN_EMERG : KERN_INFO, + is_global_init(tsk) ? KERN_EMERG : KERN_INFO, tsk->comm, tsk->pid, address, regs->pc, regs->sp, ecr); _exception(SIGSEGV, regs, code, address); @@ -209,7 +209,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; @@ -231,7 +231,7 @@ do_sigbus: if (exception_trace) printk("%s%s[%d]: bus error at %08lx pc %08lx " "sp %08lx ecr %lu\n", - is_init(tsk) ? KERN_EMERG : KERN_INFO, + is_global_init(tsk) ? KERN_EMERG : KERN_INFO, tsk->comm, tsk->pid, address, regs->pc, regs->sp, ecr); diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 32f26253c4..7571076a16 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -274,7 +274,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 70a766aad3..4a71df4c1b 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -271,7 +271,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index eaa6186811..f493f03231 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -180,7 +180,7 @@ good_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 5699c7713e..fa636fc6b7 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -173,7 +173,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bf9e39c6e2..9fb4a6849c 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -201,7 +201,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) * generate the same exception over and over again and we get * nowhere. Better to kill it and let the kernel panic. */ - if (is_init(current)) { + if (is_global_init(current)) { __sighandler_t handler; spin_lock_irq(¤t->sighand->siglock); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index ab3546c5ac..a18fda361c 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -375,7 +375,7 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 3a393c7f39..a1ab25c708 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -332,7 +332,7 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err) err->disposition == RTAS_DISP_NOT_RECOVERED && err->target == RTAS_TARGET_MEMORY && err->type == RTAS_TYPE_ECC_UNCORR && - !(current->pid == 0 || is_init(current))) { + !(current->pid == 0 || is_global_init(current))) { /* Kill off a user process with an ECC error */ printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", current->pid); diff --git a/arch/ppc/kernel/traps.c b/arch/ppc/kernel/traps.c index 3f3b292eb7..c78568905c 100644 --- a/arch/ppc/kernel/traps.c +++ b/arch/ppc/kernel/traps.c @@ -121,7 +121,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) * generate the same exception over and over again and we get * nowhere. Better to kill it and let the kernel panic. */ - if (is_init(current)) { + if (is_global_init(current)) { __sighandler_t handler; spin_lock_irq(¤t->sighand->siglock); diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c index 94913ddcf7..254c23b755 100644 --- a/arch/ppc/mm/fault.c +++ b/arch/ppc/mm/fault.c @@ -290,7 +290,7 @@ bad_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index 60604b2819..b159a9d656 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -64,7 +64,7 @@ out: out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14c241ccdd..2456b52ed0 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -211,7 +211,7 @@ static int do_out_of_memory(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); return 1; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 4729668ce5..f33cedb353 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -207,7 +207,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c index dd81c669c7..7aea586fc3 100644 --- a/arch/sh64/mm/fault.c +++ b/arch/sh64/mm/fault.c @@ -278,7 +278,7 @@ bad_area: show_regs(regs); #endif } - if (is_init(tsk)) { + if (is_global_init(tsk)) { panic("INIT had user mode bad_area\n"); } tsk->thread.address = address; @@ -320,14 +320,14 @@ no_context: * us unable to handle the page fault gracefully. */ out_of_memory: - if (is_init(current)) { + if (is_global_init(current)) { panic("INIT out of memory\n"); yield(); goto survive; } printk("fault:Out of memory\n"); up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index bd060551e6..cb3321f8e0 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -108,7 +108,7 @@ out_nosemaphore: * us unable to handle the page fault gracefully. */ out_of_memory: - if (is_init(current)) { + if (is_global_init(current)) { up_read(&mm->mmap_sem); yield(); down_read(&mm->mmap_sem); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9f38b12b4a..8bab2b2efa 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -748,7 +748,7 @@ survive: retval = get_user_pages(current, current->mm, (unsigned long )to, 1, 1, 0, &pg, NULL); - if (retval == -ENOMEM && is_init(current)) { + if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); congestion_wait(WRITE, HZ/50); goto survive; diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c index 6555c3d143..4fc5e400cf 100644 --- a/arch/x86/mm/fault_32.c +++ b/arch/x86/mm/fault_32.c @@ -587,7 +587,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c index 5e0e54906c..5149ac136a 100644 --- a/arch/x86/mm/fault_64.c +++ b/arch/x86/mm/fault_64.c @@ -554,7 +554,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); goto again; } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 2f84285994..33f366be32 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -145,7 +145,7 @@ bad_area: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 78d14935f2..de60e1ea4f 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -251,7 +251,7 @@ static void send_sig_all(int sig) struct task_struct *p; for_each_process(p) { - if (p->mm && !is_init(p)) + if (p->mm && !is_global_init(p)) /* Not swapper, init nor kernel thread */ force_sig(sig, p); } diff --git a/include/linux/sched.h b/include/linux/sched.h index df6049e5e8..47cf81d620 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1237,12 +1237,20 @@ static inline int pid_alive(struct task_struct *p) } /** - * is_init - check if a task structure is init + * is_global_init - check if a task structure is init * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. + * + * TODO: We should inline this function after some cleanups in pid_namespace.h + */ +extern int is_global_init(struct task_struct *tsk); + +/* + * is_container_init: + * check whether in the task is init in its own pid namespace. */ -static inline int is_init(struct task_struct *tsk) +static inline int is_container_init(struct task_struct *tsk) { return tsk->pid == 1; } diff --git a/kernel/capability.c b/kernel/capability.c index cbc5fd60c0..f02ad47320 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -12,6 +12,7 @@ #include #include #include +#include #include /* @@ -129,7 +130,7 @@ static inline int cap_set_all(kernel_cap_t *effective, int found = 0; do_each_thread(g, target) { - if (target == current || is_init(target)) + if (target == current || is_container_init(target->group_leader)) continue; found = 1; if (security_capset_check(target, effective, inheritable, diff --git a/kernel/exit.c b/kernel/exit.c index d1eddc753f..d22aefabb1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -221,7 +221,7 @@ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignor do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state - || is_init(p->real_parent)) + || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) { diff --git a/kernel/kexec.c b/kernel/kexec.c index e9f1b4ea50..fbffdb457c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -51,7 +51,7 @@ struct resource crashk_res = { int kexec_should_crash(struct task_struct *p) { - if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; } diff --git a/kernel/pid.c b/kernel/pid.c index 78c0dbffde..bb0785109d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -70,6 +70,11 @@ struct pid_namespace init_pid_ns = { .child_reaper = &init_task }; +int is_global_init(struct task_struct *tsk) +{ + return tsk == init_pid_ns.child_reaper; +} + /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). diff --git a/kernel/signal.c b/kernel/signal.c index 0a6d3726cb..8214ffad54 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -256,7 +256,7 @@ flush_signal_handlers(struct task_struct *t, int force_default) int unhandled_signal(struct task_struct *tsk, int sig) { - if (is_init(tsk)) + if (is_global_init(tsk)) return 1; if (tsk->ptrace & PT_PTRACED) return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 067554bda8..44868e4df1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1888,7 +1888,7 @@ int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp, return -EPERM; } - op = is_init(current) ? OP_SET : OP_AND; + op = is_global_init(current) ? OP_SET : OP_AND; return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_bset_conv,&op); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a64decb5b1..b1c2d0f862 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -212,7 +212,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) if (!p->mm) continue; /* skip the init task */ - if (is_init(p)) + if (is_global_init(p)) continue; /* @@ -265,7 +265,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) */ static void __oom_kill_task(struct task_struct *p, int verbose) { - if (is_init(p)) { + if (is_global_init(p)) { WARN_ON(1); printk(KERN_WARNING "tried to kill init!\n"); return; diff --git a/security/commoncap.c b/security/commoncap.c index 48ca5b0927..43f902750a 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_SECURITY_FILE_CAPABILITIES /* @@ -334,7 +335,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe) /* For init, we want to retain the capabilities set * in the init_task struct. Thus we skip the usual * capability rules */ - if (!is_init(current)) { + if (!is_global_init(current)) { current->cap_permitted = new_permitted; current->cap_effective = bprm->cap_effective ? new_permitted : 0; -- 2.39.5