#include <linux/init.h>
#include <linux/quotaops.h>
#include <linux/acct.h>
+#include <linux/capability.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/namespace.h>
static kmem_cache_t *mnt_cache;
static struct rw_semaphore namespace_sem;
+/* /sys/fs */
+decl_subsys(fs, NULL, NULL);
+EXPORT_SYMBOL_GPL(fs_subsys);
+
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
+ INIT_LIST_HEAD(&mnt->mnt_slave_list);
+ INIT_LIST_HEAD(&mnt->mnt_slave);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
}
/*
- * Now, lookup_mnt increments the ref count before returning
- * the vfsmount struct.
+ * find the first or last mount at @dentry on vfsmount @mnt depending on
+ * @dir. If @dir is set return the first mount else return the last mount.
*/
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+ int dir)
{
struct list_head *head = mount_hashtable + hash(mnt, dentry);
struct list_head *tmp = head;
struct vfsmount *p, *found = NULL;
- spin_lock(&vfsmount_lock);
for (;;) {
- tmp = tmp->next;
+ tmp = dir ? tmp->next : tmp->prev;
p = NULL;
if (tmp == head)
break;
p = list_entry(tmp, struct vfsmount, mnt_hash);
if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
- found = mntget(p);
+ found = p;
break;
}
}
- spin_unlock(&vfsmount_lock);
return found;
}
+/*
+ * lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+ struct vfsmount *child_mnt;
+ spin_lock(&vfsmount_lock);
+ if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+ mntget(child_mnt);
+ spin_unlock(&vfsmount_lock);
+ return child_mnt;
+}
+
static inline int check_mnt(struct vfsmount *mnt)
{
return mnt->mnt_namespace == current->namespace;
return list_entry(next, struct vfsmount, mnt_child);
}
+static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
+{
+ struct list_head *prev = p->mnt_mounts.prev;
+ while (prev != &p->mnt_mounts) {
+ p = list_entry(prev, struct vfsmount, mnt_child);
+ prev = p->mnt_mounts.prev;
+ }
+ return p;
+}
+
static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
int flag)
{
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
- if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
- list_add(&mnt->mnt_share, &old->mnt_share);
+ if (flag & CL_SLAVE) {
+ list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+ mnt->mnt_master = old;
+ CLEAR_MNT_SHARED(mnt);
+ } else {
+ if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+ list_add(&mnt->mnt_share, &old->mnt_share);
+ if (IS_MNT_SLAVE(old))
+ list_add(&mnt->mnt_slave, &old->mnt_slave);
+ mnt->mnt_master = old->mnt_master;
+ }
if (flag & CL_MAKE_SHARED)
set_mnt_shared(mnt);
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
{ MS_MANDLOCK, ",mand" },
- { MS_NOATIME, ",noatime" },
- { MS_NODIRATIME, ",nodiratime" },
{ 0, NULL }
};
static struct proc_fs_info mnt_info[] = {
{ MNT_NOSUID, ",nosuid" },
{ MNT_NODEV, ",nodev" },
{ MNT_NOEXEC, ",noexec" },
+ { MNT_NOATIME, ",noatime" },
+ { MNT_NODIRATIME, ",nodiratime" },
{ 0, NULL }
};
struct proc_fs_info *fs_infop;
*/
int may_umount(struct vfsmount *mnt)
{
- if (atomic_read(&mnt->mnt_count) > 2)
- return -EBUSY;
- return 0;
+ int ret = 0;
+ spin_lock(&vfsmount_lock);
+ if (propagate_mount_busy(mnt, 2))
+ ret = -EBUSY;
+ spin_unlock(&vfsmount_lock);
+ return ret;
}
EXPORT_SYMBOL(may_umount);
void release_mounts(struct list_head *head)
{
struct vfsmount *mnt;
- while(!list_empty(head)) {
+ while (!list_empty(head)) {
mnt = list_entry(head->next, struct vfsmount, mnt_hash);
list_del_init(&mnt->mnt_hash);
if (mnt->mnt_parent != mnt) {
}
}
-void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
{
struct vfsmount *p;
list_add(&p->mnt_hash, kill);
}
+ if (propagate)
+ propagate_umount(kill);
+
list_for_each_entry(p, kill, mnt_hash) {
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
p->mnt_namespace = NULL;
list_del_init(&p->mnt_child);
if (p->mnt_parent != p)
- mnt->mnt_mountpoint->d_mounted--;
+ p->mnt_mountpoint->d_mounted--;
+ change_mnt_propagation(p, MS_PRIVATE);
}
}
event++;
retval = -EBUSY;
- if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
+ if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, &umount_list);
+ umount_tree(mnt, 1, &umount_list);
retval = 0;
}
spin_unlock(&vfsmount_lock);
if (current->uid != nd->dentry->d_inode->i_uid)
return -EPERM;
}
- if (permission(nd->dentry->d_inode, MAY_WRITE, nd))
+ if (vfs_permission(nd, MAY_WRITE))
return -EPERM;
return 0;
#endif
struct vfsmount *res, *p, *q, *r, *s;
struct nameidata nd;
+ if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
+ return NULL;
+
res = q = clone_mnt(mnt, dentry, flag);
if (!q)
goto Enomem;
continue;
for (s = r; s; s = next_mnt(s, r)) {
+ if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) {
+ s = skip_mnt_tree(s);
+ continue;
+ }
while (p != s->mnt_parent) {
p = p->mnt_parent;
q = q->mnt_parent;
if (res) {
LIST_HEAD(umount_list);
spin_lock(&vfsmount_lock);
- umount_tree(res, &umount_list);
+ umount_tree(res, 0, &umount_list);
spin_unlock(&vfsmount_lock);
release_mounts(&umount_list);
}
/*
* @source_mnt : mount tree to be attached
- * @nd : place the mount tree @source_mnt is attached
+ * @nd : place the mount tree @source_mnt is attached
+ * @parent_nd : if non-null, detach the source_mnt from its parent and
+ * store the parent mount and mountpoint dentry.
+ * (done when source_mnt is moved)
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
- * ---------------------------------------------
- * | BIND MOUNT OPERATION |
- * |********************************************
- * | source-->| shared | private |
- * | dest | | |
- * | | | | |
- * | v | | |
- * |********************************************
- * | shared | shared (++) | shared (+) |
- * | | | |
- * |non-shared| shared (+) | private |
- * *********************************************
+ * ---------------------------------------------------------------------------
+ * | BIND MOUNT OPERATION |
+ * |**************************************************************************
+ * | source-->| shared | private | slave | unbindable |
+ * | dest | | | | |
+ * | | | | | | |
+ * | v | | | | |
+ * |**************************************************************************
+ * | shared | shared (++) | shared (+) | shared(+++)| invalid |
+ * | | | | | |
+ * |non-shared| shared (+) | private | slave (*) | invalid |
+ * ***************************************************************************
* A bind operation clones the source mount and mounts the clone on the
* destination mount.
*
* (+) the cloned mount is created under the destination mount and is marked
* as shared. The cloned mount is added to the peer group of the source
* mount.
+ * (+++) the mount is propagated to all the mounts in the propagation tree
+ * of the destination mount and the cloned mount is made slave
+ * of the same master as that of the source mount. The cloned mount
+ * is marked as 'shared and slave'.
+ * (*) the cloned mount is made a slave of the same master as that of the
+ * source mount.
+ *
+ * ---------------------------------------------------------------------------
+ * | MOVE MOUNT OPERATION |
+ * |**************************************************************************
+ * | source-->| shared | private | slave | unbindable |
+ * | dest | | | | |
+ * | | | | | | |
+ * | v | | | | |
+ * |**************************************************************************
+ * | shared | shared (+) | shared (+) | shared(+++) | invalid |
+ * | | | | | |
+ * |non-shared| shared (+*) | private | slave (*) | unbindable |
+ * ***************************************************************************
+ *
+ * (+) the mount is moved to the destination. And is then propagated to
+ * all the mounts in the propagation tree of the destination mount.
+ * (+*) the mount is moved to the destination.
+ * (+++) the mount is moved to the destination and is then propagated to
+ * all the mounts belonging to the destination mount's propagation tree.
+ * the mount is marked as 'shared and slave'.
+ * (*) the mount continues to be a slave at the new location.
*
* if the source mount is a tree, the operations explained above is
* applied to each mount in the tree.
* in allocations.
*/
static int attach_recursive_mnt(struct vfsmount *source_mnt,
- struct nameidata *nd)
+ struct nameidata *nd, struct nameidata *parent_nd)
{
LIST_HEAD(tree_list);
struct vfsmount *dest_mnt = nd->mnt;
}
spin_lock(&vfsmount_lock);
- mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
- commit_tree(source_mnt);
+ if (parent_nd) {
+ detach_mnt(source_mnt, parent_nd);
+ attach_mnt(source_mnt, nd);
+ touch_namespace(current->namespace);
+ } else {
+ mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+ commit_tree(source_mnt);
+ }
list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
list_del_init(&child->mnt_hash);
return -ENOTDIR;
err = -ENOENT;
- down(&nd->dentry->d_inode->i_sem);
+ mutex_lock(&nd->dentry->d_inode->i_mutex);
if (IS_DEADDIR(nd->dentry->d_inode))
goto out_unlock;
err = -ENOENT;
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
- err = attach_recursive_mnt(mnt, nd);
+ err = attach_recursive_mnt(mnt, nd, NULL);
out_unlock:
- up(&nd->dentry->d_inode->i_sem);
+ mutex_unlock(&nd->dentry->d_inode->i_mutex);
if (!err)
security_sb_post_addmount(mnt, nd);
return err;
down_write(&namespace_sem);
err = -EINVAL;
+ if (IS_MNT_UNBINDABLE(old_nd.mnt))
+ goto out;
+
if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
goto out;
if (err) {
LIST_HEAD(umount_list);
spin_lock(&vfsmount_lock);
- umount_tree(mnt, &umount_list);
+ umount_tree(mnt, 0, &umount_list);
spin_unlock(&vfsmount_lock);
release_mounts(&umount_list);
}
return err;
}
+static inline int tree_contains_unbindable(struct vfsmount *mnt)
+{
+ struct vfsmount *p;
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ if (IS_MNT_UNBINDABLE(p))
+ return 1;
+ }
+ return 0;
+}
+
static int do_move_mount(struct nameidata *nd, char *old_name)
{
struct nameidata old_nd, parent_nd;
goto out;
err = -ENOENT;
- down(&nd->dentry->d_inode->i_sem);
+ mutex_lock(&nd->dentry->d_inode->i_mutex);
if (IS_DEADDIR(nd->dentry->d_inode))
goto out1;
- spin_lock(&vfsmount_lock);
if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
- goto out2;
+ goto out1;
err = -EINVAL;
if (old_nd.dentry != old_nd.mnt->mnt_root)
- goto out2;
+ goto out1;
if (old_nd.mnt == old_nd.mnt->mnt_parent)
- goto out2;
+ goto out1;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
S_ISDIR(old_nd.dentry->d_inode->i_mode))
- goto out2;
-
+ goto out1;
+ /*
+ * Don't move a mount residing in a shared parent.
+ */
+ if (old_nd.mnt->mnt_parent && IS_MNT_SHARED(old_nd.mnt->mnt_parent))
+ goto out1;
+ /*
+ * Don't move a mount tree containing unbindable mounts to a destination
+ * mount which is shared.
+ */
+ if (IS_MNT_SHARED(nd->mnt) && tree_contains_unbindable(old_nd.mnt))
+ goto out1;
err = -ELOOP;
for (p = nd->mnt; p->mnt_parent != p; p = p->mnt_parent)
if (p == old_nd.mnt)
- goto out2;
- err = 0;
+ goto out1;
- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
- touch_namespace(current->namespace);
+ if ((err = attach_recursive_mnt(old_nd.mnt, nd, &parent_nd)))
+ goto out1;
+ spin_lock(&vfsmount_lock);
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old_nd.mnt->mnt_expire);
-out2:
spin_unlock(&vfsmount_lock);
out1:
- up(&nd->dentry->d_inode->i_sem);
+ mutex_unlock(&nd->dentry->d_inode->i_mutex);
out:
up_write(&namespace_sem);
if (!err)
* Check that it is still dead: the count should now be 2 - as
* contributed by the vfsmount parent and the mntget above
*/
- if (atomic_read(&mnt->mnt_count) == 2) {
+ if (!propagate_mount_busy(mnt, 2)) {
/* delete from the namespace */
touch_namespace(mnt->mnt_namespace);
list_del_init(&mnt->mnt_list);
mnt->mnt_namespace = NULL;
- umount_tree(mnt, umounts);
+ umount_tree(mnt, 1, umounts);
spin_unlock(&vfsmount_lock);
} else {
/*
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
- flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE);
+ if (flags & MS_NOATIME)
+ mnt_flags |= MNT_NOATIME;
+ if (flags & MS_NODIRATIME)
+ mnt_flags |= MNT_NODIRATIME;
+
+ flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
+ MS_NOATIME | MS_NODIRATIME);
/* ... and get the mountpoint */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&nd, dev_name, flags & MS_REC);
- else if (flags & (MS_SHARED | MS_PRIVATE))
+ else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
return retval;
}
-int copy_namespace(int flags, struct task_struct *tsk)
+/*
+ * Allocate a new namespace structure and populate it with contents
+ * copied from the namespace of the passed in task structure.
+ */
+struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
{
struct namespace *namespace = tsk->namespace;
struct namespace *new_ns;
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
- struct fs_struct *fs = tsk->fs;
struct vfsmount *p, *q;
- if (!namespace)
- return 0;
-
- get_namespace(namespace);
-
- if (!(flags & CLONE_NEWNS))
- return 0;
-
- if (!capable(CAP_SYS_ADMIN)) {
- put_namespace(namespace);
- return -EPERM;
- }
-
new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
if (!new_ns)
goto out;
down_write(&namespace_sem);
/* First pass: copy the tree topology */
new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root,
- CL_EXPIRE);
+ CL_COPY_ALL | CL_EXPIRE);
if (!new_ns->root) {
up_write(&namespace_sem);
kfree(new_ns);
}
up_write(&namespace_sem);
- tsk->namespace = new_ns;
-
if (rootmnt)
mntput(rootmnt);
if (pwdmnt)
if (altrootmnt)
mntput(altrootmnt);
- put_namespace(namespace);
- return 0;
+out:
+ return new_ns;
+}
+
+int copy_namespace(int flags, struct task_struct *tsk)
+{
+ struct namespace *namespace = tsk->namespace;
+ struct namespace *new_ns;
+ int err = 0;
+
+ if (!namespace)
+ return 0;
+
+ get_namespace(namespace);
+
+ if (!(flags & CLONE_NEWNS))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ new_ns = dup_namespace(tsk, tsk->fs);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ tsk->namespace = new_ns;
out:
put_namespace(namespace);
- return -ENOMEM;
+ return err;
}
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
* pointed to by put_old must yield the same directory as new_root. No other
* file system may be mounted on put_old. After all, new_root is a mountpoint.
*
+ * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
+ * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
+ * in this situation.
+ *
* Notes:
* - we don't move root/cwd if they are not at the root (reason: if something
* cared enough to change them, it's probably wrong to force them elsewhere)
user_nd.dentry = dget(current->fs->root);
read_unlock(¤t->fs->lock);
down_write(&namespace_sem);
- down(&old_nd.dentry->d_inode->i_sem);
+ mutex_lock(&old_nd.dentry->d_inode->i_mutex);
error = -EINVAL;
+ if (IS_MNT_SHARED(old_nd.mnt) ||
+ IS_MNT_SHARED(new_nd.mnt->mnt_parent) ||
+ IS_MNT_SHARED(user_nd.mnt->mnt_parent))
+ goto out2;
if (!check_mnt(user_nd.mnt))
goto out2;
error = -ENOENT;
path_release(&root_parent);
path_release(&parent_nd);
out2:
- up(&old_nd.dentry->d_inode->i_sem);
+ mutex_unlock(&old_nd.dentry->d_inode->i_mutex);
up_write(&namespace_sem);
path_release(&user_nd);
path_release(&old_nd);
i--;
} while (i);
sysfs_init();
+ subsystem_register(&fs_subsys);
init_rootfs();
init_mount_tree();
}
spin_unlock(&vfsmount_lock);
down_write(&namespace_sem);
spin_lock(&vfsmount_lock);
- umount_tree(root, &umount_list);
+ umount_tree(root, 0, &umount_list);
spin_unlock(&vfsmount_lock);
up_write(&namespace_sem);
release_mounts(&umount_list);