X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fnamespace.c;h=88058de59c7c2ca9edf14453f1fbeeaa4017bfa8;hb=36e15263aa5dcf3b72f1f88437e69497782b7ab8;hp=0505fb61aa740182cfabe890cc668724378c5791;hpb=97e7e0f71d6d948c25f11f0a33878d9356d9579e;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/namespace.c b/fs/namespace.c index 0505fb6..88058de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -23,11 +22,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include "pnode.h" @@ -42,6 +43,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); static int event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +static int mnt_id_start = 0; +static int mnt_group_start = 1; static struct list_head *mount_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; @@ -69,7 +72,9 @@ static int mnt_alloc_id(struct vfsmount *mnt) retry: ida_pre_get(&mnt_id_ida, GFP_KERNEL); spin_lock(&vfsmount_lock); - res = ida_get_new(&mnt_id_ida, &mnt->mnt_id); + res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); + if (!res) + mnt_id_start = mnt->mnt_id + 1; spin_unlock(&vfsmount_lock); if (res == -EAGAIN) goto retry; @@ -79,8 +84,11 @@ retry: static void mnt_free_id(struct vfsmount *mnt) { + int id = mnt->mnt_id; spin_lock(&vfsmount_lock); - ida_remove(&mnt_id_ida, mnt->mnt_id); + ida_remove(&mnt_id_ida, id); + if (mnt_id_start > id) + mnt_id_start = id; spin_unlock(&vfsmount_lock); } @@ -91,10 +99,18 @@ static void mnt_free_id(struct vfsmount *mnt) */ static int mnt_alloc_group_id(struct vfsmount *mnt) { + int res; + if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) return -ENOMEM; - return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id); + res = ida_get_new_above(&mnt_group_ida, + mnt_group_start, + &mnt->mnt_group_id); + if (!res) + mnt_group_start = mnt->mnt_group_id + 1; + + return res; } /* @@ -102,7 +118,10 @@ static int mnt_alloc_group_id(struct vfsmount *mnt) */ void mnt_release_group_id(struct vfsmount *mnt) { - ida_remove(&mnt_group_ida, mnt->mnt_group_id); + int id = mnt->mnt_group_id; + ida_remove(&mnt_group_ida, id); + if (mnt_group_start > id) + mnt_group_start = id; mnt->mnt_group_id = 0; } @@ -113,9 +132,13 @@ struct vfsmount *alloc_vfsmnt(const char *name) int err; err = mnt_alloc_id(mnt); - if (err) { - kmem_cache_free(mnt_cache, mnt); - return NULL; + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; } atomic_set(&mnt->mnt_count, 1); @@ -127,17 +150,25 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); - atomic_set(&mnt->__mnt_writers, 0); - if (name) { - int size = strlen(name) + 1; - char *newname = kmalloc(size, GFP_KERNEL); - if (newname) { - memcpy(newname, name, size); - mnt->mnt_devname = newname; - } - } +#ifdef CONFIG_SMP + mnt->mnt_writers = alloc_percpu(int); + if (!mnt->mnt_writers) + goto out_free_devname; +#else + mnt->mnt_writers = 0; +#endif } return mnt; + +#ifdef CONFIG_SMP +out_free_devname: + kfree(mnt->mnt_devname); +#endif +out_free_id: + mnt_free_id(mnt); +out_free_cache: + kmem_cache_free(mnt_cache, mnt); + return NULL; } /* @@ -169,65 +200,38 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -struct mnt_writer { - /* - * If holding multiple instances of this lock, they - * must be ordered by cpu number. - */ - spinlock_t lock; - struct lock_class_key lock_class; /* compiles out with !lockdep */ - unsigned long count; - struct vfsmount *mnt; -} ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); +static inline void inc_mnt_writers(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; +#else + mnt->mnt_writers++; +#endif +} -static int __init init_mnt_writers(void) +static inline void dec_mnt_writers(struct vfsmount *mnt) { - int cpu; - for_each_possible_cpu(cpu) { - struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); - spin_lock_init(&writer->lock); - lockdep_set_class(&writer->lock, &writer->lock_class); - writer->count = 0; - } - return 0; +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; +#else + mnt->mnt_writers--; +#endif } -fs_initcall(init_mnt_writers); -static void unlock_mnt_writers(void) +static unsigned int count_mnt_writers(struct vfsmount *mnt) { +#ifdef CONFIG_SMP + unsigned int count = 0; int cpu; - struct mnt_writer *cpu_writer; for_each_possible_cpu(cpu) { - cpu_writer = &per_cpu(mnt_writers, cpu); - spin_unlock(&cpu_writer->lock); + count += *per_cpu_ptr(mnt->mnt_writers, cpu); } -} -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) -{ - if (!cpu_writer->mnt) - return; - /* - * This is in case anyone ever leaves an invalid, - * old ->mnt and a count of 0. - */ - if (!cpu_writer->count) - return; - atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); - cpu_writer->count = 0; -} - /* - * must hold cpu_writer->lock - */ -static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, - struct vfsmount *mnt) -{ - if (cpu_writer->mnt == mnt) - return; - __clear_mnt_count(cpu_writer); - cpu_writer->mnt = mnt; + return count; +#else + return mnt->mnt_writers; +#endif } /* @@ -251,75 +255,74 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, int mnt_want_write(struct vfsmount *mnt) { int ret = 0; - struct mnt_writer *cpu_writer; - cpu_writer = &get_cpu_var(mnt_writers); - spin_lock(&cpu_writer->lock); + preempt_disable(); + inc_mnt_writers(mnt); + /* + * The store to inc_mnt_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (mnt->mnt_flags & MNT_WRITE_HOLD) + cpu_relax(); + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); if (__mnt_is_readonly(mnt)) { + dec_mnt_writers(mnt); ret = -EROFS; goto out; } - use_cpu_writer_for_mount(cpu_writer, mnt); - cpu_writer->count++; out: - spin_unlock(&cpu_writer->lock); - put_cpu_var(mnt_writers); + preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); -static void lock_mnt_writers(void) -{ - int cpu; - struct mnt_writer *cpu_writer; - - for_each_possible_cpu(cpu) { - cpu_writer = &per_cpu(mnt_writers, cpu); - spin_lock(&cpu_writer->lock); - __clear_mnt_count(cpu_writer); - cpu_writer->mnt = NULL; - } +/** + * mnt_clone_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This is effectively like mnt_want_write, except + * it must only be used to take an extra write reference + * on a mountpoint that we already know has a write reference + * on it. This allows some optimisation. + * + * After finished, mnt_drop_write must be called as usual to + * drop the reference. + */ +int mnt_clone_write(struct vfsmount *mnt) +{ + /* superblock may be r/o */ + if (__mnt_is_readonly(mnt)) + return -EROFS; + preempt_disable(); + inc_mnt_writers(mnt); + preempt_enable(); + return 0; } +EXPORT_SYMBOL_GPL(mnt_clone_write); -/* - * These per-cpu write counts are not guaranteed to have - * matched increments and decrements on any given cpu. - * A file open()ed for write on one cpu and close()d on - * another cpu will imbalance this count. Make sure it - * does not get too far out of whack. +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already */ -static void handle_write_count_underflow(struct vfsmount *mnt) +int mnt_want_write_file(struct file *file) { - if (atomic_read(&mnt->__mnt_writers) >= - MNT_WRITER_UNDERFLOW_LIMIT) - return; - /* - * It isn't necessary to hold all of the locks - * at the same time, but doing it this way makes - * us share a lot more code. - */ - lock_mnt_writers(); - /* - * vfsmount_lock is for mnt_flags. - */ - spin_lock(&vfsmount_lock); - /* - * If coalescing the per-cpu writer counts did not - * get us back to a positive writer count, we have - * a bug. - */ - if ((atomic_read(&mnt->__mnt_writers) < 0) && - !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { - printk(KERN_DEBUG "leak detected on mount(%p) writers " - "count: %d\n", - mnt, atomic_read(&mnt->__mnt_writers)); - WARN_ON(1); - /* use the flag to keep the dmesg spam down */ - mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; - } - spin_unlock(&vfsmount_lock); - unlock_mnt_writers(); + struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + return mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); } +EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * mnt_drop_write - give up write access to a mount @@ -331,37 +334,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - int must_check_underflow = 0; - struct mnt_writer *cpu_writer; - - cpu_writer = &get_cpu_var(mnt_writers); - spin_lock(&cpu_writer->lock); - - use_cpu_writer_for_mount(cpu_writer, mnt); - if (cpu_writer->count > 0) { - cpu_writer->count--; - } else { - must_check_underflow = 1; - atomic_dec(&mnt->__mnt_writers); - } - - spin_unlock(&cpu_writer->lock); - /* - * Logically, we could call this each time, - * but the __mnt_writers cacheline tends to - * be cold, and makes this expensive. - */ - if (must_check_underflow) - handle_write_count_underflow(mnt); - /* - * This could be done right after the spinlock - * is taken because the spinlock keeps us on - * the cpu, and disables preemption. However, - * putting it here bounds the amount that - * __mnt_writers can underflow. Without it, - * we could theoretically wrap __mnt_writers. - */ - put_cpu_var(mnt_writers); + preempt_disable(); + dec_mnt_writers(mnt); + preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_drop_write); @@ -369,24 +344,41 @@ static int mnt_make_readonly(struct vfsmount *mnt) { int ret = 0; - lock_mnt_writers(); + spin_lock(&vfsmount_lock); + mnt->mnt_flags |= MNT_WRITE_HOLD; /* - * With all the locks held, this value is stable + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. */ - if (atomic_read(&mnt->__mnt_writers) > 0) { - ret = -EBUSY; - goto out; - } + smp_mb(); + /* - * nobody can do a successful mnt_want_write() with all - * of the counts in MNT_DENIED_WRITE and the locks held. + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. */ - spin_lock(&vfsmount_lock); - if (!ret) + if (count_mnt_writers(mnt) > 0) + ret = -EBUSY; + else mnt->mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt_flags &= ~MNT_WRITE_HOLD; spin_unlock(&vfsmount_lock); -out: - unlock_mnt_writers(); return ret; } @@ -397,11 +389,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt) spin_unlock(&vfsmount_lock); } -int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) +void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; mnt->mnt_root = dget(sb->s_root); - return 0; } EXPORT_SYMBOL(simple_set_mnt); @@ -410,6 +401,9 @@ void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_writers); +#endif kmem_cache_free(mnt_cache, mnt); } @@ -442,11 +436,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, * lookup_mnt increments the ref count before returning * the vfsmount struct. */ -struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) +struct vfsmount *lookup_mnt(struct path *path) { struct vfsmount *child_mnt; spin_lock(&vfsmount_lock); - if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) + if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) mntget(child_mnt); spin_unlock(&vfsmount_lock); return child_mnt; @@ -579,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) list_add(&mnt->mnt_share, &old->mnt_share); if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); @@ -604,36 +598,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, static inline void __mntput(struct vfsmount *mnt) { - int cpu; struct super_block *sb = mnt->mnt_sb; /* - * We don't have to hold all of the locks at the - * same time here because we know that we're the - * last reference to mnt and that no new writers - * can come in. - */ - for_each_possible_cpu(cpu) { - struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - if (cpu_writer->mnt != mnt) - continue; - spin_lock(&cpu_writer->lock); - atomic_add(cpu_writer->count, &mnt->__mnt_writers); - cpu_writer->count = 0; - /* - * Might as well do this so that no one - * ever sees the pointer and expects - * it to be valid. - */ - cpu_writer->mnt = NULL; - spin_unlock(&cpu_writer->lock); - } - /* * This probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this * happens, the filesystem was probably unable * to make r/w->r/o transitions. */ - WARN_ON(atomic_read(&mnt->__mnt_writers)); + /* + * atomic_dec_and_lock() used to deal with ->mnt_count decrements + * provides barriers, so count_mnt_writers() below is safe. AV + */ + WARN_ON(count_mnt_writers(mnt)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); @@ -652,7 +628,6 @@ repeat: mnt->mnt_pinned = 0; spin_unlock(&vfsmount_lock); acct_auto_close_mnt(mnt); - security_sb_umount_close(mnt); goto repeat; } } @@ -693,12 +668,16 @@ static inline void mangle(struct seq_file *m, const char *s) */ int generic_show_options(struct seq_file *m, struct vfsmount *mnt) { - const char *options = mnt->mnt_sb->s_options; + const char *options; + + rcu_read_lock(); + options = rcu_dereference(mnt->mnt_sb->s_options); if (options != NULL && options[0]) { seq_putc(m, ','); mangle(m, options); } + rcu_read_unlock(); return 0; } @@ -719,11 +698,22 @@ EXPORT_SYMBOL(generic_show_options); */ void save_mount_options(struct super_block *sb, char *options) { - kfree(sb->s_options); - sb->s_options = kstrdup(options, GFP_KERNEL); + BUG_ON(sb->s_options); + rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); } EXPORT_SYMBOL(save_mount_options); +void replace_mount_options(struct super_block *sb, char *options) +{ + char *old = sb->s_options; + rcu_assign_pointer(sb->s_options, options); + if (old) { + synchronize_rcu(); + kfree(old); + } +} +EXPORT_SYMBOL(replace_mount_options); + #ifdef CONFIG_PROC_FS /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) @@ -746,12 +736,27 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } +int mnt_had_events(struct proc_mounts *p) +{ + struct mnt_namespace *ns = p->ns; + int res = 0; + + spin_lock(&vfsmount_lock); + if (p->event != ns->event) { + p->event = ns->event; + res = 1; + } + spin_unlock(&vfsmount_lock); + + return res; +} + struct proc_fs_info { int flag; const char *str; }; -static void show_sb_opts(struct seq_file *m, struct super_block *sb) +static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_info fs_info[] = { { MS_SYNCHRONOUS, ",sync" }, @@ -765,6 +770,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb) if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + return security_sb_show_options(m, sb); } static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) @@ -776,6 +783,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_STRICTATIME, ",strictatime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; @@ -807,11 +815,14 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - show_sb_opts(m, mnt->mnt_sb); + err = show_sb_opts(m, mnt->mnt_sb); + if (err) + goto out; show_mnt_opts(m, mnt); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); +out: return err; } @@ -866,10 +877,13 @@ static int show_mountinfo(struct seq_file *m, void *v) seq_putc(m, ' '); mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - show_sb_opts(m, sb); + err = show_sb_opts(m, sb); + if (err) + goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt); seq_putc(m, '\n'); +out: return err; } @@ -965,10 +979,12 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; + down_read(&namespace_sem); spin_lock(&vfsmount_lock); if (propagate_mount_busy(mnt, 2)) ret = 0; spin_unlock(&vfsmount_lock); + up_read(&namespace_sem); return ret; } @@ -1061,10 +1077,9 @@ static int do_umount(struct vfsmount *mnt, int flags) * about for the moment. */ - lock_kernel(); - if (sb->s_op->umount_begin) - sb->s_op->umount_begin(mnt, flags); - unlock_kernel(); + if (flags & MNT_FORCE && sb->s_op->umount_begin) { + sb->s_op->umount_begin(sb); + } /* * No sense to grab the lock for this test, but test itself looks @@ -1081,12 +1096,8 @@ static int do_umount(struct vfsmount *mnt, int flags) * we just try to remount it readonly. */ down_write(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY)) { - lock_kernel(); - DQUOT_OFF(sb); + if (!(sb->s_flags & MS_RDONLY)) retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); - unlock_kernel(); - } up_write(&sb->s_umount); return retval; } @@ -1105,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags) retval = 0; } spin_unlock(&vfsmount_lock); - if (retval) - security_sb_umount_busy(mnt); up_write(&namespace_sem); release_mounts(&umount_list); return retval; @@ -1120,29 +1129,36 @@ static int do_umount(struct vfsmount *mnt, int flags) * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD */ -asmlinkage long sys_umount(char __user * name, int flags) +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { - struct nameidata nd; + struct path path; int retval; + int lookup_flags = 0; + + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; - retval = __user_walk(name, LOOKUP_FOLLOW, &nd); + retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; retval = -EINVAL; - if (nd.path.dentry != nd.path.mnt->mnt_root) + if (path.dentry != path.mnt->mnt_root) goto dput_and_out; - if (!check_mnt(nd.path.mnt)) + if (!check_mnt(path.mnt)) goto dput_and_out; retval = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto dput_and_out; - retval = do_umount(nd.path.mnt, flags); + retval = do_umount(path.mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ - dput(nd.path.dentry); - mntput_no_expire(nd.path.mnt); + dput(path.dentry); + mntput_no_expire(path.mnt); out: return retval; } @@ -1152,42 +1168,31 @@ out: /* * The 2.0 compatible umount. No flags. */ -asmlinkage long sys_oldumount(char __user * name) +SYSCALL_DEFINE1(oldumount, char __user *, name) { return sys_umount(name, 0); } #endif -static int mount_is_safe(struct nameidata *nd) +static int mount_is_safe(struct path *path) { if (capable(CAP_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet - if (S_ISLNK(nd->path.dentry->d_inode->i_mode)) + if (S_ISLNK(path->dentry->d_inode->i_mode)) return -EPERM; - if (nd->path.dentry->d_inode->i_mode & S_ISVTX) { - if (current->uid != nd->path.dentry->d_inode->i_uid) + if (path->dentry->d_inode->i_mode & S_ISVTX) { + if (current_uid() != path->dentry->d_inode->i_uid) return -EPERM; } - if (vfs_permission(nd, MAY_WRITE)) + if (inode_permission(path->dentry->d_inode, MAY_WRITE)) return -EPERM; return 0; #endif } -static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) -{ - while (1) { - if (d == dentry) - return 1; - if (d == NULL || d == d->d_parent) - return 0; - d = d->d_parent; - } -} - struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, int flag) { @@ -1204,7 +1209,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { - if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry)) + if (!is_subdir(r->mnt_mountpoint, dentry)) continue; for (s = r; s; s = next_mnt(s, r)) { @@ -1240,11 +1245,11 @@ Enomem: return NULL; } -struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) +struct vfsmount *collect_mounts(struct path *path) { struct vfsmount *tree; down_write(&namespace_sem); - tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); + tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); up_write(&namespace_sem); return tree; } @@ -1260,6 +1265,21 @@ void drop_collected_mounts(struct vfsmount *mnt) release_mounts(&umount_list); } +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct vfsmount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &root->mnt_list, mnt_list) { + res = f(mnt, arg); + if (res) + return res; + } + return 0; +} + static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) { struct vfsmount *p; @@ -1368,16 +1388,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (err) goto out_cleanup_ids; + spin_lock(&vfsmount_lock); + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } - - spin_lock(&vfsmount_lock); if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); - touch_mnt_namespace(current->nsproxy->mnt_ns); + touch_mnt_namespace(parent_path->mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1409,30 +1429,22 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) err = -ENOENT; mutex_lock(&path->dentry->d_inode->i_mutex); - if (IS_DEADDIR(path->dentry->d_inode)) + if (cant_mount(path->dentry)) goto out_unlock; - err = security_sb_check_sb(mnt, path); - if (err) - goto out_unlock; - - err = -ENOENT; - if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) + if (!d_unlinked(path->dentry)) err = attach_recursive_mnt(mnt, path, NULL); out_unlock: mutex_unlock(&path->dentry->d_inode->i_mutex); - if (!err) - security_sb_post_addmount(mnt, path); return err; } /* * recursively change the type of the mountpoint. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_change_type(struct nameidata *nd, int flag) +static int do_change_type(struct path *path, int flag) { - struct vfsmount *m, *mnt = nd->path.mnt; + struct vfsmount *m, *mnt = path->mnt; int recurse = flag & MS_REC; int type = flag & ~MS_REC; int err = 0; @@ -1440,7 +1452,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (nd->path.dentry != nd->path.mnt->mnt_root) + if (path->dentry != path->mnt->mnt_root) return -EINVAL; down_write(&namespace_sem); @@ -1462,40 +1474,39 @@ static noinline int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_loopback(struct nameidata *nd, char *old_name, +static int do_loopback(struct path *path, char *old_name, int recurse) { - struct nameidata old_nd; + struct path old_path; struct vfsmount *mnt = NULL; - int err = mount_is_safe(nd); + int err = mount_is_safe(path); if (err) return err; if (!old_name || !*old_name) return -EINVAL; - err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; down_write(&namespace_sem); err = -EINVAL; - if (IS_MNT_UNBINDABLE(old_nd.path.mnt)) + if (IS_MNT_UNBINDABLE(old_path.mnt)) goto out; - if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) + if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) goto out; err = -ENOMEM; if (recurse) - mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = copy_tree(old_path.mnt, old_path.dentry, 0); else - mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); if (!mnt) goto out; - err = graft_tree(mnt, &nd->path); + err = graft_tree(mnt, path); if (err) { LIST_HEAD(umount_list); spin_lock(&vfsmount_lock); @@ -1506,7 +1517,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name, out: up_write(&namespace_sem); - path_put(&old_nd.path); + path_put(&old_path); return err; } @@ -1531,33 +1542,39 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, +static int do_remount(struct path *path, int flags, int mnt_flags, void *data) { int err; - struct super_block *sb = nd->path.mnt->mnt_sb; + struct super_block *sb = path->mnt->mnt_sb; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!check_mnt(nd->path.mnt)) + if (!check_mnt(path->mnt)) return -EINVAL; - if (nd->path.dentry != nd->path.mnt->mnt_root) + if (path->dentry != path->mnt->mnt_root) return -EINVAL; down_write(&sb->s_umount); if (flags & MS_BIND) - err = change_mount_flags(nd->path.mnt, flags); + err = change_mount_flags(path->mnt, flags); else err = do_remount_sb(sb, flags, data, 0); - if (!err) - nd->path.mnt->mnt_flags = mnt_flags; + if (!err) { + spin_lock(&vfsmount_lock); + mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; + path->mnt->mnt_flags = mnt_flags; + spin_unlock(&vfsmount_lock); + } up_write(&sb->s_umount); - if (!err) - security_sb_post_remount(nd->path.mnt, flags, data); + if (!err) { + spin_lock(&vfsmount_lock); + touch_mnt_namespace(path->mnt->mnt_ns); + spin_unlock(&vfsmount_lock); + } return err; } @@ -1571,130 +1588,129 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt) return 0; } -/* - * noinline this do_mount helper to save do_mount stack space. - */ -static noinline int do_move_mount(struct nameidata *nd, char *old_name) +static int do_move_mount(struct path *path, char *old_name) { - struct nameidata old_nd; - struct path parent_path; + struct path old_path, parent_path; struct vfsmount *p; int err = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; - err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; down_write(&namespace_sem); - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + while (d_mountpoint(path->dentry) && + follow_down(path)) ; err = -EINVAL; - if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) + if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) goto out; err = -ENOENT; - mutex_lock(&nd->path.dentry->d_inode->i_mutex); - if (IS_DEADDIR(nd->path.dentry->d_inode)) + mutex_lock(&path->dentry->d_inode->i_mutex); + if (cant_mount(path->dentry)) goto out1; - if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry)) + if (d_unlinked(path->dentry)) goto out1; err = -EINVAL; - if (old_nd.path.dentry != old_nd.path.mnt->mnt_root) + if (old_path.dentry != old_path.mnt->mnt_root) goto out1; - if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent) + if (old_path.mnt == old_path.mnt->mnt_parent) goto out1; - if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != - S_ISDIR(old_nd.path.dentry->d_inode->i_mode)) + if (S_ISDIR(path->dentry->d_inode->i_mode) != + S_ISDIR(old_path.dentry->d_inode->i_mode)) goto out1; /* * Don't move a mount residing in a shared parent. */ - if (old_nd.path.mnt->mnt_parent && - IS_MNT_SHARED(old_nd.path.mnt->mnt_parent)) + if (old_path.mnt->mnt_parent && + IS_MNT_SHARED(old_path.mnt->mnt_parent)) goto out1; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(nd->path.mnt) && - tree_contains_unbindable(old_nd.path.mnt)) + if (IS_MNT_SHARED(path->mnt) && + tree_contains_unbindable(old_path.mnt)) goto out1; err = -ELOOP; - for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent) - if (p == old_nd.path.mnt) + for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) + if (p == old_path.mnt) goto out1; - err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path); + err = attach_recursive_mnt(old_path.mnt, path, &parent_path); if (err) goto out1; /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_nd.path.mnt->mnt_expire); + list_del_init(&old_path.mnt->mnt_expire); out1: - mutex_unlock(&nd->path.dentry->d_inode->i_mutex); + mutex_unlock(&path->dentry->d_inode->i_mutex); out: up_write(&namespace_sem); if (!err) path_put(&parent_path); - path_put(&old_nd.path); + path_put(&old_path); return err; } /* * create a new mount for userspace and request it to be added into the * namespace's tree - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, +static int do_new_mount(struct path *path, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; - if (!type || !memchr(type, 0, PAGE_SIZE)) + if (!type) return -EINVAL; /* we need capabilities... */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; + lock_kernel(); mnt = do_kern_mount(type, flags, name, data); + unlock_kernel(); if (IS_ERR(mnt)) return PTR_ERR(mnt); - return do_add_mount(mnt, nd, mnt_flags, NULL); + return do_add_mount(mnt, path, mnt_flags, NULL); } /* * add a mount into a namespace's mount tree * - provide the option of adding the new mount to an expiration list */ -int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, +int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags, struct list_head *fslist) { int err; + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); + down_write(&namespace_sem); /* Something was mounted here while we slept */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + while (d_mountpoint(path->dentry) && + follow_down(path)) ; err = -EINVAL; - if (!check_mnt(nd->path.mnt)) + if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) goto unlock; /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (nd->path.mnt->mnt_sb == newmnt->mnt_sb && - nd->path.mnt->mnt_root == nd->path.dentry) + if (path->mnt->mnt_sb == newmnt->mnt_sb && + path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; @@ -1702,7 +1718,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, goto unlock; newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, &nd->path))) + if ((err = graft_tree(newmnt, path))) goto unlock; if (fslist) /* add to the specified expiration list */ @@ -1821,8 +1837,8 @@ static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) while (!list_empty(&graveyard)) { m = list_first_entry(&graveyard, struct vfsmount, mnt_expire); - touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, umounts); + touch_mnt_namespace(m->mnt_ns); + umount_tree(m, 1, umounts); } } } @@ -1888,6 +1904,23 @@ int copy_mount_options(const void __user * data, unsigned long *where) return 0; } +int copy_mount_string(const void __user *data, char **where) +{ + char *tmp; + + if (!data) { + *where = NULL; + return 0; + } + + tmp = strndup_user(data, PAGE_SIZE); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + *where = tmp; + return 0; +} + /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). @@ -1905,7 +1938,7 @@ int copy_mount_options(const void __user * data, unsigned long *where) long do_mount(char *dev_name, char *dir_name, char *type_page, unsigned long flags, void *data_page) { - struct nameidata nd; + struct path path; int retval = 0; int mnt_flags = 0; @@ -1917,12 +1950,24 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) return -EINVAL; - if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) - return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; + /* ... and get the mountpoint */ + retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); + if (retval) + return retval; + + retval = security_sb_mount(dev_name, &path, + type_page, flags, data_page); + if (retval) + goto dput_out; + + /* Default to relatime unless overriden */ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; @@ -1934,60 +1979,61 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; - if (flags & MS_RELATIME) - mnt_flags |= MNT_RELATIME; + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); - - /* ... and get the mountpoint */ - retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); - if (retval) - return retval; - - retval = security_sb_mount(dev_name, &nd.path, - type_page, flags, data_page); - if (retval) - goto dput_out; + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&path, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) - retval = do_change_type(&nd, flags); + retval = do_change_type(&path, flags); else if (flags & MS_MOVE) - retval = do_move_mount(&nd, dev_name); + retval = do_move_mount(&path, dev_name); else - retval = do_new_mount(&nd, type_page, flags, mnt_flags, + retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); dput_out: - path_put(&nd.path); + path_put(&path); return retval; } -/* - * Allocate a new namespace structure and populate it with contents - * copied from the namespace of the passed in task structure. - */ -static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct fs_struct *fs) +static struct mnt_namespace *alloc_mnt_ns(void) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; - struct vfsmount *p, *q; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); - atomic_set(&new_ns->count, 1); + new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; + return new_ns; +} + +/* + * Allocate a new namespace structure and populate it with contents + * copied from the namespace of the passed in task structure. + */ +static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, + struct fs_struct *fs) +{ + struct mnt_namespace *new_ns; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct vfsmount *p, *q; + + new_ns = alloc_mnt_ns(); + if (IS_ERR(new_ns)) + return new_ns; down_write(&namespace_sem); /* First pass: copy the tree topology */ @@ -1996,7 +2042,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - return ERR_PTR(-ENOMEM);; + return ERR_PTR(-ENOMEM); } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); @@ -2020,10 +2066,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, pwdmnt = p; fs->pwd.mnt = mntget(q); } - if (p == fs->altroot.mnt) { - altrootmnt = p; - fs->altroot.mnt = mntget(q); - } } p = next_mnt(p, mnt_ns->root); q = next_mnt(q, new_ns->root); @@ -2034,8 +2076,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); - if (altrootmnt) - mntput(altrootmnt); return new_ns; } @@ -2057,106 +2097,63 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, return new_ns; } -asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, - char __user * type, unsigned long flags, - void __user * data) +/** + * create_mnt_ns - creates a private namespace and adds a root filesystem + * @mnt: pointer to the new root filesystem mountpoint + */ +struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) { - int retval; - unsigned long data_page; - unsigned long type_page; - unsigned long dev_page; - char *dir_page; - - retval = copy_mount_options(type, &type_page); - if (retval < 0) - return retval; - - dir_page = getname(dir_name); - retval = PTR_ERR(dir_page); - if (IS_ERR(dir_page)) - goto out1; - - retval = copy_mount_options(dev_name, &dev_page); - if (retval < 0) - goto out2; - - retval = copy_mount_options(data, &data_page); - if (retval < 0) - goto out3; - - lock_kernel(); - retval = do_mount((char *)dev_page, dir_page, (char *)type_page, - flags, (void *)data_page); - unlock_kernel(); - free_page(data_page); + struct mnt_namespace *new_ns; -out3: - free_page(dev_page); -out2: - putname(dir_page); -out1: - free_page(type_page); - return retval; + new_ns = alloc_mnt_ns(); + if (!IS_ERR(new_ns)) { + mnt->mnt_ns = new_ns; + new_ns->root = mnt; + list_add(&new_ns->list, &new_ns->root->mnt_list); + } + return new_ns; } +EXPORT_SYMBOL(create_mnt_ns); -/* - * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_root(struct fs_struct *fs, struct path *path) +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) { - struct path old_root; + int ret; + char *kernel_type; + char *kernel_dir; + char *kernel_dev; + unsigned long data_page; - write_lock(&fs->lock); - old_root = fs->root; - fs->root = *path; - path_get(path); - write_unlock(&fs->lock); - if (old_root.dentry) - path_put(&old_root); -} + ret = copy_mount_string(type, &kernel_type); + if (ret < 0) + goto out_type; -/* - * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) -{ - struct path old_pwd; + kernel_dir = getname(dir_name); + if (IS_ERR(kernel_dir)) { + ret = PTR_ERR(kernel_dir); + goto out_dir; + } - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); + ret = copy_mount_string(dev_name, &kernel_dev); + if (ret < 0) + goto out_dev; - if (old_pwd.dentry) - path_put(&old_pwd); -} + ret = copy_mount_options(data, &data_page); + if (ret < 0) + goto out_data; -static void chroot_fs_refs(struct path *old_root, struct path *new_root) -{ - struct task_struct *g, *p; - struct fs_struct *fs; + ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, + (void *) data_page); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - fs = p->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(p); - if (fs->root.dentry == old_root->dentry - && fs->root.mnt == old_root->mnt) - set_fs_root(fs, new_root); - if (fs->pwd.dentry == old_root->dentry - && fs->pwd.mnt == old_root->mnt) - set_fs_pwd(fs, new_root); - put_fs_struct(fs); - } else - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + free_page(data_page); +out_data: + kfree(kernel_dev); +out_dev: + putname(kernel_dir); +out_dir: + kfree(kernel_type); +out_type: + return ret; } /* @@ -2184,32 +2181,30 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root) * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ -asmlinkage long sys_pivot_root(const char __user * new_root, - const char __user * put_old) +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) { struct vfsmount *tmp; - struct nameidata new_nd, old_nd; - struct path parent_path, root_parent, root; + struct path new, old, parent_path, root_parent, root; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, - &new_nd); + error = user_path_dir(new_root, &new); if (error) goto out0; error = -EINVAL; - if (!check_mnt(new_nd.path.mnt)) + if (!check_mnt(new.mnt)) goto out1; - error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); + error = user_path_dir(put_old, &old); if (error) goto out1; - error = security_sb_pivotroot(&old_nd.path, &new_nd.path); + error = security_sb_pivotroot(&old, &new); if (error) { - path_put(&old_nd.path); + path_put(&old); goto out1; } @@ -2218,69 +2213,68 @@ asmlinkage long sys_pivot_root(const char __user * new_root, path_get(¤t->fs->root); read_unlock(¤t->fs->lock); down_write(&namespace_sem); - mutex_lock(&old_nd.path.dentry->d_inode->i_mutex); + mutex_lock(&old.dentry->d_inode->i_mutex); error = -EINVAL; - if (IS_MNT_SHARED(old_nd.path.mnt) || - IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || + if (IS_MNT_SHARED(old.mnt) || + IS_MNT_SHARED(new.mnt->mnt_parent) || IS_MNT_SHARED(root.mnt->mnt_parent)) goto out2; if (!check_mnt(root.mnt)) goto out2; error = -ENOENT; - if (IS_DEADDIR(new_nd.path.dentry->d_inode)) + if (cant_mount(old.dentry)) goto out2; - if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry)) + if (d_unlinked(new.dentry)) goto out2; - if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) + if (d_unlinked(old.dentry)) goto out2; error = -EBUSY; - if (new_nd.path.mnt == root.mnt || - old_nd.path.mnt == root.mnt) + if (new.mnt == root.mnt || + old.mnt == root.mnt) goto out2; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out2; /* not a mountpoint */ if (root.mnt->mnt_parent == root.mnt) goto out2; /* not attached */ - if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) + if (new.mnt->mnt_root != new.dentry) goto out2; /* not a mountpoint */ - if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt) + if (new.mnt->mnt_parent == new.mnt) goto out2; /* not attached */ /* make sure we can reach put_old from new_root */ - tmp = old_nd.path.mnt; + tmp = old.mnt; spin_lock(&vfsmount_lock); - if (tmp != new_nd.path.mnt) { + if (tmp != new.mnt) { for (;;) { if (tmp->mnt_parent == tmp) goto out3; /* already mounted on put_old */ - if (tmp->mnt_parent == new_nd.path.mnt) + if (tmp->mnt_parent == new.mnt) break; tmp = tmp->mnt_parent; } - if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry)) + if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) goto out3; - } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) + } else if (!is_subdir(old.dentry, new.dentry)) goto out3; - detach_mnt(new_nd.path.mnt, &parent_path); + detach_mnt(new.mnt, &parent_path); detach_mnt(root.mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root.mnt, &old_nd.path); + attach_mnt(root.mnt, &old); /* mount new_root on / */ - attach_mnt(new_nd.path.mnt, &root_parent); + attach_mnt(new.mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); - chroot_fs_refs(&root, &new_nd.path); - security_sb_post_pivotroot(&root, &new_nd.path); + chroot_fs_refs(&root, &new); error = 0; path_put(&root_parent); path_put(&parent_path); out2: - mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&old.dentry->d_inode->i_mutex); up_write(&namespace_sem); path_put(&root); - path_put(&old_nd.path); + path_put(&old); out1: - path_put(&new_nd.path); + path_put(&new); out0: return error; out3: @@ -2297,16 +2291,9 @@ static void __init init_mount_tree(void) mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = kmalloc(sizeof(*ns), GFP_KERNEL); - if (!ns) + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - atomic_set(&ns->count, 1); - INIT_LIST_HEAD(&ns->list); - init_waitqueue_head(&ns->poll); - ns->event = 0; - list_add(&mnt->mnt_list, &ns->list); - ns->root = mnt; - mnt->mnt_ns = ns; init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -2341,25 +2328,26 @@ void __init mnt_init(void) err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", - __FUNCTION__, err); + __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) - printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__); + printk(KERN_WARNING "%s: kobj create error\n", __func__); init_rootfs(); init_mount_tree(); } -void __put_mnt_ns(struct mnt_namespace *ns) +void put_mnt_ns(struct mnt_namespace *ns) { - struct vfsmount *root = ns->root; LIST_HEAD(umount_list); - ns->root = NULL; - spin_unlock(&vfsmount_lock); + + if (!atomic_dec_and_test(&ns->count)) + return; down_write(&namespace_sem); spin_lock(&vfsmount_lock); - umount_tree(root, 0, &umount_list); + umount_tree(ns->root, 0, &umount_list); spin_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); kfree(ns); } +EXPORT_SYMBOL(put_mnt_ns);