RCU'd vfsmounts

author Al Viro <viro@zeniv.linux.org.uk>

Mon, 30 Sep 2013 02:06:07 +0000 (22:06 -0400)

committer Al Viro <viro@zeniv.linux.org.uk>

Sat, 9 Nov 2013 05:16:19 +0000 (00:16 -0500)
author Al Viro <viro@zeniv.linux.org.uk>
Mon, 30 Sep 2013 02:06:07 +0000 (22:06 -0400)
committer Al Viro <viro@zeniv.linux.org.uk>
Sat, 9 Nov 2013 05:16:19 +0000 (00:16 -0500)
diff --git a/fs/dcache.c b/fs/dcache.c

index eb0978da1bd44fc8e41abf292e2a6cc0bec9ed9c..aafa2a146434fefa48ab35eb1c4cd3d149eaa010 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2887,24 +2887,28 @@ static int prepend_path(const struct path *path,
         struct vfsmount *vfsmnt = path->mnt;
         struct mount *mnt = real_mount(vfsmnt);
         int error = 0;
-       unsigned seq = 0;
+       unsigned seq, m_seq = 0;
         char *bptr;
         int blen;
  
-       br_read_lock(&vfsmount_lock);
         rcu_read_lock();
+restart_mnt:
+       read_seqbegin_or_lock(&mount_lock, &m_seq);
+       seq = 0;
  restart:
         bptr = *buffer;
         blen = *buflen;
+       error = 0;
         read_seqbegin_or_lock(&rename_lock, &seq);
         while (dentry != root->dentry || vfsmnt != root->mnt) {
                 struct dentry * parent;
  
                 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+                       struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
                         /* Global root? */
-                       if (mnt_has_parent(mnt)) {
-                               dentry = mnt->mnt_mountpoint;
-                               mnt = mnt->mnt_parent;
+                       if (mnt != parent) {
+                               dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+                               mnt = parent;
                                 vfsmnt = &mnt->mnt;
                                 continue;
                         }
@@ -2938,7 +2942,11 @@ restart:
                 goto restart;
         }
         done_seqretry(&rename_lock, seq);
-       br_read_unlock(&vfsmount_lock);
+       if (need_seqretry(&mount_lock, m_seq)) {
+               m_seq = 1;
+               goto restart_mnt;
+       }
+       done_seqretry(&mount_lock, m_seq);
  
         if (error >= 0 && bptr == *buffer) {
                 if (--blen < 0)
diff --git a/fs/mount.h b/fs/mount.h

index f0866076de6ebf889505166d910c2fddcf18b6fb..d64c594be6c47baf29a70ffe63671f307cba4b7e 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,7 +1,6 @@
  #include <linux/mount.h>
  #include <linux/seq_file.h>
  #include <linux/poll.h>
-#include <linux/lglock.h>
  
  struct mnt_namespace {
         atomic_t                count;
@@ -30,6 +29,7 @@ struct mount {
         struct mount *mnt_parent;
         struct dentry *mnt_mountpoint;
         struct vfsmount mnt;
+       struct rcu_head mnt_rcu;
  #ifdef CONFIG_SMP
         struct mnt_pcp __percpu *mnt_pcp;
  #else
@@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt)
  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
  extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
  
+extern bool legitimize_mnt(struct vfsmount *, unsigned);
+
  static inline void get_mnt_ns(struct mnt_namespace *ns)
  {
         atomic_inc(&ns->count);
  }
  
-extern struct lglock vfsmount_lock;
+extern seqlock_t mount_lock;
  
  static inline void lock_mount_hash(void)
  {
-       br_write_lock(&vfsmount_lock);
+       write_seqlock(&mount_lock);
  }
  
  static inline void unlock_mount_hash(void)
  {
-       br_write_unlock(&vfsmount_lock);
+       write_sequnlock(&mount_lock);
  }
  
  struct proc_mounts {
diff --git a/fs/namei.c b/fs/namei.c

index 1f844fbfce7286f4b65e260a3a6603baaf329477..cb0ebae07e529228e9ddaaa59999fc758e2f221e 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put);
  
  static inline void lock_rcu_walk(void)
  {
-       br_read_lock(&vfsmount_lock);
         rcu_read_lock();
  }
  
  static inline void unlock_rcu_walk(void)
  {
         rcu_read_unlock();
-       br_read_unlock(&vfsmount_lock);
  }
  
  /**
@@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
         BUG_ON(!(nd->flags & LOOKUP_RCU));
  
         /*
-        * Get a reference to the parent first: we're
-        * going to make "path_put(nd->path)" valid in
-        * non-RCU context for "terminate_walk()".
-        *
-        * If this doesn't work, return immediately with
-        * RCU walking still active (and then we will do
-        * the RCU walk cleanup in terminate_walk()).
+        * After legitimizing the bastards, terminate_walk()
+        * will do the right thing for non-RCU mode, and all our
+        * subsequent exit cases should rcu_read_unlock()
+        * before returning.  Do vfsmount first; if dentry
+        * can't be legitimized, just set nd->path.dentry to NULL
+        * and rely on dput(NULL) being a no-op.
          */
-       if (!lockref_get_not_dead(&parent->d_lockref))
+       if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
                 return -ECHILD;
-
-       /*
-        * After the mntget(), we terminate_walk() will do
-        * the right thing for non-RCU mode, and all our
-        * subsequent exit cases should unlock_rcu_walk()
-        * before returning.
-        */
-       mntget(nd->path.mnt);
         nd->flags &= ~LOOKUP_RCU;
  
+       if (!lockref_get_not_dead(&parent->d_lockref)) {
+               nd->path.dentry = NULL; 
+               unlock_rcu_walk();
+               return -ECHILD;
+       }
+
         /*
          * For a negative lookup, the lookup sequence point is the parents
          * sequence point, and it only needs to revalidate the parent dentry.
@@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd)
                 if (!(nd->flags & LOOKUP_ROOT))
                         nd->root.mnt = NULL;
  
+               if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+                       unlock_rcu_walk();
+                       return -ECHILD;
+               }
                 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
                         unlock_rcu_walk();
+                       mntput(nd->path.mnt);
                         return -ECHILD;
                 }
                 if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
                         unlock_rcu_walk();
                         dput(dentry);
+                       mntput(nd->path.mnt);
                         return -ECHILD;
                 }
-               mntget(nd->path.mnt);
                 unlock_rcu_walk();
         }
  
@@ -909,15 +909,15 @@ int follow_up(struct path *path)
         struct mount *parent;
         struct dentry *mountpoint;
  
-       br_read_lock(&vfsmount_lock);
+       read_seqlock_excl(&mount_lock);
         parent = mnt->mnt_parent;
         if (parent == mnt) {
-               br_read_unlock(&vfsmount_lock);
+               read_sequnlock_excl(&mount_lock);
                 return 0;
         }
         mntget(&parent->mnt);
         mountpoint = dget(mnt->mnt_mountpoint);
-       br_read_unlock(&vfsmount_lock);
+       read_sequnlock_excl(&mount_lock);
         dput(path->dentry);
         path->dentry = mountpoint;
         mntput(path->mnt);
@@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags)
  
                         /* Something is mounted on this dentry in another
                          * namespace and/or whatever was mounted there in this
-                        * namespace got unmounted before we managed to get the
-                        * vfsmount_lock */
+                        * namespace got unmounted before lookup_mnt() could
+                        * get it */
                 }
  
                 /* Handle an automount point */
@@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                 if (flags & LOOKUP_RCU) {
                         lock_rcu_walk();
                         nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                       nd->m_seq = read_seqbegin(&mount_lock);
                 } else {
                         path_get(&nd->path);
                 }
@@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
  
         nd->root.mnt = NULL;
  
+       nd->m_seq = read_seqbegin(&mount_lock);
         if (*name=='/') {
                 if (flags & LOOKUP_RCU) {
                         lock_rcu_walk();
diff --git a/fs/namespace.c b/fs/namespace.c

index 500202ce10dbf0a7b10ebd0b5d418fe1e52470c6..ac2ce8a766e1a9c6250cdac616f9a951dddee45e 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj);
   * It should be taken for write in all cases where the vfsmount
   * tree or hash is modified or when a vfsmount structure is modified.
   */
-DEFINE_BRLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
  
  static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
  {
@@ -547,16 +547,38 @@ static void free_vfsmnt(struct mount *mnt)
         kmem_cache_free(mnt_cache, mnt);
  }
  
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+       struct mount *mnt;
+       if (read_seqretry(&mount_lock, seq))
+               return false;
+       if (bastard == NULL)
+               return true;
+       mnt = real_mount(bastard);
+       mnt_add_count(mnt, 1);
+       if (likely(!read_seqretry(&mount_lock, seq)))
+               return true;
+       if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+               mnt_add_count(mnt, -1);
+               return false;
+       }
+       rcu_read_unlock();
+       mntput(bastard);
+       rcu_read_lock();
+       return false;
+}
+
  /*
   * find the first mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * call under rcu_read_lock()
   */
  struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
  {
         struct list_head *head = mount_hashtable + hash(mnt, dentry);
         struct mount *p;
  
-       list_for_each_entry(p, head, mnt_hash)
+       list_for_each_entry_rcu(p, head, mnt_hash)
                 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                         return p;
         return NULL;
@@ -564,7 +586,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
  
  /*
   * find the last mount at @dentry on vfsmount @mnt.
- * vfsmount_lock must be held for read or write.
+ * mount_lock must be held.
   */
  struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
  {
@@ -596,17 +618,17 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
  struct vfsmount *lookup_mnt(struct path *path)
  {
         struct mount *child_mnt;
+       struct vfsmount *m;
+       unsigned seq;
  
-       br_read_lock(&vfsmount_lock);
-       child_mnt = __lookup_mnt(path->mnt, path->dentry);
-       if (child_mnt) {
-               mnt_add_count(child_mnt, 1);
-               br_read_unlock(&vfsmount_lock);
-               return &child_mnt->mnt;
-       } else {
-               br_read_unlock(&vfsmount_lock);
-               return NULL;
-       }
+       rcu_read_lock();
+       do {
+               seq = read_seqbegin(&mount_lock);
+               child_mnt = __lookup_mnt(path->mnt, path->dentry);
+               m = child_mnt ? &child_mnt->mnt : NULL;
+       } while (!legitimize_mnt(m, seq));
+       rcu_read_unlock();
+       return m;
  }
  
  static struct mountpoint *new_mountpoint(struct dentry *dentry)
@@ -874,38 +896,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
         return ERR_PTR(err);
  }
  
+static void delayed_free(struct rcu_head *head)
+{
+       struct mount *mnt = container_of(head, struct mount, mnt_rcu);
+       kfree(mnt->mnt_devname);
+#ifdef CONFIG_SMP
+       free_percpu(mnt->mnt_pcp);
+#endif
+       kmem_cache_free(mnt_cache, mnt);
+}
+
  static void mntput_no_expire(struct mount *mnt)
  {
  put_again:
-#ifdef CONFIG_SMP
-       br_read_lock(&vfsmount_lock);
-       if (likely(mnt->mnt_ns)) {
-               /* shouldn't be the last one */
-               mnt_add_count(mnt, -1);
-               br_read_unlock(&vfsmount_lock);
+       rcu_read_lock();
+       mnt_add_count(mnt, -1);
+       if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+               rcu_read_unlock();
                 return;
         }
-       br_read_unlock(&vfsmount_lock);
-
         lock_mount_hash();
-       mnt_add_count(mnt, -1);
         if (mnt_get_count(mnt)) {
+               rcu_read_unlock();
                 unlock_mount_hash();
                 return;
         }
-#else
-       mnt_add_count(mnt, -1);
-       if (likely(mnt_get_count(mnt)))
-               return;
-       lock_mount_hash();
-#endif
         if (unlikely(mnt->mnt_pinned)) {
                 mnt_add_count(mnt, mnt->mnt_pinned + 1);
                 mnt->mnt_pinned = 0;
+               rcu_read_unlock();
                 unlock_mount_hash();
                 acct_auto_close_mnt(&mnt->mnt);
                 goto put_again;
         }
+       if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+               rcu_read_unlock();
+               unlock_mount_hash();
+               return;
+       }
+       mnt->mnt.mnt_flags |= MNT_DOOMED;
+       rcu_read_unlock();
  
         list_del(&mnt->mnt_instance);
         unlock_mount_hash();
@@ -924,7 +954,8 @@ put_again:
         fsnotify_vfsmount_delete(&mnt->mnt);
         dput(mnt->mnt.mnt_root);
         deactivate_super(mnt->mnt.mnt_sb);
-       free_vfsmnt(mnt);
+       mnt_free_id(mnt);
+       call_rcu(&mnt->mnt_rcu, delayed_free);
  }
  
  void mntput(struct vfsmount *mnt)
@@ -1137,6 +1168,8 @@ static void namespace_unlock(void)
         list_splice_init(&unmounted, &head);
         up_write(&namespace_sem);
  
+       synchronize_rcu();
+
         while (!list_empty(&head)) {
                 mnt = list_first_entry(&head, struct mount, mnt_hash);
                 list_del_init(&mnt->mnt_hash);
@@ -1152,10 +1185,13 @@ static inline void namespace_lock(void)
  }
  
  /*
- * vfsmount lock must be held for write
+ * mount_lock must be held
   * namespace_sem must be held for write
+ * how = 0 => just this tree, don't propagate
+ * how = 1 => propagate; we know that nobody else has reference to any victims
+ * how = 2 => lazy umount
   */
-void umount_tree(struct mount *mnt, int propagate)
+void umount_tree(struct mount *mnt, int how)
  {
         LIST_HEAD(tmp_list);
         struct mount *p;
@@ -1163,7 +1199,7 @@ void umount_tree(struct mount *mnt, int propagate)
         for (p = mnt; p; p = next_mnt(p, mnt))
                 list_move(&p->mnt_hash, &tmp_list);
  
-       if (propagate)
+       if (how)
                 propagate_umount(&tmp_list);
  
         list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1171,6 +1207,8 @@ void umount_tree(struct mount *mnt, int propagate)
                 list_del_init(&p->mnt_list);
                 __touch_mnt_namespace(p->mnt_ns);
                 p->mnt_ns = NULL;
+               if (how < 2)
+                       p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
                 list_del_init(&p->mnt_child);
                 if (mnt_has_parent(p)) {
                         put_mountpoint(p->mnt_mp);
@@ -1262,14 +1300,18 @@ static int do_umount(struct mount *mnt, int flags)
         lock_mount_hash();
         event++;
  
-       if (!(flags & MNT_DETACH))
-               shrink_submounts(mnt);
-
-       retval = -EBUSY;
-       if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+       if (flags & MNT_DETACH) {
                 if (!list_empty(&mnt->mnt_list))
-                       umount_tree(mnt, 1);
+                       umount_tree(mnt, 2);
                 retval = 0;
+       } else {
+               shrink_submounts(mnt);
+               retval = -EBUSY;
+               if (!propagate_mount_busy(mnt, 2)) {
+                       if (!list_empty(&mnt->mnt_list))
+                               umount_tree(mnt, 1);
+                       retval = 0;
+               }
         }
         unlock_mount_hash();
         namespace_unlock();
@@ -1955,7 +1997,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
         struct mount *parent;
         int err;
  
-       mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+       mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
  
         mp = lock_mount(path);
         if (IS_ERR(mp))
@@ -2172,7 +2214,7 @@ resume:
   * process a list of expirable mountpoints with the intent of discarding any
   * submounts of a specific parent mountpoint
   *
- * vfsmount_lock must be held for write
+ * mount_lock must be held for write
   */
  static void shrink_submounts(struct mount *mnt)
  {
@@ -2558,7 +2600,7 @@ out_type:
  /*
   * Return true if path is reachable from root
   *
- * namespace_sem or vfsmount_lock is held
+ * namespace_sem or mount_lock is held
   */
  bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                          const struct path *root)
@@ -2573,9 +2615,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
  int path_is_under(struct path *path1, struct path *path2)
  {
         int res;
-       br_read_lock(&vfsmount_lock);
+       read_seqlock_excl(&mount_lock);
         res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-       br_read_unlock(&vfsmount_lock);
+       read_sequnlock_excl(&mount_lock);
         return res;
  }
  EXPORT_SYMBOL(path_is_under);
@@ -2748,8 +2790,6 @@ void __init mnt_init(void)
         for (u = 0; u < HASH_SIZE; u++)
                 INIT_LIST_HEAD(&mountpoint_hashtable[u]);
  
-       br_lock_init(&vfsmount_lock);
-
         err = sysfs_init();
         if (err)
                 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2788,9 +2828,8 @@ void kern_unmount(struct vfsmount *mnt)
  {
         /* release long term mount so mount point can be released */
         if (!IS_ERR_OR_NULL(mnt)) {
-               lock_mount_hash();
                 real_mount(mnt)->mnt_ns = NULL;
-               unlock_mount_hash();
+               synchronize_rcu();      /* yecchhh... */
                 mntput(mnt);
         }
  }
diff --git a/include/linux/mount.h b/include/linux/mount.h

index 38cd98f112a0e4ccb62e32d94a2d1065ce76b2dd..371d346fa270dbfe7d8d3ac4a7849ab9cf6b5f77 100644 (file)
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -49,6 +49,8 @@ struct mnt_namespace;
  
  #define MNT_LOCK_READONLY      0x400000
  #define MNT_LOCKED             0x800000
+#define MNT_DOOMED             0x1000000
+#define MNT_SYNC_UMOUNT                0x2000000
  
  struct vfsmount {
         struct dentry *mnt_root;        /* root of the mounted tree */
diff --git a/include/linux/namei.h b/include/linux/namei.h

index 8e47bc7a1665b8b7e6ca06392584328aab702d82..492de72560fab98591ca7ca617380b7d581e5c13 100644 (file)
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -16,7 +16,7 @@ struct nameidata {
         struct path     root;
         struct inode    *inode; /* path.dentry.d_inode */
         unsigned int    flags;
-       unsigned        seq;
+       unsigned        seq, m_seq;
         int             last_type;
         unsigned        depth;
         char *saved_names[MAX_NESTED_LINKS + 1];
author	Al Viro <viro@zeniv.linux.org.uk>
	Mon, 30 Sep 2013 02:06:07 +0000 (22:06 -0400)
committer	Al Viro <viro@zeniv.linux.org.uk>
	Sat, 9 Nov 2013 05:16:19 +0000 (00:16 -0500)
fs/dcache.c		patch \| blob \| blame \| history
fs/mount.h		patch \| blob \| blame \| history
fs/namei.c		patch \| blob \| blame \| history
fs/namespace.c		patch \| blob \| blame \| history
include/linux/mount.h		patch \| blob \| blame \| history
include/linux/namei.h		patch \| blob \| blame \| history