ceph: make sure syncfs flushes all cap snaps
authorYan, Zheng <zyan@redhat.com>
Tue, 5 May 2015 13:22:13 +0000 (21:22 +0800)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 25 Jun 2015 08:49:29 +0000 (11:49 +0300)
Signed-off-by: Yan, Zheng <zyan@redhat.com>
fs/ceph/caps.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/snap.c

index 900c05fd77d86db82d7152b7c4d731186807fa83..bbd969e16a016cb5a46e0a8402711100b19469a6 100644 (file)
@@ -1259,14 +1259,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
- * Unless @again is true, skip cap_snaps that were already sent to
+ * Unless @kick is true, skip cap_snaps that were already sent to
  * the MDS (i.e., during this session).
  *
  * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
                        struct ceph_mds_session **psession,
-                       int again)
+                       int kick)
                __releases(ci->i_ceph_lock)
                __acquires(ci->i_ceph_lock)
 {
@@ -1307,7 +1307,7 @@ retry:
                }
 
                /* only flush each capsnap once */
-               if (!again && !list_empty(&capsnap->flushing_item)) {
+               if (!kick && !list_empty(&capsnap->flushing_item)) {
                        dout("already flushed %p, skipping\n", capsnap);
                        continue;
                }
@@ -1317,6 +1317,9 @@ retry:
 
                if (session && session->s_mds != mds) {
                        dout("oops, wrong session %p mutex\n", session);
+                       if (kick)
+                               goto out;
+
                        mutex_unlock(&session->s_mutex);
                        ceph_put_mds_session(session);
                        session = NULL;
@@ -1342,10 +1345,9 @@ retry:
 
                capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
                atomic_inc(&capsnap->nref);
-               if (!list_empty(&capsnap->flushing_item))
-                       list_del_init(&capsnap->flushing_item);
-               list_add_tail(&capsnap->flushing_item,
-                             &session->s_cap_snaps_flushing);
+               if (list_empty(&capsnap->flushing_item))
+                       list_add_tail(&capsnap->flushing_item,
+                                     &session->s_cap_snaps_flushing);
                spin_unlock(&ci->i_ceph_lock);
 
                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
@@ -2876,6 +2878,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                                     struct ceph_mds_session *session)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        u64 follows = le64_to_cpu(m->snap_follows);
        struct ceph_cap_snap *capsnap;
        int drop = 0;
@@ -2899,6 +2902,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                        list_del(&capsnap->ci_item);
                        list_del(&capsnap->flushing_item);
                        ceph_put_cap_snap(capsnap);
+                       wake_up_all(&mdsc->cap_flushing_wq);
                        drop = 1;
                        break;
                } else {
index 88010f9a254da69d08c0801674afa42c627a9ea0..2bb9264b9225260cc1e1f4ee39e4d274b41d67e1 100644 (file)
@@ -1488,17 +1488,22 @@ out_unlocked:
        return err;
 }
 
-static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+static int check_cap_flush(struct ceph_inode_info *ci,
+                          u64 want_flush_seq, u64 want_snap_seq)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
+       int ret1 = 1, ret2 = 1;
        spin_lock(&ci->i_ceph_lock);
-       if (ci->i_flushing_caps)
-               ret = ci->i_cap_flush_seq >= want_flush_seq;
-       else
-               ret = 1;
+       if (want_flush_seq > 0 && ci->i_flushing_caps)
+               ret1 = ci->i_cap_flush_seq >= want_flush_seq;
+
+       if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
+               struct ceph_cap_snap *capsnap =
+                       list_first_entry(&ci->i_cap_snaps,
+                                        struct ceph_cap_snap, ci_item);
+               ret2 = capsnap->follows >= want_snap_seq;
+       }
        spin_unlock(&ci->i_ceph_lock);
-       return ret;
+       return ret1 && ret2;
 }
 
 /*
@@ -1506,45 +1511,72 @@ static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
  *
  * returns true if we've flushed through want_flush_seq
  */
-static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+                           u64 want_flush_seq, u64 want_snap_seq)
 {
        int mds;
 
        dout("check_cap_flush want %lld\n", want_flush_seq);
        mutex_lock(&mdsc->mutex);
-       for (mds = 0; mds < mdsc->max_sessions; mds++) {
+       for (mds = 0; mds < mdsc->max_sessions; ) {
                struct ceph_mds_session *session = mdsc->sessions[mds];
-               struct inode *inode = NULL;
+               struct inode *inode1 = NULL, *inode2 = NULL;
 
-               if (!session)
+               if (!session) {
+                       mds++;
                        continue;
+               }
                get_session(session);
                mutex_unlock(&mdsc->mutex);
 
                mutex_lock(&session->s_mutex);
                if (!list_empty(&session->s_cap_flushing)) {
                        struct ceph_inode_info *ci =
-                               list_entry(session->s_cap_flushing.next,
-                                          struct ceph_inode_info,
-                                          i_flushing_item);
+                               list_first_entry(&session->s_cap_flushing,
+                                                struct ceph_inode_info,
+                                                i_flushing_item);
 
-                       if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
+                       if (!check_cap_flush(ci, want_flush_seq, 0)) {
                                dout("check_cap_flush still flushing %p "
                                     "seq %lld <= %lld to mds%d\n",
                                     &ci->vfs_inode, ci->i_cap_flush_seq,
-                                    want_flush_seq, session->s_mds);
-                               inode = igrab(&ci->vfs_inode);
+                                    want_flush_seq, mds);
+                               inode1 = igrab(&ci->vfs_inode);
+                       }
+               }
+               if (!list_empty(&session->s_cap_snaps_flushing)) {
+                       struct ceph_cap_snap *capsnap =
+                               list_first_entry(&session->s_cap_snaps_flushing,
+                                                struct ceph_cap_snap,
+                                                flushing_item);
+                       struct ceph_inode_info *ci = capsnap->ci;
+                       if (!check_cap_flush(ci, 0, want_snap_seq)) {
+                               dout("check_cap_flush still flushing snap %p "
+                                    "follows %lld <= %lld to mds%d\n",
+                                    &ci->vfs_inode, capsnap->follows,
+                                    want_snap_seq, mds);
+                               inode2 = igrab(&ci->vfs_inode);
                        }
                }
                mutex_unlock(&session->s_mutex);
                ceph_put_mds_session(session);
 
-               if (inode) {
+               if (inode1) {
                        wait_event(mdsc->cap_flushing_wq,
-                                  check_cap_flush(inode, want_flush_seq));
-                       iput(inode);
+                                  check_cap_flush(ceph_inode(inode1),
+                                                  want_flush_seq, 0));
+                       iput(inode1);
+               }
+               if (inode2) {
+                       wait_event(mdsc->cap_flushing_wq,
+                                  check_cap_flush(ceph_inode(inode2),
+                                                  0, want_snap_seq));
+                       iput(inode2);
                }
 
+               if (!inode1 && !inode2)
+                       mds++;
+
                mutex_lock(&mdsc->mutex);
        }
 
@@ -3391,6 +3423,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        atomic_set(&mdsc->num_sessions, 0);
        mdsc->max_sessions = 0;
        mdsc->stopping = 0;
+       mdsc->last_snap_seq = 0;
        init_rwsem(&mdsc->snap_rwsem);
        mdsc->snap_realms = RB_ROOT;
        INIT_LIST_HEAD(&mdsc->snap_empty);
@@ -3517,7 +3550,7 @@ restart:
 
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
-       u64 want_tid, want_flush;
+       u64 want_tid, want_flush, want_snap;
 
        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return;
@@ -3532,10 +3565,15 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
        want_flush = mdsc->cap_flush_seq;
        spin_unlock(&mdsc->cap_dirty_lock);
 
-       dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+       down_read(&mdsc->snap_rwsem);
+       want_snap = mdsc->last_snap_seq;
+       up_read(&mdsc->snap_rwsem);
+
+       dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
+            want_tid, want_flush, want_snap);
 
        wait_unsafe_requests(mdsc, want_tid);
-       wait_caps_flush(mdsc, want_flush);
+       wait_caps_flush(mdsc, want_flush, want_snap);
 }
 
 /*
index d474141c034ab9e3059fa00bbc623a022e59705f..bf24d88cfeb2bc50a3690e717fd8b708acc21468 100644 (file)
@@ -290,6 +290,7 @@ struct ceph_mds_client {
         * references (implying they contain no inodes with caps) that
         * should be destroyed.
         */
+       u64                     last_snap_seq;
        struct rw_semaphore     snap_rwsem;
        struct rb_root          snap_realms;
        struct list_head        snap_empty;
index ba708017d60bbec0322f204c3fff5aff6fbdc1ce..233d906aec02b7c4508fd2488908bdb95a130aa4 100644 (file)
@@ -730,6 +730,8 @@ more:
 
                /* queue realm for cap_snap creation */
                list_add(&realm->dirty_item, &dirty_realms);
+               if (realm->seq > mdsc->last_snap_seq)
+                       mdsc->last_snap_seq = realm->seq;
 
                invalidate = 1;
        } else if (!realm->cached_context) {