ceph: improve reference tracking for snaprealm
authorYan, Zheng <zyan@redhat.com>
Tue, 23 Dec 2014 07:30:54 +0000 (15:30 +0800)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 19 Feb 2015 10:31:38 +0000 (13:31 +0300)
When snaprealm is created, its initial reference count is zero.
But in some rare cases, the newly created snaprealm is not referenced
by anyone. This causes snaprealm with zero reference count not freed.

The fix is set reference count of newly snaprealm to 1. The reference
is return the function who requests to create the snaprealm. When the
function finishes its job, it releases the reference.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
fs/ceph/caps.c
fs/ceph/mds_client.c
fs/ceph/snap.c
fs/ceph/super.h

index d0618e8412fd1e7ba96866bc8c148c7166224d48..8ed1192606d9cd95d97c04bf74704d8082b70586 100644 (file)
@@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode,
                struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
                                                               realmino);
                if (realm) {
-                       ceph_get_snap_realm(mdsc, realm);
                        spin_lock(&realm->inodes_with_caps_lock);
                        ci->i_snap_realm = realm;
                        list_add(&ci->i_snap_realm_item,
@@ -2447,13 +2446,13 @@ static void invalidate_aliases(struct inode *inode)
  */
 static void handle_cap_grant(struct ceph_mds_client *mdsc,
                             struct inode *inode, struct ceph_mds_caps *grant,
-                            void *snaptrace, int snaptrace_len,
                             u64 inline_version,
                             void *inline_data, int inline_len,
                             struct ceph_buffer *xattr_buf,
                             struct ceph_mds_session *session,
                             struct ceph_cap *cap, int issued)
        __releases(ci->i_ceph_lock)
+       __releases(mdsc->snap_rwsem)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2654,10 +2653,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        spin_unlock(&ci->i_ceph_lock);
 
        if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
-               down_write(&mdsc->snap_rwsem);
-               ceph_update_snap_trace(mdsc, snaptrace,
-                                      snaptrace + snaptrace_len, false);
-               downgrade_write(&mdsc->snap_rwsem);
                kick_flushing_inode_caps(mdsc, session, inode);
                up_read(&mdsc->snap_rwsem);
                if (newcaps & ~issued)
@@ -3067,6 +3062,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
        struct ceph_mds_cap_peer *peer = NULL;
+       struct ceph_snap_realm *realm;
        int mds = session->s_mds;
        int op, issued;
        u32 seq, mseq;
@@ -3168,11 +3164,23 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                goto done_unlocked;
 
        case CEPH_CAP_OP_IMPORT:
+               realm = NULL;
+               if (snaptrace_len) {
+                       down_write(&mdsc->snap_rwsem);
+                       ceph_update_snap_trace(mdsc, snaptrace,
+                                              snaptrace + snaptrace_len,
+                                              false, &realm);
+                       downgrade_write(&mdsc->snap_rwsem);
+               } else {
+                       down_read(&mdsc->snap_rwsem);
+               }
                handle_cap_import(mdsc, inode, h, peer, session,
                                  &cap, &issued);
-               handle_cap_grant(mdsc, inode, h,  snaptrace, snaptrace_len,
+               handle_cap_grant(mdsc, inode, h,
                                 inline_version, inline_data, inline_len,
                                 msg->middle, session, cap, issued);
+               if (realm)
+                       ceph_put_snap_realm(mdsc, realm);
                goto done_unlocked;
        }
 
@@ -3192,7 +3200,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_GRANT:
                __ceph_caps_issued(ci, &issued);
                issued |= __ceph_caps_dirty(ci);
-               handle_cap_grant(mdsc, inode, h, NULL, 0,
+               handle_cap_grant(mdsc, inode, h,
                                 inline_version, inline_data, inline_len,
                                 msg->middle, session, cap, issued);
                goto done_unlocked;
index c6c33b411a2f7f7679af20038bcecf5cad5ed4a6..85c67ae03e46481ba5610b5bb24f96fe80029355 100644 (file)
@@ -2286,6 +2286,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        struct ceph_mds_request *req;
        struct ceph_mds_reply_head *head = msg->front.iov_base;
        struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+       struct ceph_snap_realm *realm;
        u64 tid;
        int err, result;
        int mds = session->s_mds;
@@ -2401,11 +2402,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 
        /* snap trace */
+       realm = NULL;
        if (rinfo->snapblob_len) {
                down_write(&mdsc->snap_rwsem);
                ceph_update_snap_trace(mdsc, rinfo->snapblob,
-                              rinfo->snapblob + rinfo->snapblob_len,
-                              le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+                               rinfo->snapblob + rinfo->snapblob_len,
+                               le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+                               &realm);
                downgrade_write(&mdsc->snap_rwsem);
        } else {
                down_read(&mdsc->snap_rwsem);
@@ -2423,6 +2426,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_unlock(&req->r_fill_mutex);
 
        up_read(&mdsc->snap_rwsem);
+       if (realm)
+               ceph_put_snap_realm(mdsc, realm);
 out_err:
        mutex_lock(&mdsc->mutex);
        if (!req->r_aborted) {
index ce35fbd4ba5d3fef3f5a05abc314df300fa4ff49..a97e39f09ba683349bb5f97e44f0d229b3a88936 100644 (file)
@@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
         * safe.  we do need to protect against concurrent empty list
         * additions, however.
         */
-       if (atomic_read(&realm->nref) == 0) {
+       if (atomic_inc_return(&realm->nref) == 1) {
                spin_lock(&mdsc->snap_empty_lock);
                list_del_init(&realm->empty_item);
                spin_unlock(&mdsc->snap_empty_lock);
        }
-
-       atomic_inc(&realm->nref);
 }
 
 static void __insert_snap_realm(struct rb_root *root,
@@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
        if (!realm)
                return ERR_PTR(-ENOMEM);
 
-       atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+       atomic_set(&realm->nref, 1);    /* for caller */
        realm->ino = ino;
        INIT_LIST_HEAD(&realm->children);
        INIT_LIST_HEAD(&realm->child_item);
@@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
  *
  * caller must hold snap_rwsem for write.
  */
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
-                                              u64 ino)
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                                  u64 ino)
 {
        struct rb_node *n = mdsc->snap_realms.rb_node;
        struct ceph_snap_realm *r;
@@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
        return NULL;
 }
 
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                              u64 ino)
+{
+       struct ceph_snap_realm *r;
+       r = __lookup_snap_realm(mdsc, ino);
+       if (r)
+               ceph_get_snap_realm(mdsc, r);
+       return r;
+}
+
 static void __put_snap_realm(struct ceph_mds_client *mdsc,
                             struct ceph_snap_realm *realm);
 
@@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
        }
        realm->parent_ino = parentino;
        realm->parent = parent;
-       ceph_get_snap_realm(mdsc, parent);
        list_add(&realm->child_item, &parent->children);
        return 1;
 }
@@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
  * Caller must hold snap_rwsem for write.
  */
 int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
-                          void *p, void *e, bool deletion)
+                          void *p, void *e, bool deletion,
+                          struct ceph_snap_realm **realm_ret)
 {
        struct ceph_mds_snap_realm *ri;    /* encoded */
        __le64 *snaps;                     /* encoded */
        __le64 *prior_parent_snaps;        /* encoded */
-       struct ceph_snap_realm *realm;
+       struct ceph_snap_realm *realm = NULL;
+       struct ceph_snap_realm *first_realm = NULL;
        int invalidate = 0;
        int err = -ENOMEM;
        LIST_HEAD(dirty_realms);
@@ -704,13 +713,18 @@ more:
        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
             realm, invalidate, p, e);
 
-       if (p < e)
-               goto more;
-
        /* invalidate when we reach the _end_ (root) of the trace */
-       if (invalidate)
+       if (invalidate && p >= e)
                rebuild_snap_realms(realm);
 
+       if (!first_realm)
+               first_realm = realm;
+       else
+               ceph_put_snap_realm(mdsc, realm);
+
+       if (p < e)
+               goto more;
+
        /*
         * queue cap snaps _after_ we've built the new snap contexts,
         * so that i_head_snapc can be set appropriately.
@@ -721,12 +735,21 @@ more:
                queue_realm_cap_snaps(realm);
        }
 
+       if (realm_ret)
+               *realm_ret = first_realm;
+       else
+               ceph_put_snap_realm(mdsc, first_realm);
+
        __cleanup_empty_realms(mdsc);
        return 0;
 
 bad:
        err = -EINVAL;
 fail:
+       if (realm && !IS_ERR(realm))
+               ceph_put_snap_realm(mdsc, realm);
+       if (first_realm)
+               ceph_put_snap_realm(mdsc, first_realm);
        pr_err("update_snap_trace error %d\n", err);
        return err;
 }
@@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        if (IS_ERR(realm))
                                goto out;
                }
-               ceph_get_snap_realm(mdsc, realm);
 
                dout("splitting snap_realm %llx %p\n", realm->ino, realm);
                for (i = 0; i < num_split_inos; i++) {
@@ -905,7 +927,7 @@ skip_inode:
                /* we may have taken some of the old realm's children. */
                for (i = 0; i < num_split_realms; i++) {
                        struct ceph_snap_realm *child =
-                               ceph_lookup_snap_realm(mdsc,
+                               __lookup_snap_realm(mdsc,
                                           le64_to_cpu(split_realms[i]));
                        if (!child)
                                continue;
@@ -918,7 +940,7 @@ skip_inode:
         * snap, we can avoid queueing cap_snaps.
         */
        ceph_update_snap_trace(mdsc, p, e,
-                              op == CEPH_SNAP_OP_DESTROY);
+                              op == CEPH_SNAP_OP_DESTROY, NULL);
 
        if (op == CEPH_SNAP_OP_SPLIT)
                /* we took a reference when we created the realm, above */
index e1aa32d0759d12c3709fb3d66c03982756d55599..72bc05a73b69a5b09bcf8b18b8f01eb9b76f1444 100644 (file)
@@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
 extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
                                struct ceph_snap_realm *realm);
 extern int ceph_update_snap_trace(struct ceph_mds_client *m,
-                                 void *p, void *e, bool deletion);
+                                 void *p, void *e, bool deletion,
+                                 struct ceph_snap_realm **realm_ret);
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session,
                             struct ceph_msg *msg);