libceph: revamp subs code, switch to SUBSCRIBE2 protocol
authorIlya Dryomov <idryomov@gmail.com>
Tue, 19 Jan 2016 15:19:06 +0000 (16:19 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Fri, 25 Mar 2016 17:51:38 +0000 (18:51 +0100)
It is currently hard-coded in the mon_client that mdsmap and monmap
subs are continuous, while osdmap sub is always "onetime".  To better
handle full clusters/pools in the osd_client, we need to be able to
issue continuous osdmap subs.  Revamp subs code to allow us to specify
for each sub whether it should be continuous or not.

Although not strictly required for the above, switch to SUBSCRIBE2
protocol while at it, eliminating the ambiguity between a request for
"every map since X" and a request for "just the latest" when we don't
have a map yet (i.e. have epoch 0).  SUBSCRIBE2 feature bit is now
required - it's been supported since pre-argonaut (2010).

Move "got mdsmap" call to the end of ceph_mdsc_handle_map() - calling
in before we validate the epoch and successfully install the new map
can mess up mon_client sub state.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/mds_client.c
fs/ceph/super.c
include/linux/ceph/ceph_features.h
include/linux/ceph/ceph_fs.h
include/linux/ceph/mon_client.h
net/ceph/debugfs.c
net/ceph/mon_client.c
net/ceph/osd_client.c

index 911d64d865f176aa4ce28832207abb8651a4fec8..b43399d22e23ec7c49eda6d7e8d211614d38072a 100644 (file)
@@ -3764,7 +3764,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
 
        /* do we need it? */
-       ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
        mutex_lock(&mdsc->mutex);
        if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
                dout("handle_map epoch %u <= our %u\n",
@@ -3791,6 +3790,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
        mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
 
        __wake_requests(mdsc, &mdsc->waiting_for_map);
+       ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
+                         mdsc->mdsmap->m_epoch);
 
        mutex_unlock(&mdsc->mutex);
        schedule_delayed(mdsc);
index ca4d5e8457f1e5340330bb225b4da84c88727dc5..c941fd1a8eb806ec0b67c6e816708fb65c85779f 100644 (file)
@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
                goto fail;
        }
        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
-       fsc->client->monc.want_mdsmap = 1;
+       ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
 
        fsc->mount_options = fsopt;
 
index 15151f3c41202c6b6229475affb654ae0bab272d..ae2f66833762cd7d63bd77eafb9aed13b6ab2308 100644 (file)
@@ -105,6 +105,7 @@ static inline u64 ceph_sanitize_features(u64 features)
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT                \
        (CEPH_FEATURE_NOSRCADDR |               \
+        CEPH_FEATURE_SUBSCRIBE2 |              \
         CEPH_FEATURE_RECONNECT_SEQ |           \
         CEPH_FEATURE_PGID64 |                  \
         CEPH_FEATURE_PGPOOL3 |                 \
@@ -127,6 +128,7 @@ static inline u64 ceph_sanitize_features(u64 features)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
        (CEPH_FEATURE_NOSRCADDR |        \
+        CEPH_FEATURE_SUBSCRIBE2 |       \
         CEPH_FEATURE_RECONNECT_SEQ |    \
         CEPH_FEATURE_PGID64 |           \
         CEPH_FEATURE_PGPOOL3 |          \
index d7d072a25c2717734139e98b01dd2724f2c43844..bf74005eedec75834330ce43162f767385f064ed 100644 (file)
@@ -198,8 +198,8 @@ struct ceph_client_mount {
 #define CEPH_SUBSCRIBE_ONETIME    1  /* i want only 1 update after have */
 
 struct ceph_mon_subscribe_item {
-       __le64 have_version;    __le64 have;
-       __u8 onetime;
+       __le64 start;
+       __u8 flags;
 } __attribute__ ((packed));
 
 struct ceph_mon_subscribe_ack {
index 81810dc21f061ce1acf5129f5dea5e6f7f758e65..8b2d2f0b659e6064a2d03f915c9f4a949a122a7a 100644 (file)
@@ -68,7 +68,8 @@ struct ceph_mon_client {
 
        bool hunting;
        int cur_mon;                       /* last monitor i contacted */
-       unsigned long sub_sent, sub_renew_after;
+       unsigned long sub_renew_after;
+       unsigned long sub_renew_sent;
        struct ceph_connection con;
 
        /* pending generic requests */
@@ -76,10 +77,12 @@ struct ceph_mon_client {
        int num_generic_requests;
        u64 last_tid;
 
-       /* mds/osd map */
-       int want_mdsmap;
-       int want_next_osdmap; /* 1 = want, 2 = want+asked */
-       u32 have_osdmap, have_mdsmap;
+       /* subs, indexed with CEPH_SUB_* */
+       struct {
+               struct ceph_mon_subscribe_item item;
+               bool want;
+               u32 have; /* epoch */
+       } subs[3];
 
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_file;
@@ -93,14 +96,23 @@ extern int ceph_monmap_contains(struct ceph_monmap *m,
 extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
 extern void ceph_monc_stop(struct ceph_mon_client *monc);
 
+enum {
+       CEPH_SUB_MDSMAP = 0,
+       CEPH_SUB_MONMAP,
+       CEPH_SUB_OSDMAP,
+};
+
+extern const char *ceph_sub_str[];
+
 /*
  * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map.  We will
+ * epoch @epoch, and also call in when we receive a map.  We will
  * periodically rerequest the map from the monitor cluster until we
  * get what we want.
  */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
+                       bool continuous);
+void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
 
 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
index 593dc2eabcc803342187c1e57355f31d76b17f49..b902fbc7863ef893222401a5f2b9bdcc8db87431 100644 (file)
@@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p)
        struct ceph_mon_generic_request *req;
        struct ceph_mon_client *monc = &client->monc;
        struct rb_node *rp;
+       int i;
 
        mutex_lock(&monc->mutex);
 
-       if (monc->have_mdsmap)
-               seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap);
-       if (monc->have_osdmap)
-               seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap);
-       if (monc->want_next_osdmap)
-               seq_printf(s, "want next osdmap\n");
+       for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+               seq_printf(s, "have %s %u", ceph_sub_str[i],
+                          monc->subs[i].have);
+               if (monc->subs[i].want)
+                       seq_printf(s, " want %llu%s",
+                                  le64_to_cpu(monc->subs[i].item.start),
+                                  (monc->subs[i].item.flags &
+                                       CEPH_SUBSCRIBE_ONETIME ?  "" : "+"));
+               seq_putc(s, '\n');
+       }
 
        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
                __u16 op;
index d6af6ca26e8d841997eaf1c7e114f71e2af94cbf..89029916315c4f2078dfe45f4dda7f58e26ed8f8 100644 (file)
@@ -140,9 +140,8 @@ static int __open_session(struct ceph_mon_client *monc)
                monc->cur_mon = r % monc->monmap->num_mon;
                dout("open_session num=%d r=%d -> mon%d\n",
                     monc->monmap->num_mon, r, monc->cur_mon);
-               monc->sub_sent = 0;
                monc->sub_renew_after = jiffies;  /* i.e., expired */
-               monc->want_next_osdmap = !!monc->want_next_osdmap;
+               monc->sub_renew_sent = 0;
 
                dout("open_session mon%d opening\n", monc->cur_mon);
                ceph_con_open(&monc->con,
@@ -189,59 +188,58 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
                              round_jiffies_relative(delay));
 }
 
+const char *ceph_sub_str[] = {
+       [CEPH_SUB_MDSMAP] = "mdsmap",
+       [CEPH_SUB_MONMAP] = "monmap",
+       [CEPH_SUB_OSDMAP] = "osdmap",
+};
+
 /*
- * Send subscribe request for mdsmap and/or osdmap.
+ * Send subscribe request for one or more maps, according to
+ * monc->subs.
  */
 static void __send_subscribe(struct ceph_mon_client *monc)
 {
-       dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
-            (unsigned int)monc->sub_sent, __sub_expired(monc),
-            monc->want_next_osdmap);
-       if ((__sub_expired(monc) && !monc->sub_sent) ||
-           monc->want_next_osdmap == 1) {
-               struct ceph_msg *msg = monc->m_subscribe;
-               struct ceph_mon_subscribe_item *i;
-               void *p, *end;
-               int num;
-
-               p = msg->front.iov_base;
-               end = p + msg->front_alloc_len;
-
-               num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
-               ceph_encode_32(&p, num);
-
-               if (monc->want_next_osdmap) {
-                       dout("__send_subscribe to 'osdmap' %u\n",
-                            (unsigned int)monc->have_osdmap);
-                       ceph_encode_string(&p, end, "osdmap", 6);
-                       i = p;
-                       i->have = cpu_to_le64(monc->have_osdmap);
-                       i->onetime = 1;
-                       p += sizeof(*i);
-                       monc->want_next_osdmap = 2;  /* requested */
-               }
-               if (monc->want_mdsmap) {
-                       dout("__send_subscribe to 'mdsmap' %u+\n",
-                            (unsigned int)monc->have_mdsmap);
-                       ceph_encode_string(&p, end, "mdsmap", 6);
-                       i = p;
-                       i->have = cpu_to_le64(monc->have_mdsmap);
-                       i->onetime = 0;
-                       p += sizeof(*i);
-               }
-               ceph_encode_string(&p, end, "monmap", 6);
-               i = p;
-               i->have = 0;
-               i->onetime = 0;
-               p += sizeof(*i);
-
-               msg->front.iov_len = p - msg->front.iov_base;
-               msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-               ceph_msg_revoke(msg);
-               ceph_con_send(&monc->con, ceph_msg_get(msg));
-
-               monc->sub_sent = jiffies | 1;  /* never 0 */
+       struct ceph_msg *msg = monc->m_subscribe;
+       void *p = msg->front.iov_base;
+       void *const end = p + msg->front_alloc_len;
+       int num = 0;
+       int i;
+
+       dout("%s sent %lu\n", __func__, monc->sub_renew_sent);
+
+       BUG_ON(monc->cur_mon < 0);
+
+       if (!monc->sub_renew_sent)
+               monc->sub_renew_sent = jiffies | 1; /* never 0 */
+
+       msg->hdr.version = cpu_to_le16(2);
+
+       for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+               if (monc->subs[i].want)
+                       num++;
        }
+       BUG_ON(num < 1); /* monmap sub is always there */
+       ceph_encode_32(&p, num);
+       for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+               const char *s = ceph_sub_str[i];
+
+               if (!monc->subs[i].want)
+                       continue;
+
+               dout("%s %s start %llu flags 0x%x\n", __func__, s,
+                    le64_to_cpu(monc->subs[i].item.start),
+                    monc->subs[i].item.flags);
+               ceph_encode_string(&p, end, s, strlen(s));
+               memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
+               p += sizeof(monc->subs[i].item);
+       }
+
+       BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+       msg->front.iov_len = p - msg->front.iov_base;
+       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+       ceph_msg_revoke(msg);
+       ceph_con_send(&monc->con, ceph_msg_get(msg));
 }
 
 static void handle_subscribe_ack(struct ceph_mon_client *monc,
@@ -255,9 +253,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
        seconds = le32_to_cpu(h->duration);
 
        mutex_lock(&monc->mutex);
-       dout("handle_subscribe_ack after %d seconds\n", seconds);
-       monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
-       monc->sub_sent = 0;
+       if (monc->sub_renew_sent) {
+               monc->sub_renew_after = monc->sub_renew_sent +
+                                           (seconds >> 1) * HZ - 1;
+               dout("%s sent %lu duration %d renew after %lu\n", __func__,
+                    monc->sub_renew_sent, seconds, monc->sub_renew_after);
+               monc->sub_renew_sent = 0;
+       } else {
+               dout("%s sent %lu renew after %lu, ignoring\n", __func__,
+                    monc->sub_renew_sent, monc->sub_renew_after);
+       }
        mutex_unlock(&monc->mutex);
        return;
 bad:
@@ -266,36 +271,82 @@ bad:
 }
 
 /*
- * Keep track of which maps we have
+ * Register interest in a map
+ *
+ * @sub: one of CEPH_SUB_*
+ * @epoch: X for "every map since X", or 0 for "just the latest"
  */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub,
+                                u32 epoch, bool continuous)
+{
+       __le64 start = cpu_to_le64(epoch);
+       u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0;
+
+       dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub],
+            epoch, continuous);
+
+       if (monc->subs[sub].want &&
+           monc->subs[sub].item.start == start &&
+           monc->subs[sub].item.flags == flags)
+               return false;
+
+       monc->subs[sub].item.start = start;
+       monc->subs[sub].item.flags = flags;
+       monc->subs[sub].want = true;
+
+       return true;
+}
+
+bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
+                       bool continuous)
 {
+       bool need_request;
+
        mutex_lock(&monc->mutex);
-       monc->have_mdsmap = got;
+       need_request = __ceph_monc_want_map(monc, sub, epoch, continuous);
        mutex_unlock(&monc->mutex);
-       return 0;
+
+       return need_request;
 }
-EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+EXPORT_SYMBOL(ceph_monc_want_map);
 
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+/*
+ * Keep track of which maps we have
+ *
+ * @sub: one of CEPH_SUB_*
+ */
+static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub,
+                               u32 epoch)
+{
+       dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch);
+
+       if (monc->subs[sub].want) {
+               if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME)
+                       monc->subs[sub].want = false;
+               else
+                       monc->subs[sub].item.start = cpu_to_le64(epoch + 1);
+       }
+
+       monc->subs[sub].have = epoch;
+}
+
+void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 {
        mutex_lock(&monc->mutex);
-       monc->have_osdmap = got;
-       monc->want_next_osdmap = 0;
+       __ceph_monc_got_map(monc, sub, epoch);
        mutex_unlock(&monc->mutex);
-       return 0;
 }
+EXPORT_SYMBOL(ceph_monc_got_map);
 
 /*
  * Register interest in the next osdmap
  */
 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 {
-       dout("request_next_osdmap have %u\n", monc->have_osdmap);
+       dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
        mutex_lock(&monc->mutex);
-       if (!monc->want_next_osdmap)
-               monc->want_next_osdmap = 1;
-       if (monc->want_next_osdmap < 2)
+       if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
+                                monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
                __send_subscribe(monc);
        mutex_unlock(&monc->mutex);
 }
@@ -314,15 +365,15 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
        long ret;
 
        mutex_lock(&monc->mutex);
-       while (monc->have_osdmap < epoch) {
+       while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) {
                mutex_unlock(&monc->mutex);
 
                if (timeout && time_after_eq(jiffies, started + timeout))
                        return -ETIMEDOUT;
 
                ret = wait_event_interruptible_timeout(monc->client->auth_wq,
-                                               monc->have_osdmap >= epoch,
-                                               ceph_timeout_jiffies(timeout));
+                                    monc->subs[CEPH_SUB_OSDMAP].have >= epoch,
+                                    ceph_timeout_jiffies(timeout));
                if (ret < 0)
                        return ret;
 
@@ -335,11 +386,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 EXPORT_SYMBOL(ceph_monc_wait_osdmap);
 
 /*
- *
+ * Open a session with a random monitor.  Request monmap and osdmap,
+ * which are waited upon in __ceph_open_session().
  */
 int ceph_monc_open_session(struct ceph_mon_client *monc)
 {
        mutex_lock(&monc->mutex);
+       __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true);
+       __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false);
        __open_session(monc);
        __schedule_delayed(monc);
        mutex_unlock(&monc->mutex);
@@ -375,6 +429,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
        client->monc.monmap = monmap;
        kfree(old);
 
+       __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
        client->have_fsid = true;
 
 out:
@@ -725,8 +780,14 @@ static void delayed_work(struct work_struct *work)
                        __validate_auth(monc);
                }
 
-               if (is_auth)
-                       __send_subscribe(monc);
+               if (is_auth) {
+                       unsigned long now = jiffies;
+
+                       dout("%s renew subs? now %lu renew after %lu\n",
+                            __func__, now, monc->sub_renew_after);
+                       if (time_after_eq(now, monc->sub_renew_after))
+                               __send_subscribe(monc);
+               }
        }
        __schedule_delayed(monc);
        mutex_unlock(&monc->mutex);
@@ -815,16 +876,13 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->cur_mon = -1;
        monc->hunting = true;
        monc->sub_renew_after = jiffies;
-       monc->sub_sent = 0;
+       monc->sub_renew_sent = 0;
 
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
        monc->generic_request_tree = RB_ROOT;
        monc->num_generic_requests = 0;
        monc->last_tid = 0;
 
-       monc->have_mdsmap = 0;
-       monc->have_osdmap = 0;
-       monc->want_next_osdmap = 1;
        return 0;
 
 out_auth_reply:
index 5bc053778feddd0e35fe73480317be25345da61c..3309112e23d05475694c2bbae300ddb58866034b 100644 (file)
@@ -2187,7 +2187,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                goto bad;
 done:
        downgrade_write(&osdc->map_sem);
-       ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+       ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+                         osdc->osdmap->epoch);
 
        /*
         * subscribe to subsequent osdmap updates if full to ensure