ceph: resubmit requests on pg mapping change (not just primary change)
authorSage Weil <sage@newdream.net>
Mon, 10 May 2010 17:24:48 +0000 (10:24 -0700)
committerSage Weil <sage@newdream.net>
Tue, 11 May 2010 16:53:56 +0000 (09:53 -0700)
OSD requests need to be resubmitted on any pg mapping change, not just when
the pg primary changes.  Resending only when the primary changes results in
occasional 'hung' requests during osd cluster recovery or rebalancing.

Signed-off-by: Sage Weil <sage@newdream.net>
fs/ceph/osd_client.c
fs/ceph/osd_client.h
fs/ceph/osdmap.c
fs/ceph/osdmap.h
fs/ceph/rados.h

index 8128082a028eb2fe40601e058b5b47ef443b7076..3514f71ff85f79a866fb57699ade4f831d02d859 100644 (file)
@@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
 {
        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
        struct ceph_pg pgid;
-       int o = -1;
+       int acting[CEPH_PG_MAX_SIZE];
+       int o = -1, num = 0;
        int err;
 
        dout("map_osds %p tid %lld\n", req, req->r_tid);
@@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc,
        pgid = reqhead->layout.ol_pgid;
        req->r_pgid = pgid;
 
-       o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+       err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+       if (err > 0) {
+               o = acting[0];
+               num = err;
+       }
 
        if ((req->r_osd && req->r_osd->o_osd == o &&
-            req->r_sent >= req->r_osd->o_incarnation) ||
+            req->r_sent >= req->r_osd->o_incarnation &&
+            req->r_num_pg_osds == num &&
+            memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
            (req->r_osd == NULL && o == -1))
                return 0;  /* no change */
 
@@ -587,6 +594,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
             req->r_osd ? req->r_osd->o_osd : -1);
 
+       /* record full pg acting set */
+       memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+       req->r_num_pg_osds = num;
+
        if (req->r_osd) {
                __cancel_request(req);
                list_del_init(&req->r_osd_item);
@@ -612,7 +623,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
                __remove_osd_from_lru(req->r_osd);
                list_add(&req->r_osd_item, &req->r_osd->o_requests);
        }
-       err = 1;   /* osd changed */
+       err = 1;   /* osd or pg changed */
 
 out:
        return err;
index c5191d62f243c342c2129dc764ac1bde1915f518..ce776989ef6a76d73d1475ce2335ce558b653e2d 100644 (file)
@@ -48,6 +48,8 @@ struct ceph_osd_request {
        struct list_head r_osd_item;
        struct ceph_osd *r_osd;
        struct ceph_pg   r_pgid;
+       int              r_pg_osds[CEPH_PG_MAX_SIZE];
+       int              r_num_pg_osds;
 
        struct ceph_connection *r_con_filling_msg;
 
index 2e2c15eed82ad31fa985409100dbe07389b9df94..cfdd8f4388b7a44a0419e34ead12e9368b35b935 100644 (file)
@@ -1040,13 +1040,34 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
        return osds;
 }
 
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                       int *acting)
+{
+       int rawosds[CEPH_PG_MAX_SIZE], *osds;
+       int i, o, num = CEPH_PG_MAX_SIZE;
+
+       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+       if (!osds)
+               return -1;
+
+       /* primary is first up osd */
+       o = 0;
+       for (i = 0; i < num; i++)
+               if (ceph_osd_is_up(osdmap, osds[i]))
+                       acting[o++] = osds[i];
+       return o;
+}
+
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-       int rawosds[10], *osds;
-       int i, num = ARRAY_SIZE(rawosds);
+       int rawosds[CEPH_PG_MAX_SIZE], *osds;
+       int i, num = CEPH_PG_MAX_SIZE;
 
        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
        if (!osds)
@@ -1054,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 
        /* primary is first up osd */
        for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i])) {
+               if (ceph_osd_is_up(osdmap, osds[i]))
                        return osds[i];
-                       break;
-               }
        return -1;
 }
index 8bc9f1e4f562f7187393feb6bb278564931137aa..970b547e510dbd0edb8ba8c2ec95c45932be6106 100644 (file)
@@ -120,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
                                   const char *oid,
                                   struct ceph_file_layout *fl,
                                   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                              int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
                                struct ceph_pg pgid);
 
index a1fc1d017b585d01ba68861bf6518cc0b1c2059a..fd56451a871f4b506263f6299feeff889cd6389c 100644 (file)
@@ -58,6 +58,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
 
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
 
 /*
  * placement group.