ceph: reset osd connections after fault
authorSage Weil <sage@newdream.net>
Mon, 15 Feb 2010 20:11:51 +0000 (12:11 -0800)
committerSage Weil <sage@newdream.net>
Mon, 15 Feb 2010 20:11:51 +0000 (12:11 -0800)
A single osd connection fault (e.g. tcp disconnect) wasn't
reopening the connection, which causes all current and future
requests for that osd to hang.

Signed-off-by: Sage Weil <sage@newdream.net>
fs/ceph/osd_client.c

index 7f8a26fdcc2c06dfb0eaa7739c9593c0b575e7dd..fa0f73703954b4f1c8a568d841155cda99ced260 100644 (file)
@@ -369,7 +369,6 @@ static void osd_reset(struct ceph_connection *con)
                return;
        dout("osd_reset osd%d\n", osd->o_osd);
        osdc = osd->o_osdc;
-       osd->o_incarnation++;
        down_read(&osdc->map_sem);
        kick_requests(osdc, osd);
        up_read(&osdc->map_sem);
@@ -921,7 +920,9 @@ static void kick_requests(struct ceph_osd_client *osdc,
 
        dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
        mutex_lock(&osdc->request_mutex);
-       if (!kickosd) {
+       if (kickosd) {
+               __reset_osd(osdc, kickosd);
+       } else {
                for (p = rb_first(&osdc->osds); p; p = n) {
                        struct ceph_osd *osd =
                                rb_entry(p, struct ceph_osd, o_node);