ceph: choose readdir frag based on previous readdir reply
authorYan, Zheng <zyan@redhat.com>
Mon, 24 Apr 2017 03:56:50 +0000 (11:56 +0800)
committerIlya Dryomov <idryomov@gmail.com>
Thu, 4 May 2017 07:19:24 +0000 (09:19 +0200)
The dirfragtree is lazily updated, it's not always accurate. Infinite
loops happens in following circumstance.

- client send request to read frag A
- frag A has been fragmented into frag B and C. So mds fills the reply
  with contents of frag B
- client wants to read next frag C. ceph_choose_frag(frag value of C)
  return frag A.

The fix is using previous readdir reply to calculate next readdir frag
when possible.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/dir.c

index ae61cdf7d4891530d6a05cde73825f3b6b9b6481..e071d23f61481bf23fc4407d72ea75c9d1ec2430 100644 (file)
@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        struct ceph_mds_client *mdsc = fsc->mdsc;
        int i;
        int err;
-       u32 ftype;
+       unsigned frag = -1;
        struct ceph_mds_reply_info_parsed *rinfo;
 
        dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
@@ -341,7 +341,6 @@ more:
        /* do we have the correct frag content buffered? */
        if (need_send_readdir(fi, ctx->pos)) {
                struct ceph_mds_request *req;
-               unsigned frag;
                int op = ceph_snap(inode) == CEPH_SNAPDIR ?
                        CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -352,8 +351,11 @@ more:
                }
 
                if (is_hash_order(ctx->pos)) {
-                       frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
-                                               NULL, NULL);
+                       /* fragtree isn't always accurate. choose frag
+                        * based on previous reply when possible. */
+                       if (frag == (unsigned)-1)
+                               frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+                                                       NULL, NULL);
                } else {
                        frag = fpos_frag(ctx->pos);
                }
@@ -480,6 +482,7 @@ more:
                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
                struct ceph_vino vino;
                ino_t ino;
+               u32 ftype;
 
                BUG_ON(rde->offset < ctx->pos);
 
@@ -502,15 +505,17 @@ more:
                ctx->pos++;
        }
 
+       ceph_mdsc_put_request(fi->last_readdir);
+       fi->last_readdir = NULL;
+
        if (fi->next_offset > 2) {
-               ceph_mdsc_put_request(fi->last_readdir);
-               fi->last_readdir = NULL;
+               frag = fi->frag;
                goto more;
        }
 
        /* more frags? */
        if (!ceph_frag_is_rightmost(fi->frag)) {
-               unsigned frag = ceph_frag_next(fi->frag);
+               frag = ceph_frag_next(fi->frag);
                if (is_hash_order(ctx->pos)) {
                        loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
                                                        fi->next_offset, true);